/[pcre]/code/branches/pcre16/pcre_compile.c
ViewVC logotype

Contents of /code/branches/pcre16/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 210 - (show annotations) (download)
Wed Aug 8 14:24:50 2007 UTC (6 years, 11 months ago) by ph10
Original Path: code/trunk/pcre_compile.c
File MIME type: text/plain
File size: 187339 byte(s)
Add Perl 5.10's backtracking verbs.

1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2007 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains the external function pcre_compile(), along with
42 supporting internal functions that are not used by other modules. */
43
44
45 #ifdef HAVE_CONFIG_H
46 #include <config.h>
47 #endif
48
49 #define NLBLOCK cd /* Block containing newline information */
50 #define PSSTART start_pattern /* Field containing processed string start */
51 #define PSEND end_pattern /* Field containing processed string end */
52
53 #include "pcre_internal.h"
54
55
56 /* When DEBUG is defined, we need the pcre_printint() function, which is also
57 used by pcretest. DEBUG is not defined when building a production library. */
58
59 #ifdef DEBUG
60 #include "pcre_printint.src"
61 #endif
62
63
64 /* Macro for setting individual bits in class bitmaps. */
65
66 #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
67
68 /* Maximum length value to check against when making sure that the integer that
69 holds the compiled pattern length does not overflow. We make it a bit less than
70 INT_MAX to allow for adding in group terminating bytes, so that we don't have
71 to check them every time. */
72
73 #define OFLOW_MAX (INT_MAX - 20)
74
75
76 /*************************************************
77 * Code parameters and static tables *
78 *************************************************/
79
80 /* This value specifies the size of stack workspace that is used during the
81 first pre-compile phase that determines how much memory is required. The regex
82 is partly compiled into this space, but the compiled parts are discarded as
83 soon as they can be, so that hopefully there will never be an overrun. The code
84 does, however, check for an overrun. The largest amount I've seen used is 218,
85 so this number is very generous.
86
87 The same workspace is used during the second, actual compile phase for
88 remembering forward references to groups so that they can be filled in at the
89 end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
90 is 4 there is plenty of room. */
91
92 #define COMPILE_WORK_SIZE (4096)
93
94
95 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
96 are simple data values; negative values are for special things like \d and so
97 on. Zero means further processing is needed (for things like \x), or the escape
98 is invalid. */
99
100 #ifndef EBCDIC /* This is the "normal" table for ASCII systems */
101 static const short int escapes[] = {
102 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
103 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
104 '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */
105 -ESC_H, 0, 0, -ESC_K, 0, 0, 0, 0, /* H - O */
106 -ESC_P, -ESC_Q, -ESC_R, -ESC_S, 0, 0, -ESC_V, -ESC_W, /* P - W */
107 -ESC_X, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
108 '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */
109 -ESC_h, 0, 0, -ESC_k, 0, 0, ESC_n, 0, /* h - o */
110 -ESC_p, 0, ESC_r, -ESC_s, ESC_tee, 0, -ESC_v, -ESC_w, /* p - w */
111 0, 0, -ESC_z /* x - z */
112 };
113
114 #else /* This is the "abnormal" table for EBCDIC systems */
115 static const short int escapes[] = {
116 /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
117 /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
118 /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
119 /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
120 /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
121 /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
122 /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
123 /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
124 /* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0,
125 /* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
126 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
127 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0,
128 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
129 /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
130 /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
131 /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
132 /* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0,
133 /* D0 */ '}', 0, -ESC_K, 0, 0, 0, 0, -ESC_P,
134 /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
135 /* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X,
136 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
137 /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
138 /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
139 };
140 #endif
141
142
143 /* Table of special "verbs" like (*PRUNE) */
144
145 typedef struct verbitem {
146 const char *name;
147 int len;
148 int op;
149 } verbitem;
150
151 static verbitem verbs[] = {
152 { "ACCEPT", 6, OP_ACCEPT },
153 { "COMMIT", 6, OP_COMMIT },
154 { "F", 1, OP_FAIL },
155 { "FAIL", 4, OP_FAIL },
156 { "PRUNE", 5, OP_PRUNE },
157 { "SKIP", 4, OP_SKIP },
158 { "THEN", 4, OP_THEN }
159 };
160
161 static int verbcount = sizeof(verbs)/sizeof(verbitem);
162
163
164 /* Tables of names of POSIX character classes and their lengths. The list is
165 terminated by a zero length entry. The first three must be alpha, lower, upper,
166 as this is assumed for handling case independence. */
167
168 static const char *const posix_names[] = {
169 "alpha", "lower", "upper",
170 "alnum", "ascii", "blank", "cntrl", "digit", "graph",
171 "print", "punct", "space", "word", "xdigit" };
172
173 static const uschar posix_name_lengths[] = {
174 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
175
176 /* Table of class bit maps for each POSIX class. Each class is formed from a
177 base map, with an optional addition or removal of another map. Then, for some
178 classes, there is some additional tweaking: for [:blank:] the vertical space
179 characters are removed, and for [:alpha:] and [:alnum:] the underscore
180 character is removed. The triples in the table consist of the base map offset,
181 second map offset or -1 if no second map, and a non-negative value for map
182 addition or a negative value for map subtraction (if there are two maps). The
183 absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
184 remove vertical space characters, 2 => remove underscore. */
185
186 static const int posix_class_maps[] = {
187 cbit_word, cbit_digit, -2, /* alpha */
188 cbit_lower, -1, 0, /* lower */
189 cbit_upper, -1, 0, /* upper */
190 cbit_word, -1, 2, /* alnum - word without underscore */
191 cbit_print, cbit_cntrl, 0, /* ascii */
192 cbit_space, -1, 1, /* blank - a GNU extension */
193 cbit_cntrl, -1, 0, /* cntrl */
194 cbit_digit, -1, 0, /* digit */
195 cbit_graph, -1, 0, /* graph */
196 cbit_print, -1, 0, /* print */
197 cbit_punct, -1, 0, /* punct */
198 cbit_space, -1, 0, /* space */
199 cbit_word, -1, 0, /* word - a Perl extension */
200 cbit_xdigit,-1, 0 /* xdigit */
201 };
202
203
204 #define STRING(a) # a
205 #define XSTRING(s) STRING(s)
206
207 /* The texts of compile-time error messages. These are "char *" because they
208 are passed to the outside world. Do not ever re-use any error number, because
209 they are documented. Always add a new error instead. Messages marked DEAD below
210 are no longer used. */
211
212 static const char *error_texts[] = {
213 "no error",
214 "\\ at end of pattern",
215 "\\c at end of pattern",
216 "unrecognized character follows \\",
217 "numbers out of order in {} quantifier",
218 /* 5 */
219 "number too big in {} quantifier",
220 "missing terminating ] for character class",
221 "invalid escape sequence in character class",
222 "range out of order in character class",
223 "nothing to repeat",
224 /* 10 */
225 "operand of unlimited repeat could match the empty string", /** DEAD **/
226 "internal error: unexpected repeat",
227 "unrecognized character after (?",
228 "POSIX named classes are supported only within a class",
229 "missing )",
230 /* 15 */
231 "reference to non-existent subpattern",
232 "erroffset passed as NULL",
233 "unknown option bit(s) set",
234 "missing ) after comment",
235 "parentheses nested too deeply", /** DEAD **/
236 /* 20 */
237 "regular expression is too large",
238 "failed to get memory",
239 "unmatched parentheses",
240 "internal error: code overflow",
241 "unrecognized character after (?<",
242 /* 25 */
243 "lookbehind assertion is not fixed length",
244 "malformed number or name after (?(",
245 "conditional group contains more than two branches",
246 "assertion expected after (?(",
247 "(?R or (?[+-]digits must be followed by )",
248 /* 30 */
249 "unknown POSIX class name",
250 "POSIX collating elements are not supported",
251 "this version of PCRE is not compiled with PCRE_UTF8 support",
252 "spare error", /** DEAD **/
253 "character value in \\x{...} sequence is too large",
254 /* 35 */
255 "invalid condition (?(0)",
256 "\\C not allowed in lookbehind assertion",
257 "PCRE does not support \\L, \\l, \\N, \\U, or \\u",
258 "number after (?C is > 255",
259 "closing ) for (?C expected",
260 /* 40 */
261 "recursive call could loop indefinitely",
262 "unrecognized character after (?P",
263 "syntax error in subpattern name (missing terminator)",
264 "two named subpatterns have the same name",
265 "invalid UTF-8 string",
266 /* 45 */
267 "support for \\P, \\p, and \\X has not been compiled",
268 "malformed \\P or \\p sequence",
269 "unknown property name after \\P or \\p",
270 "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)",
271 "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")",
272 /* 50 */
273 "repeated subpattern is too long", /** DEAD **/
274 "octal value is greater than \\377 (not in UTF-8 mode)",
275 "internal error: overran compiling workspace",
276 "internal error: previously-checked referenced subpattern not found",
277 "DEFINE group contains more than one branch",
278 /* 55 */
279 "repeating a DEFINE group is not allowed",
280 "inconsistent NEWLINE options",
281 "\\g is not followed by a braced name or an optionally braced non-zero number",
282 "(?+ or (?- or (?(+ or (?(- must be followed by a non-zero number",
283 "(*VERB) with an argument is not supported",
284 /* 60 */
285 "(*VERB) not recognized"
286 };
287
288
289 /* Table to identify digits and hex digits. This is used when compiling
290 patterns. Note that the tables in chartables are dependent on the locale, and
291 may mark arbitrary characters as digits - but the PCRE compiling code expects
292 to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
293 a private table here. It costs 256 bytes, but it is a lot faster than doing
294 character value tests (at least in some simple cases I timed), and in some
295 applications one wants PCRE to compile efficiently as well as match
296 efficiently.
297
298 For convenience, we use the same bit definitions as in chartables:
299
300 0x04 decimal digit
301 0x08 hexadecimal digit
302
303 Then we can use ctype_digit and ctype_xdigit in the code. */
304
305 #ifndef EBCDIC /* This is the "normal" case, for ASCII systems */
306 static const unsigned char digitab[] =
307 {
308 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
309 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
310 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
311 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
312 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
313 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
314 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
315 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
316 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
317 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
318 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
319 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
320 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
321 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
322 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
323 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
324 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
325 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
326 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
327 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
328 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
329 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
330 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
331 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
332 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
333 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
334 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
335 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
336 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
337 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
338 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
339 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
340
341 #else /* This is the "abnormal" case, for EBCDIC systems */
342 static const unsigned char digitab[] =
343 {
344 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
345 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
346 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
347 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
348 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
349 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
350 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
351 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
352 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
353 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
354 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
355 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */
356 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
357 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
358 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
359 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
360 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
361 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
362 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
363 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
364 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
365 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
366 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
367 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
368 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
369 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
370 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
371 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
372 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
373 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
374 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
375 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
376
377 static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
378 0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
379 0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
380 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
381 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
382 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
383 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
384 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
385 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
386 0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
387 0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
388 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
389 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */
390 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
391 0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
392 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
393 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
394 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
395 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
396 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
397 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
398 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
399 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
400 0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
401 0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
402 0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
403 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
404 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
405 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
406 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
407 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
408 0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
409 0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
410 #endif
411
412
413 /* Definition to allow mutual recursion */
414
415 static BOOL
416 compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
417 int *, int *, branch_chain *, compile_data *, int *);
418
419
420
421 /*************************************************
422 * Handle escapes *
423 *************************************************/
424
425 /* This function is called when a \ has been encountered. It either returns a
426 positive value for a simple escape such as \n, or a negative value which
427 encodes one of the more complicated things such as \d. A backreference to group
428 n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
429 UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
430 ptr is pointing at the \. On exit, it is on the final character of the escape
431 sequence.
432
433 Arguments:
434 ptrptr points to the pattern position pointer
435 errorcodeptr points to the errorcode variable
436 bracount number of previous extracting brackets
437 options the options bits
438 isclass TRUE if inside a character class
439
440 Returns: zero or positive => a data character
441 negative => a special escape sequence
442 on error, errorptr is set
443 */
444
445 static int
446 check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
447 int options, BOOL isclass)
448 {
449 BOOL utf8 = (options & PCRE_UTF8) != 0;
450 const uschar *ptr = *ptrptr + 1;
451 int c, i;
452
453 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
454 ptr--; /* Set pointer back to the last byte */
455
456 /* If backslash is at the end of the pattern, it's an error. */
457
458 if (c == 0) *errorcodeptr = ERR1;
459
460 /* Non-alphamerics are literals. For digits or letters, do an initial lookup in
461 a table. A non-zero result is something that can be returned immediately.
462 Otherwise further processing may be required. */
463
464 #ifndef EBCDIC /* ASCII coding */
465 else if (c < '0' || c > 'z') {} /* Not alphameric */
466 else if ((i = escapes[c - '0']) != 0) c = i;
467
468 #else /* EBCDIC coding */
469 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphameric */
470 else if ((i = escapes[c - 0x48]) != 0) c = i;
471 #endif
472
473 /* Escapes that need further processing, or are illegal. */
474
475 else
476 {
477 const uschar *oldptr;
478 BOOL braced, negated;
479
480 switch (c)
481 {
482 /* A number of Perl escapes are not handled by PCRE. We give an explicit
483 error. */
484
485 case 'l':
486 case 'L':
487 case 'N':
488 case 'u':
489 case 'U':
490 *errorcodeptr = ERR37;
491 break;
492
493 /* \g must be followed by a number, either plain or braced. If positive, it
494 is an absolute backreference. If negative, it is a relative backreference.
495 This is a Perl 5.10 feature. Perl 5.10 also supports \g{name} as a
496 reference to a named group. This is part of Perl's movement towards a
497 unified syntax for back references. As this is synonymous with \k{name}, we
498 fudge it up by pretending it really was \k. */
499
500 case 'g':
501 if (ptr[1] == '{')
502 {
503 const uschar *p;
504 for (p = ptr+2; *p != 0 && *p != '}'; p++)
505 if (*p != '-' && (digitab[*p] & ctype_digit) == 0) break;
506 if (*p != 0 && *p != '}')
507 {
508 c = -ESC_k;
509 break;
510 }
511 braced = TRUE;
512 ptr++;
513 }
514 else braced = FALSE;
515
516 if (ptr[1] == '-')
517 {
518 negated = TRUE;
519 ptr++;
520 }
521 else negated = FALSE;
522
523 c = 0;
524 while ((digitab[ptr[1]] & ctype_digit) != 0)
525 c = c * 10 + *(++ptr) - '0';
526
527 if (c == 0 || (braced && *(++ptr) != '}'))
528 {
529 *errorcodeptr = ERR57;
530 return 0;
531 }
532
533 if (negated)
534 {
535 if (c > bracount)
536 {
537 *errorcodeptr = ERR15;
538 return 0;
539 }
540 c = bracount - (c - 1);
541 }
542
543 c = -(ESC_REF + c);
544 break;
545
546 /* The handling of escape sequences consisting of a string of digits
547 starting with one that is not zero is not straightforward. By experiment,
548 the way Perl works seems to be as follows:
549
550 Outside a character class, the digits are read as a decimal number. If the
551 number is less than 10, or if there are that many previous extracting
552 left brackets, then it is a back reference. Otherwise, up to three octal
553 digits are read to form an escaped byte. Thus \123 is likely to be octal
554 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
555 value is greater than 377, the least significant 8 bits are taken. Inside a
556 character class, \ followed by a digit is always an octal number. */
557
558 case '1': case '2': case '3': case '4': case '5':
559 case '6': case '7': case '8': case '9':
560
561 if (!isclass)
562 {
563 oldptr = ptr;
564 c -= '0';
565 while ((digitab[ptr[1]] & ctype_digit) != 0)
566 c = c * 10 + *(++ptr) - '0';
567 if (c < 10 || c <= bracount)
568 {
569 c = -(ESC_REF + c);
570 break;
571 }
572 ptr = oldptr; /* Put the pointer back and fall through */
573 }
574
575 /* Handle an octal number following \. If the first digit is 8 or 9, Perl
576 generates a binary zero byte and treats the digit as a following literal.
577 Thus we have to pull back the pointer by one. */
578
579 if ((c = *ptr) >= '8')
580 {
581 ptr--;
582 c = 0;
583 break;
584 }
585
586 /* \0 always starts an octal number, but we may drop through to here with a
587 larger first octal digit. The original code used just to take the least
588 significant 8 bits of octal numbers (I think this is what early Perls used
589 to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
590 than 3 octal digits. */
591
592 case '0':
593 c -= '0';
594 while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
595 c = c * 8 + *(++ptr) - '0';
596 if (!utf8 && c > 255) *errorcodeptr = ERR51;
597 break;
598
599 /* \x is complicated. \x{ddd} is a character number which can be greater
600 than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
601 treated as a data character. */
602
603 case 'x':
604 if (ptr[1] == '{')
605 {
606 const uschar *pt = ptr + 2;
607 int count = 0;
608
609 c = 0;
610 while ((digitab[*pt] & ctype_xdigit) != 0)
611 {
612 register int cc = *pt++;
613 if (c == 0 && cc == '0') continue; /* Leading zeroes */
614 count++;
615
616 #ifndef EBCDIC /* ASCII coding */
617 if (cc >= 'a') cc -= 32; /* Convert to upper case */
618 c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));
619 #else /* EBCDIC coding */
620 if (cc >= 'a' && cc <= 'z') cc += 64; /* Convert to upper case */
621 c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));
622 #endif
623 }
624
625 if (*pt == '}')
626 {
627 if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
628 ptr = pt;
629 break;
630 }
631
632 /* If the sequence of hex digits does not end with '}', then we don't
633 recognize this construct; fall through to the normal \x handling. */
634 }
635
636 /* Read just a single-byte hex-defined char */
637
638 c = 0;
639 while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
640 {
641 int cc; /* Some compilers don't like ++ */
642 cc = *(++ptr); /* in initializers */
643 #ifndef EBCDIC /* ASCII coding */
644 if (cc >= 'a') cc -= 32; /* Convert to upper case */
645 c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
646 #else /* EBCDIC coding */
647 if (cc <= 'z') cc += 64; /* Convert to upper case */
648 c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
649 #endif
650 }
651 break;
652
653 /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
654 This coding is ASCII-specific, but then the whole concept of \cx is
655 ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
656
657 case 'c':
658 c = *(++ptr);
659 if (c == 0)
660 {
661 *errorcodeptr = ERR2;
662 return 0;
663 }
664
665 #ifndef EBCDIC /* ASCII coding */
666 if (c >= 'a' && c <= 'z') c -= 32;
667 c ^= 0x40;
668 #else /* EBCDIC coding */
669 if (c >= 'a' && c <= 'z') c += 64;
670 c ^= 0xC0;
671 #endif
672 break;
673
674 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
675 other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
676 for Perl compatibility, it is a literal. This code looks a bit odd, but
677 there used to be some cases other than the default, and there may be again
678 in future, so I haven't "optimized" it. */
679
680 default:
681 if ((options & PCRE_EXTRA) != 0) switch(c)
682 {
683 default:
684 *errorcodeptr = ERR3;
685 break;
686 }
687 break;
688 }
689 }
690
691 *ptrptr = ptr;
692 return c;
693 }
694
695
696
697 #ifdef SUPPORT_UCP
698 /*************************************************
699 * Handle \P and \p *
700 *************************************************/
701
702 /* This function is called after \P or \p has been encountered, provided that
703 PCRE is compiled with support for Unicode properties. On entry, ptrptr is
704 pointing at the P or p. On exit, it is pointing at the final character of the
705 escape sequence.
706
707 Argument:
708 ptrptr points to the pattern position pointer
709 negptr points to a boolean that is set TRUE for negation else FALSE
710 dptr points to an int that is set to the detailed property value
711 errorcodeptr points to the error code variable
712
713 Returns: type value from ucp_type_table, or -1 for an invalid type
714 */
715
716 static int
717 get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
718 {
719 int c, i, bot, top;
720 const uschar *ptr = *ptrptr;
721 char name[32];
722
723 c = *(++ptr);
724 if (c == 0) goto ERROR_RETURN;
725
726 *negptr = FALSE;
727
728 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
729 negation. */
730
731 if (c == '{')
732 {
733 if (ptr[1] == '^')
734 {
735 *negptr = TRUE;
736 ptr++;
737 }
738 for (i = 0; i < (int)sizeof(name) - 1; i++)
739 {
740 c = *(++ptr);
741 if (c == 0) goto ERROR_RETURN;
742 if (c == '}') break;
743 name[i] = c;
744 }
745 if (c !='}') goto ERROR_RETURN;
746 name[i] = 0;
747 }
748
749 /* Otherwise there is just one following character */
750
751 else
752 {
753 name[0] = c;
754 name[1] = 0;
755 }
756
757 *ptrptr = ptr;
758
759 /* Search for a recognized property name using binary chop */
760
761 bot = 0;
762 top = _pcre_utt_size;
763
764 while (bot < top)
765 {
766 i = (bot + top) >> 1;
767 c = strcmp(name, _pcre_utt[i].name);
768 if (c == 0)
769 {
770 *dptr = _pcre_utt[i].value;
771 return _pcre_utt[i].type;
772 }
773 if (c > 0) bot = i + 1; else top = i;
774 }
775
776 *errorcodeptr = ERR47;
777 *ptrptr = ptr;
778 return -1;
779
780 ERROR_RETURN:
781 *errorcodeptr = ERR46;
782 *ptrptr = ptr;
783 return -1;
784 }
785 #endif
786
787
788
789
790 /*************************************************
791 * Check for counted repeat *
792 *************************************************/
793
794 /* This function is called when a '{' is encountered in a place where it might
795 start a quantifier. It looks ahead to see if it really is a quantifier or not.
796 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
797 where the ddds are digits.
798
799 Arguments:
800 p pointer to the first char after '{'
801
802 Returns: TRUE or FALSE
803 */
804
805 static BOOL
806 is_counted_repeat(const uschar *p)
807 {
808 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
809 while ((digitab[*p] & ctype_digit) != 0) p++;
810 if (*p == '}') return TRUE;
811
812 if (*p++ != ',') return FALSE;
813 if (*p == '}') return TRUE;
814
815 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
816 while ((digitab[*p] & ctype_digit) != 0) p++;
817
818 return (*p == '}');
819 }
820
821
822
823 /*************************************************
824 * Read repeat counts *
825 *************************************************/
826
827 /* Read an item of the form {n,m} and return the values. This is called only
828 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
829 so the syntax is guaranteed to be correct, but we need to check the values.
830
831 Arguments:
832 p pointer to first char after '{'
833 minp pointer to int for min
834 maxp pointer to int for max
835 returned as -1 if no max
836 errorcodeptr points to error code variable
837
838 Returns: pointer to '}' on success;
839 current ptr on error, with errorcodeptr set non-zero
840 */
841
842 static const uschar *
843 read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
844 {
845 int min = 0;
846 int max = -1;
847
848 /* Read the minimum value and do a paranoid check: a negative value indicates
849 an integer overflow. */
850
851 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
852 if (min < 0 || min > 65535)
853 {
854 *errorcodeptr = ERR5;
855 return p;
856 }
857
858 /* Read the maximum value if there is one, and again do a paranoid on its size.
859 Also, max must not be less than min. */
860
861 if (*p == '}') max = min; else
862 {
863 if (*(++p) != '}')
864 {
865 max = 0;
866 while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
867 if (max < 0 || max > 65535)
868 {
869 *errorcodeptr = ERR5;
870 return p;
871 }
872 if (max < min)
873 {
874 *errorcodeptr = ERR4;
875 return p;
876 }
877 }
878 }
879
880 /* Fill in the required variables, and pass back the pointer to the terminating
881 '}'. */
882
883 *minp = min;
884 *maxp = max;
885 return p;
886 }
887
888
889
890 /*************************************************
891 * Find forward referenced subpattern *
892 *************************************************/
893
894 /* This function scans along a pattern's text looking for capturing
895 subpatterns, and counting them. If it finds a named pattern that matches the
896 name it is given, it returns its number. Alternatively, if the name is NULL, it
897 returns when it reaches a given numbered subpattern. This is used for forward
898 references to subpatterns. We know that if (?P< is encountered, the name will
899 be terminated by '>' because that is checked in the first pass.
900
901 Arguments:
902 ptr current position in the pattern
903 count current count of capturing parens so far encountered
904 name name to seek, or NULL if seeking a numbered subpattern
905 lorn name length, or subpattern number if name is NULL
906 xmode TRUE if we are in /x mode
907
908 Returns: the number of the named subpattern, or -1 if not found
909 */
910
911 static int
912 find_parens(const uschar *ptr, int count, const uschar *name, int lorn,
913 BOOL xmode)
914 {
915 const uschar *thisname;
916
917 for (; *ptr != 0; ptr++)
918 {
919 int term;
920
921 /* Skip over backslashed characters and also entire \Q...\E */
922
923 if (*ptr == '\\')
924 {
925 if (*(++ptr) == 0) return -1;
926 if (*ptr == 'Q') for (;;)
927 {
928 while (*(++ptr) != 0 && *ptr != '\\');
929 if (*ptr == 0) return -1;
930 if (*(++ptr) == 'E') break;
931 }
932 continue;
933 }
934
935 /* Skip over character classes */
936
937 if (*ptr == '[')
938 {
939 while (*(++ptr) != ']')
940 {
941 if (*ptr == '\\')
942 {
943 if (*(++ptr) == 0) return -1;
944 if (*ptr == 'Q') for (;;)
945 {
946 while (*(++ptr) != 0 && *ptr != '\\');
947 if (*ptr == 0) return -1;
948 if (*(++ptr) == 'E') break;
949 }
950 continue;
951 }
952 }
953 continue;
954 }
955
956 /* Skip comments in /x mode */
957
958 if (xmode && *ptr == '#')
959 {
960 while (*(++ptr) != 0 && *ptr != '\n');
961 if (*ptr == 0) return -1;
962 continue;
963 }
964
965 /* An opening parens must now be a real metacharacter */
966
967 if (*ptr != '(') continue;
968 if (ptr[1] != '?' && ptr[1] != '*')
969 {
970 count++;
971 if (name == NULL && count == lorn) return count;
972 continue;
973 }
974
975 ptr += 2;
976 if (*ptr == 'P') ptr++; /* Allow optional P */
977
978 /* We have to disambiguate (?<! and (?<= from (?<name> */
979
980 if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') &&
981 *ptr != '\'')
982 continue;
983
984 count++;
985
986 if (name == NULL && count == lorn) return count;
987 term = *ptr++;
988 if (term == '<') term = '>';
989 thisname = ptr;
990 while (*ptr != term) ptr++;
991 if (name != NULL && lorn == ptr - thisname &&
992 strncmp((const char *)name, (const char *)thisname, lorn) == 0)
993 return count;
994 }
995
996 return -1;
997 }
998
999
1000
1001 /*************************************************
1002 * Find first significant op code *
1003 *************************************************/
1004
1005 /* This is called by several functions that scan a compiled expression looking
1006 for a fixed first character, or an anchoring op code etc. It skips over things
1007 that do not influence this. For some calls, a change of option is important.
1008 For some calls, it makes sense to skip negative forward and all backward
1009 assertions, and also the \b assertion; for others it does not.
1010
1011 Arguments:
1012 code pointer to the start of the group
1013 options pointer to external options
1014 optbit the option bit whose changing is significant, or
1015 zero if none are
1016 skipassert TRUE if certain assertions are to be skipped
1017
1018 Returns: pointer to the first significant opcode
1019 */
1020
1021 static const uschar*
1022 first_significant_code(const uschar *code, int *options, int optbit,
1023 BOOL skipassert)
1024 {
1025 for (;;)
1026 {
1027 switch ((int)*code)
1028 {
1029 case OP_OPT:
1030 if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
1031 *options = (int)code[1];
1032 code += 2;
1033 break;
1034
1035 case OP_ASSERT_NOT:
1036 case OP_ASSERTBACK:
1037 case OP_ASSERTBACK_NOT:
1038 if (!skipassert) return code;
1039 do code += GET(code, 1); while (*code == OP_ALT);
1040 code += _pcre_OP_lengths[*code];
1041 break;
1042
1043 case OP_WORD_BOUNDARY:
1044 case OP_NOT_WORD_BOUNDARY:
1045 if (!skipassert) return code;
1046 /* Fall through */
1047
1048 case OP_CALLOUT:
1049 case OP_CREF:
1050 case OP_RREF:
1051 case OP_DEF:
1052 code += _pcre_OP_lengths[*code];
1053 break;
1054
1055 default:
1056 return code;
1057 }
1058 }
1059 /* Control never reaches here */
1060 }
1061
1062
1063
1064
1065 /*************************************************
1066 * Find the fixed length of a pattern *
1067 *************************************************/
1068
1069 /* Scan a pattern and compute the fixed length of subject that will match it,
1070 if the length is fixed. This is needed for dealing with backward assertions.
1071 In UTF8 mode, the result is in characters rather than bytes.
1072
1073 Arguments:
1074 code points to the start of the pattern (the bracket)
1075 options the compiling options
1076
1077 Returns: the fixed length, or -1 if there is no fixed length,
1078 or -2 if \C was encountered
1079 */
1080
1081 static int
1082 find_fixedlength(uschar *code, int options)
1083 {
1084 int length = -1;
1085
1086 register int branchlength = 0;
1087 register uschar *cc = code + 1 + LINK_SIZE;
1088
1089 /* Scan along the opcodes for this branch. If we get to the end of the
1090 branch, check the length against that of the other branches. */
1091
1092 for (;;)
1093 {
1094 int d;
1095 register int op = *cc;
1096
1097 switch (op)
1098 {
1099 case OP_CBRA:
1100 case OP_BRA:
1101 case OP_ONCE:
1102 case OP_COND:
1103 d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);
1104 if (d < 0) return d;
1105 branchlength += d;
1106 do cc += GET(cc, 1); while (*cc == OP_ALT);
1107 cc += 1 + LINK_SIZE;
1108 break;
1109
1110 /* Reached end of a branch; if it's a ket it is the end of a nested
1111 call. If it's ALT it is an alternation in a nested call. If it is
1112 END it's the end of the outer call. All can be handled by the same code. */
1113
1114 case OP_ALT:
1115 case OP_KET:
1116 case OP_KETRMAX:
1117 case OP_KETRMIN:
1118 case OP_END:
1119 if (length < 0) length = branchlength;
1120 else if (length != branchlength) return -1;
1121 if (*cc != OP_ALT) return length;
1122 cc += 1 + LINK_SIZE;
1123 branchlength = 0;
1124 break;
1125
1126 /* Skip over assertive subpatterns */
1127
1128 case OP_ASSERT:
1129 case OP_ASSERT_NOT:
1130 case OP_ASSERTBACK:
1131 case OP_ASSERTBACK_NOT:
1132 do cc += GET(cc, 1); while (*cc == OP_ALT);
1133 /* Fall through */
1134
1135 /* Skip over things that don't match chars */
1136
1137 case OP_REVERSE:
1138 case OP_CREF:
1139 case OP_RREF:
1140 case OP_DEF:
1141 case OP_OPT:
1142 case OP_CALLOUT:
1143 case OP_SOD:
1144 case OP_SOM:
1145 case OP_EOD:
1146 case OP_EODN:
1147 case OP_CIRC:
1148 case OP_DOLL:
1149 case OP_NOT_WORD_BOUNDARY:
1150 case OP_WORD_BOUNDARY:
1151 cc += _pcre_OP_lengths[*cc];
1152 break;
1153
1154 /* Handle literal characters */
1155
1156 case OP_CHAR:
1157 case OP_CHARNC:
1158 case OP_NOT:
1159 branchlength++;
1160 cc += 2;
1161 #ifdef SUPPORT_UTF8
1162 if ((options & PCRE_UTF8) != 0)
1163 {
1164 while ((*cc & 0xc0) == 0x80) cc++;
1165 }
1166 #endif
1167 break;
1168
1169 /* Handle exact repetitions. The count is already in characters, but we
1170 need to skip over a multibyte character in UTF8 mode. */
1171
1172 case OP_EXACT:
1173 branchlength += GET2(cc,1);
1174 cc += 4;
1175 #ifdef SUPPORT_UTF8
1176 if ((options & PCRE_UTF8) != 0)
1177 {
1178 while((*cc & 0x80) == 0x80) cc++;
1179 }
1180 #endif
1181 break;
1182
1183 case OP_TYPEEXACT:
1184 branchlength += GET2(cc,1);
1185 cc += 4;
1186 break;
1187
1188 /* Handle single-char matchers */
1189
1190 case OP_PROP:
1191 case OP_NOTPROP:
1192 cc += 2;
1193 /* Fall through */
1194
1195 case OP_NOT_DIGIT:
1196 case OP_DIGIT:
1197 case OP_NOT_WHITESPACE:
1198 case OP_WHITESPACE:
1199 case OP_NOT_WORDCHAR:
1200 case OP_WORDCHAR:
1201 case OP_ANY:
1202 branchlength++;
1203 cc++;
1204 break;
1205
1206 /* The single-byte matcher isn't allowed */
1207
1208 case OP_ANYBYTE:
1209 return -2;
1210
1211 /* Check a class for variable quantification */
1212
1213 #ifdef SUPPORT_UTF8
1214 case OP_XCLASS:
1215 cc += GET(cc, 1) - 33;
1216 /* Fall through */
1217 #endif
1218
1219 case OP_CLASS:
1220 case OP_NCLASS:
1221 cc += 33;
1222
1223 switch (*cc)
1224 {
1225 case OP_CRSTAR:
1226 case OP_CRMINSTAR:
1227 case OP_CRQUERY:
1228 case OP_CRMINQUERY:
1229 return -1;
1230
1231 case OP_CRRANGE:
1232 case OP_CRMINRANGE:
1233 if (GET2(cc,1) != GET2(cc,3)) return -1;
1234 branchlength += GET2(cc,1);
1235 cc += 5;
1236 break;
1237
1238 default:
1239 branchlength++;
1240 }
1241 break;
1242
1243 /* Anything else is variable length */
1244
1245 default:
1246 return -1;
1247 }
1248 }
1249 /* Control never gets here */
1250 }
1251
1252
1253
1254
1255 /*************************************************
1256 * Scan compiled regex for numbered bracket *
1257 *************************************************/
1258
1259 /* This little function scans through a compiled pattern until it finds a
1260 capturing bracket with the given number.
1261
1262 Arguments:
1263 code points to start of expression
1264 utf8 TRUE in UTF-8 mode
1265 number the required bracket number
1266
1267 Returns: pointer to the opcode for the bracket, or NULL if not found
1268 */
1269
1270 static const uschar *
1271 find_bracket(const uschar *code, BOOL utf8, int number)
1272 {
1273 for (;;)
1274 {
1275 register int c = *code;
1276 if (c == OP_END) return NULL;
1277
1278 /* XCLASS is used for classes that cannot be represented just by a bit
1279 map. This includes negated single high-valued characters. The length in
1280 the table is zero; the actual length is stored in the compiled code. */
1281
1282 if (c == OP_XCLASS) code += GET(code, 1);
1283
1284 /* Handle capturing bracket */
1285
1286 else if (c == OP_CBRA)
1287 {
1288 int n = GET2(code, 1+LINK_SIZE);
1289 if (n == number) return (uschar *)code;
1290 code += _pcre_OP_lengths[c];
1291 }
1292
1293 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1294 a multi-byte character. The length in the table is a minimum, so we have to
1295 arrange to skip the extra bytes. */
1296
1297 else
1298 {
1299 code += _pcre_OP_lengths[c];
1300 #ifdef SUPPORT_UTF8
1301 if (utf8) switch(c)
1302 {
1303 case OP_CHAR:
1304 case OP_CHARNC:
1305 case OP_EXACT:
1306 case OP_UPTO:
1307 case OP_MINUPTO:
1308 case OP_POSUPTO:
1309 case OP_STAR:
1310 case OP_MINSTAR:
1311 case OP_POSSTAR:
1312 case OP_PLUS:
1313 case OP_MINPLUS:
1314 case OP_POSPLUS:
1315 case OP_QUERY:
1316 case OP_MINQUERY:
1317 case OP_POSQUERY:
1318 if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1319 break;
1320 }
1321 #endif
1322 }
1323 }
1324 }
1325
1326
1327
1328 /*************************************************
1329 * Scan compiled regex for recursion reference *
1330 *************************************************/
1331
1332 /* This little function scans through a compiled pattern until it finds an
1333 instance of OP_RECURSE.
1334
1335 Arguments:
1336 code points to start of expression
1337 utf8 TRUE in UTF-8 mode
1338
1339 Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1340 */
1341
1342 static const uschar *
1343 find_recurse(const uschar *code, BOOL utf8)
1344 {
1345 for (;;)
1346 {
1347 register int c = *code;
1348 if (c == OP_END) return NULL;
1349 if (c == OP_RECURSE) return code;
1350
1351 /* XCLASS is used for classes that cannot be represented just by a bit
1352 map. This includes negated single high-valued characters. The length in
1353 the table is zero; the actual length is stored in the compiled code. */
1354
1355 if (c == OP_XCLASS) code += GET(code, 1);
1356
1357 /* Otherwise, we get the item's length from the table. In UTF-8 mode, opcodes
1358 that are followed by a character may be followed by a multi-byte character.
1359 The length in the table is a minimum, so we have to arrange to skip the extra
1360 bytes. */
1361
1362 else
1363 {
1364 code += _pcre_OP_lengths[c];
1365 #ifdef SUPPORT_UTF8
1366 if (utf8) switch(c)
1367 {
1368 case OP_CHAR:
1369 case OP_CHARNC:
1370 case OP_EXACT:
1371 case OP_UPTO:
1372 case OP_MINUPTO:
1373 case OP_POSUPTO:
1374 case OP_STAR:
1375 case OP_MINSTAR:
1376 case OP_POSSTAR:
1377 case OP_PLUS:
1378 case OP_MINPLUS:
1379 case OP_POSPLUS:
1380 case OP_QUERY:
1381 case OP_MINQUERY:
1382 case OP_POSQUERY:
1383 if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1384 break;
1385 }
1386 #endif
1387 }
1388 }
1389 }
1390
1391
1392
1393 /*************************************************
1394 * Scan compiled branch for non-emptiness *
1395 *************************************************/
1396
1397 /* This function scans through a branch of a compiled pattern to see whether it
1398 can match the empty string or not. It is called from could_be_empty()
1399 below and from compile_branch() when checking for an unlimited repeat of a
1400 group that can match nothing. Note that first_significant_code() skips over
1401 assertions. If we hit an unclosed bracket, we return "empty" - this means we've
1402 struck an inner bracket whose current branch will already have been scanned.
1403
1404 Arguments:
1405 code points to start of search
1406 endcode points to where to stop
1407 utf8 TRUE if in UTF8 mode
1408
1409 Returns: TRUE if what is matched could be empty
1410 */
1411
1412 static BOOL
1413 could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1414 {
1415 register int c;
1416 for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1417 code < endcode;
1418 code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1419 {
1420 const uschar *ccode;
1421
1422 c = *code;
1423
1424 /* Groups with zero repeats can of course be empty; skip them. */
1425
1426 if (c == OP_BRAZERO || c == OP_BRAMINZERO)
1427 {
1428 code += _pcre_OP_lengths[c];
1429 do code += GET(code, 1); while (*code == OP_ALT);
1430 c = *code;
1431 continue;
1432 }
1433
1434 /* For other groups, scan the branches. */
1435
1436 if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND)
1437 {
1438 BOOL empty_branch;
1439 if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1440
1441 /* Scan a closed bracket */
1442
1443 empty_branch = FALSE;
1444 do
1445 {
1446 if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1447 empty_branch = TRUE;
1448 code += GET(code, 1);
1449 }
1450 while (*code == OP_ALT);
1451 if (!empty_branch) return FALSE; /* All branches are non-empty */
1452 c = *code;
1453 continue;
1454 }
1455
1456 /* Handle the other opcodes */
1457
1458 switch (c)
1459 {
1460 /* Check for quantifiers after a class */
1461
1462 #ifdef SUPPORT_UTF8
1463 case OP_XCLASS:
1464 ccode = code + GET(code, 1);
1465 goto CHECK_CLASS_REPEAT;
1466 #endif
1467
1468 case OP_CLASS:
1469 case OP_NCLASS:
1470 ccode = code + 33;
1471
1472 #ifdef SUPPORT_UTF8
1473 CHECK_CLASS_REPEAT:
1474 #endif
1475
1476 switch (*ccode)
1477 {
1478 case OP_CRSTAR: /* These could be empty; continue */
1479 case OP_CRMINSTAR:
1480 case OP_CRQUERY:
1481 case OP_CRMINQUERY:
1482 break;
1483
1484 default: /* Non-repeat => class must match */
1485 case OP_CRPLUS: /* These repeats aren't empty */
1486 case OP_CRMINPLUS:
1487 return FALSE;
1488
1489 case OP_CRRANGE:
1490 case OP_CRMINRANGE:
1491 if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
1492 break;
1493 }
1494 break;
1495
1496 /* Opcodes that must match a character */
1497
1498 case OP_PROP:
1499 case OP_NOTPROP:
1500 case OP_EXTUNI:
1501 case OP_NOT_DIGIT:
1502 case OP_DIGIT:
1503 case OP_NOT_WHITESPACE:
1504 case OP_WHITESPACE:
1505 case OP_NOT_WORDCHAR:
1506 case OP_WORDCHAR:
1507 case OP_ANY:
1508 case OP_ANYBYTE:
1509 case OP_CHAR:
1510 case OP_CHARNC:
1511 case OP_NOT:
1512 case OP_PLUS:
1513 case OP_MINPLUS:
1514 case OP_POSPLUS:
1515 case OP_EXACT:
1516 case OP_NOTPLUS:
1517 case OP_NOTMINPLUS:
1518 case OP_NOTPOSPLUS:
1519 case OP_NOTEXACT:
1520 case OP_TYPEPLUS:
1521 case OP_TYPEMINPLUS:
1522 case OP_TYPEPOSPLUS:
1523 case OP_TYPEEXACT:
1524 return FALSE;
1525
1526 /* End of branch */
1527
1528 case OP_KET:
1529 case OP_KETRMAX:
1530 case OP_KETRMIN:
1531 case OP_ALT:
1532 return TRUE;
1533
1534 /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1535 MINUPTO, and POSUPTO may be followed by a multibyte character */
1536
1537 #ifdef SUPPORT_UTF8
1538 case OP_STAR:
1539 case OP_MINSTAR:
1540 case OP_POSSTAR:
1541 case OP_QUERY:
1542 case OP_MINQUERY:
1543 case OP_POSQUERY:
1544 case OP_UPTO:
1545 case OP_MINUPTO:
1546 case OP_POSUPTO:
1547 if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1548 break;
1549 #endif
1550 }
1551 }
1552
1553 return TRUE;
1554 }
1555
1556
1557
1558 /*************************************************
1559 * Scan compiled regex for non-emptiness *
1560 *************************************************/
1561
1562 /* This function is called to check for left recursive calls. We want to check
1563 the current branch of the current pattern to see if it could match the empty
1564 string. If it could, we must look outwards for branches at other levels,
1565 stopping when we pass beyond the bracket which is the subject of the recursion.
1566
1567 Arguments:
1568 code points to start of the recursion
1569 endcode points to where to stop (current RECURSE item)
1570 bcptr points to the chain of current (unclosed) branch starts
1571 utf8 TRUE if in UTF-8 mode
1572
1573 Returns: TRUE if what is matched could be empty
1574 */
1575
1576 static BOOL
1577 could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1578 BOOL utf8)
1579 {
1580 while (bcptr != NULL && bcptr->current >= code)
1581 {
1582 if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1583 bcptr = bcptr->outer;
1584 }
1585 return TRUE;
1586 }
1587
1588
1589
1590 /*************************************************
1591 * Check for POSIX class syntax *
1592 *************************************************/
1593
1594 /* This function is called when the sequence "[:" or "[." or "[=" is
1595 encountered in a character class. It checks whether this is followed by an
1596 optional ^ and then a sequence of letters, terminated by a matching ":]" or
1597 ".]" or "=]".
1598
1599 Argument:
1600 ptr pointer to the initial [
1601 endptr where to return the end pointer
1602 cd pointer to compile data
1603
1604 Returns: TRUE or FALSE
1605 */
1606
1607 static BOOL
1608 check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
1609 {
1610 int terminator; /* Don't combine these lines; the Solaris cc */
1611 terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
1612 if (*(++ptr) == '^') ptr++;
1613 while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
1614 if (*ptr == terminator && ptr[1] == ']')
1615 {
1616 *endptr = ptr;
1617 return TRUE;
1618 }
1619 return FALSE;
1620 }
1621
1622
1623
1624
1625 /*************************************************
1626 * Check POSIX class name *
1627 *************************************************/
1628
1629 /* This function is called to check the name given in a POSIX-style class entry
1630 such as [:alnum:].
1631
1632 Arguments:
1633 ptr points to the first letter
1634 len the length of the name
1635
1636 Returns: a value representing the name, or -1 if unknown
1637 */
1638
1639 static int
1640 check_posix_name(const uschar *ptr, int len)
1641 {
1642 register int yield = 0;
1643 while (posix_name_lengths[yield] != 0)
1644 {
1645 if (len == posix_name_lengths[yield] &&
1646 strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
1647 yield++;
1648 }
1649 return -1;
1650 }
1651
1652
1653 /*************************************************
1654 * Adjust OP_RECURSE items in repeated group *
1655 *************************************************/
1656
1657 /* OP_RECURSE items contain an offset from the start of the regex to the group
1658 that is referenced. This means that groups can be replicated for fixed
1659 repetition simply by copying (because the recursion is allowed to refer to
1660 earlier groups that are outside the current group). However, when a group is
1661 optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
1662 it, after it has been compiled. This means that any OP_RECURSE items within it
1663 that refer to the group itself or any contained groups have to have their
1664 offsets adjusted. That one of the jobs of this function. Before it is called,
1665 the partially compiled regex must be temporarily terminated with OP_END.
1666
1667 This function has been extended with the possibility of forward references for
1668 recursions and subroutine calls. It must also check the list of such references
1669 for the group we are dealing with. If it finds that one of the recursions in
1670 the current group is on this list, it adjusts the offset in the list, not the
1671 value in the reference (which is a group number).
1672
1673 Arguments:
1674 group points to the start of the group
1675 adjust the amount by which the group is to be moved
1676 utf8 TRUE in UTF-8 mode
1677 cd contains pointers to tables etc.
1678 save_hwm the hwm forward reference pointer at the start of the group
1679
1680 Returns: nothing
1681 */
1682
1683 static void
1684 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
1685 uschar *save_hwm)
1686 {
1687 uschar *ptr = group;
1688 while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1689 {
1690 int offset;
1691 uschar *hc;
1692
1693 /* See if this recursion is on the forward reference list. If so, adjust the
1694 reference. */
1695
1696 for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
1697 {
1698 offset = GET(hc, 0);
1699 if (cd->start_code + offset == ptr + 1)
1700 {
1701 PUT(hc, 0, offset + adjust);
1702 break;
1703 }
1704 }
1705
1706 /* Otherwise, adjust the recursion offset if it's after the start of this
1707 group. */
1708
1709 if (hc >= cd->hwm)
1710 {
1711 offset = GET(ptr, 1);
1712 if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1713 }
1714
1715 ptr += 1 + LINK_SIZE;
1716 }
1717 }
1718
1719
1720
1721 /*************************************************
1722 * Insert an automatic callout point *
1723 *************************************************/
1724
1725 /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
1726 callout points before each pattern item.
1727
1728 Arguments:
1729 code current code pointer
1730 ptr current pattern pointer
1731 cd pointers to tables etc
1732
1733 Returns: new code pointer
1734 */
1735
1736 static uschar *
1737 auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
1738 {
1739 *code++ = OP_CALLOUT;
1740 *code++ = 255;
1741 PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */
1742 PUT(code, LINK_SIZE, 0); /* Default length */
1743 return code + 2*LINK_SIZE;
1744 }
1745
1746
1747
1748 /*************************************************
1749 * Complete a callout item *
1750 *************************************************/
1751
1752 /* A callout item contains the length of the next item in the pattern, which
1753 we can't fill in till after we have reached the relevant point. This is used
1754 for both automatic and manual callouts.
1755
1756 Arguments:
1757 previous_callout points to previous callout item
1758 ptr current pattern pointer
1759 cd pointers to tables etc
1760
1761 Returns: nothing
1762 */
1763
1764 static void
1765 complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
1766 {
1767 int length = ptr - cd->start_pattern - GET(previous_callout, 2);
1768 PUT(previous_callout, 2 + LINK_SIZE, length);
1769 }
1770
1771
1772
1773 #ifdef SUPPORT_UCP
1774 /*************************************************
1775 * Get othercase range *
1776 *************************************************/
1777
1778 /* This function is passed the start and end of a class range, in UTF-8 mode
1779 with UCP support. It searches up the characters, looking for internal ranges of
1780 characters in the "other" case. Each call returns the next one, updating the
1781 start address.
1782
1783 Arguments:
1784 cptr points to starting character value; updated
1785 d end value
1786 ocptr where to put start of othercase range
1787 odptr where to put end of othercase range
1788
1789 Yield: TRUE when range returned; FALSE when no more
1790 */
1791
1792 static BOOL
1793 get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
1794 unsigned int *odptr)
1795 {
1796 unsigned int c, othercase, next;
1797
1798 for (c = *cptr; c <= d; c++)
1799 { if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR) break; }
1800
1801 if (c > d) return FALSE;
1802
1803 *ocptr = othercase;
1804 next = othercase + 1;
1805
1806 for (++c; c <= d; c++)
1807 {
1808 if (_pcre_ucp_othercase(c) != next) break;
1809 next++;
1810 }
1811
1812 *odptr = next - 1;
1813 *cptr = c;
1814
1815 return TRUE;
1816 }
1817 #endif /* SUPPORT_UCP */
1818
1819
1820
1821 /*************************************************
1822 * Check if auto-possessifying is possible *
1823 *************************************************/
1824
1825 /* This function is called for unlimited repeats of certain items, to see
1826 whether the next thing could possibly match the repeated item. If not, it makes
1827 sense to automatically possessify the repeated item.
1828
1829 Arguments:
1830 op_code the repeated op code
1831 this data for this item, depends on the opcode
1832 utf8 TRUE in UTF-8 mode
1833 utf8_char used for utf8 character bytes, NULL if not relevant
1834 ptr next character in pattern
1835 options options bits
1836 cd contains pointers to tables etc.
1837
1838 Returns: TRUE if possessifying is wanted
1839 */
1840
1841 static BOOL
1842 check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
1843 const uschar *ptr, int options, compile_data *cd)
1844 {
1845 int next;
1846
1847 /* Skip whitespace and comments in extended mode */
1848
1849 if ((options & PCRE_EXTENDED) != 0)
1850 {
1851 for (;;)
1852 {
1853 while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1854 if (*ptr == '#')
1855 {
1856 while (*(++ptr) != 0)
1857 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1858 }
1859 else break;
1860 }
1861 }
1862
1863 /* If the next item is one that we can handle, get its value. A non-negative
1864 value is a character, a negative value is an escape value. */
1865
1866 if (*ptr == '\\')
1867 {
1868 int temperrorcode = 0;
1869 next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
1870 if (temperrorcode != 0) return FALSE;
1871 ptr++; /* Point after the escape sequence */
1872 }
1873
1874 else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
1875 {
1876 #ifdef SUPPORT_UTF8
1877 if (utf8) { GETCHARINC(next, ptr); } else
1878 #endif
1879 next = *ptr++;
1880 }
1881
1882 else return FALSE;
1883
1884 /* Skip whitespace and comments in extended mode */
1885
1886 if ((options & PCRE_EXTENDED) != 0)
1887 {
1888 for (;;)
1889 {
1890 while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1891 if (*ptr == '#')
1892 {
1893 while (*(++ptr) != 0)
1894 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1895 }
1896 else break;
1897 }
1898 }
1899
1900 /* If the next thing is itself optional, we have to give up. */
1901
1902 if (*ptr == '*' || *ptr == '?' || strncmp((char *)ptr, "{0,", 3) == 0)
1903 return FALSE;
1904
1905 /* Now compare the next item with the previous opcode. If the previous is a
1906 positive single character match, "item" either contains the character or, if
1907 "item" is greater than 127 in utf8 mode, the character's bytes are in
1908 utf8_char. */
1909
1910
1911 /* Handle cases when the next item is a character. */
1912
1913 if (next >= 0) switch(op_code)
1914 {
1915 case OP_CHAR:
1916 #ifdef SUPPORT_UTF8
1917 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1918 #endif
1919 return item != next;
1920
1921 /* For CHARNC (caseless character) we must check the other case. If we have
1922 Unicode property support, we can use it to test the other case of
1923 high-valued characters. */
1924
1925 case OP_CHARNC:
1926 #ifdef SUPPORT_UTF8
1927 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1928 #endif
1929 if (item == next) return FALSE;
1930 #ifdef SUPPORT_UTF8
1931 if (utf8)
1932 {
1933 unsigned int othercase;
1934 if (next < 128) othercase = cd->fcc[next]; else
1935 #ifdef SUPPORT_UCP
1936 othercase = _pcre_ucp_othercase((unsigned int)next);
1937 #else
1938 othercase = NOTACHAR;
1939 #endif
1940 return (unsigned int)item != othercase;
1941 }
1942 else
1943 #endif /* SUPPORT_UTF8 */
1944 return (item != cd->fcc[next]); /* Non-UTF-8 mode */
1945
1946 /* For OP_NOT, "item" must be a single-byte character. */
1947
1948 case OP_NOT:
1949 if (next < 0) return FALSE; /* Not a character */
1950 if (item == next) return TRUE;
1951 if ((options & PCRE_CASELESS) == 0) return FALSE;
1952 #ifdef SUPPORT_UTF8
1953 if (utf8)
1954 {
1955 unsigned int othercase;
1956 if (next < 128) othercase = cd->fcc[next]; else
1957 #ifdef SUPPORT_UCP
1958 othercase = _pcre_ucp_othercase(next);
1959 #else
1960 othercase = NOTACHAR;
1961 #endif
1962 return (unsigned int)item == othercase;
1963 }
1964 else
1965 #endif /* SUPPORT_UTF8 */
1966 return (item == cd->fcc[next]); /* Non-UTF-8 mode */
1967
1968 case OP_DIGIT:
1969 return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
1970
1971 case OP_NOT_DIGIT:
1972 return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
1973
1974 case OP_WHITESPACE:
1975 return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
1976
1977 case OP_NOT_WHITESPACE:
1978 return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
1979
1980 case OP_WORDCHAR:
1981 return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
1982
1983 case OP_NOT_WORDCHAR:
1984 return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
1985
1986 case OP_HSPACE:
1987 case OP_NOT_HSPACE:
1988 switch(next)
1989 {
1990 case 0x09:
1991 case 0x20:
1992 case 0xa0:
1993 case 0x1680:
1994 case 0x180e:
1995 case 0x2000:
1996 case 0x2001:
1997 case 0x2002:
1998 case 0x2003:
1999 case 0x2004:
2000 case 0x2005:
2001 case 0x2006:
2002 case 0x2007:
2003 case 0x2008:
2004 case 0x2009:
2005 case 0x200A:
2006 case 0x202f:
2007 case 0x205f:
2008 case 0x3000:
2009 return op_code != OP_HSPACE;
2010 default:
2011 return op_code == OP_HSPACE;
2012 }
2013
2014 case OP_VSPACE:
2015 case OP_NOT_VSPACE:
2016 switch(next)
2017 {
2018 case 0x0a:
2019 case 0x0b:
2020 case 0x0c:
2021 case 0x0d:
2022 case 0x85:
2023 case 0x2028:
2024 case 0x2029:
2025 return op_code != OP_VSPACE;
2026 default:
2027 return op_code == OP_VSPACE;
2028 }
2029
2030 default:
2031 return FALSE;
2032 }
2033
2034
2035 /* Handle the case when the next item is \d, \s, etc. */
2036
2037 switch(op_code)
2038 {
2039 case OP_CHAR:
2040 case OP_CHARNC:
2041 #ifdef SUPPORT_UTF8
2042 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2043 #endif
2044 switch(-next)
2045 {
2046 case ESC_d:
2047 return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
2048
2049 case ESC_D:
2050 return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
2051
2052 case ESC_s:
2053 return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
2054
2055 case ESC_S:
2056 return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
2057
2058 case ESC_w:
2059 return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
2060
2061 case ESC_W:
2062 return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
2063
2064 case ESC_h:
2065 case ESC_H:
2066 switch(item)
2067 {
2068 case 0x09:
2069 case 0x20:
2070 case 0xa0:
2071 case 0x1680:
2072 case 0x180e:
2073 case 0x2000:
2074 case 0x2001:
2075 case 0x2002:
2076 case 0x2003:
2077 case 0x2004:
2078 case 0x2005:
2079 case 0x2006:
2080 case 0x2007:
2081 case 0x2008:
2082 case 0x2009:
2083 case 0x200A:
2084 case 0x202f:
2085 case 0x205f:
2086 case 0x3000:
2087 return -next != ESC_h;
2088 default:
2089 return -next == ESC_h;
2090 }
2091
2092 case ESC_v:
2093 case ESC_V:
2094 switch(item)
2095 {
2096 case 0x0a:
2097 case 0x0b:
2098 case 0x0c:
2099 case 0x0d:
2100 case 0x85:
2101 case 0x2028:
2102 case 0x2029:
2103 return -next != ESC_v;
2104 default:
2105 return -next == ESC_v;
2106 }
2107
2108 default:
2109 return FALSE;
2110 }
2111
2112 case OP_DIGIT:
2113 return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
2114 next == -ESC_h || next == -ESC_v;
2115
2116 case OP_NOT_DIGIT:
2117 return next == -ESC_d;
2118
2119 case OP_WHITESPACE:
2120 return next == -ESC_S || next == -ESC_d || next == -ESC_w;
2121
2122 case OP_NOT_WHITESPACE:
2123 return next == -ESC_s || next == -ESC_h || next == -ESC_v;
2124
2125 case OP_HSPACE:
2126 return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w;
2127
2128 case OP_NOT_HSPACE:
2129 return next == -ESC_h;
2130
2131 /* Can't have \S in here because VT matches \S (Perl anomaly) */
2132 case OP_VSPACE:
2133 return next == -ESC_V || next == -ESC_d || next == -ESC_w;
2134
2135 case OP_NOT_VSPACE:
2136 return next == -ESC_v;
2137
2138 case OP_WORDCHAR:
2139 return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v;
2140
2141 case OP_NOT_WORDCHAR:
2142 return next == -ESC_w || next == -ESC_d;
2143
2144 default:
2145 return FALSE;
2146 }
2147
2148 /* Control does not reach here */
2149 }
2150
2151
2152
2153 /*************************************************
2154 * Compile one branch *
2155 *************************************************/
2156
2157 /* Scan the pattern, compiling it into the a vector. If the options are
2158 changed during the branch, the pointer is used to change the external options
2159 bits. This function is used during the pre-compile phase when we are trying
2160 to find out the amount of memory needed, as well as during the real compile
2161 phase. The value of lengthptr distinguishes the two phases.
2162
2163 Arguments:
2164 optionsptr pointer to the option bits
2165 codeptr points to the pointer to the current code point
2166 ptrptr points to the current pattern pointer
2167 errorcodeptr points to error code variable
2168 firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2169 reqbyteptr set to the last literal character required, else < 0
2170 bcptr points to current branch chain
2171 cd contains pointers to tables etc.
2172 lengthptr NULL during the real compile phase
2173 points to length accumulator during pre-compile phase
2174
2175 Returns: TRUE on success
2176 FALSE, with *errorcodeptr set non-zero on error
2177 */
2178
2179 static BOOL
2180 compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2181 int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2182 compile_data *cd, int *lengthptr)
2183 {
2184 int repeat_type, op_type;
2185 int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
2186 int bravalue = 0;
2187 int greedy_default, greedy_non_default;
2188 int firstbyte, reqbyte;
2189 int zeroreqbyte, zerofirstbyte;
2190 int req_caseopt, reqvary, tempreqvary;
2191 int options = *optionsptr;
2192 int after_manual_callout = 0;
2193 int length_prevgroup = 0;
2194 register int c;
2195 register uschar *code = *codeptr;
2196 uschar *last_code = code;
2197 uschar *orig_code = code;
2198 uschar *tempcode;
2199 BOOL inescq = FALSE;
2200 BOOL groupsetfirstbyte = FALSE;
2201 const uschar *ptr = *ptrptr;
2202 const uschar *tempptr;
2203 uschar *previous = NULL;
2204 uschar *previous_callout = NULL;
2205 uschar *save_hwm = NULL;
2206 uschar classbits[32];
2207
2208 #ifdef SUPPORT_UTF8
2209 BOOL class_utf8;
2210 BOOL utf8 = (options & PCRE_UTF8) != 0;
2211 uschar *class_utf8data;
2212 uschar utf8_char[6];
2213 #else
2214 BOOL utf8 = FALSE;
2215 uschar *utf8_char = NULL;
2216 #endif
2217
2218 #ifdef DEBUG
2219 if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2220 #endif
2221
2222 /* Set up the default and non-default settings for greediness */
2223
2224 greedy_default = ((options & PCRE_UNGREEDY) != 0);
2225 greedy_non_default = greedy_default ^ 1;
2226
2227 /* Initialize no first byte, no required byte. REQ_UNSET means "no char
2228 matching encountered yet". It gets changed to REQ_NONE if we hit something that
2229 matches a non-fixed char first char; reqbyte just remains unset if we never
2230 find one.
2231
2232 When we hit a repeat whose minimum is zero, we may have to adjust these values
2233 to take the zero repeat into account. This is implemented by setting them to
2234 zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2235 item types that can be repeated set these backoff variables appropriately. */
2236
2237 firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2238
2239 /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2240 according to the current setting of the caseless flag. REQ_CASELESS is a bit
2241 value > 255. It is added into the firstbyte or reqbyte variables to record the
2242 case status of the value. This is used only for ASCII characters. */
2243
2244 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2245
2246 /* Switch on next character until the end of the branch */
2247
2248 for (;; ptr++)
2249 {
2250 BOOL negate_class;
2251 BOOL possessive_quantifier;
2252 BOOL is_quantifier;
2253 BOOL is_recurse;
2254 BOOL reset_bracount;
2255 int class_charcount;
2256 int class_lastchar;
2257 int newoptions;
2258 int recno;
2259 int refsign;
2260 int skipbytes;
2261 int subreqbyte;
2262 int subfirstbyte;
2263 int terminator;
2264 int mclength;
2265 uschar mcbuffer[8];
2266
2267 /* Get next byte in the pattern */
2268
2269 c = *ptr;
2270
2271 /* If we are in the pre-compile phase, accumulate the length used for the
2272 previous cycle of this loop. */
2273
2274 if (lengthptr != NULL)
2275 {
2276 #ifdef DEBUG
2277 if (code > cd->hwm) cd->hwm = code; /* High water info */
2278 #endif
2279 if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
2280 {
2281 *errorcodeptr = ERR52;
2282 goto FAILED;
2283 }
2284
2285 /* There is at least one situation where code goes backwards: this is the
2286 case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2287 the class is simply eliminated. However, it is created first, so we have to
2288 allow memory for it. Therefore, don't ever reduce the length at this point.
2289 */
2290
2291 if (code < last_code) code = last_code;
2292
2293 /* Paranoid check for integer overflow */
2294
2295 if (OFLOW_MAX - *lengthptr < code - last_code)
2296 {
2297 *errorcodeptr = ERR20;
2298 goto FAILED;
2299 }
2300
2301 *lengthptr += code - last_code;
2302 DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2303
2304 /* If "previous" is set and it is not at the start of the work space, move
2305 it back to there, in order to avoid filling up the work space. Otherwise,
2306 if "previous" is NULL, reset the current code pointer to the start. */
2307
2308 if (previous != NULL)
2309 {
2310 if (previous > orig_code)
2311 {
2312 memmove(orig_code, previous, code - previous);
2313 code -= previous - orig_code;
2314 previous = orig_code;
2315 }
2316 }
2317 else code = orig_code;
2318
2319 /* Remember where this code item starts so we can pick up the length
2320 next time round. */
2321
2322 last_code = code;
2323 }
2324
2325 /* In the real compile phase, just check the workspace used by the forward
2326 reference list. */
2327
2328 else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
2329 {
2330 *errorcodeptr = ERR52;
2331 goto FAILED;
2332 }
2333
2334 /* If in \Q...\E, check for the end; if not, we have a literal */
2335
2336 if (inescq && c != 0)
2337 {
2338 if (c == '\\' && ptr[1] == 'E')
2339 {
2340 inescq = FALSE;
2341 ptr++;
2342 continue;
2343 }
2344 else
2345 {
2346 if (previous_callout != NULL)
2347 {
2348 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2349 complete_callout(previous_callout, ptr, cd);
2350 previous_callout = NULL;
2351 }
2352 if ((options & PCRE_AUTO_CALLOUT) != 0)
2353 {
2354 previous_callout = code;
2355 code = auto_callout(code, ptr, cd);
2356 }
2357 goto NORMAL_CHAR;
2358 }
2359 }
2360
2361 /* Fill in length of a previous callout, except when the next thing is
2362 a quantifier. */
2363
2364 is_quantifier = c == '*' || c == '+' || c == '?' ||
2365 (c == '{' && is_counted_repeat(ptr+1));
2366
2367 if (!is_quantifier && previous_callout != NULL &&
2368 after_manual_callout-- <= 0)
2369 {
2370 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2371 complete_callout(previous_callout, ptr, cd);
2372 previous_callout = NULL;
2373 }
2374
2375 /* In extended mode, skip white space and comments */
2376
2377 if ((options & PCRE_EXTENDED) != 0)
2378 {
2379 if ((cd->ctypes[c] & ctype_space) != 0) continue;
2380 if (c == '#')
2381 {
2382 while (*(++ptr) != 0)
2383 {
2384 if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
2385 }
2386 if (*ptr != 0) continue;
2387
2388 /* Else fall through to handle end of string */
2389 c = 0;
2390 }
2391 }
2392
2393 /* No auto callout for quantifiers. */
2394
2395 if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
2396 {
2397 previous_callout = code;
2398 code = auto_callout(code, ptr, cd);
2399 }
2400
2401 switch(c)
2402 {
2403 /* ===================================================================*/
2404 case 0: /* The branch terminates at string end */
2405 case '|': /* or | or ) */
2406 case ')':
2407 *firstbyteptr = firstbyte;
2408 *reqbyteptr = reqbyte;
2409 *codeptr = code;
2410 *ptrptr = ptr;
2411 if (lengthptr != NULL)
2412 {
2413 if (OFLOW_MAX - *lengthptr < code - last_code)
2414 {
2415 *errorcodeptr = ERR20;
2416 goto FAILED;
2417 }
2418 *lengthptr += code - last_code; /* To include callout length */
2419 DPRINTF((">> end branch\n"));
2420 }
2421 return TRUE;
2422
2423
2424 /* ===================================================================*/
2425 /* Handle single-character metacharacters. In multiline mode, ^ disables
2426 the setting of any following char as a first character. */
2427
2428 case '^':
2429 if ((options & PCRE_MULTILINE) != 0)
2430 {
2431 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2432 }
2433 previous = NULL;
2434 *code++ = OP_CIRC;
2435 break;
2436
2437 case '$':
2438 previous = NULL;
2439 *code++ = OP_DOLL;
2440 break;
2441
2442 /* There can never be a first char if '.' is first, whatever happens about
2443 repeats. The value of reqbyte doesn't change either. */
2444
2445 case '.':
2446 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2447 zerofirstbyte = firstbyte;
2448 zeroreqbyte = reqbyte;
2449 previous = code;
2450 *code++ = OP_ANY;
2451 break;
2452
2453
2454 /* ===================================================================*/
2455 /* Character classes. If the included characters are all < 256, we build a
2456 32-byte bitmap of the permitted characters, except in the special case
2457 where there is only one such character. For negated classes, we build the
2458 map as usual, then invert it at the end. However, we use a different opcode
2459 so that data characters > 255 can be handled correctly.
2460
2461 If the class contains characters outside the 0-255 range, a different
2462 opcode is compiled. It may optionally have a bit map for characters < 256,
2463 but those above are are explicitly listed afterwards. A flag byte tells
2464 whether the bitmap is present, and whether this is a negated class or not.
2465 */
2466
2467 case '[':
2468 previous = code;
2469
2470 /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
2471 they are encountered at the top level, so we'll do that too. */
2472
2473 if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2474 check_posix_syntax(ptr, &tempptr, cd))
2475 {
2476 *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;
2477 goto FAILED;
2478 }
2479
2480 /* If the first character is '^', set the negation flag and skip it. Also,
2481 if the first few characters (either before or after ^) are \Q\E or \E we
2482 skip them too. This makes for compatibility with Perl. */
2483
2484 negate_class = FALSE;
2485 for (;;)
2486 {
2487 c = *(++ptr);
2488 if (c == '\\')
2489 {
2490 if (ptr[1] == 'E') ptr++;
2491 else if (strncmp((const char *)ptr+1, "Q\\E", 3) == 0) ptr += 3;
2492 else break;
2493 }
2494 else if (!negate_class && c == '^')
2495 negate_class = TRUE;
2496 else break;
2497 }
2498
2499 /* Keep a count of chars with values < 256 so that we can optimize the case
2500 of just a single character (as long as it's < 256). However, For higher
2501 valued UTF-8 characters, we don't yet do any optimization. */
2502
2503 class_charcount = 0;
2504 class_lastchar = -1;
2505
2506 /* Initialize the 32-char bit map to all zeros. We build the map in a
2507 temporary bit of memory, in case the class contains only 1 character (less
2508 than 256), because in that case the compiled code doesn't use the bit map.
2509 */
2510
2511 memset(classbits, 0, 32 * sizeof(uschar));
2512
2513 #ifdef SUPPORT_UTF8
2514 class_utf8 = FALSE; /* No chars >= 256 */
2515 class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */
2516 #endif
2517
2518 /* Process characters until ] is reached. By writing this as a "do" it
2519 means that an initial ] is taken as a data character. At the start of the
2520 loop, c contains the first byte of the character. */
2521
2522 if (c != 0) do
2523 {
2524 const uschar *oldptr;
2525
2526 #ifdef SUPPORT_UTF8
2527 if (utf8 && c > 127)
2528 { /* Braces are required because the */
2529 GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
2530 }
2531 #endif
2532
2533 /* Inside \Q...\E everything is literal except \E */
2534
2535 if (inescq)
2536 {
2537 if (c == '\\' && ptr[1] == 'E') /* If we are at \E */
2538 {
2539 inescq = FALSE; /* Reset literal state */
2540 ptr++; /* Skip the 'E' */
2541 continue; /* Carry on with next */
2542 }
2543 goto CHECK_RANGE; /* Could be range if \E follows */
2544 }
2545
2546 /* Handle POSIX class names. Perl allows a negation extension of the
2547 form [:^name:]. A square bracket that doesn't match the syntax is
2548 treated as a literal. We also recognize the POSIX constructions
2549 [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
2550 5.6 and 5.8 do. */
2551
2552 if (c == '[' &&
2553 (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2554 check_posix_syntax(ptr, &tempptr, cd))
2555 {
2556 BOOL local_negate = FALSE;
2557 int posix_class, taboffset, tabopt;
2558 register const uschar *cbits = cd->cbits;
2559 uschar pbits[32];
2560
2561 if (ptr[1] != ':')
2562 {
2563 *errorcodeptr = ERR31;
2564 goto FAILED;
2565 }
2566
2567 ptr += 2;
2568 if (*ptr == '^')
2569 {
2570 local_negate = TRUE;
2571 ptr++;
2572 }
2573
2574 posix_class = check_posix_name(ptr, tempptr - ptr);
2575 if (posix_class < 0)
2576 {
2577 *errorcodeptr = ERR30;
2578 goto FAILED;
2579 }
2580
2581 /* If matching is caseless, upper and lower are converted to
2582 alpha. This relies on the fact that the class table starts with
2583 alpha, lower, upper as the first 3 entries. */
2584
2585 if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
2586 posix_class = 0;
2587
2588 /* We build the bit map for the POSIX class in a chunk of local store
2589 because we may be adding and subtracting from it, and we don't want to
2590 subtract bits that may be in the main map already. At the end we or the
2591 result into the bit map that is being built. */
2592
2593 posix_class *= 3;
2594
2595 /* Copy in the first table (always present) */
2596
2597 memcpy(pbits, cbits + posix_class_maps[posix_class],
2598 32 * sizeof(uschar));
2599
2600 /* If there is a second table, add or remove it as required. */
2601
2602 taboffset = posix_class_maps[posix_class + 1];
2603 tabopt = posix_class_maps[posix_class + 2];
2604
2605 if (taboffset >= 0)
2606 {
2607 if (tabopt >= 0)
2608 for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
2609 else
2610 for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
2611 }
2612
2613 /* Not see if we need to remove any special characters. An option
2614 value of 1 removes vertical space and 2 removes underscore. */
2615
2616 if (tabopt < 0) tabopt = -tabopt;
2617 if (tabopt == 1) pbits[1] &= ~0x3c;
2618 else if (tabopt == 2) pbits[11] &= 0x7f;
2619
2620 /* Add the POSIX table or its complement into the main table that is
2621 being built and we are done. */
2622
2623 if (local_negate)
2624 for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
2625 else
2626 for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
2627
2628 ptr = tempptr + 1;
2629 class_charcount = 10; /* Set > 1; assumes more than 1 per class */
2630 continue; /* End of POSIX syntax handling */
2631 }
2632
2633 /* Backslash may introduce a single character, or it may introduce one
2634 of the specials, which just set a flag. The sequence \b is a special
2635 case. Inside a class (and only there) it is treated as backspace.
2636 Elsewhere it marks a word boundary. Other escapes have preset maps ready
2637 to 'or' into the one we are building. We assume they have more than one
2638 character in them, so set class_charcount bigger than one. */
2639
2640 if (c == '\\')
2641 {
2642 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2643 if (*errorcodeptr != 0) goto FAILED;
2644
2645 if (-c == ESC_b) c = '\b'; /* \b is backslash in a class */
2646 else if (-c == ESC_X) c = 'X'; /* \X is literal X in a class */
2647 else if (-c == ESC_R) c = 'R'; /* \R is literal R in a class */
2648 else if (-c == ESC_Q) /* Handle start of quoted string */
2649 {
2650 if (ptr[1] == '\\' && ptr[2] == 'E')
2651 {
2652 ptr += 2; /* avoid empty string */
2653 }
2654 else inescq = TRUE;
2655 continue;
2656 }
2657
2658 if (c < 0)
2659 {
2660 register const uschar *cbits = cd->cbits;
2661 class_charcount += 2; /* Greater than 1 is what matters */
2662
2663 /* Save time by not doing this in the pre-compile phase. */
2664
2665 if (lengthptr == NULL) switch (-c)
2666 {
2667 case ESC_d:
2668 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
2669 continue;
2670
2671 case ESC_D:
2672 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
2673 continue;
2674
2675 case ESC_w:
2676 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
2677 continue;
2678
2679 case ESC_W:
2680 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
2681 continue;
2682
2683 case ESC_s:
2684 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
2685 classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
2686 continue;
2687
2688 case ESC_S:
2689 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
2690 classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
2691 continue;
2692
2693 case ESC_E: /* Perl ignores an orphan \E */
2694 continue;
2695
2696 default: /* Not recognized; fall through */
2697 break; /* Need "default" setting to stop compiler warning. */
2698 }
2699
2700 /* In the pre-compile phase, just do the recognition. */
2701
2702 else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
2703 c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
2704
2705 /* We need to deal with \H, \h, \V, and \v in both phases because
2706 they use extra memory. */
2707
2708 if (-c == ESC_h)
2709 {
2710 SETBIT(classbits, 0x09); /* VT */
2711 SETBIT(classbits, 0x20); /* SPACE */
2712 SETBIT(classbits, 0xa0); /* NSBP */
2713 #ifdef SUPPORT_UTF8
2714 if (utf8)
2715 {
2716 class_utf8 = TRUE;
2717 *class_utf8data++ = XCL_SINGLE;
2718 class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
2719 *class_utf8data++ = XCL_SINGLE;
2720 class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
2721 *class_utf8data++ = XCL_RANGE;
2722 class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
2723 class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
2724 *class_utf8data++ = XCL_SINGLE;
2725 class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
2726 *class_utf8data++ = XCL_SINGLE;
2727 class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
2728 *class_utf8data++ = XCL_SINGLE;
2729 class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
2730 }
2731 #endif
2732 continue;
2733 }
2734
2735 if (-c == ESC_H)
2736 {
2737 for (c = 0; c < 32; c++)
2738 {
2739 int x = 0xff;
2740 switch (c)
2741 {
2742 case 0x09/8: x ^= 1 << (0x09%8); break;
2743 case 0x20/8: x ^= 1 << (0x20%8); break;
2744 case 0xa0/8: x ^= 1 << (0xa0%8); break;
2745 default: break;
2746 }
2747 classbits[c] |= x;
2748 }
2749
2750 #ifdef SUPPORT_UTF8
2751 if (utf8)
2752 {
2753 class_utf8 = TRUE;
2754 *class_utf8data++ = XCL_RANGE;
2755 class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2756 class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
2757 *class_utf8data++ = XCL_RANGE;
2758 class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
2759 class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
2760 *class_utf8data++ = XCL_RANGE;
2761 class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
2762 class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
2763 *class_utf8data++ = XCL_RANGE;
2764 class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
2765 class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
2766 *class_utf8data++ = XCL_RANGE;
2767 class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
2768 class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
2769 *class_utf8data++ = XCL_RANGE;
2770 class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
2771 class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
2772 *class_utf8data++ = XCL_RANGE;
2773 class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
2774 class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2775 }
2776 #endif
2777 continue;
2778 }
2779
2780 if (-c == ESC_v)
2781 {
2782 SETBIT(classbits, 0x0a); /* LF */
2783 SETBIT(classbits, 0x0b); /* VT */
2784 SETBIT(classbits, 0x0c); /* FF */
2785 SETBIT(classbits, 0x0d); /* CR */
2786 SETBIT(classbits, 0x85); /* NEL */
2787 #ifdef SUPPORT_UTF8
2788 if (utf8)
2789 {
2790 class_utf8 = TRUE;
2791 *class_utf8data++ = XCL_RANGE;
2792 class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
2793 class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
2794 }
2795 #endif
2796 continue;
2797 }
2798
2799 if (-c == ESC_V)
2800 {
2801 for (c = 0; c < 32; c++)
2802 {
2803 int x = 0xff;
2804 switch (c)
2805 {
2806 case 0x0a/8: x ^= 1 << (0x0a%8);
2807 x ^= 1 << (0x0b%8);
2808 x ^= 1 << (0x0c%8);
2809 x ^= 1 << (0x0d%8);
2810 break;
2811 case 0x85/8: x ^= 1 << (0x85%8); break;
2812 default: break;
2813 }
2814 classbits[c] |= x;
2815 }
2816
2817 #ifdef SUPPORT_UTF8
2818 if (utf8)
2819 {
2820 class_utf8 = TRUE;
2821 *class_utf8data++ = XCL_RANGE;
2822 class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2823 class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
2824 *class_utf8data++ = XCL_RANGE;
2825 class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
2826 class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2827 }
2828 #endif
2829 continue;
2830 }
2831
2832 /* We need to deal with \P and \p in both phases. */
2833
2834 #ifdef SUPPORT_UCP
2835 if (-c == ESC_p || -c == ESC_P)
2836 {
2837 BOOL negated;
2838 int pdata;
2839 int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
2840 if (ptype < 0) goto FAILED;
2841 class_utf8 = TRUE;
2842 *class_utf8data++ = ((-c == ESC_p) != negated)?
2843 XCL_PROP : XCL_NOTPROP;
2844 *class_utf8data++ = ptype;
2845 *class_utf8data++ = pdata;
2846 class_charcount -= 2; /* Not a < 256 character */
2847 continue;
2848 }
2849 #endif
2850 /* Unrecognized escapes are faulted if PCRE is running in its
2851 strict mode. By default, for compatibility with Perl, they are
2852 treated as literals. */
2853
2854 if ((options & PCRE_EXTRA) != 0)
2855 {
2856 *errorcodeptr = ERR7;
2857 goto FAILED;
2858 }
2859
2860 class_charcount -= 2; /* Undo the default count from above */
2861 c = *ptr; /* Get the final character and fall through */
2862 }
2863
2864 /* Fall through if we have a single character (c >= 0). This may be
2865 greater than 256 in UTF-8 mode. */
2866
2867 } /* End of backslash handling */
2868
2869 /* A single character may be followed by '-' to form a range. However,
2870 Perl does not permit ']' to be the end of the range. A '-' character
2871 at the end is treated as a literal. Perl ignores orphaned \E sequences
2872 entirely. The code for handling \Q and \E is messy. */
2873
2874 CHECK_RANGE:
2875 while (ptr[1] == '\\' && ptr[2] == 'E')
2876 {
2877 inescq = FALSE;
2878 ptr += 2;
2879 }
2880
2881 oldptr = ptr;
2882
2883 if (!inescq && ptr[1] == '-')
2884 {
2885 int d;
2886 ptr += 2;
2887 while (*ptr == '\\' && ptr[1] == 'E') ptr += 2;
2888
2889 /* If we hit \Q (not followed by \E) at this point, go into escaped
2890 mode. */
2891
2892 while (*ptr == '\\' && ptr[1] == 'Q')
2893 {
2894 ptr += 2;
2895 if (*ptr == '\\' && ptr[1] == 'E') { ptr += 2; continue; }
2896 inescq = TRUE;
2897 break;
2898 }
2899
2900 if (*ptr == 0 || (!inescq && *ptr == ']'))
2901 {
2902 ptr = oldptr;
2903 goto LONE_SINGLE_CHARACTER;
2904 }
2905
2906 #ifdef SUPPORT_UTF8
2907 if (utf8)
2908 { /* Braces are required because the */
2909 GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
2910 }
2911 else
2912 #endif
2913 d = *ptr; /* Not UTF-8 mode */
2914
2915 /* The second part of a range can be a single-character escape, but
2916 not any of the other escapes. Perl 5.6 treats a hyphen as a literal
2917 in such circumstances. */
2918
2919 if (!inescq && d == '\\')
2920 {
2921 d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2922 if (*errorcodeptr != 0) goto FAILED;
2923
2924 /* \b is backslash; \X is literal X; \R is literal R; any other
2925 special means the '-' was literal */
2926
2927 if (d < 0)
2928 {
2929 if (d == -ESC_b) d = '\b';
2930 else if (d == -ESC_X) d = 'X';
2931 else if (d == -ESC_R) d = 'R'; else
2932 {
2933 ptr = oldptr;
2934 goto LONE_SINGLE_CHARACTER; /* A few lines below */
2935 }
2936 }
2937 }
2938
2939 /* Check that the two values are in the correct order. Optimize
2940 one-character ranges */
2941
2942 if (d < c)
2943 {
2944 *errorcodeptr = ERR8;
2945 goto FAILED;
2946 }
2947
2948 if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
2949
2950 /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
2951 matching, we have to use an XCLASS with extra data items. Caseless
2952 matching for characters > 127 is available only if UCP support is
2953 available. */
2954
2955 #ifdef SUPPORT_UTF8
2956 if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
2957 {
2958 class_utf8 = TRUE;
2959
2960 /* With UCP support, we can find the other case equivalents of
2961 the relevant characters. There may be several ranges. Optimize how
2962 they fit with the basic range. */
2963
2964 #ifdef SUPPORT_UCP
2965 if ((options & PCRE_CASELESS) != 0)
2966 {
2967 unsigned int occ, ocd;
2968 unsigned int cc = c;
2969 unsigned int origd = d;
2970 while (get_othercase_range(&cc, origd, &occ, &ocd))
2971 {
2972 if (occ >= (unsigned int)c &&
2973 ocd <= (unsigned int)d)
2974 continue; /* Skip embedded ranges */
2975
2976 if (occ < (unsigned int)c &&
2977 ocd >= (unsigned int)c - 1) /* Extend the basic range */
2978 { /* if there is overlap, */
2979 c = occ; /* noting that if occ < c */
2980 continue; /* we can't have ocd > d */
2981 } /* because a subrange is */
2982 if (ocd > (unsigned int)d &&
2983 occ <= (unsigned int)d + 1) /* always shorter than */
2984 { /* the basic range. */
2985 d = ocd;
2986 continue;
2987 }
2988
2989 if (occ == ocd)
2990 {
2991 *class_utf8data++ = XCL_SINGLE;
2992 }
2993 else
2994 {
2995 *class_utf8data++ = XCL_RANGE;
2996 class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
2997 }
2998 class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
2999 }
3000 }
3001 #endif /* SUPPORT_UCP */
3002
3003 /* Now record the original range, possibly modified for UCP caseless
3004 overlapping ranges. */
3005
3006 *class_utf8data++ = XCL_RANGE;
3007 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3008 class_utf8data += _pcre_ord2utf8(d, class_utf8data);
3009
3010 /* With UCP support, we are done. Without UCP support, there is no
3011 caseless matching for UTF-8 characters > 127; we can use the bit map
3012 for the smaller ones. */
3013
3014 #ifdef SUPPORT_UCP
3015 continue; /* With next character in the class */
3016 #else
3017 if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
3018
3019 /* Adjust upper limit and fall through to set up the map */
3020
3021 d = 127;
3022
3023 #endif /* SUPPORT_UCP */
3024 }
3025 #endif /* SUPPORT_UTF8 */
3026
3027 /* We use the bit map for all cases when not in UTF-8 mode; else
3028 ranges that lie entirely within 0-127 when there is UCP support; else
3029 for partial ranges without UCP support. */
3030
3031 class_charcount += d - c + 1;
3032 class_lastchar = d;
3033
3034 /* We can save a bit of time by skipping this in the pre-compile. */
3035
3036 if (lengthptr == NULL) for (; c <= d; c++)
3037 {
3038 classbits[c/8] |= (1 << (c&7));
3039 if ((options & PCRE_CASELESS) != 0)
3040 {
3041 int uc = cd->fcc[c]; /* flip case */
3042 classbits[uc/8] |= (1 << (uc&7));
3043 }
3044 }
3045
3046 continue; /* Go get the next char in the class */
3047 }
3048
3049 /* Handle a lone single character - we can get here for a normal
3050 non-escape char, or after \ that introduces a single character or for an
3051 apparent range that isn't. */
3052
3053 LONE_SINGLE_CHARACTER:
3054
3055 /* Handle a character that cannot go in the bit map */
3056
3057 #ifdef SUPPORT_UTF8
3058 if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
3059 {
3060 class_utf8 = TRUE;
3061 *class_utf8data++ = XCL_SINGLE;
3062 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3063
3064 #ifdef SUPPORT_UCP
3065 if ((options & PCRE_CASELESS) != 0)
3066 {
3067 unsigned int othercase;
3068 if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR)
3069 {
3070 *class_utf8data++ = XCL_SINGLE;
3071 class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
3072 }
3073 }
3074 #endif /* SUPPORT_UCP */
3075
3076 }
3077 else
3078 #endif /* SUPPORT_UTF8 */
3079
3080 /* Handle a single-byte character */
3081 {
3082 classbits[c/8] |= (1 << (c&7));
3083 if ((options & PCRE_CASELESS) != 0)
3084 {
3085 c = cd->fcc[c]; /* flip case */
3086 classbits[c/8] |= (1 << (c&7));
3087 }
3088 class_charcount++;
3089 class_lastchar = c;
3090 }
3091 }
3092
3093 /* Loop until ']' reached. This "while" is the end of the "do" above. */
3094
3095 while ((c = *(++ptr)) != 0 && (c != ']' || inescq));
3096
3097 if (c == 0) /* Missing terminating ']' */
3098 {
3099 *errorcodeptr = ERR6;
3100 goto FAILED;
3101 }
3102
3103 /* If class_charcount is 1, we saw precisely one character whose value is
3104 less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we
3105 can optimize the negative case only if there were no characters >= 128
3106 because OP_NOT and the related opcodes like OP_NOTSTAR operate on
3107 single-bytes only. This is an historical hangover. Maybe one day we can
3108 tidy these opcodes to handle multi-byte characters.
3109
3110 The optimization throws away the bit map. We turn the item into a
3111 1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
3112 that OP_NOT does not support multibyte characters. In the positive case, it
3113 can cause firstbyte to be set. Otherwise, there can be no first char if
3114 this item is first, whatever repeat count may follow. In the case of
3115 reqbyte, save the previous value for reinstating. */
3116
3117 #ifdef SUPPORT_UTF8
3118 if (class_charcount == 1 &&
3119 (!utf8 ||
3120 (!class_utf8 && (!negate_class || class_lastchar < 128))))
3121
3122 #else
3123 if (class_charcount == 1)
3124 #endif
3125 {
3126 zeroreqbyte = reqbyte;
3127
3128 /* The OP_NOT opcode works on one-byte characters only. */
3129
3130 if (negate_class)
3131 {
3132 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3133 zerofirstbyte = firstbyte;
3134 *code++ = OP_NOT;
3135 *code++ = class_lastchar;
3136 break;
3137 }
3138
3139 /* For a single, positive character, get the value into mcbuffer, and
3140 then we can handle this with the normal one-character code. */
3141
3142 #ifdef SUPPORT_UTF8
3143 if (utf8 && class_lastchar > 127)
3144 mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
3145 else
3146 #endif
3147 {
3148 mcbuffer[0] = class_lastchar;
3149 mclength = 1;
3150 }
3151 goto ONE_CHAR;
3152 } /* End of 1-char optimization */
3153
3154 /* The general case - not the one-char optimization. If this is the first
3155 thing in the branch, there can be no first char setting, whatever the
3156 repeat count. Any reqbyte setting must remain unchanged after any kind of
3157 repeat. */
3158
3159 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3160 zerofirstbyte = firstbyte;
3161 zeroreqbyte = reqbyte;
3162
3163 /* If there are characters with values > 255, we have to compile an
3164 extended class, with its own opcode. If there are no characters < 256,
3165 we can omit the bitmap in the actual compiled code. */
3166
3167 #ifdef SUPPORT_UTF8
3168 if (class_utf8)
3169 {
3170 *class_utf8data++ = XCL_END; /* Marks the end of extra data */
3171 *code++ = OP_XCLASS;
3172 code += LINK_SIZE;
3173 *code = negate_class? XCL_NOT : 0;
3174
3175 /* If the map is required, move up the extra data to make room for it;
3176 otherwise just move the code pointer to the end of the extra data. */
3177
3178 if (class_charcount > 0)
3179 {
3180 *code++ |= XCL_MAP;
3181 memmove(code + 32, code, class_utf8data - code);
3182 memcpy(code, classbits, 32);
3183 code = class_utf8data + 32;
3184 }
3185 else code = class_utf8data;
3186
3187 /* Now fill in the complete length of the item */
3188
3189 PUT(previous, 1, code - previous);
3190 break; /* End of class handling */
3191 }
3192 #endif
3193
3194 /* If there are no characters > 255, negate the 32-byte map if necessary,
3195 and copy it into the code vector. If this is the first thing in the branch,
3196 there can be no first char setting, whatever the repeat count. Any reqbyte
3197 setting must remain unchanged after any kind of repeat. */
3198
3199 if (negate_class)
3200 {
3201 *code++ = OP_NCLASS;
3202 if (lengthptr == NULL) /* Save time in the pre-compile phase */
3203 for (c = 0; c < 32; c++) code[c] = ~classbits[c];
3204 }
3205 else
3206 {
3207 *code++ = OP_CLASS;
3208 memcpy(code, classbits, 32);
3209 }
3210 code += 32;
3211 break;
3212
3213
3214 /* ===================================================================*/
3215 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
3216 has been tested above. */
3217
3218 case '{':
3219 if (!is_quantifier) goto NORMAL_CHAR;
3220 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
3221 if (*errorcodeptr != 0) goto FAILED;
3222 goto REPEAT;
3223
3224 case '*':
3225 repeat_min = 0;
3226 repeat_max = -1;
3227 goto REPEAT;
3228
3229 case '+':
3230 repeat_min = 1;
3231 repeat_max = -1;
3232 goto REPEAT;
3233
3234 case '?':
3235 repeat_min = 0;
3236 repeat_max = 1;
3237
3238 REPEAT:
3239 if (previous == NULL)
3240 {
3241 *errorcodeptr = ERR9;
3242 goto FAILED;
3243 }
3244
3245 if (repeat_min == 0)
3246 {
3247 firstbyte = zerofirstbyte; /* Adjust for zero repeat */
3248 reqbyte = zeroreqbyte; /* Ditto */
3249 }
3250
3251 /* Remember whether this is a variable length repeat */
3252
3253 reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
3254
3255 op_type = 0; /* Default single-char op codes */
3256 possessive_quantifier = FALSE; /* Default not possessive quantifier */
3257
3258 /* Save start of previous item, in case we have to move it up to make space
3259 for an inserted OP_ONCE for the additional '+' extension. */
3260
3261 tempcode = previous;
3262
3263 /* If the next character is '+', we have a possessive quantifier. This
3264 implies greediness, whatever the setting of the PCRE_UNGREEDY option.
3265 If the next character is '?' this is a minimizing repeat, by default,
3266 but if PCRE_UNGREEDY is set, it works the other way round. We change the
3267 repeat type to the non-default. */
3268
3269 if (ptr[1] == '+')
3270 {
3271 repeat_type = 0; /* Force greedy */
3272 possessive_quantifier = TRUE;
3273 ptr++;
3274 }
3275 else if (ptr[1] == '?')
3276 {
3277 repeat_type = greedy_non_default;
3278 ptr++;
3279 }
3280 else repeat_type = greedy_default;
3281
3282 /* If previous was a character match, abolish the item and generate a
3283 repeat item instead. If a char item has a minumum of more than one, ensure
3284 that it is set in reqbyte - it might not be if a sequence such as x{3} is
3285 the first thing in a branch because the x will have gone into firstbyte
3286 instead. */
3287
3288 if (*previous == OP_CHAR || *previous == OP_CHARNC)
3289 {
3290 /* Deal with UTF-8 characters that take up more than one byte. It's
3291 easier to write this out separately than try to macrify it. Use c to
3292 hold the length of the character in bytes, plus 0x80 to flag that it's a
3293 length rather than a small character. */
3294
3295 #ifdef SUPPORT_UTF8
3296 if (utf8 && (code[-1] & 0x80) != 0)
3297 {
3298 uschar *lastchar = code - 1;
3299 while((*lastchar & 0xc0) == 0x80) lastchar--;
3300 c = code - lastchar; /* Length of UTF-8 character */
3301 memcpy(utf8_char, lastchar, c); /* Save the char */
3302 c |= 0x80; /* Flag c as a length */
3303 }
3304 else
3305 #endif
3306
3307 /* Handle the case of a single byte - either with no UTF8 support, or
3308 with UTF-8 disabled, or for a UTF-8 character < 128. */
3309
3310 {
3311 c = code[-1];
3312 if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
3313 }
3314
3315 /* If the repetition is unlimited, it pays to see if the next thing on
3316 the line is something that cannot possibly match this character. If so,
3317 automatically possessifying this item gains some performance in the case
3318 where the match fails. */
3319
3320 if (!possessive_quantifier &&
3321 repeat_max < 0 &&
3322 check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3323 options, cd))
3324 {
3325 repeat_type = 0; /* Force greedy */
3326 possessive_quantifier = TRUE;
3327 }
3328
3329 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
3330 }
3331
3332 /* If previous was a single negated character ([^a] or similar), we use
3333 one of the special opcodes, replacing it. The code is shared with single-
3334 character repeats by setting opt_type to add a suitable offset into
3335 repeat_type. We can also test for auto-possessification. OP_NOT is
3336 currently used only for single-byte chars. */
3337
3338 else if (*previous == OP_NOT)
3339 {
3340 op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
3341 c = previous[1];
3342 if (!possessive_quantifier &&
3343 repeat_max < 0 &&
3344 check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3345 {
3346 repeat_type = 0; /* Force greedy */
3347 possessive_quantifier = TRUE;
3348 }
3349 goto OUTPUT_SINGLE_REPEAT;
3350 }
3351
3352 /* If previous was a character type match (\d or similar), abolish it and
3353 create a suitable repeat item. The code is shared with single-character
3354 repeats by setting op_type to add a suitable offset into repeat_type. Note
3355 the the Unicode property types will be present only when SUPPORT_UCP is
3356 defined, but we don't wrap the little bits of code here because it just
3357 makes it horribly messy. */
3358
3359 else if (*previous < OP_EODN)
3360 {
3361 uschar *oldcode;
3362 int prop_type, prop_value;
3363 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
3364 c = *previous;
3365
3366 if (!possessive_quantifier &&
3367 repeat_max < 0 &&
3368 check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3369 {
3370 repeat_type = 0; /* Force greedy */
3371 possessive_quantifier = TRUE;
3372 }
3373
3374 OUTPUT_SINGLE_REPEAT:
3375 if (*previous == OP_PROP || *previous == OP_NOTPROP)
3376 {
3377 prop_type = previous[1];
3378 prop_value = previous[2];
3379 }
3380 else prop_type = prop_value = -1;
3381
3382 oldcode = code;
3383 code = previous; /* Usually overwrite previous item */
3384
3385 /* If the maximum is zero then the minimum must also be zero; Perl allows
3386 this case, so we do too - by simply omitting the item altogether. */
3387
3388 if (repeat_max == 0) goto END_REPEAT;
3389
3390 /* All real repeats make it impossible to handle partial matching (maybe
3391 one day we will be able to remove this restriction). */
3392
3393 if (repeat_max != 1) cd->nopartial = TRUE;
3394
3395 /* Combine the op_type with the repeat_type */
3396
3397 repeat_type += op_type;
3398
3399 /* A minimum of zero is handled either as the special case * or ?, or as
3400 an UPTO, with the maximum given. */
3401
3402 if (repeat_min == 0)
3403 {
3404 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
3405 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
3406 else
3407 {
3408 *code++ = OP_UPTO + repeat_type;
3409 PUT2INC(code, 0, repeat_max);
3410 }
3411 }
3412
3413 /* A repeat minimum of 1 is optimized into some special cases. If the
3414 maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
3415 left in place and, if the maximum is greater than 1, we use OP_UPTO with
3416 one less than the maximum. */
3417
3418 else if (repeat_min == 1)
3419 {
3420 if (repeat_max == -1)
3421 *code++ = OP_PLUS + repeat_type;
3422 else
3423 {
3424 code = oldcode; /* leave previous item in place */
3425 if (repeat_max == 1) goto END_REPEAT;
3426 *code++ = OP_UPTO + repeat_type;
3427 PUT2INC(code, 0, repeat_max - 1);
3428 }
3429 }
3430
3431 /* The case {n,n} is just an EXACT, while the general case {n,m} is
3432 handled as an EXACT followed by an UPTO. */
3433
3434 else
3435 {
3436 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
3437 PUT2INC(code, 0, repeat_min);
3438
3439 /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
3440 we have to insert the character for the previous code. For a repeated
3441 Unicode property match, there are two extra bytes that define the
3442 required property. In UTF-8 mode, long characters have their length in
3443 c, with the 0x80 bit as a flag. */
3444
3445 if (repeat_max < 0)
3446 {
3447 #ifdef SUPPORT_UTF8
3448 if (utf8 && c >= 128)
3449 {
3450 memcpy(code, utf8_char, c & 7);
3451 code += c & 7;
3452 }
3453 else
3454 #endif
3455 {
3456 *code++ = c;
3457 if (prop_type >= 0)
3458 {
3459 *code++ = prop_type;
3460 *code++ = prop_value;
3461 }
3462 }
3463 *code++ = OP_STAR + repeat_type;
3464 }
3465
3466 /* Else insert an UPTO if the max is greater than the min, again
3467 preceded by the character, for the previously inserted code. If the
3468 UPTO is just for 1 instance, we can use QUERY instead. */
3469
3470 else if (repeat_max != repeat_min)
3471 {
3472 #ifdef SUPPORT_UTF8
3473 if (utf8 && c >= 128)
3474 {
3475 memcpy(code, utf8_char, c & 7);
3476 code += c & 7;
3477 }
3478 else
3479 #endif
3480 *code++ = c;
3481 if (prop_type >= 0)
3482 {
3483 *code++ = prop_type;
3484 *code++ = prop_value;
3485 }
3486 repeat_max -= repeat_min;
3487
3488 if (repeat_max == 1)
3489 {
3490 *code++ = OP_QUERY + repeat_type;
3491 }
3492 else
3493 {
3494 *code++ = OP_UPTO + repeat_type;
3495 PUT2INC(code, 0, repeat_max);
3496 }
3497 }
3498 }
3499
3500 /* The character or character type itself comes last in all cases. */
3501
3502 #ifdef SUPPORT_UTF8
3503 if (utf8 && c >= 128)
3504 {
3505 memcpy(code, utf8_char, c & 7);
3506 code += c & 7;
3507 }
3508 else
3509 #endif
3510 *code++ = c;
3511
3512 /* For a repeated Unicode property match, there are two extra bytes that
3513 define the required property. */
3514
3515 #ifdef SUPPORT_UCP
3516 if (prop_type >= 0)
3517 {
3518 *code++ = prop_type;
3519 *code++ = prop_value;
3520 }
3521 #endif
3522 }
3523
3524 /* If previous was a character class or a back reference, we put the repeat
3525 stuff after it, but just skip the item if the repeat was {0,0}. */
3526
3527 else if (*previous == OP_CLASS ||
3528 *previous == OP_NCLASS ||
3529 #ifdef SUPPORT_UTF8
3530 *previous == OP_XCLASS ||
3531 #endif
3532 *previous == OP_REF)
3533 {
3534 if (repeat_max == 0)
3535 {
3536 code = previous;
3537 goto END_REPEAT;
3538 }
3539
3540 /* All real repeats make it impossible to handle partial matching (maybe
3541 one day we will be able to remove this restriction). */
3542
3543 if (repeat_max != 1) cd->nopartial = TRUE;
3544
3545 if (repeat_min == 0 && repeat_max == -1)
3546 *code++ = OP_CRSTAR + repeat_type;
3547 else if (repeat_min == 1 && repeat_max == -1)
3548 *code++ = OP_CRPLUS + repeat_type;
3549 else if (repeat_min == 0 && repeat_max == 1)
3550 *code++ = OP_CRQUERY + repeat_type;
3551 else
3552 {
3553 *code++ = OP_CRRANGE + repeat_type;
3554 PUT2INC(code, 0, repeat_min);
3555 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
3556 PUT2INC(code, 0, repeat_max);
3557 }
3558 }
3559
3560 /* If previous was a bracket group, we may have to replicate it in certain
3561 cases. */
3562
3563 else if (*previous == OP_BRA || *previous == OP_CBRA ||
3564 *previous == OP_ONCE || *previous == OP_COND)
3565 {
3566 register int i;
3567 int ketoffset = 0;
3568 int len = code - previous;
3569 uschar *bralink = NULL;
3570
3571 /* Repeating a DEFINE group is pointless */
3572
3573 if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
3574 {
3575 *errorcodeptr = ERR55;
3576 goto FAILED;
3577 }
3578
3579 /* If the maximum repeat count is unlimited, find the end of the bracket
3580 by scanning through from the start, and compute the offset back to it
3581 from the current code pointer. There may be an OP_OPT setting following
3582 the final KET, so we can't find the end just by going back from the code
3583 pointer. */
3584
3585 if (repeat_max == -1)
3586 {
3587 register uschar *ket = previous;
3588 do ket += GET(ket, 1); while (*ket != OP_KET);
3589 ketoffset = code - ket;
3590 }
3591
3592 /* The case of a zero minimum is special because of the need to stick
3593 OP_BRAZERO in front of it, and because the group appears once in the
3594 data, whereas in other cases it appears the minimum number of times. For
3595 this reason, it is simplest to treat this case separately, as otherwise
3596 the code gets far too messy. There are several special subcases when the
3597 minimum is zero. */
3598
3599 if (repeat_min == 0)
3600 {
3601 /* If the maximum is also zero, we just omit the group from the output
3602 altogether. */
3603
3604 if (repeat_max == 0)
3605 {
3606 code = previous;
3607 goto END_REPEAT;
3608 }
3609
3610 /* If the maximum is 1 or unlimited, we just have to stick in the
3611 BRAZERO and do no more at this point. However, we do need to adjust
3612 any OP_RECURSE calls inside the group that refer to the group itself or
3613 any internal or forward referenced group, because the offset is from
3614 the start of the whole regex. Temporarily terminate the pattern while
3615 doing this. */
3616
3617 if (repeat_max <= 1)
3618 {
3619 *code = OP_END;
3620 adjust_recurse(previous, 1, utf8, cd, save_hwm);
3621 memmove(previous+1, previous, len);
3622 code++;
3623 *previous++ = OP_BRAZERO + repeat_type;
3624 }
3625
3626 /* If the maximum is greater than 1 and limited, we have to replicate
3627 in a nested fashion, sticking OP_BRAZERO before each set of brackets.
3628 The first one has to be handled carefully because it's the original
3629 copy, which has to be moved up. The remainder can be handled by code
3630 that is common with the non-zero minimum case below. We have to
3631 adjust the value or repeat_max, since one less copy is required. Once
3632 again, we may have to adjust any OP_RECURSE calls inside the group. */
3633
3634 else
3635 {
3636 int offset;
3637 *code = OP_END;
3638 adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
3639 memmove(previous + 2 + LINK_SIZE, previous, len);
3640 code += 2 + LINK_SIZE;
3641 *previous++ = OP_BRAZERO + repeat_type;
3642 *previous++ = OP_BRA;
3643
3644 /* We chain together the bracket offset fields that have to be
3645 filled in later when the ends of the brackets are reached. */
3646
3647 offset = (bralink == NULL)? 0 : previous - bralink;
3648 bralink = previous;
3649 PUTINC(previous, 0, offset);
3650 }
3651
3652 repeat_max--;
3653 }
3654
3655 /* If the minimum is greater than zero, replicate the group as many
3656 times as necessary, and adjust the maximum to the number of subsequent
3657 copies that we need. If we set a first char from the group, and didn't
3658 set a required char, copy the latter from the former. If there are any
3659 forward reference subroutine calls in the group, there will be entries on
3660 the workspace list; replicate these with an appropriate increment. */
3661
3662 else
3663 {
3664 if (repeat_min > 1)
3665 {
3666 /* In the pre-compile phase, we don't actually do the replication. We
3667 just adjust the length as if we had. Do some paranoid checks for
3668 potential integer overflow. */
3669
3670 if (lengthptr != NULL)
3671 {
3672 int delta = (repeat_min - 1)*length_prevgroup;
3673 if ((double)(repeat_min - 1)*(double)length_prevgroup >
3674 (double)INT_MAX ||
3675 OFLOW_MAX - *lengthptr < delta)
3676 {
3677 *errorcodeptr = ERR20;
3678 goto FAILED;
3679 }
3680 *lengthptr += delta;
3681 }
3682
3683 /* This is compiling for real */
3684
3685 else
3686 {
3687 if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
3688 for (i = 1; i < repeat_min; i++)
3689 {
3690 uschar *hc;
3691 uschar *this_hwm = cd->hwm;
3692 memcpy(code, previous, len);
3693 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3694 {
3695 PUT(cd->hwm, 0, GET(hc, 0) + len);
3696 cd->hwm += LINK_SIZE;
3697 }
3698 save_hwm = this_hwm;
3699 code += len;
3700 }
3701 }
3702 }
3703
3704 if (repeat_max > 0) repeat_max -= repeat_min;
3705 }
3706
3707 /* This code is common to both the zero and non-zero minimum cases. If
3708 the maximum is limited, it replicates the group in a nested fashion,
3709 remembering the bracket starts on a stack. In the case of a zero minimum,
3710 the first one was set up above. In all cases the repeat_max now specifies
3711 the number of additional copies needed. Again, we must remember to
3712 replicate entries on the forward reference list. */
3713
3714 if (repeat_max >= 0)
3715 {
3716 /* In the pre-compile phase, we don't actually do the replication. We
3717 just adjust the length as if we had. For each repetition we must add 1
3718 to the length for BRAZERO and for all but the last repetition we must
3719 add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
3720 paranoid checks to avoid integer overflow. */
3721
3722 if (lengthptr != NULL && repeat_max > 0)
3723 {
3724 int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
3725 2 - 2*LINK_SIZE; /* Last one doesn't nest */
3726 if ((double)repeat_max *
3727 (double)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
3728 > (double)INT_MAX ||
3729 OFLOW_MAX - *lengthptr < delta)
3730 {
3731 *errorcodeptr = ERR20;
3732 goto FAILED;
3733 }
3734 *lengthptr += delta;
3735 }
3736
3737 /* This is compiling for real */
3738
3739 else for (i = repeat_max - 1; i >= 0; i--)
3740 {
3741 uschar *hc;
3742 uschar *this_hwm = cd->hwm;
3743
3744 *code++ = OP_BRAZERO + repeat_type;
3745
3746 /* All but the final copy start a new nesting, maintaining the
3747 chain of brackets outstanding. */
3748
3749 if (i != 0)
3750 {
3751 int offset;
3752 *code++ = OP_BRA;
3753 offset = (bralink == NULL)? 0 : code - bralink;
3754 bralink = code;
3755 PUTINC(code, 0, offset);
3756 }
3757
3758 memcpy(code, previous, len);
3759 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3760 {
3761 PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
3762 cd->hwm += LINK_SIZE;
3763 }
3764 save_hwm = this_hwm;
3765 code += len;
3766 }
3767
3768 /* Now chain through the pending brackets, and fill in their length
3769 fields (which are holding the chain links pro tem). */
3770
3771 while (bralink != NULL)
3772 {
3773 int oldlinkoffset;
3774 int offset = code - bralink + 1;
3775 uschar *bra = code - offset;
3776 oldlinkoffset = GET(bra, 1);
3777 bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
3778 *code++ = OP_KET;
3779 PUTINC(code, 0, offset);
3780 PUT(bra, 1, offset);
3781 }
3782 }
3783
3784 /* If the maximum is unlimited, set a repeater in the final copy. We
3785 can't just offset backwards from the current code point, because we
3786 don't know if there's been an options resetting after the ket. The
3787 correct offset was computed above.
3788
3789 Then, when we are doing the actual compile phase, check to see whether
3790 this group is a non-atomic one that could match an empty string. If so,
3791 convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
3792 that runtime checking can be done. [This check is also applied to
3793 atomic groups at runtime, but in a different way.] */
3794
3795 else
3796 {
3797 uschar *ketcode = code - ketoffset;
3798 uschar *bracode = ketcode - GET(ketcode, 1);
3799 *ketcode = OP_KETRMAX + repeat_type;
3800 if (lengthptr == NULL && *bracode != OP_ONCE)
3801 {
3802 uschar *scode = bracode;
3803 do
3804 {
3805 if (could_be_empty_branch(scode, ketcode, utf8))
3806 {
3807 *bracode += OP_SBRA - OP_BRA;
3808 break;
3809 }
3810 scode += GET(scode, 1);
3811 }
3812 while (*scode == OP_ALT);
3813 }
3814 }
3815 }
3816
3817 /* Else there's some kind of shambles */
3818
3819 else
3820 {
3821 *errorcodeptr = ERR11;
3822 goto FAILED;
3823 }
3824
3825 /* If the character following a repeat is '+', or if certain optimization
3826 tests above succeeded, possessive_quantifier is TRUE. For some of the
3827 simpler opcodes, there is an special alternative opcode for this. For
3828 anything else, we wrap the entire repeated item inside OP_ONCE brackets.
3829 The '+' notation is just syntactic sugar, taken from Sun's Java package,
3830 but the special opcodes can optimize it a bit. The repeated item starts at
3831 tempcode, not at previous, which might be the first part of a string whose
3832 (former) last char we repeated.
3833
3834 Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
3835 an 'upto' may follow. We skip over an 'exact' item, and then test the
3836 length of what remains before proceeding. */
3837
3838 if (possessive_quantifier)
3839 {
3840 int len;
3841 if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||
3842 *tempcode == OP_NOTEXACT)
3843 tempcode += _pcre_OP_lengths[*tempcode];
3844 len = code - tempcode;
3845 if (len > 0) switch (*tempcode)
3846 {
3847 case OP_STAR: *tempcode = OP_POSSTAR; break;
3848 case OP_PLUS: *tempcode = OP_POSPLUS; break;
3849 case OP_QUERY: *tempcode = OP_POSQUERY; break;
3850 case OP_UPTO: *tempcode = OP_POSUPTO; break;
3851
3852 case OP_TYPESTAR: *tempcode = OP_TYPEPOSSTAR; break;
3853 case OP_TYPEPLUS: *tempcode = OP_TYPEPOSPLUS; break;
3854 case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
3855 case OP_TYPEUPTO: *tempcode = OP_TYPEPOSUPTO; break;
3856
3857 case OP_NOTSTAR: *tempcode = OP_NOTPOSSTAR; break;
3858 case OP_NOTPLUS: *tempcode = OP_NOTPOSPLUS; break;
3859 case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
3860 case OP_NOTUPTO: *tempcode = OP_NOTPOSUPTO; break;
3861
3862 default:
3863 memmove(tempcode + 1+LINK_SIZE, tempcode, len);
3864 code += 1 + LINK_SIZE;
3865 len += 1 + LINK_SIZE;
3866 tempcode[0] = OP_ONCE;
3867 *code++ = OP_KET;
3868 PUTINC(code, 0, len);
3869 PUT(tempcode, 1, len);
3870 break;
3871 }
3872 }
3873
3874 /* In all case we no longer have a previous item. We also set the
3875 "follows varying string" flag for subsequently encountered reqbytes if
3876 it isn't already set and we have just passed a varying length item. */
3877
3878 END_REPEAT:
3879 previous = NULL;
3880 cd->req_varyopt |= reqvary;
3881 break;
3882
3883
3884 /* ===================================================================*/
3885 /* Start of nested parenthesized sub-expression, or comment or lookahead or
3886 lookbehind or option setting or condition or all the other extended
3887 parenthesis forms. */
3888
3889 case '(':
3890 newoptions = options;
3891 skipbytes = 0;
3892 bravalue = OP_CBRA;
3893 save_hwm = cd->hwm;
3894 reset_bracount = FALSE;
3895
3896 /* First deal with various "verbs" that can be introduced by '*'. */
3897
3898 if (*(++ptr) == '*' && (cd->ctypes[ptr[1]] & ctype_letter) != 0)
3899 {
3900 int i, namelen;
3901 const uschar *name = ++ptr;
3902 previous = NULL;
3903 while ((cd->ctypes[*++ptr] & ctype_letter) != 0);
3904 if (*ptr == ':')
3905 {
3906 *errorcodeptr = ERR59; /* Not supported */
3907 goto FAILED;
3908 }
3909 if (*ptr != ')')
3910 {
3911 *errorcodeptr = ERR60;
3912 goto FAILED;
3913 }
3914 namelen = ptr - name;
3915 for (i = 0; i < verbcount; i++)
3916 {
3917 if (namelen == verbs[i].len &&
3918 strncmp((char *)name, verbs[i].name, namelen) == 0)
3919 {
3920 *code = verbs[i].op;
3921 if (*code++ == OP_ACCEPT) cd->had_accept = TRUE;
3922 break;
3923 }
3924 }
3925 if (i < verbcount) continue;
3926 *errorcodeptr = ERR60;
3927 goto FAILED;
3928 }
3929
3930 /* Deal with the extended parentheses; all are introduced by '?', and the
3931 appearance of any of them means that this is not a capturing group. */
3932
3933 else if (*ptr == '?')
3934 {
3935 int i, set, unset, namelen;
3936 int *optset;
3937 const uschar *name;
3938 uschar *slot;
3939
3940 switch (*(++ptr))
3941 {
3942 case '#': /* Comment; skip to ket */
3943 ptr++;
3944 while (*ptr != 0 && *ptr != ')') ptr++;
3945 if (*ptr == 0)
3946 {
3947 *errorcodeptr = ERR18;
3948 goto FAILED;
3949 }
3950 continue;
3951
3952
3953 /* ------------------------------------------------------------ */
3954 case '|': /* Reset capture count for each branch */
3955 reset_bracount = TRUE;
3956 /* Fall through */
3957
3958 /* ------------------------------------------------------------ */
3959 case ':': /* Non-capturing bracket */
3960 bravalue = OP_BRA;
3961 ptr++;
3962 break;
3963
3964
3965 /* ------------------------------------------------------------ */
3966 case '(':
3967 bravalue = OP_COND; /* Conditional group */
3968
3969 /* A condition can be an assertion, a number (referring to a numbered
3970 group), a name (referring to a named group), or 'R', referring to
3971 recursion. R<digits> and R&name are also permitted for recursion tests.
3972
3973 There are several syntaxes for testing a named group: (?(name)) is used
3974 by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
3975
3976 There are two unfortunate ambiguities, caused by history. (a) 'R' can
3977 be the recursive thing or the name 'R' (and similarly for 'R' followed
3978 by digits), and (b) a number could be a name that consists of digits.
3979 In both cases, we look for a name first; if not found, we try the other
3980 cases. */
3981
3982 /* For conditions that are assertions, check the syntax, and then exit
3983 the switch. This will take control down to where bracketed groups,
3984 including assertions, are processed. */
3985
3986 if (ptr[1] == '?' && (ptr[2] == '=' || ptr[2] == '!' || ptr[2] == '<'))
3987 break;
3988
3989 /* Most other conditions use OP_CREF (a couple change to OP_RREF
3990 below), and all need to skip 3 bytes at the start of the group. */
3991
3992 code[1+LINK_SIZE] = OP_CREF;
3993 skipbytes = 3;
3994 refsign = -1;
3995
3996 /* Check for a test for recursion in a named group. */
3997
3998 if (ptr[1] == 'R' && ptr[2] == '&')
3999 {
4000 terminator = -1;
4001 ptr += 2;
4002 code[1+LINK_SIZE] = OP_RREF; /* Change the type of test */
4003 }
4004
4005 /* Check for a test for a named group's having been set, using the Perl
4006 syntax (?(<name>) or (?('name') */
4007
4008 else if (ptr[1] == '<')
4009 {
4010 terminator = '>';
4011 ptr++;
4012 }
4013 else if (ptr[1] == '\'')
4014 {
4015 terminator = '\'';
4016 ptr++;
4017 }
4018 else
4019 {
4020 terminator = 0;
4021 if (ptr[1] == '-' || ptr[1] == '+') refsign = *(++ptr);
4022 }
4023
4024 /* We now expect to read a name; any thing else is an error */
4025
4026 if ((cd->ctypes[ptr[1]] & ctype_word) == 0)
4027 {
4028 ptr += 1; /* To get the right offset */
4029 *errorcodeptr = ERR28;
4030 goto FAILED;
4031 }
4032
4033 /* Read the name, but also get it as a number if it's all digits */
4034
4035 recno = 0;
4036 name = ++ptr;
4037 while ((cd->ctypes[*ptr] & ctype_word) != 0)
4038 {
4039 if (recno >= 0)
4040 recno = ((digitab[*ptr] & ctype_digit) != 0)?
4041 recno * 10 + *ptr - '0' : -1;
4042 ptr++;
4043 }
4044 namelen = ptr - name;
4045
4046 if ((terminator > 0 && *ptr++ != terminator) || *ptr++ != ')')
4047 {
4048 ptr--; /* Error offset */
4049 *errorcodeptr = ERR26;
4050 goto FAILED;
4051 }
4052
4053 /* Do no further checking in the pre-compile phase. */
4054
4055 if (lengthptr != NULL) break;
4056
4057 /* In the real compile we do the work of looking for the actual
4058 reference. If the string started with "+" or "-" we require the rest to
4059 be digits, in which case recno will be set. */
4060
4061 if (refsign > 0)
4062 {
4063 if (recno <= 0)
4064 {
4065 *errorcodeptr = ERR58;
4066 goto FAILED;
4067 }
4068 if (refsign == '-')
4069 {
4070 recno = cd->bracount - recno + 1;
4071 if (recno <= 0)
4072 {
4073 *errorcodeptr = ERR15;
4074 goto FAILED;
4075 }
4076 }
4077 else recno += cd->bracount;
4078 PUT2(code, 2+LINK_SIZE, recno);
4079 break;
4080 }
4081
4082 /* Otherwise (did not start with "+" or "-"), start by looking for the
4083 name. */
4084
4085 slot = cd->name_table;
4086 for (i = 0; i < cd->names_found; i++)
4087 {
4088 if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
4089 slot += cd->name_entry_size;
4090 }
4091
4092 /* Found a previous named subpattern */
4093
4094 if (i < cd->names_found)
4095 {
4096 recno = GET2(slot, 0);
4097 PUT2(code, 2+LINK_SIZE, recno);
4098 }
4099
4100 /* Search the pattern for a forward reference */
4101
4102 else if ((i = find_parens(ptr, cd->bracount, name, namelen,
4103 (options & PCRE_EXTENDED) != 0)) > 0)
4104 {
4105 PUT2(code, 2+LINK_SIZE, i);
4106 }
4107
4108 /* If terminator == 0 it means that the name followed directly after
4109 the opening parenthesis [e.g. (?(abc)...] and in this case there are
4110 some further alternatives to try. For the cases where terminator != 0
4111 [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
4112 now checked all the possibilities, so give an error. */
4113
4114 else if (terminator != 0)
4115 {
4116 *errorcodeptr = ERR15;
4117 goto FAILED;
4118 }
4119
4120 /* Check for (?(R) for recursion. Allow digits after R to specify a
4121 specific group number. */
4122
4123 else if (*name == 'R')
4124 {
4125 recno = 0;
4126 for (i = 1; i < namelen; i++)
4127 {
4128 if ((digitab[name[i]] & ctype_digit) == 0)
4129 {
4130 *errorcodeptr = ERR15;
4131 goto FAILED;
4132 }
4133 recno = recno * 10 + name[i] - '0';
4134 }
4135 if (recno == 0) recno = RREF_ANY;
4136 code[1+LINK_SIZE] = OP_RREF; /* Change test type */
4137 PUT2(code, 2+LINK_SIZE, recno);
4138 }
4139
4140 /* Similarly, check for the (?(DEFINE) "condition", which is always
4141 false. */
4142
4143 else if (namelen == 6 && strncmp((char *)name, "DEFINE", 6) == 0)
4144 {
4145 code[1+LINK_SIZE] = OP_DEF;
4146 skipbytes = 1;
4147 }
4148
4149 /* Check for the "name" actually being a subpattern number. */
4150
4151 else if (recno > 0)
4152 {
4153 PUT2(code, 2+LINK_SIZE, recno);
4154 }
4155
4156 /* Either an unidentified subpattern, or a reference to (?(0) */
4157
4158 else
4159 {
4160 *errorcodeptr = (recno == 0)? ERR35: ERR15;
4161 goto FAILED;
4162 }
4163 break;
4164
4165
4166 /* ------------------------------------------------------------ */
4167 case '=': /* Positive lookahead */
4168 bravalue = OP_ASSERT;
4169 ptr++;
4170 break;
4171
4172
4173 /* ------------------------------------------------------------ */
4174 case '!': /* Negative lookahead */
4175 ptr++;
4176 if (*ptr == ')') /* Optimize (?!) */
4177 {
4178 *code++ = OP_FAIL;
4179 previous = NULL;
4180 continue;
4181 }
4182 bravalue = OP_ASSERT_NOT;
4183 break;
4184
4185
4186 /* ------------------------------------------------------------ */
4187 case '<': /* Lookbehind or named define */
4188 switch (ptr[1])
4189 {
4190 case '=': /* Positive lookbehind */
4191 bravalue = OP_ASSERTBACK;
4192 ptr += 2;
4193 break;
4194
4195 case '!': /* Negative lookbehind */
4196 bravalue = OP_ASSERTBACK_NOT;
4197 ptr += 2;
4198 break;
4199
4200 default: /* Could be name define, else bad */
4201 if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME;
4202 ptr++; /* Correct offset for error */
4203 *errorcodeptr = ERR24;
4204 goto FAILED;
4205 }
4206 break;
4207
4208
4209 /* ------------------------------------------------------------ */
4210 case '>': /* One-time brackets */
4211 bravalue = OP_ONCE;
4212 ptr++;
4213 break;
4214
4215
4216 /* ------------------------------------------------------------ */
4217 case 'C': /* Callout - may be followed by digits; */
4218 previous_callout = code; /* Save for later completion */
4219 after_manual_callout = 1; /* Skip one item before completing */
4220 *code++ = OP_CALLOUT;
4221 {
4222 int n = 0;
4223 while ((digitab[*(++ptr)] & ctype_digit) != 0)
4224 n = n * 10 + *ptr - '0';
4225 if (*ptr != ')')
4226 {
4227 *errorcodeptr = ERR39;
4228 goto FAILED;
4229 }
4230 if (n > 255)
4231 {
4232 *errorcodeptr = ERR38;
4233 goto FAILED;
4234 }
4235 *code++ = n;
4236 PUT(code, 0, ptr - cd->start_pattern + 1); /* Pattern offset */
4237 PUT(code, LINK_SIZE, 0); /* Default length */
4238 code += 2 * LINK_SIZE;
4239 }
4240 previous = NULL;
4241 continue;
4242
4243
4244 /* ------------------------------------------------------------ */
4245 case 'P': /* Python-style named subpattern handling */
4246 if (*(++ptr) == '=' || *ptr == '>') /* Reference or recursion */
4247 {
4248 is_recurse = *ptr == '>';
4249 terminator = ')';
4250 goto NAMED_REF_OR_RECURSE;
4251 }
4252 else if (*ptr != '<') /* Test for Python-style definition */
4253 {
4254 *errorcodeptr = ERR41;
4255 goto FAILED;
4256 }
4257 /* Fall through to handle (?P< as (?< is handled */
4258
4259
4260 /* ------------------------------------------------------------ */
4261 DEFINE_NAME: /* Come here from (?< handling */
4262 case '\'':
4263 {
4264 terminator = (*ptr == '<')? '>' : '\'';
4265 name = ++ptr;
4266
4267 while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4268 namelen = ptr - name;
4269
4270 /* In the pre-compile phase, just do a syntax check. */
4271
4272 if (lengthptr != NULL)
4273 {
4274 if (*ptr != terminator)
4275 {
4276 *errorcodeptr = ERR42;
4277 goto FAILED;
4278 }
4279 if (cd->names_found >= MAX_NAME_COUNT)
4280 {
4281 *errorcodeptr = ERR49;
4282 goto FAILED;
4283 }
4284 if (namelen + 3 > cd->name_entry_size)
4285 {
4286 cd->name_entry_size = namelen + 3;
4287 if (namelen > MAX_NAME_SIZE)
4288 {
4289 *errorcodeptr = ERR48;
4290 goto FAILED;
4291 }
4292 }
4293 }
4294
4295 /* In the real compile, create the entry in the table */
4296
4297 else
4298 {
4299 slot = cd->name_table;
4300 for (i = 0; i < cd->names_found; i++)
4301 {
4302 int crc = memcmp(name, slot+2, namelen);
4303 if (crc == 0)
4304 {
4305 if (slot[2+namelen] == 0)
4306 {
4307 if ((options & PCRE_DUPNAMES) == 0)
4308 {
4309 *errorcodeptr = ERR43;
4310 goto FAILED;
4311 }
4312 }
4313 else crc = -1; /* Current name is substring */
4314 }
4315 if (crc < 0)
4316 {
4317 memmove(slot + cd->name_entry_size, slot,
4318 (cd->names_found - i) * cd->name_entry_size);
4319 break;
4320 }
4321 slot += cd->name_entry_size;
4322 }
4323
4324 PUT2(slot, 0, cd->bracount + 1);
4325 memcpy(slot + 2, name, namelen);
4326 slot[2+namelen] = 0;
4327 }
4328 }
4329
4330 /* In both cases, count the number of names we've encountered. */
4331
4332 ptr++; /* Move past > or ' */
4333 cd->names_found++;
4334 goto NUMBERED_GROUP;
4335
4336
4337 /* ------------------------------------------------------------ */
4338 case '&': /* Perl recursion/subroutine syntax */
4339 terminator = ')';
4340 is_recurse = TRUE;
4341 /* Fall through */
4342
4343 /* We come here from the Python syntax above that handles both
4344 references (?P=name) and recursion (?P>name), as well as falling
4345 through from the Perl recursion syntax (?&name). */
4346
4347 NAMED_REF_OR_RECURSE:
4348 name = ++ptr;
4349 while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4350 namelen = ptr - name;
4351
4352 /* In the pre-compile phase, do a syntax check and set a dummy
4353 reference number. */
4354
4355 if (lengthptr != NULL)
4356 {
4357 if (*ptr != terminator)
4358 {
4359 *errorcodeptr = ERR42;
4360 goto FAILED;
4361 }
4362 if (namelen > MAX_NAME_SIZE)
4363 {
4364 *errorcodeptr = ERR48;
4365 goto FAILED;
4366 }
4367 recno = 0;
4368 }
4369
4370 /* In the real compile, seek the name in the table */
4371
4372 else
4373 {
4374 slot = cd->name_table;
4375 for (i = 0; i < cd->names_found; i++)
4376 {
4377 if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
4378 slot += cd->name_entry_size;
4379 }
4380
4381 if (i < cd->names_found) /* Back reference */
4382 {
4383 recno = GET2(slot, 0);
4384 }
4385 else if ((recno = /* Forward back reference */
4386 find_parens(ptr, cd->bracount, name, namelen,
4387 (options & PCRE_EXTENDED) != 0)) <= 0)
4388 {
4389 *errorcodeptr = ERR15;
4390 goto FAILED;
4391 }
4392 }
4393
4394 /* In both phases, we can now go to the code than handles numerical
4395 recursion or backreferences. */
4396
4397 if (is_recurse) goto HANDLE_RECURSION;
4398 else goto HANDLE_REFERENCE;
4399
4400
4401 /* ------------------------------------------------------------ */
4402 case 'R': /* Recursion */
4403 ptr++; /* Same as (?0) */
4404 /* Fall through */
4405
4406
4407 /* ------------------------------------------------------------ */
4408 case '-': case '+':
4409 case '0': case '1': case '2': case '3': case '4': /* Recursion or */
4410 case '5': case '6': case '7': case '8': case '9': /* subroutine */
4411 {
4412 const uschar *called;
4413
4414 if ((refsign = *ptr) == '+') ptr++;
4415 else if (refsign == '-')
4416 {
4417 if ((digitab[ptr[1]] & ctype_digit) == 0)
4418 goto OTHER_CHAR_AFTER_QUERY;
4419 ptr++;
4420 }
4421
4422 recno = 0;
4423 while((digitab[*ptr] & ctype_digit) != 0)
4424 recno = recno * 10 + *ptr++ - '0';
4425
4426 if (*ptr != ')')
4427 {
4428 *errorcodeptr = ERR29;
4429 goto FAILED;
4430 }
4431
4432 if (refsign == '-')
4433 {
4434 if (recno == 0)
4435 {
4436 *errorcodeptr = ERR58;
4437 goto FAILED;
4438 }
4439 recno = cd->bracount - recno + 1;
4440 if (recno <= 0)
4441 {
4442 *errorcodeptr = ERR15;
4443 goto FAILED;
4444 }
4445 }
4446 else if (refsign == '+')
4447 {
4448 if (recno == 0)
4449 {
4450 *errorcodeptr = ERR58;
4451 goto FAILED;
4452 }
4453 recno += cd->bracount;
4454 }
4455
4456 /* Come here from code above that handles a named recursion */
4457
4458 HANDLE_RECURSION:
4459
4460 previous = code;
4461 called = cd->start_code;
4462
4463 /* When we are actually compiling, find the bracket that is being
4464 referenced. Temporarily end the regex in case it doesn't exist before
4465 this point. If we end up with a forward reference, first check that
4466 the bracket does occur later so we can give the error (and position)
4467 now. Then remember this forward reference in the workspace so it can
4468 be filled in at the end. */
4469
4470 if (lengthptr == NULL)
4471 {
4472 *code = OP_END;
4473 if (recno != 0) called = find_bracket(cd->start_code, utf8, recno);
4474
4475 /* Forward reference */
4476
4477 if (called == NULL)
4478 {
4479 if (find_parens(ptr, cd->bracount, NULL, recno,
4480 (options & PCRE_EXTENDED) != 0) < 0)
4481 {
4482 *errorcodeptr = ERR15;
4483 goto FAILED;
4484 }
4485 called = cd->start_code + recno;
4486 PUTINC(cd->hwm, 0, code + 2 + LINK_SIZE - cd->start_code);
4487 }
4488
4489 /* If not a forward reference, and the subpattern is still open,
4490 this is a recursive call. We check to see if this is a left
4491 recursion that could loop for ever, and diagnose that case. */
4492
4493 else if (GET(called, 1) == 0 &&
4494 could_be_empty(called, code, bcptr, utf8))
4495 {
4496 *errorcodeptr = ERR40;
4497 goto FAILED;
4498 }
4499 }
4500
4501 /* Insert the recursion/subroutine item, automatically wrapped inside
4502 "once" brackets. Set up a "previous group" length so that a
4503 subsequent quantifier will work. */
4504
4505 *code = OP_ONCE;
4506 PUT(code, 1, 2 + 2*LINK_SIZE);
4507 code += 1 + LINK_SIZE;
4508
4509 *code = OP_RECURSE;
4510 PUT(code, 1, called - cd->start_code);
4511 code += 1 + LINK_SIZE;
4512
4513 *code = OP_KET;
4514 PUT(code, 1, 2 + 2*LINK_SIZE);
4515 code += 1 + LINK_SIZE;
4516
4517 length_prevgroup = 3 + 3*LINK_SIZE;
4518 }
4519
4520 /* Can't determine a first byte now */
4521
4522 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
4523 continue;
4524
4525
4526 /* ------------------------------------------------------------ */
4527 default: /* Other characters: check option setting */
4528 OTHER_CHAR_AFTER_QUERY:
4529 set = unset = 0;
4530 optset = &set;
4531
4532 while (*ptr != ')' && *ptr != ':')
4533 {
4534 switch (*ptr++)
4535 {
4536 case '-': optset = &unset; break;
4537
4538 case 'J': /* Record that it changed in the external options */
4539 *optset |= PCRE_DUPNAMES;
4540 cd->external_options |= PCRE_JCHANGED;
4541 break;
4542
4543 case 'i': *optset |= PCRE_CASELESS; break;
4544 case 'm': *optset |= PCRE_MULTILINE; break;
4545 case 's': *optset |= PCRE_DOTALL; break;
4546 case 'x': *optset |= PCRE_EXTENDED; break;
4547 case 'U': *optset |= PCRE_UNGREEDY; break;
4548 case 'X': *optset |= PCRE_EXTRA; break;
4549
4550 default: *errorcodeptr = ERR12;
4551 ptr--; /* Correct the offset */
4552 goto FAILED;
4553 }
4554 }
4555
4556 /* Set up the changed option bits, but don't change anything yet. */
4557
4558 newoptions = (options | set) & (~unset);
4559
4560 /* If the options ended with ')' this is not the start of a nested
4561 group with option changes, so the options change at this level. If this
4562 item is right at the start of the pattern, the options can be
4563 abstracted and made external in the pre-compile phase, and ignored in
4564 the compile phase. This can be helpful when matching -- for instance in
4565 caseless checking of required bytes.
4566
4567 If the code pointer is not (cd->start_code + 1 + LINK_SIZE), we are
4568 definitely *not* at the start of the pattern because something has been
4569 compiled. In the pre-compile phase, however, the code pointer can have
4570 that value after the start, because it gets reset as code is discarded
4571 during the pre-compile. However, this can happen only at top level - if
4572 we are within parentheses, the starting BRA will still be present. At
4573 any parenthesis level, the length value can be used to test if anything
4574 has been compiled at that level. Thus, a test for both these conditions
4575 is necessary to ensure we correctly detect the start of the pattern in
4576 both phases.
4577
4578 If we are not at the pattern start, compile code to change the ims
4579 options if this setting actually changes any of them. We also pass the
4580 new setting back so that it can be put at the start of any following
4581 branches, and when this group ends (if we are in a group), a resetting
4582 item can be compiled. */
4583
4584 if (*ptr == ')')
4585 {
4586 if (code == cd->start_code + 1 + LINK_SIZE &&
4587 (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE))
4588 {
4589 cd->external_options = newoptions;
4590 options = newoptions;
4591 }
4592 else
4593 {
4594 if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
4595 {
4596 *code++ = OP_OPT;
4597 *code++ = newoptions & PCRE_IMS;
4598 }
4599
4600 /* Change options at this level, and pass them back for use
4601 in subsequent branches. Reset the greedy defaults and the case
4602 value for firstbyte and reqbyte. */
4603
4604 *optionsptr = options = newoptions;
4605 greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
4606 greedy_non_default = greedy_default ^ 1;
4607 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
4608 }
4609
4610 previous = NULL; /* This item can't be repeated */
4611 continue; /* It is complete */
4612 }
4613
4614 /* If the options ended with ':' we are heading into a nested group
4615 with possible change of options. Such groups are non-capturing and are
4616 not assertions of any kind. All we need to do is skip over the ':';
4617 the newoptions value is handled below. */
4618
4619 bravalue = OP_BRA;
4620 ptr++;
4621 } /* End of switch for character following (? */
4622 } /* End of (? handling */
4623
4624 /* Opening parenthesis not followed by '?'. If PCRE_NO_AUTO_CAPTURE is set,
4625 all unadorned brackets become non-capturing and behave like (?:...)
4626 brackets. */
4627
4628 else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
4629 {
4630 bravalue = OP_BRA;
4631 }
4632
4633 /* Else we have a capturing group. */
4634
4635 else
4636 {
4637 NUMBERED_GROUP:
4638 cd->bracount += 1;
4639 PUT2(code, 1+LINK_SIZE, cd->bracount);
4640 skipbytes = 2;
4641 }
4642
4643 /* Process nested bracketed regex. Assertions may not be repeated, but
4644 other kinds can be. All their opcodes are >= OP_ONCE. We copy code into a
4645 non-register variable in order to be able to pass its address because some
4646 compilers complain otherwise. Pass in a new setting for the ims options if
4647 they have changed. */
4648
4649 previous = (bravalue >= OP_ONCE)? code : NULL;
4650 *code = bravalue;
4651 tempcode = code;
4652 tempreqvary = cd->req_varyopt; /* Save value before bracket */
4653 length_prevgroup = 0; /* Initialize for pre-compile phase */
4654
4655 if (!compile_regex(
4656 newoptions, /* The complete new option state */
4657 options & PCRE_IMS, /* The previous ims option state */
4658 &tempcode, /* Where to put code (updated) */
4659 &ptr, /* Input pointer (updated) */
4660 errorcodeptr, /* Where to put an error message */
4661 (bravalue == OP_ASSERTBACK ||
4662 bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
4663 reset_bracount, /* True if (?| group */
4664 skipbytes, /* Skip over bracket number */
4665 &subfirstbyte, /* For possible first char */
4666 &subreqbyte, /* For possible last char */
4667 bcptr, /* Current branch chain */
4668 cd, /* Tables block */
4669 (lengthptr == NULL)? NULL : /* Actual compile phase */
4670 &length_prevgroup /* Pre-compile phase */
4671 ))
4672 goto FAILED;
4673
4674 /* At the end of compiling, code is still pointing to the start of the
4675 group, while tempcode has been updated to point past the end of the group
4676 and any option resetting that may follow it. The pattern pointer (ptr)
4677 is on the bracket. */
4678
4679 /* If this is a conditional bracket, check that there are no more than
4680 two branches in the group, or just one if it's a DEFINE group. We do this
4681 in the real compile phase, not in the pre-pass, where the whole group may
4682 not be available. */
4683
4684 if (bravalue == OP_COND && lengthptr == NULL)
4685 {
4686 uschar *tc = code;
4687 int condcount = 0;
4688
4689 do {
4690 condcount++;
4691 tc += GET(tc,1);
4692 }
4693 while (*tc != OP_KET);
4694
4695 /* A DEFINE group is never obeyed inline (the "condition" is always
4696 false). It must have only one branch. */
4697
4698 if (code[LINK_SIZE+1] == OP_DEF)
4699 {
4700 if (condcount > 1)
4701 {
4702 *errorcodeptr = ERR54;
4703 goto FAILED;
4704 }
4705 bravalue = OP_DEF; /* Just a flag to suppress char handling below */
4706 }
4707
4708 /* A "normal" conditional group. If there is just one branch, we must not
4709 make use of its firstbyte or reqbyte, because this is equivalent to an
4710 empty second branch. */
4711
4712 else
4713 {
4714 if (condcount > 2)
4715 {
4716 *errorcodeptr = ERR27;
4717 goto FAILED;
4718 }
4719 if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
4720 }
4721 }
4722
4723 /* Error if hit end of pattern */
4724
4725 if (*ptr != ')')
4726 {
4727 *errorcodeptr = ERR14;
4728 goto FAILED;
4729 }
4730
4731 /* In the pre-compile phase, update the length by the length of the nested
4732 group, less the brackets at either end. Then reduce the compiled code to
4733 just the brackets so that it doesn't use much memory if it is duplicated by
4734 a quantifier. */
4735
4736 if (lengthptr != NULL)
4737 {
4738 if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE)
4739 {
4740 *errorcodeptr = ERR20;
4741 goto FAILED;
4742 }
4743 *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
4744 code++;
4745 PUTINC(code, 0, 1 + LINK_SIZE);
4746 *code++ = OP_KET;
4747 PUTINC(code, 0, 1 + LINK_SIZE);
4748 }
4749
4750 /* Otherwise update the main code pointer to the end of the group. */
4751
4752 else code = tempcode;
4753
4754 /* For a DEFINE group, required and first character settings are not
4755 relevant. */
4756
4757 if (bravalue == OP_DEF) break;
4758
4759 /* Handle updating of the required and first characters for other types of
4760 group. Update for normal brackets of all kinds, and conditions with two
4761 branches (see code above). If the bracket is followed by a quantifier with
4762 zero repeat, we have to back off. Hence the definition of zeroreqbyte and
4763 zerofirstbyte outside the main loop so that they can be accessed for the
4764 back off. */
4765
4766 zeroreqbyte = reqbyte;
4767 zerofirstbyte = firstbyte;
4768 groupsetfirstbyte = FALSE;
4769
4770 if (bravalue >= OP_ONCE)
4771 {
4772 /* If we have not yet set a firstbyte in this branch, take it from the
4773 subpattern, remembering that it was set here so that a repeat of more
4774 than one can replicate it as reqbyte if necessary. If the subpattern has
4775 no firstbyte, set "none" for the whole branch. In both cases, a zero
4776 repeat forces firstbyte to "none". */
4777
4778 if (firstbyte == REQ_UNSET)
4779 {
4780 if (subfirstbyte >= 0)
4781 {
4782 firstbyte = subfirstbyte;
4783 groupsetfirstbyte = TRUE;
4784 }
4785 else firstbyte = REQ_NONE;
4786 zerofirstbyte = REQ_NONE;
4787 }
4788
4789 /* If firstbyte was previously set, convert the subpattern's firstbyte
4790 into reqbyte if there wasn't one, using the vary flag that was in
4791 existence beforehand. */
4792
4793 else if (subfirstbyte >= 0 && subreqbyte < 0)
4794 subreqbyte = subfirstbyte | tempreqvary;
4795
4796 /* If the subpattern set a required byte (or set a first byte that isn't
4797 really the first byte - see above), set it. */
4798
4799 if (subreqbyte >= 0) reqbyte = subreqbyte;
4800 }
4801
4802 /* For a forward assertion, we take the reqbyte, if set. This can be
4803 helpful if the pattern that follows the assertion doesn't set a different
4804 char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
4805 for an assertion, however because it leads to incorrect effect for patterns
4806 such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
4807 of a firstbyte. This is overcome by a scan at the end if there's no
4808 firstbyte, looking for an asserted first char. */
4809
4810 else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
4811 break; /* End of processing '(' */
4812
4813
4814 /* ===================================================================*/
4815 /* Handle metasequences introduced by \. For ones like \d, the ESC_ values
4816 are arranged to be the negation of the corresponding OP_values. For the
4817 back references, the values are ESC_REF plus the reference number. Only
4818 back references and those types that consume a character may be repeated.
4819 We can test for values between ESC_b and ESC_Z for the latter; this may
4820 have to change if any new ones are ever created. */
4821
4822 case '\\':
4823 tempptr = ptr;
4824 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, FALSE);
4825 if (*errorcodeptr != 0) goto FAILED;
4826
4827 if (c < 0)
4828 {
4829 if (-c == ESC_Q) /* Handle start of quoted string */
4830 {
4831 if (ptr[1] == '\\' && ptr[2] == 'E') ptr += 2; /* avoid empty string */
4832 else inescq = TRUE;
4833 continue;
4834 }
4835
4836 if (-c == ESC_E) continue; /* Perl ignores an orphan \E */
4837
4838 /* For metasequences that actually match a character, we disable the
4839 setting of a first character if it hasn't already been set. */
4840
4841 if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
4842 firstbyte = REQ_NONE;
4843
4844 /* Set values to reset to if this is followed by a zero repeat. */
4845
4846 zerofirstbyte = firstbyte;
4847 zeroreqbyte = reqbyte;
4848
4849 /* \k<name> or \k'name' is a back reference by name (Perl syntax).
4850 We also support \k{name} (.NET syntax) */
4851
4852 if (-c == ESC_k && (ptr[1] == '<' || ptr[1] == '\'' || ptr[1] == '{'))
4853 {
4854 is_recurse = FALSE;
4855 terminator = (*(++ptr) == '<')? '>' : (*ptr == '\'')? '\'' : '}';
4856 goto NAMED_REF_OR_RECURSE;
4857 }
4858
4859 /* Back references are handled specially; must disable firstbyte if
4860 not set to cope with cases like (?=(\w+))\1: which would otherwise set
4861 ':' later. */
4862
4863 if (-c >= ESC_REF)
4864 {
4865 recno = -c - ESC_REF;
4866
4867 HANDLE_REFERENCE: /* Come here from named backref handling */
4868 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
4869 previous = code;
4870 *code++ = OP_REF;
4871 PUT2INC(code, 0, recno);
4872 cd->backref_map |= (recno < 32)? (1 << recno) : 1;
4873 if (recno > cd->top_backref) cd->top_backref = recno;
4874 }
4875
4876 /* So are Unicode property matches, if supported. */
4877
4878 #ifdef SUPPORT_UCP
4879 else if (-c == ESC_P || -c == ESC_p)
4880 {
4881 BOOL negated;
4882 int pdata;
4883 int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
4884 if (ptype < 0) goto FAILED;
4885 previous = code;
4886 *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
4887 *code++ = ptype;
4888 *code++ = pdata;
4889 }
4890 #else
4891
4892 /* If Unicode properties are not supported, \X, \P, and \p are not
4893 allowed. */
4894
4895 else if (-c == ESC_X || -c == ESC_P || -c == ESC_p)
4896 {
4897 *errorcodeptr = ERR45;
4898 goto FAILED;
4899 }
4900 #endif
4901
4902 /* For the rest (including \X when Unicode properties are supported), we
4903 can obtain the OP value by negating the escape value. */
4904
4905 else
4906 {
4907 previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
4908 *code++ = -c;
4909 }
4910 continue;
4911 }
4912
4913 /* We have a data character whose value is in c. In UTF-8 mode it may have
4914 a value > 127. We set its representation in the length/buffer, and then
4915 handle it as a data character. */
4916
4917 #ifdef SUPPORT_UTF8
4918 if (utf8 && c > 127)
4919 mclength = _pcre_ord2utf8(c, mcbuffer);
4920 else
4921 #endif
4922
4923 {
4924 mcbuffer[0] = c;
4925 mclength = 1;
4926 }
4927 goto ONE_CHAR;
4928
4929
4930 /* ===================================================================*/
4931 /* Handle a literal character. It is guaranteed not to be whitespace or #
4932 when the extended flag is set. If we are in UTF-8 mode, it may be a
4933 multi-byte literal character. */
4934
4935 default:
4936 NORMAL_CHAR:
4937 mclength = 1;
4938 mcbuffer[0] = c;
4939
4940 #ifdef SUPPORT_UTF8
4941 if (utf8 && c >= 0xc0)
4942 {
4943 while ((ptr[1] & 0xc0) == 0x80)
4944 mcbuffer[mclength++] = *(++ptr);
4945 }
4946 #endif
4947
4948 /* At this point we have the character's bytes in mcbuffer, and the length
4949 in mclength. When not in UTF-8 mode, the length is always 1. */
4950
4951 ONE_CHAR:
4952 previous = code;
4953 *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARNC : OP_CHAR;
4954 for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
4955
4956 /* Set the first and required bytes appropriately. If no previous first
4957 byte, set it from this character, but revert to none on a zero repeat.
4958 Otherwise, leave the firstbyte value alone, and don't change it on a zero
4959 repeat. */
4960
4961 if (firstbyte == REQ_UNSET)
4962 {
4963 zerofirstbyte = REQ_NONE;
4964 zeroreqbyte = reqbyte;
4965
4966 /* If the character is more than one byte long, we can set firstbyte
4967 only if it is not to be matched caselessly. */
4968
4969 if (mclength == 1 || req_caseopt == 0)
4970 {
4971 firstbyte = mcbuffer[0] | req_caseopt;
4972 if (mclength != 1) reqbyte = code[-1] | cd->req_varyopt;
4973 }
4974 else firstbyte = reqbyte = REQ_NONE;
4975 }
4976
4977 /* firstbyte was previously set; we can set reqbyte only the length is
4978 1 or the matching is caseful. */
4979
4980 else
4981 {
4982 zerofirstbyte = firstbyte;
4983 zeroreqbyte = reqbyte;
4984 if (mclength == 1 || req_caseopt == 0)
4985 reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
4986 }
4987
4988 break; /* End of literal character handling */
4989 }
4990 } /* end of big loop */
4991
4992
4993 /* Control never reaches here by falling through, only by a goto for all the
4994 error states. Pass back the position in the pattern so that it can be displayed
4995 to the user for diagnosing the error. */
4996
4997 FAILED:
4998 *ptrptr = ptr;
4999 return FALSE;
5000 }
5001
5002
5003
5004
5005 /*************************************************
5006 * Compile sequence of alternatives *
5007 *************************************************/
5008
5009 /* On entry, ptr is pointing past the bracket character, but on return it
5010 points to the closing bracket, or vertical bar, or end of string. The code
5011 variable is pointing at the byte into which the BRA operator has been stored.
5012 If the ims options are changed at the start (for a (?ims: group) or during any
5013 branch, we need to insert an OP_OPT item at the start of every following branch
5014 to ensure they get set correctly at run time, and also pass the new options
5015 into every subsequent branch compile.
5016
5017 This function is used during the pre-compile phase when we are trying to find
5018 out the amount of memory needed, as well as during the real compile phase. The
5019 value of lengthptr distinguishes the two phases.
5020
5021 Arguments:
5022 options option bits, including any changes for this subpattern
5023 oldims previous settings of ims option bits
5024 codeptr -> the address of the current code pointer
5025 ptrptr -> the address of the current pattern pointer
5026 errorcodeptr -> pointer to error code variable
5027 lookbehind TRUE if this is a lookbehind assertion
5028 reset_bracount TRUE to reset the count for each branch
5029 skipbytes skip this many bytes at start (for brackets and OP_COND)
5030 firstbyteptr place to put the first required character, or a negative number
5031 reqbyteptr place to put the last required character, or a negative number
5032 bcptr pointer to the chain of currently open branches
5033 cd points to the data block with tables pointers etc.
5034 lengthptr NULL during the real compile phase
5035 points to length accumulator during pre-compile phase
5036
5037 Returns: TRUE on success
5038 */
5039
5040 static BOOL
5041 compile_regex(int options, int oldims, uschar **codeptr, const uschar **ptrptr,
5042 int *errorcodeptr, BOOL lookbehind, BOOL reset_bracount, int skipbytes,
5043 int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd,
5044 int *lengthptr)
5045 {
5046 const uschar *ptr = *ptrptr;
5047 uschar *code = *codeptr;
5048 uschar *last_branch = code;
5049 uschar *start_bracket = code;
5050 uschar *reverse_count = NULL;
5051 int firstbyte, reqbyte;
5052 int branchfirstbyte, branchreqbyte;
5053 int length;
5054 int orig_bracount;
5055 int max_bracount;
5056 branch_chain bc;
5057
5058 bc.outer = bcptr;
5059 bc.current = code;
5060
5061 firstbyte = reqbyte = REQ_UNSET;
5062
5063 /* Accumulate the length for use in the pre-compile phase. Start with the
5064 length of the BRA and KET and any extra bytes that are required at the
5065 beginning. We accumulate in a local variable to save frequent testing of
5066 lenthptr for NULL. We cannot do this by looking at the value of code at the
5067 start and end of each alternative, because compiled items are discarded during
5068 the pre-compile phase so that the work space is not exceeded. */
5069
5070 length = 2 + 2*LINK_SIZE + skipbytes;
5071
5072 /* WARNING: If the above line is changed for any reason, you must also change
5073 the code that abstracts option settings at the start of the pattern and makes
5074 them global. It tests the value of length for (2 + 2*LINK_SIZE) in the
5075 pre-compile phase to find out whether anything has yet been compiled or not. */
5076
5077 /* Offset is set zero to mark that this bracket is still open */
5078
5079 PUT(code, 1, 0);
5080 code += 1 + LINK_SIZE + skipbytes;
5081
5082 /* Loop for each alternative branch */
5083
5084 orig_bracount = max_bracount = cd->bracount;
5085 for (;;)
5086 {
5087 /* For a (?| group, reset the capturing bracket count so that each branch
5088 uses the same numbers. */
5089
5090 if (reset_bracount) cd->bracount = orig_bracount;
5091
5092 /* Handle a change of ims options at the start of the branch */
5093
5094 if ((options & PCRE_IMS) != oldims)
5095 {
5096 *code++ = OP_OPT;
5097 *code++ = options & PCRE_IMS;
5098 length += 2;
5099 }
5100
5101 /* Set up dummy OP_REVERSE if lookbehind assertion */
5102
5103 if (lookbehind)
5104 {
5105 *code++ = OP_REVERSE;
5106 reverse_count = code;
5107 PUTINC(code, 0, 0);
5108 length += 1 + LINK_SIZE;
5109 }
5110
5111 /* Now compile the branch; in the pre-compile phase its length gets added
5112 into the length. */
5113
5114 if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstbyte,
5115 &branchreqbyte, &bc, cd, (lengthptr == NULL)? NULL : &length))
5116 {
5117 *ptrptr = ptr;
5118 return FALSE;
5119 }
5120
5121 /* Keep the highest bracket count in case (?| was used and some branch
5122 has fewer than the rest. */
5123
5124 if (cd->bracount > max_bracount) max_bracount = cd->bracount;
5125
5126 /* In the real compile phase, there is some post-processing to be done. */
5127
5128 if (lengthptr == NULL)
5129 {
5130 /* If this is the first branch, the firstbyte and reqbyte values for the
5131 branch become the values for the regex. */
5132
5133 if (*last_branch != OP_ALT)
5134 {
5135 firstbyte = branchfirstbyte;
5136 reqbyte = branchreqbyte;
5137 }
5138
5139 /* If this is not the first branch, the first char and reqbyte have to
5140 match the values from all the previous branches, except that if the
5141 previous value for reqbyte didn't have REQ_VARY set, it can still match,
5142 and we set REQ_VARY for the regex. */
5143
5144 else
5145 {
5146 /* If we previously had a firstbyte, but it doesn't match the new branch,
5147 we have to abandon the firstbyte for the regex, but if there was
5148 previously no reqbyte, it takes on the value of the old firstbyte. */
5149
5150 if (firstbyte >= 0 && firstbyte != branchfirstbyte)
5151 {
5152 if (reqbyte < 0) reqbyte = firstbyte;
5153 firstbyte = REQ_NONE;
5154 }
5155
5156 /* If we (now or from before) have no firstbyte, a firstbyte from the
5157 branch becomes a reqbyte if there isn't a branch reqbyte. */
5158
5159 if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
5160 branchreqbyte = branchfirstbyte;
5161
5162 /* Now ensure that the reqbytes match */
5163
5164 if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
5165 reqbyte = REQ_NONE;
5166 else reqbyte |= branchreqbyte; /* To "or" REQ_VARY */
5167 }
5168
5169 /* If lookbehind, check that this branch matches a fixed-length string, and
5170 put the length into the OP_REVERSE item. Temporarily mark the end of the
5171 branch with OP_END. */
5172
5173 if (lookbehind)
5174 {
5175 int fixed_length;
5176 *code = OP_END;
5177 fixed_length = find_fixedlength(last_branch, options);
5178 DPRINTF(("fixed length = %d\n", fixed_length));
5179 if (fixed_length < 0)
5180 {
5181 *errorcodeptr = (fixed_length == -2)? ERR36 : ERR25;
5182 *ptrptr = ptr;
5183 return FALSE;
5184 }
5185 PUT(reverse_count, 0, fixed_length);
5186 }
5187 }
5188
5189 /* Reached end of expression, either ')' or end of pattern. In the real
5190 compile phase, go back through the alternative branches and reverse the chain
5191 of offsets, with the field in the BRA item now becoming an offset to the
5192 first alternative. If there are no alternatives, it points to the end of the
5193 group. The length in the terminating ket is always the length of the whole
5194 bracketed item. If any of the ims options were changed inside the group,
5195 compile a resetting op-code following, except at the very end of the pattern.
5196 Return leaving the pointer at the terminating char. */
5197
5198 if (*ptr != '|')
5199 {
5200 if (lengthptr == NULL)
5201 {
5202 int branch_length = code - last_branch;
5203 do
5204 {
5205 int prev_length = GET(last_branch, 1);
5206 PUT(last_branch, 1, branch_length);
5207 branch_length = prev_length;
5208 last_branch -= branch_length;
5209 }
5210 while (branch_length > 0);
5211 }
5212
5213 /* Fill in the ket */
5214
5215 *code = OP_KET;
5216 PUT(code, 1, code - start_bracket);
5217 code += 1 + LINK_SIZE;
5218
5219 /* Resetting option if needed */
5220
5221 if ((options & PCRE_IMS) != oldims && *ptr == ')')
5222 {
5223 *code++ = OP_OPT;
5224 *code++ = oldims;
5225 length += 2;
5226 }
5227
5228 /* Retain the highest bracket number, in case resetting was used. */
5229
5230 cd->bracount = max_bracount;
5231
5232 /* Set values to pass back */
5233
5234 *codeptr = code;
5235 *ptrptr = ptr;
5236 *firstbyteptr = firstbyte;
5237 *reqbyteptr = reqbyte;
5238 if (lengthptr != NULL)
5239 {
5240 if (OFLOW_MAX - *lengthptr < length)
5241 {
5242 *errorcodeptr = ERR20;
5243 return FALSE;
5244 }
5245 *lengthptr += length;
5246 }
5247 return TRUE;
5248 }
5249
5250 /* Another branch follows. In the pre-compile phase, we can move the code
5251 pointer back to where it was for the start of the first branch. (That is,
5252 pretend that each branch is the only one.)
5253
5254 In the real compile phase, insert an ALT node. Its length field points back
5255 to the previous branch while the bracket remains open. At the end the chain
5256 is reversed. It's done like this so that the start of the bracket has a
5257 zero offset until it is closed, making it possible to detect recursion. */
5258
5259 if (lengthptr != NULL)
5260 {
5261 code = *codeptr + 1 + LINK_SIZE + skipbytes;
5262 length += 1 + LINK_SIZE;
5263 }
5264 else
5265 {
5266 *code = OP_ALT;
5267 PUT(code, 1, code - last_branch);
5268 bc.current = last_branch = code;
5269 code += 1 + LINK_SIZE;
5270 }
5271
5272 ptr++;
5273 }
5274 /* Control never reaches here */
5275 }
5276
5277
5278
5279
5280 /*************************************************
5281 * Check for anchored expression *
5282 *************************************************/
5283
5284 /* Try to find out if this is an anchored regular expression. Consider each
5285 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
5286 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
5287 it's anchored. However, if this is a multiline pattern, then only OP_SOD
5288 counts, since OP_CIRC can match in the middle.
5289
5290 We can also consider a regex to be anchored if OP_SOM starts all its branches.
5291 This is the code for \G, which means "match at start of match position, taking
5292 into account the match offset".
5293
5294 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
5295 because that will try the rest of the pattern at all possible matching points,
5296 so there is no point trying again.... er ....
5297
5298 .... except when the .* appears inside capturing parentheses, and there is a
5299 subsequent back reference to those parentheses. We haven't enough information
5300 to catch that case precisely.
5301
5302 At first, the best we could do was to detect when .* was in capturing brackets
5303 and the highest back reference was greater than or equal to that level.
5304 However, by keeping a bitmap of the first 31 back references, we can catch some
5305 of the more common cases more precisely.
5306
5307 Arguments:
5308 code points to start of expression (the bracket)
5309 options points to the options setting
5310 bracket_map a bitmap of which brackets we are inside while testing; this
5311 handles up to substring 31; after that we just have to take
5312 the less precise approach
5313 backref_map the back reference bitmap
5314
5315 Returns: TRUE or FALSE
5316 */
5317
5318 static BOOL
5319 is_anchored(register const uschar *code, int *options, unsigned int bracket_map,
5320 unsigned int backref_map)
5321 {
5322 do {
5323 const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
5324 options, PCRE_MULTILINE, FALSE);
5325 register int op = *scode;
5326
5327 /* Non-capturing brackets */
5328
5329 if (op == OP_BRA)
5330 {
5331 if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
5332 }
5333
5334 /* Capturing brackets */
5335
5336 else if (op == OP_CBRA)
5337 {
5338 int n = GET2(scode, 1+LINK_SIZE);
5339 int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
5340 if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;
5341 }
5342
5343 /* Other brackets */
5344
5345 else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
5346 {
5347 if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
5348 }
5349
5350 /* .* is not anchored unless DOTALL is set and it isn't in brackets that
5351 are or may be referenced. */
5352
5353 else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
5354 op == OP_TYPEPOSSTAR) &&
5355 (*options & PCRE_DOTALL) != 0)
5356 {
5357 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
5358 }
5359
5360 /* Check for explicit anchoring */
5361
5362 else if (op != OP_SOD && op != OP_SOM &&
5363 ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
5364 return FALSE;
5365 code += GET(code, 1);
5366 }
5367 while (*code == OP_ALT); /* Loop for each alternative */
5368 return TRUE;
5369 }
5370
5371
5372
5373 /*************************************************
5374 * Check for starting with ^ or .* *
5375 *************************************************/
5376
5377 /* This is called to find out if every branch starts with ^ or .* so that
5378 "first char" processing can be done to speed things up in multiline
5379 matching and for non-DOTALL patterns that start with .* (which must start at
5380 the beginning or after \n). As in the case of is_anchored() (see above), we
5381 have to take account of back references to capturing brackets that contain .*
5382 because in that case we can't make the assumption.
5383
5384 Arguments:
5385 code points to start of expression (the bracket)
5386 bracket_map a bitmap of which brackets we are inside while testing; this
5387 handles up to substring 31; after that we just have to take
5388 the less precise approach
5389 backref_map the back reference bitmap
5390
5391 Returns: TRUE or FALSE
5392 */
5393
5394 static BOOL
5395 is_startline(const uschar *code, unsigned int bracket_map,
5396 unsigned int backref_map)
5397 {
5398 do {
5399 const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
5400 NULL, 0, FALSE);
5401 register int op = *scode;
5402
5403 /* Non-capturing brackets */
5404
5405 if (op == OP_BRA)
5406 {
5407 if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
5408 }
5409
5410 /* Capturing brackets */
5411
5412 else if (op == OP_CBRA)
5413 {
5414 int n = GET2(scode, 1+LINK_SIZE);
5415 int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
5416 if (!is_startline(scode, new_map, backref_map)) return FALSE;
5417 }
5418
5419 /* Other brackets */
5420
5421 else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
5422 { if (!is_startline(scode, bracket_map, backref_map)) return FALSE; }
5423
5424 /* .* means "start at start or after \n" if it isn't in brackets that
5425 may be referenced. */
5426
5427 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
5428 {
5429 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
5430 }
5431
5432 /* Check for explicit circumflex */
5433
5434 else if (op != OP_CIRC) return FALSE;
5435
5436 /* Move on to the next alternative */
5437
5438 code += GET(code, 1);
5439 }
5440 while (*code == OP_ALT); /* Loop for each alternative */
5441 return TRUE;
5442 }
5443
5444
5445
5446 /*************************************************
5447 * Check for asserted fixed first char *
5448 *************************************************/
5449
5450 /* During compilation, the "first char" settings from forward assertions are
5451 discarded, because they can cause conflicts with actual literals that follow.
5452 However, if we end up without a first char setting for an unanchored pattern,
5453 it is worth scanning the regex to see if there is an initial asserted first
5454 char. If all branches start with the same asserted char, or with a bracket all
5455 of whose alternatives start with the same asserted char (recurse ad lib), then
5456 we return that char, otherwise -1.
5457
5458 Arguments:
5459 code points to start of expression (the bracket)
5460 options pointer to the options (used to check casing changes)
5461 inassert TRUE if in an assertion
5462
5463 Returns: -1 or the fixed first char
5464 */
5465
5466 static int
5467 find_firstassertedchar(const uschar *code, int *options, BOOL inassert)
5468 {
5469 register int c = -1;
5470 do {
5471 int d;
5472 const uschar *scode =
5473 first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE);
5474 register int op = *scode;
5475
5476 switch(op)
5477 {
5478 default:
5479 return -1;
5480
5481 case OP_BRA:
5482 case OP_CBRA:
5483 case OP_ASSERT:
5484 case OP_ONCE:
5485 case OP_COND:
5486 if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0)
5487 return -1;
5488 if (c < 0) c = d; else if (c != d) return -1;
5489 break;
5490
5491 case OP_EXACT: /* Fall through */
5492 scode += 2;
5493
5494 case OP_CHAR:
5495 case OP_CHARNC:
5496 case OP_PLUS:
5497 case OP_MINPLUS:
5498 case OP_POSPLUS:
5499 if (!inassert) return -1;
5500 if (c < 0)
5501 {
5502 c = scode[1];
5503 if ((*options & PCRE_CASELESS) != 0) c |= REQ_CASELESS;
5504 }
5505 else if (c != scode[1]) return -1;
5506 break;
5507 }
5508
5509 code += GET(code, 1);
5510 }
5511 while (*code == OP_ALT);
5512 return c;
5513 }
5514
5515
5516
5517 /*************************************************
5518 * Compile a Regular Expression *
5519 *************************************************/
5520
5521 /* This function takes a string and returns a pointer to a block of store
5522 holding a compiled version of the expression. The original API for this
5523 function had no error code return variable; it is retained for backwards
5524 compatibility. The new function is given a new name.
5525
5526 Arguments:
5527 pattern the regular expression
5528 options various option bits
5529 errorcodeptr pointer to error code variable (pcre_compile2() only)
5530 can be NULL if you don't want a code value
5531 errorptr pointer to pointer to error text
5532 erroroffset ptr offset in pattern where error was detected
5533 tables pointer to character tables or NULL
5534
5535 Returns: pointer to compiled data block, or NULL on error,
5536 with errorptr and erroroffset set
5537 */
5538
5539 PCRE_EXP_DEFN pcre *
5540 pcre_compile(const char *pattern, int options, const char **errorptr,
5541 int *erroroffset, const unsigned char *tables)
5542 {
5543 return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
5544 }
5545
5546
5547 PCRE_EXP_DEFN pcre *
5548 pcre_compile2(const char *pattern, int options, int *errorcodeptr,
5549 const char **errorptr, int *erroroffset, const unsigned char *tables)
5550 {
5551 real_pcre *re;
5552 int length = 1; /* For final END opcode */
5553 int firstbyte, reqbyte, newline;
5554 int errorcode = 0;
5555 #ifdef SUPPORT_UTF8
5556 BOOL utf8;
5557 #endif
5558 size_t size;
5559 uschar *code;
5560 const uschar *codestart;
5561 const uschar *ptr;
5562 compile_data compile_block;
5563 compile_data *cd = &compile_block;
5564
5565 /* This space is used for "compiling" into during the first phase, when we are
5566 computing the amount of memory that is needed. Compiled items are thrown away
5567 as soon as possible, so that a fairly large buffer should be sufficient for
5568 this purpose. The same space is used in the second phase for remembering where
5569 to fill in forward references to subpatterns. */
5570
5571 uschar cworkspace[COMPILE_WORK_SIZE];
5572
5573
5574 /* Set this early so that early errors get offset 0. */
5575
5576 ptr = (const uschar *)pattern;
5577
5578 /* We can't pass back an error message if errorptr is NULL; I guess the best we
5579 can do is just return NULL, but we can set a code value if there is a code
5580 pointer. */
5581
5582 if (errorptr == NULL)
5583 {
5584 if (errorcodeptr != NULL) *errorcodeptr = 99;
5585 return NULL;
5586 }
5587
5588 *errorptr = NULL;
5589 if (errorcodeptr != NULL) *errorcodeptr = ERR0;
5590
5591 /* However, we can give a message for this error */
5592
5593 if (erroroffset == NULL)
5594 {
5595 errorcode = ERR16;
5596 goto PCRE_EARLY_ERROR_RETURN2;
5597 }
5598
5599 *erroroffset = 0;
5600
5601 /* Can't support UTF8 unless PCRE has been compiled to include the code. */
5602
5603 #ifdef SUPPORT_UTF8
5604 utf8 = (options & PCRE_UTF8) != 0;
5605 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
5606 (*erroroffset = _pcre_valid_utf8((uschar *)pattern, -1)) >= 0)
5607 {
5608 errorcode = ERR44;
5609 goto PCRE_EARLY_ERROR_RETURN2;
5610 }
5611 #else
5612 if ((options & PCRE_UTF8) != 0)
5613 {
5614 errorcode = ERR32;
5615 goto PCRE_EARLY_ERROR_RETURN;
5616 }
5617 #endif
5618
5619 if ((options & ~PUBLIC_OPTIONS) != 0)
5620 {
5621 errorcode = ERR17;
5622 goto PCRE_EARLY_ERROR_RETURN;
5623 }
5624
5625 /* Set up pointers to the individual character tables */
5626
5627 if (tables == NULL) tables = _pcre_default_tables;
5628 cd->lcc = tables + lcc_offset;
5629 cd->fcc = tables + fcc_offset;
5630 cd->cbits = tables + cbits_offset;
5631 cd->ctypes = tables + ctypes_offset;
5632
5633 /* Handle different types of newline. The three bits give seven cases. The
5634 current code allows for fixed one- or two-byte sequences, plus "any" and
5635 "anycrlf". */
5636
5637 switch (options & (PCRE_NEWLINE_CRLF | PCRE_NEWLINE_ANY))
5638 {
5639 case 0: newline = NEWLINE; break; /* Compile-time default */
5640 case PCRE_NEWLINE_CR: newline = '\r'; break;
5641 case PCRE_NEWLINE_LF: newline = '\n'; break;
5642 case PCRE_NEWLINE_CR+
5643 PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
5644 case PCRE_NEWLINE_ANY: newline = -1; break;
5645 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
5646 default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
5647 }
5648
5649 if (newline == -2)
5650 {
5651 cd->nltype = NLTYPE_ANYCRLF;
5652 }
5653 else if (newline < 0)
5654 {
5655 cd->nltype = NLTYPE_ANY;
5656 }
5657 else
5658 {
5659 cd->nltype = NLTYPE_FIXED;
5660 if (newline > 255)
5661 {
5662 cd->nllen = 2;
5663 cd->nl[0] = (newline >> 8) & 255;
5664 cd->nl[1] = newline & 255;
5665 }
5666 else
5667 {
5668 cd->nllen = 1;
5669 cd->nl[0] = newline;
5670 }
5671 }
5672
5673 /* Maximum back reference and backref bitmap. The bitmap records up to 31 back
5674 references to help in deciding whether (.*) can be treated as anchored or not.
5675 */
5676
5677 cd->top_backref = 0;
5678 cd->backref_map = 0;
5679
5680 /* Reflect pattern for debugging output */
5681
5682 DPRINTF(("------------------------------------------------------------------\n"));
5683 DPRINTF(("%s\n", pattern));
5684
5685 /* Pretend to compile the pattern while actually just accumulating the length
5686 of memory required. This behaviour is triggered by passing a non-NULL final
5687 argument to compile_regex(). We pass a block of workspace (cworkspace) for it
5688 to compile parts of the pattern into; the compiled code is discarded when it is
5689 no longer needed, so hopefully this workspace will never overflow, though there
5690 is a test for its doing so. */
5691
5692 cd->bracount = 0;
5693 cd->names_found = 0;
5694 cd->name_entry_size = 0;
5695 cd->name_table = NULL;
5696 cd->start_workspace = cworkspace;
5697 cd->start_code = cworkspace;
5698 cd->hwm = cworkspace;
5699 cd->start_pattern = (const uschar *)pattern;
5700 cd->end_pattern = (const uschar *)(pattern + strlen(pattern));
5701 cd->req_varyopt = 0;
5702 cd->nopartial = FALSE;
5703 cd->external_options = options;
5704
5705 /* Now do the pre-compile. On error, errorcode will be set non-zero, so we
5706 don't need to look at the result of the function here. The initial options have
5707 been put into the cd block so that they can be changed if an option setting is
5708 found within the regex right at the beginning. Bringing initial option settings
5709 outside can help speed up starting point checks. */
5710
5711 code = cworkspace;
5712 *code = OP_BRA;
5713 (void)compile_regex(cd->external_options, cd->external_options & PCRE_IMS,
5714 &code, &ptr, &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd,
5715 &length);
5716 if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN;
5717
5718 DPRINTF(("end pre-compile: length=%d workspace=%d\n", length,
5719 cd->hwm - cworkspace));
5720
5721 if (length > MAX_PATTERN_SIZE)
5722 {
5723 errorcode = ERR20;
5724 goto PCRE_EARLY_ERROR_RETURN;
5725 }
5726
5727 /* Compute the size of data block needed and get it, either from malloc or
5728 externally provided function. Integer overflow should no longer be possible
5729 because nowadays we limit the maximum value of cd->names_found and
5730 cd->name_entry_size. */
5731
5732 size = length + sizeof(real_pcre) + cd->names_found * (cd->name_entry_size + 3);
5733 re = (real_pcre *)(pcre_malloc)(size);
5734
5735 if (re == NULL)
5736 {
5737 errorcode = ERR21;
5738 goto PCRE_EARLY_ERROR_RETURN;
5739 }
5740
5741 /* Put in the magic number, and save the sizes, initial options, and character
5742 table pointer. NULL is used for the default character tables. The nullpad field
5743 is at the end; it's there to help in the case when a regex compiled on a system
5744 with 4-byte pointers is run on another with 8-byte pointers. */
5745
5746 re->magic_number = MAGIC_NUMBER;
5747 re->size = size;
5748 re->options = cd->external_options;
5749 re->dummy1 = 0;
5750 re->first_byte = 0;
5751 re->req_byte = 0;
5752 re->name_table_offset = sizeof(real_pcre);
5753 re->name_entry_size = cd->name_entry_size;
5754 re->name_count = cd->names_found;
5755 re->ref_count = 0;
5756 re->tables = (tables == _pcre_default_tables)? NULL : tables;
5757 re->nullpad = NULL;
5758
5759 /* The starting points of the name/number translation table and of the code are
5760 passed around in the compile data block. The start/end pattern and initial
5761 options are already set from the pre-compile phase, as is the name_entry_size
5762 field. Reset the bracket count and the names_found field. Also reset the hwm
5763 field; this time it's used for remembering forward references to subpatterns.
5764 */
5765
5766 cd->bracount = 0;
5767 cd->names_found = 0;
5768 cd->name_table = (uschar *)re + re->name_table_offset;
5769 codestart = cd->name_table + re->name_entry_size * re->name_count;
5770 cd->start_code = codestart;
5771 cd->hwm = cworkspace;
5772 cd->req_varyopt = 0;
5773 cd->nopartial = FALSE;
5774 cd->had_accept = FALSE;
5775
5776 /* Set up a starting, non-extracting bracket, then compile the expression. On
5777 error, errorcode will be set non-zero, so we don't need to look at the result
5778 of the function here. */
5779
5780 ptr = (const uschar *)pattern;
5781 code = (uschar *)codestart;
5782 *code = OP_BRA;
5783 (void)compile_regex(re->options, re->options & PCRE_IMS, &code, &ptr,
5784 &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd, NULL);
5785 re->top_bracket = cd->bracount;
5786 re->top_backref = cd->top_backref;
5787
5788 if (cd->nopartial) re->options |= PCRE_NOPARTIAL;
5789 if (cd->had_accept) reqbyte = -1; /* Must disable after (*ACCEPT) */
5790
5791 /* If not reached end of pattern on success, there's an excess bracket. */
5792
5793 if (errorcode == 0 && *ptr != 0) errorcode = ERR22;
5794
5795 /* Fill in the terminating state and check for disastrous overflow, but
5796 if debugging, leave the test till after things are printed out. */
5797
5798 *code++ = OP_END;
5799
5800 #ifndef DEBUG
5801 if (code - codestart > length) errorcode = ERR23;
5802 #endif
5803
5804 /* Fill in any forward references that are required. */
5805
5806 while (errorcode == 0 && cd->hwm > cworkspace)
5807 {
5808 int offset, recno;
5809 const uschar *groupptr;
5810 cd->hwm -= LINK_SIZE;
5811 offset = GET(cd->hwm, 0);
5812 recno = GET(codestart, offset);
5813 groupptr = find_bracket(codestart, (re->options & PCRE_UTF8) != 0, recno);
5814 if (groupptr == NULL) errorcode = ERR53;
5815 else PUT(((uschar *)codestart), offset, groupptr - codestart);
5816 }
5817
5818 /* Give an error if there's back reference to a non-existent capturing
5819 subpattern. */
5820
5821 if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15;
5822
5823 /* Failed to compile, or error while post-processing */
5824
5825 if (errorcode != 0)
5826 {
5827 (pcre_free)(re);
5828 PCRE_EARLY_ERROR_RETURN:
5829 *erroroffset = ptr - (const uschar *)pattern;
5830 PCRE_EARLY_ERROR_RETURN2:
5831 *errorptr = error_texts[errorcode];
5832 if (errorcodeptr != NULL) *errorcodeptr = errorcode;
5833 return NULL;
5834 }
5835
5836 /* If the anchored option was not passed, set the flag if we can determine that
5837 the pattern is anchored by virtue of ^ characters or \A or anything else (such
5838 as starting with .* when DOTALL is set).
5839
5840 Otherwise, if we know what the first byte has to be, save it, because that
5841 speeds up unanchored matches no end. If not, see if we can set the
5842 PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
5843 start with ^. and also when all branches start with .* for non-DOTALL matches.
5844 */
5845
5846 if ((re->options & PCRE_ANCHORED) == 0)
5847 {
5848 int temp_options = re->options; /* May get changed during these scans */
5849 if (is_anchored(codestart, &temp_options, 0, cd->backref_map))
5850 re->options |= PCRE_ANCHORED;
5851 else
5852 {
5853 if (firstbyte < 0)
5854 firstbyte = find_firstassertedchar(codestart, &temp_options, FALSE);
5855 if (firstbyte >= 0) /* Remove caseless flag for non-caseable chars */
5856 {
5857 int ch = firstbyte & 255;
5858 re->first_byte = ((firstbyte & REQ_CASELESS) != 0 &&
5859 cd->fcc[ch] == ch)? ch : firstbyte;
5860 re->options |= PCRE_FIRSTSET;
5861 }
5862 else if (is_startline(codestart, 0, cd->backref_map))
5863 re->options |= PCRE_STARTLINE;
5864 }
5865 }
5866
5867 /* For an anchored pattern, we use the "required byte" only if it follows a
5868 variable length item in the regex. Remove the caseless flag for non-caseable
5869 bytes. */
5870
5871 if (reqbyte >= 0 &&
5872 ((re->options & PCRE_ANCHORED) == 0 || (reqbyte & REQ_VARY) != 0))
5873 {
5874 int ch = reqbyte & 255;
5875 re->req_byte = ((reqbyte & REQ_CASELESS) != 0 &&
5876 cd->fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte;
5877 re->options |= PCRE_REQCHSET;
5878 }
5879
5880 /* Print out the compiled data if debugging is enabled. This is never the
5881 case when building a production library. */
5882
5883 #ifdef DEBUG
5884
5885 printf("Length = %d top_bracket = %d top_backref = %d\n",
5886 length, re->top_bracket, re->top_backref);
5887
5888 if (re->options != 0)
5889 {
5890 printf("%s%s%s%s%s%s%s%s%s\n",
5891 ((re->options & PCRE_NOPARTIAL) != 0)? "nopartial " : "",
5892 ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
5893 ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
5894 ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
5895 ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
5896 ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
5897 ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
5898 ((re->options & PCRE_EXTRA) != 0)? "extra " : "",
5899 ((re->options & PCRE_UNGREEDY) != 0)? "ungreedy " : "");
5900 }
5901
5902 if ((re->options & PCRE_FIRSTSET) != 0)
5903 {
5904 int ch = re->first_byte & 255;
5905 const char *caseless = ((re->first_byte & REQ_CASELESS) == 0)?
5906 "" : " (caseless)";
5907 if (isprint(ch)) printf("First char = %c%s\n", ch, caseless);
5908 else printf("First char = \\x%02x%s\n", ch, caseless);
5909 }
5910
5911 if ((re->options & PCRE_REQCHSET) != 0)
5912 {
5913 int ch = re->req_byte & 255;
5914 const char *caseless = ((re->req_byte & REQ_CASELESS) == 0)?
5915 "" : " (caseless)";
5916 if (isprint(ch)) printf("Req char = %c%s\n", ch, caseless);
5917 else printf("Req char = \\x%02x%s\n", ch, caseless);
5918 }
5919
5920 pcre_printint(re, stdout, TRUE);
5921
5922 /* This check is done here in the debugging case so that the code that
5923 was compiled can be seen. */
5924
5925 if (code - codestart > length)
5926 {
5927 (pcre_free)(re);
5928 *errorptr = error_texts[ERR23];
5929 *erroroffset = ptr - (uschar *)pattern;
5930 if (errorcodeptr != NULL) *errorcodeptr = ERR23;
5931 return NULL;
5932 }
5933 #endif /* DEBUG */
5934
5935 return (pcre *)re;
5936 }
5937
5938 /* End of pcre_compile.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12