/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Contents of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 215 - (show annotations) (download)
Wed Aug 15 14:20:05 2007 UTC (7 years, 3 months ago) by ph10
File MIME type: text/plain
File size: 187512 byte(s)
Fixed overrun for missing ] with a forward reference, e.g. /(?1)\c[/.

1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2007 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains the external function pcre_compile(), along with
42 supporting internal functions that are not used by other modules. */
43
44
45 #ifdef HAVE_CONFIG_H
46 #include <config.h>
47 #endif
48
49 #define NLBLOCK cd /* Block containing newline information */
50 #define PSSTART start_pattern /* Field containing processed string start */
51 #define PSEND end_pattern /* Field containing processed string end */
52
53 #include "pcre_internal.h"
54
55
56 /* When DEBUG is defined, we need the pcre_printint() function, which is also
57 used by pcretest. DEBUG is not defined when building a production library. */
58
59 #ifdef DEBUG
60 #include "pcre_printint.src"
61 #endif
62
63
64 /* Macro for setting individual bits in class bitmaps. */
65
66 #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
67
68 /* Maximum length value to check against when making sure that the integer that
69 holds the compiled pattern length does not overflow. We make it a bit less than
70 INT_MAX to allow for adding in group terminating bytes, so that we don't have
71 to check them every time. */
72
73 #define OFLOW_MAX (INT_MAX - 20)
74
75
76 /*************************************************
77 * Code parameters and static tables *
78 *************************************************/
79
80 /* This value specifies the size of stack workspace that is used during the
81 first pre-compile phase that determines how much memory is required. The regex
82 is partly compiled into this space, but the compiled parts are discarded as
83 soon as they can be, so that hopefully there will never be an overrun. The code
84 does, however, check for an overrun. The largest amount I've seen used is 218,
85 so this number is very generous.
86
87 The same workspace is used during the second, actual compile phase for
88 remembering forward references to groups so that they can be filled in at the
89 end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
90 is 4 there is plenty of room. */
91
92 #define COMPILE_WORK_SIZE (4096)
93
94
95 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
96 are simple data values; negative values are for special things like \d and so
97 on. Zero means further processing is needed (for things like \x), or the escape
98 is invalid. */
99
100 #ifndef EBCDIC /* This is the "normal" table for ASCII systems */
101 static const short int escapes[] = {
102 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
103 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
104 '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */
105 -ESC_H, 0, 0, -ESC_K, 0, 0, 0, 0, /* H - O */
106 -ESC_P, -ESC_Q, -ESC_R, -ESC_S, 0, 0, -ESC_V, -ESC_W, /* P - W */
107 -ESC_X, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
108 '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */
109 -ESC_h, 0, 0, -ESC_k, 0, 0, ESC_n, 0, /* h - o */
110 -ESC_p, 0, ESC_r, -ESC_s, ESC_tee, 0, -ESC_v, -ESC_w, /* p - w */
111 0, 0, -ESC_z /* x - z */
112 };
113
114 #else /* This is the "abnormal" table for EBCDIC systems */
115 static const short int escapes[] = {
116 /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
117 /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
118 /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
119 /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
120 /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
121 /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
122 /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
123 /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
124 /* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0,
125 /* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
126 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
127 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0,
128 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
129 /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
130 /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
131 /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
132 /* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0,
133 /* D0 */ '}', 0, -ESC_K, 0, 0, 0, 0, -ESC_P,
134 /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
135 /* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X,
136 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
137 /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
138 /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
139 };
140 #endif
141
142
143 /* Table of special "verbs" like (*PRUNE) */
144
145 typedef struct verbitem {
146 const char *name;
147 int len;
148 int op;
149 } verbitem;
150
151 static verbitem verbs[] = {
152 { "ACCEPT", 6, OP_ACCEPT },
153 { "COMMIT", 6, OP_COMMIT },
154 { "F", 1, OP_FAIL },
155 { "FAIL", 4, OP_FAIL },
156 { "PRUNE", 5, OP_PRUNE },
157 { "SKIP", 4, OP_SKIP },
158 { "THEN", 4, OP_THEN }
159 };
160
161 static int verbcount = sizeof(verbs)/sizeof(verbitem);
162
163
164 /* Tables of names of POSIX character classes and their lengths. The list is
165 terminated by a zero length entry. The first three must be alpha, lower, upper,
166 as this is assumed for handling case independence. */
167
168 static const char *const posix_names[] = {
169 "alpha", "lower", "upper",
170 "alnum", "ascii", "blank", "cntrl", "digit", "graph",
171 "print", "punct", "space", "word", "xdigit" };
172
173 static const uschar posix_name_lengths[] = {
174 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
175
176 /* Table of class bit maps for each POSIX class. Each class is formed from a
177 base map, with an optional addition or removal of another map. Then, for some
178 classes, there is some additional tweaking: for [:blank:] the vertical space
179 characters are removed, and for [:alpha:] and [:alnum:] the underscore
180 character is removed. The triples in the table consist of the base map offset,
181 second map offset or -1 if no second map, and a non-negative value for map
182 addition or a negative value for map subtraction (if there are two maps). The
183 absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
184 remove vertical space characters, 2 => remove underscore. */
185
186 static const int posix_class_maps[] = {
187 cbit_word, cbit_digit, -2, /* alpha */
188 cbit_lower, -1, 0, /* lower */
189 cbit_upper, -1, 0, /* upper */
190 cbit_word, -1, 2, /* alnum - word without underscore */
191 cbit_print, cbit_cntrl, 0, /* ascii */
192 cbit_space, -1, 1, /* blank - a GNU extension */
193 cbit_cntrl, -1, 0, /* cntrl */
194 cbit_digit, -1, 0, /* digit */
195 cbit_graph, -1, 0, /* graph */
196 cbit_print, -1, 0, /* print */
197 cbit_punct, -1, 0, /* punct */
198 cbit_space, -1, 0, /* space */
199 cbit_word, -1, 0, /* word - a Perl extension */
200 cbit_xdigit,-1, 0 /* xdigit */
201 };
202
203
204 #define STRING(a) # a
205 #define XSTRING(s) STRING(s)
206
207 /* The texts of compile-time error messages. These are "char *" because they
208 are passed to the outside world. Do not ever re-use any error number, because
209 they are documented. Always add a new error instead. Messages marked DEAD below
210 are no longer used. */
211
212 static const char *error_texts[] = {
213 "no error",
214 "\\ at end of pattern",
215 "\\c at end of pattern",
216 "unrecognized character follows \\",
217 "numbers out of order in {} quantifier",
218 /* 5 */
219 "number too big in {} quantifier",
220 "missing terminating ] for character class",
221 "invalid escape sequence in character class",
222 "range out of order in character class",
223 "nothing to repeat",
224 /* 10 */
225 "operand of unlimited repeat could match the empty string", /** DEAD **/
226 "internal error: unexpected repeat",
227 "unrecognized character after (?",
228 "POSIX named classes are supported only within a class",
229 "missing )",
230 /* 15 */
231 "reference to non-existent subpattern",
232 "erroffset passed as NULL",
233 "unknown option bit(s) set",
234 "missing ) after comment",
235 "parentheses nested too deeply", /** DEAD **/
236 /* 20 */
237 "regular expression is too large",
238 "failed to get memory",
239 "unmatched parentheses",
240 "internal error: code overflow",
241 "unrecognized character after (?<",
242 /* 25 */
243 "lookbehind assertion is not fixed length",
244 "malformed number or name after (?(",
245 "conditional group contains more than two branches",
246 "assertion expected after (?(",
247 "(?R or (?[+-]digits must be followed by )",
248 /* 30 */
249 "unknown POSIX class name",
250 "POSIX collating elements are not supported",
251 "this version of PCRE is not compiled with PCRE_UTF8 support",
252 "spare error", /** DEAD **/
253 "character value in \\x{...} sequence is too large",
254 /* 35 */
255 "invalid condition (?(0)",
256 "\\C not allowed in lookbehind assertion",
257 "PCRE does not support \\L, \\l, \\N, \\U, or \\u",
258 "number after (?C is > 255",
259 "closing ) for (?C expected",
260 /* 40 */
261 "recursive call could loop indefinitely",
262 "unrecognized character after (?P",
263 "syntax error in subpattern name (missing terminator)",
264 "two named subpatterns have the same name",
265 "invalid UTF-8 string",
266 /* 45 */
267 "support for \\P, \\p, and \\X has not been compiled",
268 "malformed \\P or \\p sequence",
269 "unknown property name after \\P or \\p",
270 "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)",
271 "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")",
272 /* 50 */
273 "repeated subpattern is too long", /** DEAD **/
274 "octal value is greater than \\377 (not in UTF-8 mode)",
275 "internal error: overran compiling workspace",
276 "internal error: previously-checked referenced subpattern not found",
277 "DEFINE group contains more than one branch",
278 /* 55 */
279 "repeating a DEFINE group is not allowed",
280 "inconsistent NEWLINE options",
281 "\\g is not followed by a braced name or an optionally braced non-zero number",
282 "(?+ or (?- or (?(+ or (?(- must be followed by a non-zero number",
283 "(*VERB) with an argument is not supported",
284 /* 60 */
285 "(*VERB) not recognized",
286 "number is too big"
287 };
288
289
290 /* Table to identify digits and hex digits. This is used when compiling
291 patterns. Note that the tables in chartables are dependent on the locale, and
292 may mark arbitrary characters as digits - but the PCRE compiling code expects
293 to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
294 a private table here. It costs 256 bytes, but it is a lot faster than doing
295 character value tests (at least in some simple cases I timed), and in some
296 applications one wants PCRE to compile efficiently as well as match
297 efficiently.
298
299 For convenience, we use the same bit definitions as in chartables:
300
301 0x04 decimal digit
302 0x08 hexadecimal digit
303
304 Then we can use ctype_digit and ctype_xdigit in the code. */
305
306 #ifndef EBCDIC /* This is the "normal" case, for ASCII systems */
307 static const unsigned char digitab[] =
308 {
309 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
310 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
311 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
312 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
313 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
314 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
315 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
316 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
317 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
318 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
319 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
320 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
321 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
322 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
323 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
324 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
325 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
326 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
327 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
328 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
329 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
330 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
331 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
332 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
333 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
334 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
335 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
336 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
337 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
338 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
339 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
340 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
341
342 #else /* This is the "abnormal" case, for EBCDIC systems */
343 static const unsigned char digitab[] =
344 {
345 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
346 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
347 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
348 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
349 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
350 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
351 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
352 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
353 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
354 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
355 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
356 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */
357 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
358 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
359 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
360 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
361 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
362 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
363 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
364 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
365 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
366 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
367 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
368 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
369 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
370 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
371 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
372 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
373 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
374 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
375 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
376 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
377
378 static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
379 0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
380 0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
381 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
382 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
383 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
384 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
385 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
386 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
387 0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
388 0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
389 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
390 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */
391 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
392 0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
393 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
394 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
395 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
396 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
397 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
398 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
399 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
400 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
401 0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
402 0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
403 0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
404 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
405 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
406 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
407 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
408 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
409 0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
410 0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
411 #endif
412
413
414 /* Definition to allow mutual recursion */
415
416 static BOOL
417 compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
418 int *, int *, branch_chain *, compile_data *, int *);
419
420
421
422 /*************************************************
423 * Handle escapes *
424 *************************************************/
425
426 /* This function is called when a \ has been encountered. It either returns a
427 positive value for a simple escape such as \n, or a negative value which
428 encodes one of the more complicated things such as \d. A backreference to group
429 n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
430 UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
431 ptr is pointing at the \. On exit, it is on the final character of the escape
432 sequence.
433
434 Arguments:
435 ptrptr points to the pattern position pointer
436 errorcodeptr points to the errorcode variable
437 bracount number of previous extracting brackets
438 options the options bits
439 isclass TRUE if inside a character class
440
441 Returns: zero or positive => a data character
442 negative => a special escape sequence
443 on error, errorcodeptr is set
444 */
445
446 static int
447 check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
448 int options, BOOL isclass)
449 {
450 BOOL utf8 = (options & PCRE_UTF8) != 0;
451 const uschar *ptr = *ptrptr + 1;
452 int c, i;
453
454 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
455 ptr--; /* Set pointer back to the last byte */
456
457 /* If backslash is at the end of the pattern, it's an error. */
458
459 if (c == 0) *errorcodeptr = ERR1;
460
461 /* Non-alphamerics are literals. For digits or letters, do an initial lookup in
462 a table. A non-zero result is something that can be returned immediately.
463 Otherwise further processing may be required. */
464
465 #ifndef EBCDIC /* ASCII coding */
466 else if (c < '0' || c > 'z') {} /* Not alphameric */
467 else if ((i = escapes[c - '0']) != 0) c = i;
468
469 #else /* EBCDIC coding */
470 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphameric */
471 else if ((i = escapes[c - 0x48]) != 0) c = i;
472 #endif
473
474 /* Escapes that need further processing, or are illegal. */
475
476 else
477 {
478 const uschar *oldptr;
479 BOOL braced, negated;
480
481 switch (c)
482 {
483 /* A number of Perl escapes are not handled by PCRE. We give an explicit
484 error. */
485
486 case 'l':
487 case 'L':
488 case 'N':
489 case 'u':
490 case 'U':
491 *errorcodeptr = ERR37;
492 break;
493
494 /* \g must be followed by a number, either plain or braced. If positive, it
495 is an absolute backreference. If negative, it is a relative backreference.
496 This is a Perl 5.10 feature. Perl 5.10 also supports \g{name} as a
497 reference to a named group. This is part of Perl's movement towards a
498 unified syntax for back references. As this is synonymous with \k{name}, we
499 fudge it up by pretending it really was \k. */
500
501 case 'g':
502 if (ptr[1] == '{')
503 {
504 const uschar *p;
505 for (p = ptr+2; *p != 0 && *p != '}'; p++)
506 if (*p != '-' && (digitab[*p] & ctype_digit) == 0) break;
507 if (*p != 0 && *p != '}')
508 {
509 c = -ESC_k;
510 break;
511 }
512 braced = TRUE;
513 ptr++;
514 }
515 else braced = FALSE;
516
517 if (ptr[1] == '-')
518 {
519 negated = TRUE;
520 ptr++;
521 }
522 else negated = FALSE;
523
524 c = 0;
525 while ((digitab[ptr[1]] & ctype_digit) != 0)
526 c = c * 10 + *(++ptr) - '0';
527
528 if (c < 0)
529 {
530 *errorcodeptr = ERR61;
531 break;
532 }
533
534 if (c == 0 || (braced && *(++ptr) != '}'))
535 {
536 *errorcodeptr = ERR57;
537 break;
538 }
539
540 if (negated)
541 {
542 if (c > bracount)
543 {
544 *errorcodeptr = ERR15;
545 break;
546 }
547 c = bracount - (c - 1);
548 }
549
550 c = -(ESC_REF + c);
551 break;
552
553 /* The handling of escape sequences consisting of a string of digits
554 starting with one that is not zero is not straightforward. By experiment,
555 the way Perl works seems to be as follows:
556
557 Outside a character class, the digits are read as a decimal number. If the
558 number is less than 10, or if there are that many previous extracting
559 left brackets, then it is a back reference. Otherwise, up to three octal
560 digits are read to form an escaped byte. Thus \123 is likely to be octal
561 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
562 value is greater than 377, the least significant 8 bits are taken. Inside a
563 character class, \ followed by a digit is always an octal number. */
564
565 case '1': case '2': case '3': case '4': case '5':
566 case '6': case '7': case '8': case '9':
567
568 if (!isclass)
569 {
570 oldptr = ptr;
571 c -= '0';
572 while ((digitab[ptr[1]] & ctype_digit) != 0)
573 c = c * 10 + *(++ptr) - '0';
574 if (c < 0)
575 {
576 *errorcodeptr = ERR61;
577 break;
578 }
579 if (c < 10 || c <= bracount)
580 {
581 c = -(ESC_REF + c);
582 break;
583 }
584 ptr = oldptr; /* Put the pointer back and fall through */
585 }
586
587 /* Handle an octal number following \. If the first digit is 8 or 9, Perl
588 generates a binary zero byte and treats the digit as a following literal.
589 Thus we have to pull back the pointer by one. */
590
591 if ((c = *ptr) >= '8')
592 {
593 ptr--;
594 c = 0;
595 break;
596 }
597
598 /* \0 always starts an octal number, but we may drop through to here with a
599 larger first octal digit. The original code used just to take the least
600 significant 8 bits of octal numbers (I think this is what early Perls used
601 to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
602 than 3 octal digits. */
603
604 case '0':
605 c -= '0';
606 while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
607 c = c * 8 + *(++ptr) - '0';
608 if (!utf8 && c > 255) *errorcodeptr = ERR51;
609 break;
610
611 /* \x is complicated. \x{ddd} is a character number which can be greater
612 than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
613 treated as a data character. */
614
615 case 'x':
616 if (ptr[1] == '{')
617 {
618 const uschar *pt = ptr + 2;
619 int count = 0;
620
621 c = 0;
622 while ((digitab[*pt] & ctype_xdigit) != 0)
623 {
624 register int cc = *pt++;
625 if (c == 0 && cc == '0') continue; /* Leading zeroes */
626 count++;
627
628 #ifndef EBCDIC /* ASCII coding */
629 if (cc >= 'a') cc -= 32; /* Convert to upper case */
630 c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));
631 #else /* EBCDIC coding */
632 if (cc >= 'a' && cc <= 'z') cc += 64; /* Convert to upper case */
633 c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));
634 #endif
635 }
636
637 if (*pt == '}')
638 {
639 if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
640 ptr = pt;
641 break;
642 }
643
644 /* If the sequence of hex digits does not end with '}', then we don't
645 recognize this construct; fall through to the normal \x handling. */
646 }
647
648 /* Read just a single-byte hex-defined char */
649
650 c = 0;
651 while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
652 {
653 int cc; /* Some compilers don't like ++ */
654 cc = *(++ptr); /* in initializers */
655 #ifndef EBCDIC /* ASCII coding */
656 if (cc >= 'a') cc -= 32; /* Convert to upper case */
657 c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
658 #else /* EBCDIC coding */
659 if (cc <= 'z') cc += 64; /* Convert to upper case */
660 c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
661 #endif
662 }
663 break;
664
665 /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
666 This coding is ASCII-specific, but then the whole concept of \cx is
667 ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
668
669 case 'c':
670 c = *(++ptr);
671 if (c == 0)
672 {
673 *errorcodeptr = ERR2;
674 break;
675 }
676
677 #ifndef EBCDIC /* ASCII coding */
678 if (c >= 'a' && c <= 'z') c -= 32;
679 c ^= 0x40;
680 #else /* EBCDIC coding */
681 if (c >= 'a' && c <= 'z') c += 64;
682 c ^= 0xC0;
683 #endif
684 break;
685
686 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
687 other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
688 for Perl compatibility, it is a literal. This code looks a bit odd, but
689 there used to be some cases other than the default, and there may be again
690 in future, so I haven't "optimized" it. */
691
692 default:
693 if ((options & PCRE_EXTRA) != 0) switch(c)
694 {
695 default:
696 *errorcodeptr = ERR3;
697 break;
698 }
699 break;
700 }
701 }
702
703 *ptrptr = ptr;
704 return c;
705 }
706
707
708
709 #ifdef SUPPORT_UCP
710 /*************************************************
711 * Handle \P and \p *
712 *************************************************/
713
714 /* This function is called after \P or \p has been encountered, provided that
715 PCRE is compiled with support for Unicode properties. On entry, ptrptr is
716 pointing at the P or p. On exit, it is pointing at the final character of the
717 escape sequence.
718
719 Argument:
720 ptrptr points to the pattern position pointer
721 negptr points to a boolean that is set TRUE for negation else FALSE
722 dptr points to an int that is set to the detailed property value
723 errorcodeptr points to the error code variable
724
725 Returns: type value from ucp_type_table, or -1 for an invalid type
726 */
727
728 static int
729 get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
730 {
731 int c, i, bot, top;
732 const uschar *ptr = *ptrptr;
733 char name[32];
734
735 c = *(++ptr);
736 if (c == 0) goto ERROR_RETURN;
737
738 *negptr = FALSE;
739
740 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
741 negation. */
742
743 if (c == '{')
744 {
745 if (ptr[1] == '^')
746 {
747 *negptr = TRUE;
748 ptr++;
749 }
750 for (i = 0; i < (int)sizeof(name) - 1; i++)
751 {
752 c = *(++ptr);
753 if (c == 0) goto ERROR_RETURN;
754 if (c == '}') break;
755 name[i] = c;
756 }
757 if (c !='}') goto ERROR_RETURN;
758 name[i] = 0;
759 }
760
761 /* Otherwise there is just one following character */
762
763 else
764 {
765 name[0] = c;
766 name[1] = 0;
767 }
768
769 *ptrptr = ptr;
770
771 /* Search for a recognized property name using binary chop */
772
773 bot = 0;
774 top = _pcre_utt_size;
775
776 while (bot < top)
777 {
778 i = (bot + top) >> 1;
779 c = strcmp(name, _pcre_utt[i].name);
780 if (c == 0)
781 {
782 *dptr = _pcre_utt[i].value;
783 return _pcre_utt[i].type;
784 }
785 if (c > 0) bot = i + 1; else top = i;
786 }
787
788 *errorcodeptr = ERR47;
789 *ptrptr = ptr;
790 return -1;
791
792 ERROR_RETURN:
793 *errorcodeptr = ERR46;
794 *ptrptr = ptr;
795 return -1;
796 }
797 #endif
798
799
800
801
802 /*************************************************
803 * Check for counted repeat *
804 *************************************************/
805
806 /* This function is called when a '{' is encountered in a place where it might
807 start a quantifier. It looks ahead to see if it really is a quantifier or not.
808 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
809 where the ddds are digits.
810
811 Arguments:
812 p pointer to the first char after '{'
813
814 Returns: TRUE or FALSE
815 */
816
817 static BOOL
818 is_counted_repeat(const uschar *p)
819 {
820 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
821 while ((digitab[*p] & ctype_digit) != 0) p++;
822 if (*p == '}') return TRUE;
823
824 if (*p++ != ',') return FALSE;
825 if (*p == '}') return TRUE;
826
827 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
828 while ((digitab[*p] & ctype_digit) != 0) p++;
829
830 return (*p == '}');
831 }
832
833
834
835 /*************************************************
836 * Read repeat counts *
837 *************************************************/
838
839 /* Read an item of the form {n,m} and return the values. This is called only
840 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
841 so the syntax is guaranteed to be correct, but we need to check the values.
842
843 Arguments:
844 p pointer to first char after '{'
845 minp pointer to int for min
846 maxp pointer to int for max
847 returned as -1 if no max
848 errorcodeptr points to error code variable
849
850 Returns: pointer to '}' on success;
851 current ptr on error, with errorcodeptr set non-zero
852 */
853
854 static const uschar *
855 read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
856 {
857 int min = 0;
858 int max = -1;
859
860 /* Read the minimum value and do a paranoid check: a negative value indicates
861 an integer overflow. */
862
863 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
864 if (min < 0 || min > 65535)
865 {
866 *errorcodeptr = ERR5;
867 return p;
868 }
869
870 /* Read the maximum value if there is one, and again do a paranoid on its size.
871 Also, max must not be less than min. */
872
873 if (*p == '}') max = min; else
874 {
875 if (*(++p) != '}')
876 {
877 max = 0;
878 while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
879 if (max < 0 || max > 65535)
880 {
881 *errorcodeptr = ERR5;
882 return p;
883 }
884 if (max < min)
885 {
886 *errorcodeptr = ERR4;
887 return p;
888 }
889 }
890 }
891
892 /* Fill in the required variables, and pass back the pointer to the terminating
893 '}'. */
894
895 *minp = min;
896 *maxp = max;
897 return p;
898 }
899
900
901
902 /*************************************************
903 * Find forward referenced subpattern *
904 *************************************************/
905
906 /* This function scans along a pattern's text looking for capturing
907 subpatterns, and counting them. If it finds a named pattern that matches the
908 name it is given, it returns its number. Alternatively, if the name is NULL, it
909 returns when it reaches a given numbered subpattern. This is used for forward
910 references to subpatterns. We know that if (?P< is encountered, the name will
911 be terminated by '>' because that is checked in the first pass.
912
913 Arguments:
914 ptr current position in the pattern
915 count current count of capturing parens so far encountered
916 name name to seek, or NULL if seeking a numbered subpattern
917 lorn name length, or subpattern number if name is NULL
918 xmode TRUE if we are in /x mode
919
920 Returns: the number of the named subpattern, or -1 if not found
921 */
922
923 static int
924 find_parens(const uschar *ptr, int count, const uschar *name, int lorn,
925 BOOL xmode)
926 {
927 const uschar *thisname;
928
929 for (; *ptr != 0; ptr++)
930 {
931 int term;
932
933 /* Skip over backslashed characters and also entire \Q...\E */
934
935 if (*ptr == '\\')
936 {
937 if (*(++ptr) == 0) return -1;
938 if (*ptr == 'Q') for (;;)
939 {
940 while (*(++ptr) != 0 && *ptr != '\\');
941 if (*ptr == 0) return -1;
942 if (*(++ptr) == 'E') break;
943 }
944 continue;
945 }
946
947 /* Skip over character classes */
948
949 if (*ptr == '[')
950 {
951 while (*(++ptr) != ']')
952 {
953 if (*ptr == 0) return -1;
954 if (*ptr == '\\')
955 {
956 if (*(++ptr) == 0) return -1;
957 if (*ptr == 'Q') for (;;)
958 {
959 while (*(++ptr) != 0 && *ptr != '\\');
960 if (*ptr == 0) return -1;
961 if (*(++ptr) == 'E') break;
962 }
963 continue;
964 }
965 }
966 continue;
967 }
968
969 /* Skip comments in /x mode */
970
971 if (xmode && *ptr == '#')
972 {
973 while (*(++ptr) != 0 && *ptr != '\n');
974 if (*ptr == 0) return -1;
975 continue;
976 }
977
978 /* An opening parens must now be a real metacharacter */
979
980 if (*ptr != '(') continue;
981 if (ptr[1] != '?' && ptr[1] != '*')
982 {
983 count++;
984 if (name == NULL && count == lorn) return count;
985 continue;
986 }
987
988 ptr += 2;
989 if (*ptr == 'P') ptr++; /* Allow optional P */
990
991 /* We have to disambiguate (?<! and (?<= from (?<name> */
992
993 if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') &&
994 *ptr != '\'')
995 continue;
996
997 count++;
998
999 if (name == NULL && count == lorn) return count;
1000 term = *ptr++;
1001 if (term == '<') term = '>';
1002 thisname = ptr;
1003 while (*ptr != term) ptr++;
1004 if (name != NULL && lorn == ptr - thisname &&
1005 strncmp((const char *)name, (const char *)thisname, lorn) == 0)
1006 return count;
1007 }
1008
1009 return -1;
1010 }
1011
1012
1013
1014 /*************************************************
1015 * Find first significant op code *
1016 *************************************************/
1017
1018 /* This is called by several functions that scan a compiled expression looking
1019 for a fixed first character, or an anchoring op code etc. It skips over things
1020 that do not influence this. For some calls, a change of option is important.
1021 For some calls, it makes sense to skip negative forward and all backward
1022 assertions, and also the \b assertion; for others it does not.
1023
1024 Arguments:
1025 code pointer to the start of the group
1026 options pointer to external options
1027 optbit the option bit whose changing is significant, or
1028 zero if none are
1029 skipassert TRUE if certain assertions are to be skipped
1030
1031 Returns: pointer to the first significant opcode
1032 */
1033
1034 static const uschar*
1035 first_significant_code(const uschar *code, int *options, int optbit,
1036 BOOL skipassert)
1037 {
1038 for (;;)
1039 {
1040 switch ((int)*code)
1041 {
1042 case OP_OPT:
1043 if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
1044 *options = (int)code[1];
1045 code += 2;
1046 break;
1047
1048 case OP_ASSERT_NOT:
1049 case OP_ASSERTBACK:
1050 case OP_ASSERTBACK_NOT:
1051 if (!skipassert) return code;
1052 do code += GET(code, 1); while (*code == OP_ALT);
1053 code += _pcre_OP_lengths[*code];
1054 break;
1055
1056 case OP_WORD_BOUNDARY:
1057 case OP_NOT_WORD_BOUNDARY:
1058 if (!skipassert) return code;
1059 /* Fall through */
1060
1061 case OP_CALLOUT:
1062 case OP_CREF:
1063 case OP_RREF:
1064 case OP_DEF:
1065 code += _pcre_OP_lengths[*code];
1066 break;
1067
1068 default:
1069 return code;
1070 }
1071 }
1072 /* Control never reaches here */
1073 }
1074
1075
1076
1077
1078 /*************************************************
1079 * Find the fixed length of a pattern *
1080 *************************************************/
1081
1082 /* Scan a pattern and compute the fixed length of subject that will match it,
1083 if the length is fixed. This is needed for dealing with backward assertions.
1084 In UTF8 mode, the result is in characters rather than bytes.
1085
1086 Arguments:
1087 code points to the start of the pattern (the bracket)
1088 options the compiling options
1089
1090 Returns: the fixed length, or -1 if there is no fixed length,
1091 or -2 if \C was encountered
1092 */
1093
1094 static int
1095 find_fixedlength(uschar *code, int options)
1096 {
1097 int length = -1;
1098
1099 register int branchlength = 0;
1100 register uschar *cc = code + 1 + LINK_SIZE;
1101
1102 /* Scan along the opcodes for this branch. If we get to the end of the
1103 branch, check the length against that of the other branches. */
1104
1105 for (;;)
1106 {
1107 int d;
1108 register int op = *cc;
1109
1110 switch (op)
1111 {
1112 case OP_CBRA:
1113 case OP_BRA:
1114 case OP_ONCE:
1115 case OP_COND:
1116 d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);
1117 if (d < 0) return d;
1118 branchlength += d;
1119 do cc += GET(cc, 1); while (*cc == OP_ALT);
1120 cc += 1 + LINK_SIZE;
1121 break;
1122
1123 /* Reached end of a branch; if it's a ket it is the end of a nested
1124 call. If it's ALT it is an alternation in a nested call. If it is
1125 END it's the end of the outer call. All can be handled by the same code. */
1126
1127 case OP_ALT:
1128 case OP_KET:
1129 case OP_KETRMAX:
1130 case OP_KETRMIN:
1131 case OP_END:
1132 if (length < 0) length = branchlength;
1133 else if (length != branchlength) return -1;
1134 if (*cc != OP_ALT) return length;
1135 cc += 1 + LINK_SIZE;
1136 branchlength = 0;
1137 break;
1138
1139 /* Skip over assertive subpatterns */
1140
1141 case OP_ASSERT:
1142 case OP_ASSERT_NOT:
1143 case OP_ASSERTBACK:
1144 case OP_ASSERTBACK_NOT:
1145 do cc += GET(cc, 1); while (*cc == OP_ALT);
1146 /* Fall through */
1147
1148 /* Skip over things that don't match chars */
1149
1150 case OP_REVERSE:
1151 case OP_CREF:
1152 case OP_RREF:
1153 case OP_DEF:
1154 case OP_OPT:
1155 case OP_CALLOUT:
1156 case OP_SOD:
1157 case OP_SOM:
1158 case OP_EOD:
1159 case OP_EODN:
1160 case OP_CIRC:
1161 case OP_DOLL:
1162 case OP_NOT_WORD_BOUNDARY:
1163 case OP_WORD_BOUNDARY:
1164 cc += _pcre_OP_lengths[*cc];
1165 break;
1166
1167 /* Handle literal characters */
1168
1169 case OP_CHAR:
1170 case OP_CHARNC:
1171 case OP_NOT:
1172 branchlength++;
1173 cc += 2;
1174 #ifdef SUPPORT_UTF8
1175 if ((options & PCRE_UTF8) != 0)
1176 {
1177 while ((*cc & 0xc0) == 0x80) cc++;
1178 }
1179 #endif
1180 break;
1181
1182 /* Handle exact repetitions. The count is already in characters, but we
1183 need to skip over a multibyte character in UTF8 mode. */
1184
1185 case OP_EXACT:
1186 branchlength += GET2(cc,1);
1187 cc += 4;
1188 #ifdef SUPPORT_UTF8
1189 if ((options & PCRE_UTF8) != 0)
1190 {
1191 while((*cc & 0x80) == 0x80) cc++;
1192 }
1193 #endif
1194 break;
1195
1196 case OP_TYPEEXACT:
1197 branchlength += GET2(cc,1);
1198 cc += 4;
1199 break;
1200
1201 /* Handle single-char matchers */
1202
1203 case OP_PROP:
1204 case OP_NOTPROP:
1205 cc += 2;
1206 /* Fall through */
1207
1208 case OP_NOT_DIGIT:
1209 case OP_DIGIT:
1210 case OP_NOT_WHITESPACE:
1211 case OP_WHITESPACE:
1212 case OP_NOT_WORDCHAR:
1213 case OP_WORDCHAR:
1214 case OP_ANY:
1215 branchlength++;
1216 cc++;
1217 break;
1218
1219 /* The single-byte matcher isn't allowed */
1220
1221 case OP_ANYBYTE:
1222 return -2;
1223
1224 /* Check a class for variable quantification */
1225
1226 #ifdef SUPPORT_UTF8
1227 case OP_XCLASS:
1228 cc += GET(cc, 1) - 33;
1229 /* Fall through */
1230 #endif
1231
1232 case OP_CLASS:
1233 case OP_NCLASS:
1234 cc += 33;
1235
1236 switch (*cc)
1237 {
1238 case OP_CRSTAR:
1239 case OP_CRMINSTAR:
1240 case OP_CRQUERY:
1241 case OP_CRMINQUERY:
1242 return -1;
1243
1244 case OP_CRRANGE:
1245 case OP_CRMINRANGE:
1246 if (GET2(cc,1) != GET2(cc,3)) return -1;
1247 branchlength += GET2(cc,1);
1248 cc += 5;
1249 break;
1250
1251 default:
1252 branchlength++;
1253 }
1254 break;
1255
1256 /* Anything else is variable length */
1257
1258 default:
1259 return -1;
1260 }
1261 }
1262 /* Control never gets here */
1263 }
1264
1265
1266
1267
1268 /*************************************************
1269 * Scan compiled regex for numbered bracket *
1270 *************************************************/
1271
1272 /* This little function scans through a compiled pattern until it finds a
1273 capturing bracket with the given number.
1274
1275 Arguments:
1276 code points to start of expression
1277 utf8 TRUE in UTF-8 mode
1278 number the required bracket number
1279
1280 Returns: pointer to the opcode for the bracket, or NULL if not found
1281 */
1282
1283 static const uschar *
1284 find_bracket(const uschar *code, BOOL utf8, int number)
1285 {
1286 for (;;)
1287 {
1288 register int c = *code;
1289 if (c == OP_END) return NULL;
1290
1291 /* XCLASS is used for classes that cannot be represented just by a bit
1292 map. This includes negated single high-valued characters. The length in
1293 the table is zero; the actual length is stored in the compiled code. */
1294
1295 if (c == OP_XCLASS) code += GET(code, 1);
1296
1297 /* Handle capturing bracket */
1298
1299 else if (c == OP_CBRA)
1300 {
1301 int n = GET2(code, 1+LINK_SIZE);
1302 if (n == number) return (uschar *)code;
1303 code += _pcre_OP_lengths[c];
1304 }
1305
1306 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1307 a multi-byte character. The length in the table is a minimum, so we have to
1308 arrange to skip the extra bytes. */
1309
1310 else
1311 {
1312 code += _pcre_OP_lengths[c];
1313 #ifdef SUPPORT_UTF8
1314 if (utf8) switch(c)
1315 {
1316 case OP_CHAR:
1317 case OP_CHARNC:
1318 case OP_EXACT:
1319 case OP_UPTO:
1320 case OP_MINUPTO:
1321 case OP_POSUPTO:
1322 case OP_STAR:
1323 case OP_MINSTAR:
1324 case OP_POSSTAR:
1325 case OP_PLUS:
1326 case OP_MINPLUS:
1327 case OP_POSPLUS:
1328 case OP_QUERY:
1329 case OP_MINQUERY:
1330 case OP_POSQUERY:
1331 if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1332 break;
1333 }
1334 #endif
1335 }
1336 }
1337 }
1338
1339
1340
1341 /*************************************************
1342 * Scan compiled regex for recursion reference *
1343 *************************************************/
1344
1345 /* This little function scans through a compiled pattern until it finds an
1346 instance of OP_RECURSE.
1347
1348 Arguments:
1349 code points to start of expression
1350 utf8 TRUE in UTF-8 mode
1351
1352 Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1353 */
1354
1355 static const uschar *
1356 find_recurse(const uschar *code, BOOL utf8)
1357 {
1358 for (;;)
1359 {
1360 register int c = *code;
1361 if (c == OP_END) return NULL;
1362 if (c == OP_RECURSE) return code;
1363
1364 /* XCLASS is used for classes that cannot be represented just by a bit
1365 map. This includes negated single high-valued characters. The length in
1366 the table is zero; the actual length is stored in the compiled code. */
1367
1368 if (c == OP_XCLASS) code += GET(code, 1);
1369
1370 /* Otherwise, we get the item's length from the table. In UTF-8 mode, opcodes
1371 that are followed by a character may be followed by a multi-byte character.
1372 The length in the table is a minimum, so we have to arrange to skip the extra
1373 bytes. */
1374
1375 else
1376 {
1377 code += _pcre_OP_lengths[c];
1378 #ifdef SUPPORT_UTF8
1379 if (utf8) switch(c)
1380 {
1381 case OP_CHAR:
1382 case OP_CHARNC:
1383 case OP_EXACT:
1384 case OP_UPTO:
1385 case OP_MINUPTO:
1386 case OP_POSUPTO:
1387 case OP_STAR:
1388 case OP_MINSTAR:
1389 case OP_POSSTAR:
1390 case OP_PLUS:
1391 case OP_MINPLUS:
1392 case OP_POSPLUS:
1393 case OP_QUERY:
1394 case OP_MINQUERY:
1395 case OP_POSQUERY:
1396 if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1397 break;
1398 }
1399 #endif
1400 }
1401 }
1402 }
1403
1404
1405
1406 /*************************************************
1407 * Scan compiled branch for non-emptiness *
1408 *************************************************/
1409
1410 /* This function scans through a branch of a compiled pattern to see whether it
1411 can match the empty string or not. It is called from could_be_empty()
1412 below and from compile_branch() when checking for an unlimited repeat of a
1413 group that can match nothing. Note that first_significant_code() skips over
1414 assertions. If we hit an unclosed bracket, we return "empty" - this means we've
1415 struck an inner bracket whose current branch will already have been scanned.
1416
1417 Arguments:
1418 code points to start of search
1419 endcode points to where to stop
1420 utf8 TRUE if in UTF8 mode
1421
1422 Returns: TRUE if what is matched could be empty
1423 */
1424
1425 static BOOL
1426 could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1427 {
1428 register int c;
1429 for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1430 code < endcode;
1431 code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1432 {
1433 const uschar *ccode;
1434
1435 c = *code;
1436
1437 /* Groups with zero repeats can of course be empty; skip them. */
1438
1439 if (c == OP_BRAZERO || c == OP_BRAMINZERO)
1440 {
1441 code += _pcre_OP_lengths[c];
1442 do code += GET(code, 1); while (*code == OP_ALT);
1443 c = *code;
1444 continue;
1445 }
1446
1447 /* For other groups, scan the branches. */
1448
1449 if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND)
1450 {
1451 BOOL empty_branch;
1452 if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1453
1454 /* Scan a closed bracket */
1455
1456 empty_branch = FALSE;
1457 do
1458 {
1459 if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1460 empty_branch = TRUE;
1461 code += GET(code, 1);
1462 }
1463 while (*code == OP_ALT);
1464 if (!empty_branch) return FALSE; /* All branches are non-empty */
1465 c = *code;
1466 continue;
1467 }
1468
1469 /* Handle the other opcodes */
1470
1471 switch (c)
1472 {
1473 /* Check for quantifiers after a class */
1474
1475 #ifdef SUPPORT_UTF8
1476 case OP_XCLASS:
1477 ccode = code + GET(code, 1);
1478 goto CHECK_CLASS_REPEAT;
1479 #endif
1480
1481 case OP_CLASS:
1482 case OP_NCLASS:
1483 ccode = code + 33;
1484
1485 #ifdef SUPPORT_UTF8
1486 CHECK_CLASS_REPEAT:
1487 #endif
1488
1489 switch (*ccode)
1490 {
1491 case OP_CRSTAR: /* These could be empty; continue */
1492 case OP_CRMINSTAR:
1493 case OP_CRQUERY:
1494 case OP_CRMINQUERY:
1495 break;
1496
1497 default: /* Non-repeat => class must match */
1498 case OP_CRPLUS: /* These repeats aren't empty */
1499 case OP_CRMINPLUS:
1500 return FALSE;
1501
1502 case OP_CRRANGE:
1503 case OP_CRMINRANGE:
1504 if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
1505 break;
1506 }
1507 break;
1508
1509 /* Opcodes that must match a character */
1510
1511 case OP_PROP:
1512 case OP_NOTPROP:
1513 case OP_EXTUNI:
1514 case OP_NOT_DIGIT:
1515 case OP_DIGIT:
1516 case OP_NOT_WHITESPACE:
1517 case OP_WHITESPACE:
1518 case OP_NOT_WORDCHAR:
1519 case OP_WORDCHAR:
1520 case OP_ANY:
1521 case OP_ANYBYTE:
1522 case OP_CHAR:
1523 case OP_CHARNC:
1524 case OP_NOT:
1525 case OP_PLUS:
1526 case OP_MINPLUS:
1527 case OP_POSPLUS:
1528 case OP_EXACT:
1529 case OP_NOTPLUS:
1530 case OP_NOTMINPLUS:
1531 case OP_NOTPOSPLUS:
1532 case OP_NOTEXACT:
1533 case OP_TYPEPLUS:
1534 case OP_TYPEMINPLUS:
1535 case OP_TYPEPOSPLUS:
1536 case OP_TYPEEXACT:
1537 return FALSE;
1538
1539 /* End of branch */
1540
1541 case OP_KET:
1542 case OP_KETRMAX:
1543 case OP_KETRMIN:
1544 case OP_ALT:
1545 return TRUE;
1546
1547 /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1548 MINUPTO, and POSUPTO may be followed by a multibyte character */
1549
1550 #ifdef SUPPORT_UTF8
1551 case OP_STAR:
1552 case OP_MINSTAR:
1553 case OP_POSSTAR:
1554 case OP_QUERY:
1555 case OP_MINQUERY:
1556 case OP_POSQUERY:
1557 case OP_UPTO:
1558 case OP_MINUPTO:
1559 case OP_POSUPTO:
1560 if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1561 break;
1562 #endif
1563 }
1564 }
1565
1566 return TRUE;
1567 }
1568
1569
1570
1571 /*************************************************
1572 * Scan compiled regex for non-emptiness *
1573 *************************************************/
1574
1575 /* This function is called to check for left recursive calls. We want to check
1576 the current branch of the current pattern to see if it could match the empty
1577 string. If it could, we must look outwards for branches at other levels,
1578 stopping when we pass beyond the bracket which is the subject of the recursion.
1579
1580 Arguments:
1581 code points to start of the recursion
1582 endcode points to where to stop (current RECURSE item)
1583 bcptr points to the chain of current (unclosed) branch starts
1584 utf8 TRUE if in UTF-8 mode
1585
1586 Returns: TRUE if what is matched could be empty
1587 */
1588
1589 static BOOL
1590 could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1591 BOOL utf8)
1592 {
1593 while (bcptr != NULL && bcptr->current >= code)
1594 {
1595 if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1596 bcptr = bcptr->outer;
1597 }
1598 return TRUE;
1599 }
1600
1601
1602
1603 /*************************************************
1604 * Check for POSIX class syntax *
1605 *************************************************/
1606
1607 /* This function is called when the sequence "[:" or "[." or "[=" is
1608 encountered in a character class. It checks whether this is followed by an
1609 optional ^ and then a sequence of letters, terminated by a matching ":]" or
1610 ".]" or "=]".
1611
1612 Argument:
1613 ptr pointer to the initial [
1614 endptr where to return the end pointer
1615 cd pointer to compile data
1616
1617 Returns: TRUE or FALSE
1618 */
1619
1620 static BOOL
1621 check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
1622 {
1623 int terminator; /* Don't combine these lines; the Solaris cc */
1624 terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
1625 if (*(++ptr) == '^') ptr++;
1626 while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
1627 if (*ptr == terminator && ptr[1] == ']')
1628 {
1629 *endptr = ptr;
1630 return TRUE;
1631 }
1632 return FALSE;
1633 }
1634
1635
1636
1637
1638 /*************************************************
1639 * Check POSIX class name *
1640 *************************************************/
1641
1642 /* This function is called to check the name given in a POSIX-style class entry
1643 such as [:alnum:].
1644
1645 Arguments:
1646 ptr points to the first letter
1647 len the length of the name
1648
1649 Returns: a value representing the name, or -1 if unknown
1650 */
1651
1652 static int
1653 check_posix_name(const uschar *ptr, int len)
1654 {
1655 register int yield = 0;
1656 while (posix_name_lengths[yield] != 0)
1657 {
1658 if (len == posix_name_lengths[yield] &&
1659 strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
1660 yield++;
1661 }
1662 return -1;
1663 }
1664
1665
1666 /*************************************************
1667 * Adjust OP_RECURSE items in repeated group *
1668 *************************************************/
1669
1670 /* OP_RECURSE items contain an offset from the start of the regex to the group
1671 that is referenced. This means that groups can be replicated for fixed
1672 repetition simply by copying (because the recursion is allowed to refer to
1673 earlier groups that are outside the current group). However, when a group is
1674 optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
1675 it, after it has been compiled. This means that any OP_RECURSE items within it
1676 that refer to the group itself or any contained groups have to have their
1677 offsets adjusted. That one of the jobs of this function. Before it is called,
1678 the partially compiled regex must be temporarily terminated with OP_END.
1679
1680 This function has been extended with the possibility of forward references for
1681 recursions and subroutine calls. It must also check the list of such references
1682 for the group we are dealing with. If it finds that one of the recursions in
1683 the current group is on this list, it adjusts the offset in the list, not the
1684 value in the reference (which is a group number).
1685
1686 Arguments:
1687 group points to the start of the group
1688 adjust the amount by which the group is to be moved
1689 utf8 TRUE in UTF-8 mode
1690 cd contains pointers to tables etc.
1691 save_hwm the hwm forward reference pointer at the start of the group
1692
1693 Returns: nothing
1694 */
1695
1696 static void
1697 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
1698 uschar *save_hwm)
1699 {
1700 uschar *ptr = group;
1701 while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1702 {
1703 int offset;
1704 uschar *hc;
1705
1706 /* See if this recursion is on the forward reference list. If so, adjust the
1707 reference. */
1708
1709 for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
1710 {
1711 offset = GET(hc, 0);
1712 if (cd->start_code + offset == ptr + 1)
1713 {
1714 PUT(hc, 0, offset + adjust);
1715 break;
1716 }
1717 }
1718
1719 /* Otherwise, adjust the recursion offset if it's after the start of this
1720 group. */
1721
1722 if (hc >= cd->hwm)
1723 {
1724 offset = GET(ptr, 1);
1725 if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1726 }
1727
1728 ptr += 1 + LINK_SIZE;
1729 }
1730 }
1731
1732
1733
1734 /*************************************************
1735 * Insert an automatic callout point *
1736 *************************************************/
1737
1738 /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
1739 callout points before each pattern item.
1740
1741 Arguments:
1742 code current code pointer
1743 ptr current pattern pointer
1744 cd pointers to tables etc
1745
1746 Returns: new code pointer
1747 */
1748
1749 static uschar *
1750 auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
1751 {
1752 *code++ = OP_CALLOUT;
1753 *code++ = 255;
1754 PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */
1755 PUT(code, LINK_SIZE, 0); /* Default length */
1756 return code + 2*LINK_SIZE;
1757 }
1758
1759
1760
1761 /*************************************************
1762 * Complete a callout item *
1763 *************************************************/
1764
1765 /* A callout item contains the length of the next item in the pattern, which
1766 we can't fill in till after we have reached the relevant point. This is used
1767 for both automatic and manual callouts.
1768
1769 Arguments:
1770 previous_callout points to previous callout item
1771 ptr current pattern pointer
1772 cd pointers to tables etc
1773
1774 Returns: nothing
1775 */
1776
1777 static void
1778 complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
1779 {
1780 int length = ptr - cd->start_pattern - GET(previous_callout, 2);
1781 PUT(previous_callout, 2 + LINK_SIZE, length);
1782 }
1783
1784
1785
1786 #ifdef SUPPORT_UCP
1787 /*************************************************
1788 * Get othercase range *
1789 *************************************************/
1790
1791 /* This function is passed the start and end of a class range, in UTF-8 mode
1792 with UCP support. It searches up the characters, looking for internal ranges of
1793 characters in the "other" case. Each call returns the next one, updating the
1794 start address.
1795
1796 Arguments:
1797 cptr points to starting character value; updated
1798 d end value
1799 ocptr where to put start of othercase range
1800 odptr where to put end of othercase range
1801
1802 Yield: TRUE when range returned; FALSE when no more
1803 */
1804
1805 static BOOL
1806 get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
1807 unsigned int *odptr)
1808 {
1809 unsigned int c, othercase, next;
1810
1811 for (c = *cptr; c <= d; c++)
1812 { if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR) break; }
1813
1814 if (c > d) return FALSE;
1815
1816 *ocptr = othercase;
1817 next = othercase + 1;
1818
1819 for (++c; c <= d; c++)
1820 {
1821 if (_pcre_ucp_othercase(c) != next) break;
1822 next++;
1823 }
1824
1825 *odptr = next - 1;
1826 *cptr = c;
1827
1828 return TRUE;
1829 }
1830 #endif /* SUPPORT_UCP */
1831
1832
1833
1834 /*************************************************
1835 * Check if auto-possessifying is possible *
1836 *************************************************/
1837
1838 /* This function is called for unlimited repeats of certain items, to see
1839 whether the next thing could possibly match the repeated item. If not, it makes
1840 sense to automatically possessify the repeated item.
1841
1842 Arguments:
1843 op_code the repeated op code
1844 this data for this item, depends on the opcode
1845 utf8 TRUE in UTF-8 mode
1846 utf8_char used for utf8 character bytes, NULL if not relevant
1847 ptr next character in pattern
1848 options options bits
1849 cd contains pointers to tables etc.
1850
1851 Returns: TRUE if possessifying is wanted
1852 */
1853
1854 static BOOL
1855 check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
1856 const uschar *ptr, int options, compile_data *cd)
1857 {
1858 int next;
1859
1860 /* Skip whitespace and comments in extended mode */
1861
1862 if ((options & PCRE_EXTENDED) != 0)
1863 {
1864 for (;;)
1865 {
1866 while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1867 if (*ptr == '#')
1868 {
1869 while (*(++ptr) != 0)
1870 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1871 }
1872 else break;
1873 }
1874 }
1875
1876 /* If the next item is one that we can handle, get its value. A non-negative
1877 value is a character, a negative value is an escape value. */
1878
1879 if (*ptr == '\\')
1880 {
1881 int temperrorcode = 0;
1882 next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
1883 if (temperrorcode != 0) return FALSE;
1884 ptr++; /* Point after the escape sequence */
1885 }
1886
1887 else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
1888 {
1889 #ifdef SUPPORT_UTF8
1890 if (utf8) { GETCHARINC(next, ptr); } else
1891 #endif
1892 next = *ptr++;
1893 }
1894
1895 else return FALSE;
1896
1897 /* Skip whitespace and comments in extended mode */
1898
1899 if ((options & PCRE_EXTENDED) != 0)
1900 {
1901 for (;;)
1902 {
1903 while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1904 if (*ptr == '#')
1905 {
1906 while (*(++ptr) != 0)
1907 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1908 }
1909 else break;
1910 }
1911 }
1912
1913 /* If the next thing is itself optional, we have to give up. */
1914
1915 if (*ptr == '*' || *ptr == '?' || strncmp((char *)ptr, "{0,", 3) == 0)
1916 return FALSE;
1917
1918 /* Now compare the next item with the previous opcode. If the previous is a
1919 positive single character match, "item" either contains the character or, if
1920 "item" is greater than 127 in utf8 mode, the character's bytes are in
1921 utf8_char. */
1922
1923
1924 /* Handle cases when the next item is a character. */
1925
1926 if (next >= 0) switch(op_code)
1927 {
1928 case OP_CHAR:
1929 #ifdef SUPPORT_UTF8
1930 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1931 #endif
1932 return item != next;
1933
1934 /* For CHARNC (caseless character) we must check the other case. If we have
1935 Unicode property support, we can use it to test the other case of
1936 high-valued characters. */
1937
1938 case OP_CHARNC:
1939 #ifdef SUPPORT_UTF8
1940 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1941 #endif
1942 if (item == next) return FALSE;
1943 #ifdef SUPPORT_UTF8
1944 if (utf8)
1945 {
1946 unsigned int othercase;
1947 if (next < 128) othercase = cd->fcc[next]; else
1948 #ifdef SUPPORT_UCP
1949 othercase = _pcre_ucp_othercase((unsigned int)next);
1950 #else
1951 othercase = NOTACHAR;
1952 #endif
1953 return (unsigned int)item != othercase;
1954 }
1955 else
1956 #endif /* SUPPORT_UTF8 */
1957 return (item != cd->fcc[next]); /* Non-UTF-8 mode */
1958
1959 /* For OP_NOT, "item" must be a single-byte character. */
1960
1961 case OP_NOT:
1962 if (next < 0) return FALSE; /* Not a character */
1963 if (item == next) return TRUE;
1964 if ((options & PCRE_CASELESS) == 0) return FALSE;
1965 #ifdef SUPPORT_UTF8
1966 if (utf8)
1967 {
1968 unsigned int othercase;
1969 if (next < 128) othercase = cd->fcc[next]; else
1970 #ifdef SUPPORT_UCP
1971 othercase = _pcre_ucp_othercase(next);
1972 #else
1973 othercase = NOTACHAR;
1974 #endif
1975 return (unsigned int)item == othercase;
1976 }
1977 else
1978 #endif /* SUPPORT_UTF8 */
1979 return (item == cd->fcc[next]); /* Non-UTF-8 mode */
1980
1981 case OP_DIGIT:
1982 return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
1983
1984 case OP_NOT_DIGIT:
1985 return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
1986
1987 case OP_WHITESPACE:
1988 return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
1989
1990 case OP_NOT_WHITESPACE:
1991 return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
1992
1993 case OP_WORDCHAR:
1994 return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
1995
1996 case OP_NOT_WORDCHAR:
1997 return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
1998
1999 case OP_HSPACE:
2000 case OP_NOT_HSPACE:
2001 switch(next)
2002 {
2003 case 0x09:
2004 case 0x20:
2005 case 0xa0:
2006 case 0x1680:
2007 case 0x180e:
2008 case 0x2000:
2009 case 0x2001:
2010 case 0x2002:
2011 case 0x2003:
2012 case 0x2004:
2013 case 0x2005:
2014 case 0x2006:
2015 case 0x2007:
2016 case 0x2008:
2017 case 0x2009:
2018 case 0x200A:
2019 case 0x202f:
2020 case 0x205f:
2021 case 0x3000:
2022 return op_code != OP_HSPACE;
2023 default:
2024 return op_code == OP_HSPACE;
2025 }
2026
2027 case OP_VSPACE:
2028 case OP_NOT_VSPACE:
2029 switch(next)
2030 {
2031 case 0x0a:
2032 case 0x0b:
2033 case 0x0c:
2034 case 0x0d:
2035 case 0x85:
2036 case 0x2028:
2037 case 0x2029:
2038 return op_code != OP_VSPACE;
2039 default:
2040 return op_code == OP_VSPACE;
2041 }
2042
2043 default:
2044 return FALSE;
2045 }
2046
2047
2048 /* Handle the case when the next item is \d, \s, etc. */
2049
2050 switch(op_code)
2051 {
2052 case OP_CHAR:
2053 case OP_CHARNC:
2054 #ifdef SUPPORT_UTF8
2055 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2056 #endif
2057 switch(-next)
2058 {
2059 case ESC_d:
2060 return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
2061
2062 case ESC_D:
2063 return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
2064
2065 case ESC_s:
2066 return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
2067
2068 case ESC_S:
2069 return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
2070
2071 case ESC_w:
2072 return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
2073
2074 case ESC_W:
2075 return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
2076
2077 case ESC_h:
2078 case ESC_H:
2079 switch(item)
2080 {
2081 case 0x09:
2082 case 0x20:
2083 case 0xa0:
2084 case 0x1680:
2085 case 0x180e:
2086 case 0x2000:
2087 case 0x2001:
2088 case 0x2002:
2089 case 0x2003:
2090 case 0x2004:
2091 case 0x2005:
2092 case 0x2006:
2093 case 0x2007:
2094 case 0x2008:
2095 case 0x2009:
2096 case 0x200A:
2097 case 0x202f:
2098 case 0x205f:
2099 case 0x3000:
2100 return -next != ESC_h;
2101 default:
2102 return -next == ESC_h;
2103 }
2104
2105 case ESC_v:
2106 case ESC_V:
2107 switch(item)
2108 {
2109 case 0x0a:
2110 case 0x0b:
2111 case 0x0c:
2112 case 0x0d:
2113 case 0x85:
2114 case 0x2028:
2115 case 0x2029:
2116 return -next != ESC_v;
2117 default:
2118 return -next == ESC_v;
2119 }
2120
2121 default:
2122 return FALSE;
2123 }
2124
2125 case OP_DIGIT:
2126 return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
2127 next == -ESC_h || next == -ESC_v;
2128
2129 case OP_NOT_DIGIT:
2130 return next == -ESC_d;
2131
2132 case OP_WHITESPACE:
2133 return next == -ESC_S || next == -ESC_d || next == -ESC_w;
2134
2135 case OP_NOT_WHITESPACE:
2136 return next == -ESC_s || next == -ESC_h || next == -ESC_v;
2137
2138 case OP_HSPACE:
2139 return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w;
2140
2141 case OP_NOT_HSPACE:
2142 return next == -ESC_h;
2143
2144 /* Can't have \S in here because VT matches \S (Perl anomaly) */
2145 case OP_VSPACE:
2146 return next == -ESC_V || next == -ESC_d || next == -ESC_w;
2147
2148 case OP_NOT_VSPACE:
2149 return next == -ESC_v;
2150
2151 case OP_WORDCHAR:
2152 return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v;
2153
2154 case OP_NOT_WORDCHAR:
2155 return next == -ESC_w || next == -ESC_d;
2156
2157 default:
2158 return FALSE;
2159 }
2160
2161 /* Control does not reach here */
2162 }
2163
2164
2165
2166 /*************************************************
2167 * Compile one branch *
2168 *************************************************/
2169
2170 /* Scan the pattern, compiling it into the a vector. If the options are
2171 changed during the branch, the pointer is used to change the external options
2172 bits. This function is used during the pre-compile phase when we are trying
2173 to find out the amount of memory needed, as well as during the real compile
2174 phase. The value of lengthptr distinguishes the two phases.
2175
2176 Arguments:
2177 optionsptr pointer to the option bits
2178 codeptr points to the pointer to the current code point
2179 ptrptr points to the current pattern pointer
2180 errorcodeptr points to error code variable
2181 firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2182 reqbyteptr set to the last literal character required, else < 0
2183 bcptr points to current branch chain
2184 cd contains pointers to tables etc.
2185 lengthptr NULL during the real compile phase
2186 points to length accumulator during pre-compile phase
2187
2188 Returns: TRUE on success
2189 FALSE, with *errorcodeptr set non-zero on error
2190 */
2191
2192 static BOOL
2193 compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2194 int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2195 compile_data *cd, int *lengthptr)
2196 {
2197 int repeat_type, op_type;
2198 int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
2199 int bravalue = 0;
2200 int greedy_default, greedy_non_default;
2201 int firstbyte, reqbyte;
2202 int zeroreqbyte, zerofirstbyte;
2203 int req_caseopt, reqvary, tempreqvary;
2204 int options = *optionsptr;
2205 int after_manual_callout = 0;
2206 int length_prevgroup = 0;
2207 register int c;
2208 register uschar *code = *codeptr;
2209 uschar *last_code = code;
2210 uschar *orig_code = code;
2211 uschar *tempcode;
2212 BOOL inescq = FALSE;
2213 BOOL groupsetfirstbyte = FALSE;
2214 const uschar *ptr = *ptrptr;
2215 const uschar *tempptr;
2216 uschar *previous = NULL;
2217 uschar *previous_callout = NULL;
2218 uschar *save_hwm = NULL;
2219 uschar classbits[32];
2220
2221 #ifdef SUPPORT_UTF8
2222 BOOL class_utf8;
2223 BOOL utf8 = (options & PCRE_UTF8) != 0;
2224 uschar *class_utf8data;
2225 uschar utf8_char[6];
2226 #else
2227 BOOL utf8 = FALSE;
2228 uschar *utf8_char = NULL;
2229 #endif
2230
2231 #ifdef DEBUG
2232 if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2233 #endif
2234
2235 /* Set up the default and non-default settings for greediness */
2236
2237 greedy_default = ((options & PCRE_UNGREEDY) != 0);
2238 greedy_non_default = greedy_default ^ 1;
2239
2240 /* Initialize no first byte, no required byte. REQ_UNSET means "no char
2241 matching encountered yet". It gets changed to REQ_NONE if we hit something that
2242 matches a non-fixed char first char; reqbyte just remains unset if we never
2243 find one.
2244
2245 When we hit a repeat whose minimum is zero, we may have to adjust these values
2246 to take the zero repeat into account. This is implemented by setting them to
2247 zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2248 item types that can be repeated set these backoff variables appropriately. */
2249
2250 firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2251
2252 /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2253 according to the current setting of the caseless flag. REQ_CASELESS is a bit
2254 value > 255. It is added into the firstbyte or reqbyte variables to record the
2255 case status of the value. This is used only for ASCII characters. */
2256
2257 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2258
2259 /* Switch on next character until the end of the branch */
2260
2261 for (;; ptr++)
2262 {
2263 BOOL negate_class;
2264 BOOL possessive_quantifier;
2265 BOOL is_quantifier;
2266 BOOL is_recurse;
2267 BOOL reset_bracount;
2268 int class_charcount;
2269 int class_lastchar;
2270 int newoptions;
2271 int recno;
2272 int refsign;
2273 int skipbytes;
2274 int subreqbyte;
2275 int subfirstbyte;
2276 int terminator;
2277 int mclength;
2278 uschar mcbuffer[8];
2279
2280 /* Get next byte in the pattern */
2281
2282 c = *ptr;
2283
2284 /* If we are in the pre-compile phase, accumulate the length used for the
2285 previous cycle of this loop. */
2286
2287 if (lengthptr != NULL)
2288 {
2289 #ifdef DEBUG
2290 if (code > cd->hwm) cd->hwm = code; /* High water info */
2291 #endif
2292 if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
2293 {
2294 *errorcodeptr = ERR52;
2295 goto FAILED;
2296 }
2297
2298 /* There is at least one situation where code goes backwards: this is the
2299 case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2300 the class is simply eliminated. However, it is created first, so we have to
2301 allow memory for it. Therefore, don't ever reduce the length at this point.
2302 */
2303
2304 if (code < last_code) code = last_code;
2305
2306 /* Paranoid check for integer overflow */
2307
2308 if (OFLOW_MAX - *lengthptr < code - last_code)
2309 {
2310 *errorcodeptr = ERR20;
2311 goto FAILED;
2312 }
2313
2314 *lengthptr += code - last_code;
2315 DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2316
2317 /* If "previous" is set and it is not at the start of the work space, move
2318 it back to there, in order to avoid filling up the work space. Otherwise,
2319 if "previous" is NULL, reset the current code pointer to the start. */
2320
2321 if (previous != NULL)
2322 {
2323 if (previous > orig_code)
2324 {
2325 memmove(orig_code, previous, code - previous);
2326 code -= previous - orig_code;
2327 previous = orig_code;
2328 }
2329 }
2330 else code = orig_code;
2331
2332 /* Remember where this code item starts so we can pick up the length
2333 next time round. */
2334
2335 last_code = code;
2336 }
2337
2338 /* In the real compile phase, just check the workspace used by the forward
2339 reference list. */
2340
2341 else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
2342 {
2343 *errorcodeptr = ERR52;
2344 goto FAILED;
2345 }
2346
2347 /* If in \Q...\E, check for the end; if not, we have a literal */
2348
2349 if (inescq && c != 0)
2350 {
2351 if (c == '\\' && ptr[1] == 'E')
2352 {
2353 inescq = FALSE;
2354 ptr++;
2355 continue;
2356 }
2357 else
2358 {
2359 if (previous_callout != NULL)
2360 {
2361 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2362 complete_callout(previous_callout, ptr, cd);
2363 previous_callout = NULL;
2364 }
2365 if ((options & PCRE_AUTO_CALLOUT) != 0)
2366 {
2367 previous_callout = code;
2368 code = auto_callout(code, ptr, cd);
2369 }
2370 goto NORMAL_CHAR;
2371 }
2372 }
2373
2374 /* Fill in length of a previous callout, except when the next thing is
2375 a quantifier. */
2376
2377 is_quantifier = c == '*' || c == '+' || c == '?' ||
2378 (c == '{' && is_counted_repeat(ptr+1));
2379
2380 if (!is_quantifier && previous_callout != NULL &&
2381 after_manual_callout-- <= 0)
2382 {
2383 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2384 complete_callout(previous_callout, ptr, cd);
2385 previous_callout = NULL;
2386 }
2387
2388 /* In extended mode, skip white space and comments */
2389
2390 if ((options & PCRE_EXTENDED) != 0)
2391 {
2392 if ((cd->ctypes[c] & ctype_space) != 0) continue;
2393 if (c == '#')
2394 {
2395 while (*(++ptr) != 0)
2396 {
2397 if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
2398 }
2399 if (*ptr != 0) continue;
2400
2401 /* Else fall through to handle end of string */
2402 c = 0;
2403 }
2404 }
2405
2406 /* No auto callout for quantifiers. */
2407
2408 if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
2409 {
2410 previous_callout = code;
2411 code = auto_callout(code, ptr, cd);
2412 }
2413
2414 switch(c)
2415 {
2416 /* ===================================================================*/
2417 case 0: /* The branch terminates at string end */
2418 case '|': /* or | or ) */
2419 case ')':
2420 *firstbyteptr = firstbyte;
2421 *reqbyteptr = reqbyte;
2422 *codeptr = code;
2423 *ptrptr = ptr;
2424 if (lengthptr != NULL)
2425 {
2426 if (OFLOW_MAX - *lengthptr < code - last_code)
2427 {
2428 *errorcodeptr = ERR20;
2429 goto FAILED;
2430 }
2431 *lengthptr += code - last_code; /* To include callout length */
2432 DPRINTF((">> end branch\n"));
2433 }
2434 return TRUE;
2435
2436
2437 /* ===================================================================*/
2438 /* Handle single-character metacharacters. In multiline mode, ^ disables
2439 the setting of any following char as a first character. */
2440
2441 case '^':
2442 if ((options & PCRE_MULTILINE) != 0)
2443 {
2444 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2445 }
2446 previous = NULL;
2447 *code++ = OP_CIRC;
2448 break;
2449
2450 case '$':
2451 previous = NULL;
2452 *code++ = OP_DOLL;
2453 break;
2454
2455 /* There can never be a first char if '.' is first, whatever happens about
2456 repeats. The value of reqbyte doesn't change either. */
2457
2458 case '.':
2459 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2460 zerofirstbyte = firstbyte;
2461 zeroreqbyte = reqbyte;
2462 previous = code;
2463 *code++ = OP_ANY;
2464 break;
2465
2466
2467 /* ===================================================================*/
2468 /* Character classes. If the included characters are all < 256, we build a
2469 32-byte bitmap of the permitted characters, except in the special case
2470 where there is only one such character. For negated classes, we build the
2471 map as usual, then invert it at the end. However, we use a different opcode
2472 so that data characters > 255 can be handled correctly.
2473
2474 If the class contains characters outside the 0-255 range, a different
2475 opcode is compiled. It may optionally have a bit map for characters < 256,
2476 but those above are are explicitly listed afterwards. A flag byte tells
2477 whether the bitmap is present, and whether this is a negated class or not.
2478 */
2479
2480 case '[':
2481 previous = code;
2482
2483 /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
2484 they are encountered at the top level, so we'll do that too. */
2485
2486 if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2487 check_posix_syntax(ptr, &tempptr, cd))
2488 {
2489 *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;
2490 goto FAILED;
2491 }
2492
2493 /* If the first character is '^', set the negation flag and skip it. Also,
2494 if the first few characters (either before or after ^) are \Q\E or \E we
2495 skip them too. This makes for compatibility with Perl. */
2496
2497 negate_class = FALSE;
2498 for (;;)
2499 {
2500 c = *(++ptr);
2501 if (c == '\\')
2502 {
2503 if (ptr[1] == 'E') ptr++;
2504 else if (strncmp((const char *)ptr+1, "Q\\E", 3) == 0) ptr += 3;
2505 else break;
2506 }
2507 else if (!negate_class && c == '^')
2508 negate_class = TRUE;
2509 else break;
2510 }
2511
2512 /* Keep a count of chars with values < 256 so that we can optimize the case
2513 of just a single character (as long as it's < 256). However, For higher
2514 valued UTF-8 characters, we don't yet do any optimization. */
2515
2516 class_charcount = 0;
2517 class_lastchar = -1;
2518
2519 /* Initialize the 32-char bit map to all zeros. We build the map in a
2520 temporary bit of memory, in case the class contains only 1 character (less
2521 than 256), because in that case the compiled code doesn't use the bit map.
2522 */
2523
2524 memset(classbits, 0, 32 * sizeof(uschar));
2525
2526 #ifdef SUPPORT_UTF8
2527 class_utf8 = FALSE; /* No chars >= 256 */
2528 class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */
2529 #endif
2530
2531 /* Process characters until ] is reached. By writing this as a "do" it
2532 means that an initial ] is taken as a data character. At the start of the
2533 loop, c contains the first byte of the character. */
2534
2535 if (c != 0) do
2536 {
2537 const uschar *oldptr;
2538
2539 #ifdef SUPPORT_UTF8
2540 if (utf8 && c > 127)
2541 { /* Braces are required because the */
2542 GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
2543 }
2544 #endif
2545
2546 /* Inside \Q...\E everything is literal except \E */
2547
2548 if (inescq)
2549 {
2550 if (c == '\\' && ptr[1] == 'E') /* If we are at \E */
2551 {
2552 inescq = FALSE; /* Reset literal state */
2553 ptr++; /* Skip the 'E' */
2554 continue; /* Carry on with next */
2555 }
2556 goto CHECK_RANGE; /* Could be range if \E follows */
2557 }
2558
2559 /* Handle POSIX class names. Perl allows a negation extension of the
2560 form [:^name:]. A square bracket that doesn't match the syntax is
2561 treated as a literal. We also recognize the POSIX constructions
2562 [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
2563 5.6 and 5.8 do. */
2564
2565 if (c == '[' &&
2566 (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2567 check_posix_syntax(ptr, &tempptr, cd))
2568 {
2569 BOOL local_negate = FALSE;
2570 int posix_class, taboffset, tabopt;
2571 register const uschar *cbits = cd->cbits;
2572 uschar pbits[32];
2573
2574 if (ptr[1] != ':')
2575 {
2576 *errorcodeptr = ERR31;
2577 goto FAILED;
2578 }
2579
2580 ptr += 2;
2581 if (*ptr == '^')
2582 {
2583 local_negate = TRUE;
2584 ptr++;
2585 }
2586
2587 posix_class = check_posix_name(ptr, tempptr - ptr);
2588 if (posix_class < 0)
2589 {
2590 *errorcodeptr = ERR30;
2591 goto FAILED;
2592 }
2593
2594 /* If matching is caseless, upper and lower are converted to
2595 alpha. This relies on the fact that the class table starts with
2596 alpha, lower, upper as the first 3 entries. */
2597
2598 if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
2599 posix_class = 0;
2600
2601 /* We build the bit map for the POSIX class in a chunk of local store
2602 because we may be adding and subtracting from it, and we don't want to
2603 subtract bits that may be in the main map already. At the end we or the
2604 result into the bit map that is being built. */
2605
2606 posix_class *= 3;
2607
2608 /* Copy in the first table (always present) */
2609
2610 memcpy(pbits, cbits + posix_class_maps[posix_class],
2611 32 * sizeof(uschar));
2612
2613 /* If there is a second table, add or remove it as required. */
2614
2615 taboffset = posix_class_maps[posix_class + 1];
2616 tabopt = posix_class_maps[posix_class + 2];
2617
2618 if (taboffset >= 0)
2619 {
2620 if (tabopt >= 0)
2621 for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
2622 else
2623 for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
2624 }
2625
2626 /* Not see if we need to remove any special characters. An option
2627 value of 1 removes vertical space and 2 removes underscore. */
2628
2629 if (tabopt < 0) tabopt = -tabopt;
2630 if (tabopt == 1) pbits[1] &= ~0x3c;
2631 else if (tabopt == 2) pbits[11] &= 0x7f;
2632
2633 /* Add the POSIX table or its complement into the main table that is
2634 being built and we are done. */
2635
2636 if (local_negate)
2637 for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
2638 else
2639 for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
2640
2641 ptr = tempptr + 1;
2642 class_charcount = 10; /* Set > 1; assumes more than 1 per class */
2643 continue; /* End of POSIX syntax handling */
2644 }
2645
2646 /* Backslash may introduce a single character, or it may introduce one
2647 of the specials, which just set a flag. The sequence \b is a special
2648 case. Inside a class (and only there) it is treated as backspace.
2649 Elsewhere it marks a word boundary. Other escapes have preset maps ready
2650 to 'or' into the one we are building. We assume they have more than one
2651 character in them, so set class_charcount bigger than one. */
2652
2653 if (c == '\\')
2654 {
2655 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2656 if (*errorcodeptr != 0) goto FAILED;
2657
2658 if (-c == ESC_b) c = '\b'; /* \b is backslash in a class */
2659 else if (-c == ESC_X) c = 'X'; /* \X is literal X in a class */
2660 else if (-c == ESC_R) c = 'R'; /* \R is literal R in a class */
2661 else if (-c == ESC_Q) /* Handle start of quoted string */
2662 {
2663 if (ptr[1] == '\\' && ptr[2] == 'E')
2664 {
2665 ptr += 2; /* avoid empty string */
2666 }
2667 else inescq = TRUE;
2668 continue;
2669 }
2670
2671 if (c < 0)
2672 {
2673 register const uschar *cbits = cd->cbits;
2674 class_charcount += 2; /* Greater than 1 is what matters */
2675
2676 /* Save time by not doing this in the pre-compile phase. */
2677
2678 if (lengthptr == NULL) switch (-c)
2679 {
2680 case ESC_d:
2681 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
2682 continue;
2683
2684 case ESC_D:
2685 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
2686 continue;
2687
2688 case ESC_w:
2689 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
2690 continue;
2691
2692 case ESC_W:
2693 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
2694 continue;
2695
2696 case ESC_s:
2697 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
2698 classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
2699 continue;
2700
2701 case ESC_S:
2702 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
2703 classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
2704 continue;
2705
2706 case ESC_E: /* Perl ignores an orphan \E */
2707 continue;
2708
2709 default: /* Not recognized; fall through */
2710 break; /* Need "default" setting to stop compiler warning. */
2711 }
2712
2713 /* In the pre-compile phase, just do the recognition. */
2714
2715 else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
2716 c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
2717
2718 /* We need to deal with \H, \h, \V, and \v in both phases because
2719 they use extra memory. */
2720
2721 if (-c == ESC_h)
2722 {
2723 SETBIT(classbits, 0x09); /* VT */
2724 SETBIT(classbits, 0x20); /* SPACE */
2725 SETBIT(classbits, 0xa0); /* NSBP */
2726 #ifdef SUPPORT_UTF8
2727 if (utf8)
2728 {
2729 class_utf8 = TRUE;
2730 *class_utf8data++ = XCL_SINGLE;
2731 class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
2732 *class_utf8data++ = XCL_SINGLE;
2733 class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
2734 *class_utf8data++ = XCL_RANGE;
2735 class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
2736 class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
2737 *class_utf8data++ = XCL_SINGLE;
2738 class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
2739 *class_utf8data++ = XCL_SINGLE;
2740 class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
2741 *class_utf8data++ = XCL_SINGLE;
2742 class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
2743 }
2744 #endif
2745 continue;
2746 }
2747
2748 if (-c == ESC_H)
2749 {
2750 for (c = 0; c < 32; c++)
2751 {
2752 int x = 0xff;
2753 switch (c)
2754 {
2755 case 0x09/8: x ^= 1 << (0x09%8); break;
2756 case 0x20/8: x ^= 1 << (0x20%8); break;
2757 case 0xa0/8: x ^= 1 << (0xa0%8); break;
2758 default: break;
2759 }
2760 classbits[c] |= x;
2761 }
2762
2763 #ifdef SUPPORT_UTF8
2764 if (utf8)
2765 {
2766 class_utf8 = TRUE;
2767 *class_utf8data++ = XCL_RANGE;
2768 class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2769 class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
2770 *class_utf8data++ = XCL_RANGE;
2771 class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
2772 class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
2773 *class_utf8data++ = XCL_RANGE;
2774 class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
2775 class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
2776 *class_utf8data++ = XCL_RANGE;
2777 class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
2778 class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
2779 *class_utf8data++ = XCL_RANGE;
2780 class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
2781 class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
2782 *class_utf8data++ = XCL_RANGE;
2783 class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
2784 class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
2785 *class_utf8data++ = XCL_RANGE;
2786 class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
2787 class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2788 }
2789 #endif
2790 continue;
2791 }
2792
2793 if (-c == ESC_v)
2794 {
2795 SETBIT(classbits, 0x0a); /* LF */
2796 SETBIT(classbits, 0x0b); /* VT */
2797 SETBIT(classbits, 0x0c); /* FF */
2798 SETBIT(classbits, 0x0d); /* CR */
2799 SETBIT(classbits, 0x85); /* NEL */
2800 #ifdef SUPPORT_UTF8
2801 if (utf8)
2802 {
2803 class_utf8 = TRUE;
2804 *class_utf8data++ = XCL_RANGE;
2805 class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
2806 class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
2807 }
2808 #endif
2809 continue;
2810 }
2811
2812 if (-c == ESC_V)
2813 {
2814 for (c = 0; c < 32; c++)
2815 {
2816 int x = 0xff;
2817 switch (c)
2818 {
2819 case 0x0a/8: x ^= 1 << (0x0a%8);
2820 x ^= 1 << (0x0b%8);
2821 x ^= 1 << (0x0c%8);
2822 x ^= 1 << (0x0d%8);
2823 break;
2824 case 0x85/8: x ^= 1 << (0x85%8); break;
2825 default: break;
2826 }
2827 classbits[c] |= x;
2828 }
2829
2830 #ifdef SUPPORT_UTF8
2831 if (utf8)
2832 {
2833 class_utf8 = TRUE;
2834 *class_utf8data++ = XCL_RANGE;
2835 class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2836 class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
2837 *class_utf8data++ = XCL_RANGE;
2838 class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
2839 class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2840 }
2841 #endif
2842 continue;
2843 }
2844
2845 /* We need to deal with \P and \p in both phases. */
2846
2847 #ifdef SUPPORT_UCP
2848 if (-c == ESC_p || -c == ESC_P)
2849 {
2850 BOOL negated;
2851 int pdata;
2852 int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
2853 if (ptype < 0) goto FAILED;
2854 class_utf8 = TRUE;
2855 *class_utf8data++ = ((-c == ESC_p) != negated)?
2856 XCL_PROP : XCL_NOTPROP;
2857 *class_utf8data++ = ptype;
2858 *class_utf8data++ = pdata;
2859 class_charcount -= 2; /* Not a < 256 character */
2860 continue;
2861 }
2862 #endif
2863 /* Unrecognized escapes are faulted if PCRE is running in its
2864 strict mode. By default, for compatibility with Perl, they are
2865 treated as literals. */
2866
2867 if ((options & PCRE_EXTRA) != 0)
2868 {
2869 *errorcodeptr = ERR7;
2870 goto FAILED;
2871 }
2872
2873 class_charcount -= 2; /* Undo the default count from above */
2874 c = *ptr; /* Get the final character and fall through */
2875 }
2876
2877 /* Fall through if we have a single character (c >= 0). This may be
2878 greater than 256 in UTF-8 mode. */
2879
2880 } /* End of backslash handling */
2881
2882 /* A single character may be followed by '-' to form a range. However,
2883 Perl does not permit ']' to be the end of the range. A '-' character
2884 at the end is treated as a literal. Perl ignores orphaned \E sequences
2885 entirely. The code for handling \Q and \E is messy. */
2886
2887 CHECK_RANGE:
2888 while (ptr[1] == '\\' && ptr[2] == 'E')
2889 {
2890 inescq = FALSE;
2891 ptr += 2;
2892 }
2893
2894 oldptr = ptr;
2895
2896 if (!inescq && ptr[1] == '-')
2897 {
2898 int d;
2899 ptr += 2;
2900 while (*ptr == '\\' && ptr[1] == 'E') ptr += 2;
2901
2902 /* If we hit \Q (not followed by \E) at this point, go into escaped
2903 mode. */
2904
2905 while (*ptr == '\\' && ptr[1] == 'Q')
2906 {
2907 ptr += 2;
2908 if (*ptr == '\\' && ptr[1] == 'E') { ptr += 2; continue; }
2909 inescq = TRUE;
2910 break;
2911 }
2912
2913 if (*ptr == 0 || (!inescq && *ptr == ']'))
2914 {
2915 ptr = oldptr;
2916 goto LONE_SINGLE_CHARACTER;
2917 }
2918
2919 #ifdef SUPPORT_UTF8
2920 if (utf8)
2921 { /* Braces are required because the */
2922 GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
2923 }
2924 else
2925 #endif
2926 d = *ptr; /* Not UTF-8 mode */
2927
2928 /* The second part of a range can be a single-character escape, but
2929 not any of the other escapes. Perl 5.6 treats a hyphen as a literal
2930 in such circumstances. */
2931
2932 if (!inescq && d == '\\')
2933 {
2934 d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2935 if (*errorcodeptr != 0) goto FAILED;
2936
2937 /* \b is backslash; \X is literal X; \R is literal R; any other
2938 special means the '-' was literal */
2939
2940 if (d < 0)
2941 {
2942 if (d == -ESC_b) d = '\b';
2943 else if (d == -ESC_X) d = 'X';
2944 else if (d == -ESC_R) d = 'R'; else
2945 {
2946 ptr = oldptr;
2947 goto LONE_SINGLE_CHARACTER; /* A few lines below */
2948 }
2949 }
2950 }
2951
2952 /* Check that the two values are in the correct order. Optimize
2953 one-character ranges */
2954
2955 if (d < c)
2956 {
2957 *errorcodeptr = ERR8;
2958 goto FAILED;
2959 }
2960
2961 if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
2962
2963 /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
2964 matching, we have to use an XCLASS with extra data items. Caseless
2965 matching for characters > 127 is available only if UCP support is
2966 available. */
2967
2968 #ifdef SUPPORT_UTF8
2969 if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
2970 {
2971 class_utf8 = TRUE;
2972
2973 /* With UCP support, we can find the other case equivalents of
2974 the relevant characters. There may be several ranges. Optimize how
2975 they fit with the basic range. */
2976
2977 #ifdef SUPPORT_UCP
2978 if ((options & PCRE_CASELESS) != 0)
2979 {
2980 unsigned int occ, ocd;
2981 unsigned int cc = c;
2982 unsigned int origd = d;
2983 while (get_othercase_range(&cc, origd, &occ, &ocd))
2984 {
2985 if (occ >= (unsigned int)c &&
2986 ocd <= (unsigned int)d)
2987 continue; /* Skip embedded ranges */
2988
2989 if (occ < (unsigned int)c &&
2990 ocd >= (unsigned int)c - 1) /* Extend the basic range */
2991 { /* if there is overlap, */
2992 c = occ; /* noting that if occ < c */
2993 continue; /* we can't have ocd > d */
2994 } /* because a subrange is */
2995 if (ocd > (unsigned int)d &&
2996 occ <= (unsigned int)d + 1) /* always shorter than */
2997 { /* the basic range. */
2998 d = ocd;
2999 continue;
3000 }
3001
3002 if (occ == ocd)
3003 {
3004 *class_utf8data++ = XCL_SINGLE;
3005 }
3006 else
3007 {
3008 *class_utf8data++ = XCL_RANGE;
3009 class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
3010 }
3011 class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
3012 }
3013 }
3014 #endif /* SUPPORT_UCP */
3015
3016 /* Now record the original range, possibly modified for UCP caseless
3017 overlapping ranges. */
3018
3019 *class_utf8data++ = XCL_RANGE;
3020 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3021 class_utf8data += _pcre_ord2utf8(d, class_utf8data);
3022
3023 /* With UCP support, we are done. Without UCP support, there is no
3024 caseless matching for UTF-8 characters > 127; we can use the bit map
3025 for the smaller ones. */
3026
3027 #ifdef SUPPORT_UCP
3028 continue; /* With next character in the class */
3029 #else
3030 if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
3031
3032 /* Adjust upper limit and fall through to set up the map */
3033
3034 d = 127;
3035
3036 #endif /* SUPPORT_UCP */
3037 }
3038 #endif /* SUPPORT_UTF8 */
3039
3040 /* We use the bit map for all cases when not in UTF-8 mode; else
3041 ranges that lie entirely within 0-127 when there is UCP support; else
3042 for partial ranges without UCP support. */
3043
3044 class_charcount += d - c + 1;
3045 class_lastchar = d;
3046
3047 /* We can save a bit of time by skipping this in the pre-compile. */
3048
3049 if (lengthptr == NULL) for (; c <= d; c++)
3050 {
3051 classbits[c/8] |= (1 << (c&7));
3052 if ((options & PCRE_CASELESS) != 0)
3053 {
3054 int uc = cd->fcc[c]; /* flip case */
3055 classbits[uc/8] |= (1 << (uc&7));
3056 }
3057 }
3058
3059 continue; /* Go get the next char in the class */
3060 }
3061
3062 /* Handle a lone single character - we can get here for a normal
3063 non-escape char, or after \ that introduces a single character or for an
3064 apparent range that isn't. */
3065
3066 LONE_SINGLE_CHARACTER:
3067
3068 /* Handle a character that cannot go in the bit map */
3069
3070 #ifdef SUPPORT_UTF8
3071 if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
3072 {
3073 class_utf8 = TRUE;
3074 *class_utf8data++ = XCL_SINGLE;
3075 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3076
3077 #ifdef SUPPORT_UCP
3078 if ((options & PCRE_CASELESS) != 0)
3079 {
3080 unsigned int othercase;
3081 if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR)
3082 {
3083 *class_utf8data++ = XCL_SINGLE;
3084 class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
3085 }
3086 }
3087 #endif /* SUPPORT_UCP */
3088
3089 }
3090 else
3091 #endif /* SUPPORT_UTF8 */
3092
3093 /* Handle a single-byte character */
3094 {
3095 classbits[c/8] |= (1 << (c&7));
3096 if ((options & PCRE_CASELESS) != 0)
3097 {
3098 c = cd->fcc[c]; /* flip case */
3099 classbits[c/8] |= (1 << (c&7));
3100 }
3101 class_charcount++;
3102 class_lastchar = c;
3103 }
3104 }
3105
3106 /* Loop until ']' reached. This "while" is the end of the "do" above. */
3107
3108 while ((c = *(++ptr)) != 0 && (c != ']' || inescq));
3109
3110 if (c == 0) /* Missing terminating ']' */
3111 {
3112 *errorcodeptr = ERR6;
3113 goto FAILED;
3114 }
3115
3116 /* If class_charcount is 1, we saw precisely one character whose value is
3117 less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we
3118 can optimize the negative case only if there were no characters >= 128
3119 because OP_NOT and the related opcodes like OP_NOTSTAR operate on
3120 single-bytes only. This is an historical hangover. Maybe one day we can
3121 tidy these opcodes to handle multi-byte characters.
3122
3123 The optimization throws away the bit map. We turn the item into a
3124 1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
3125 that OP_NOT does not support multibyte characters. In the positive case, it
3126 can cause firstbyte to be set. Otherwise, there can be no first char if
3127 this item is first, whatever repeat count may follow. In the case of
3128 reqbyte, save the previous value for reinstating. */
3129
3130 #ifdef SUPPORT_UTF8
3131 if (class_charcount == 1 &&
3132 (!utf8 ||
3133 (!class_utf8 && (!negate_class || class_lastchar < 128))))
3134
3135 #else
3136 if (class_charcount == 1)
3137 #endif
3138 {
3139 zeroreqbyte = reqbyte;
3140
3141 /* The OP_NOT opcode works on one-byte characters only. */
3142
3143 if (negate_class)
3144 {
3145 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3146 zerofirstbyte = firstbyte;
3147 *code++ = OP_NOT;
3148 *code++ = class_lastchar;
3149 break;
3150 }
3151
3152 /* For a single, positive character, get the value into mcbuffer, and
3153 then we can handle this with the normal one-character code. */
3154
3155 #ifdef SUPPORT_UTF8
3156 if (utf8 && class_lastchar > 127)
3157 mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
3158 else
3159 #endif
3160 {
3161 mcbuffer[0] = class_lastchar;
3162 mclength = 1;
3163 }
3164 goto ONE_CHAR;
3165 } /* End of 1-char optimization */
3166
3167 /* The general case - not the one-char optimization. If this is the first
3168 thing in the branch, there can be no first char setting, whatever the
3169 repeat count. Any reqbyte setting must remain unchanged after any kind of
3170 repeat. */
3171
3172 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3173 zerofirstbyte = firstbyte;
3174 zeroreqbyte = reqbyte;
3175
3176 /* If there are characters with values > 255, we have to compile an
3177 extended class, with its own opcode. If there are no characters < 256,
3178 we can omit the bitmap in the actual compiled code. */
3179
3180 #ifdef SUPPORT_UTF8
3181 if (class_utf8)
3182 {
3183 *class_utf8data++ = XCL_END; /* Marks the end of extra data */
3184 *code++ = OP_XCLASS;
3185 code += LINK_SIZE;
3186 *code = negate_class? XCL_NOT : 0;
3187
3188 /* If the map is required, move up the extra data to make room for it;
3189 otherwise just move the code pointer to the end of the extra data. */
3190
3191 if (class_charcount > 0)
3192 {
3193 *code++ |= XCL_MAP;
3194 memmove(code + 32, code, class_utf8data - code);
3195 memcpy(code, classbits, 32);
3196 code = class_utf8data + 32;
3197 }
3198 else code = class_utf8data;
3199
3200 /* Now fill in the complete length of the item */
3201
3202 PUT(previous, 1, code - previous);
3203 break; /* End of class handling */
3204 }
3205 #endif
3206
3207 /* If there are no characters > 255, negate the 32-byte map if necessary,
3208 and copy it into the code vector. If this is the first thing in the branch,
3209 there can be no first char setting, whatever the repeat count. Any reqbyte
3210 setting must remain unchanged after any kind of repeat. */
3211
3212 if (negate_class)
3213 {
3214 *code++ = OP_NCLASS;
3215 if (lengthptr == NULL) /* Save time in the pre-compile phase */
3216 for (c = 0; c < 32; c++) code[c] = ~classbits[c];
3217 }
3218 else
3219 {
3220 *code++ = OP_CLASS;
3221 memcpy(code, classbits, 32);
3222 }
3223 code += 32;
3224 break;
3225
3226
3227 /* ===================================================================*/
3228 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
3229 has been tested above. */
3230
3231 case '{':
3232 if (!is_quantifier) goto NORMAL_CHAR;
3233 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
3234 if (*errorcodeptr != 0) goto FAILED;
3235 goto REPEAT;
3236
3237 case '*':
3238 repeat_min = 0;
3239 repeat_max = -1;
3240 goto REPEAT;
3241
3242 case '+':
3243 repeat_min = 1;
3244 repeat_max = -1;
3245 goto REPEAT;
3246
3247 case '?':
3248 repeat_min = 0;
3249 repeat_max = 1;
3250
3251 REPEAT:
3252 if (previous == NULL)
3253 {
3254 *errorcodeptr = ERR9;
3255 goto FAILED;
3256 }
3257
3258 if (repeat_min == 0)
3259 {
3260 firstbyte = zerofirstbyte; /* Adjust for zero repeat */
3261 reqbyte = zeroreqbyte; /* Ditto */
3262 }
3263
3264 /* Remember whether this is a variable length repeat */
3265
3266 reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
3267
3268 op_type = 0; /* Default single-char op codes */
3269 possessive_quantifier = FALSE; /* Default not possessive quantifier */
3270
3271 /* Save start of previous item, in case we have to move it up to make space
3272 for an inserted OP_ONCE for the additional '+' extension. */
3273
3274 tempcode = previous;
3275
3276 /* If the next character is '+', we have a possessive quantifier. This
3277 implies greediness, whatever the setting of the PCRE_UNGREEDY option.
3278 If the next character is '?' this is a minimizing repeat, by default,
3279 but if PCRE_UNGREEDY is set, it works the other way round. We change the
3280 repeat type to the non-default. */
3281
3282 if (ptr[1] == '+')
3283 {
3284 repeat_type = 0; /* Force greedy */
3285 possessive_quantifier = TRUE;
3286 ptr++;
3287 }
3288 else if (ptr[1] == '?')
3289 {
3290 repeat_type = greedy_non_default;
3291 ptr++;
3292 }
3293 else repeat_type = greedy_default;
3294
3295 /* If previous was a character match, abolish the item and generate a
3296 repeat item instead. If a char item has a minumum of more than one, ensure
3297 that it is set in reqbyte - it might not be if a sequence such as x{3} is
3298 the first thing in a branch because the x will have gone into firstbyte
3299 instead. */
3300
3301 if (*previous == OP_CHAR || *previous == OP_CHARNC)
3302 {
3303 /* Deal with UTF-8 characters that take up more than one byte. It's
3304 easier to write this out separately than try to macrify it. Use c to
3305 hold the length of the character in bytes, plus 0x80 to flag that it's a
3306 length rather than a small character. */
3307
3308 #ifdef SUPPORT_UTF8
3309 if (utf8 && (code[-1] & 0x80) != 0)
3310 {
3311 uschar *lastchar = code - 1;
3312 while((*lastchar & 0xc0) == 0x80) lastchar--;
3313 c = code - lastchar; /* Length of UTF-8 character */
3314 memcpy(utf8_char, lastchar, c); /* Save the char */
3315 c |= 0x80; /* Flag c as a length */
3316 }
3317 else
3318 #endif
3319
3320 /* Handle the case of a single byte - either with no UTF8 support, or
3321 with UTF-8 disabled, or for a UTF-8 character < 128. */
3322
3323 {
3324 c = code[-1];
3325 if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
3326 }
3327
3328 /* If the repetition is unlimited, it pays to see if the next thing on
3329 the line is something that cannot possibly match this character. If so,
3330 automatically possessifying this item gains some performance in the case
3331 where the match fails. */
3332
3333 if (!possessive_quantifier &&
3334 repeat_max < 0 &&
3335 check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3336 options, cd))
3337 {
3338 repeat_type = 0; /* Force greedy */
3339 possessive_quantifier = TRUE;
3340 }
3341
3342 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
3343 }
3344
3345 /* If previous was a single negated character ([^a] or similar), we use
3346 one of the special opcodes, replacing it. The code is shared with single-
3347 character repeats by setting opt_type to add a suitable offset into
3348 repeat_type. We can also test for auto-possessification. OP_NOT is
3349 currently used only for single-byte chars. */
3350
3351 else if (*previous == OP_NOT)
3352 {
3353 op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
3354 c = previous[1];
3355 if (!possessive_quantifier &&
3356 repeat_max < 0 &&
3357 check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3358 {
3359 repeat_type = 0; /* Force greedy */
3360 possessive_quantifier = TRUE;
3361 }
3362 goto OUTPUT_SINGLE_REPEAT;
3363 }
3364
3365 /* If previous was a character type match (\d or similar), abolish it and
3366 create a suitable repeat item. The code is shared with single-character
3367 repeats by setting op_type to add a suitable offset into repeat_type. Note
3368 the the Unicode property types will be present only when SUPPORT_UCP is
3369 defined, but we don't wrap the little bits of code here because it just
3370 makes it horribly messy. */
3371
3372 else if (*previous < OP_EODN)
3373 {
3374 uschar *oldcode;
3375 int prop_type, prop_value;
3376 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
3377 c = *previous;
3378
3379 if (!possessive_quantifier &&
3380 repeat_max < 0 &&
3381 check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3382 {
3383 repeat_type = 0; /* Force greedy */
3384 possessive_quantifier = TRUE;
3385 }
3386
3387 OUTPUT_SINGLE_REPEAT:
3388 if (*previous == OP_PROP || *previous == OP_NOTPROP)
3389 {
3390 prop_type = previous[1];
3391 prop_value = previous[2];
3392 }
3393 else prop_type = prop_value = -1;
3394
3395 oldcode = code;
3396 code = previous; /* Usually overwrite previous item */
3397
3398 /* If the maximum is zero then the minimum must also be zero; Perl allows
3399 this case, so we do too - by simply omitting the item altogether. */
3400
3401 if (repeat_max == 0) goto END_REPEAT;
3402
3403 /* All real repeats make it impossible to handle partial matching (maybe
3404 one day we will be able to remove this restriction). */
3405
3406 if (repeat_max != 1) cd->nopartial = TRUE;
3407
3408 /* Combine the op_type with the repeat_type */
3409
3410 repeat_type += op_type;
3411
3412 /* A minimum of zero is handled either as the special case * or ?, or as
3413 an UPTO, with the maximum given. */
3414
3415 if (repeat_min == 0)
3416 {
3417 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
3418 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
3419 else
3420 {
3421 *code++ = OP_UPTO + repeat_type;
3422 PUT2INC(code, 0, repeat_max);
3423 }
3424 }
3425
3426 /* A repeat minimum of 1 is optimized into some special cases. If the
3427 maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
3428 left in place and, if the maximum is greater than 1, we use OP_UPTO with
3429 one less than the maximum. */
3430
3431 else if (repeat_min == 1)
3432 {
3433 if (repeat_max == -1)
3434 *code++ = OP_PLUS + repeat_type;
3435 else
3436 {
3437 code = oldcode; /* leave previous item in place */
3438 if (repeat_max == 1) goto END_REPEAT;
3439 *code++ = OP_UPTO + repeat_type;
3440 PUT2INC(code, 0, repeat_max - 1);
3441 }
3442 }
3443
3444 /* The case {n,n} is just an EXACT, while the general case {n,m} is
3445 handled as an EXACT followed by an UPTO. */
3446
3447 else
3448 {
3449 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
3450 PUT2INC(code, 0, repeat_min);
3451
3452 /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
3453 we have to insert the character for the previous code. For a repeated
3454 Unicode property match, there are two extra bytes that define the
3455 required property. In UTF-8 mode, long characters have their length in
3456 c, with the 0x80 bit as a flag. */
3457
3458 if (repeat_max < 0)
3459 {
3460 #ifdef SUPPORT_UTF8
3461 if (utf8 && c >= 128)
3462 {
3463 memcpy(code, utf8_char, c & 7);
3464 code += c & 7;
3465 }
3466 else
3467 #endif
3468 {
3469 *code++ = c;
3470 if (prop_type >= 0)
3471 {
3472 *code++ = prop_type;
3473 *code++ = prop_value;
3474 }
3475 }
3476 *code++ = OP_STAR + repeat_type;
3477 }
3478
3479 /* Else insert an UPTO if the max is greater than the min, again
3480 preceded by the character, for the previously inserted code. If the
3481 UPTO is just for 1 instance, we can use QUERY instead. */
3482
3483 else if (repeat_max != repeat_min)
3484 {
3485 #ifdef SUPPORT_UTF8
3486 if (utf8 && c >= 128)
3487 {
3488 memcpy(code, utf8_char, c & 7);
3489 code += c & 7;
3490 }
3491 else
3492 #endif
3493 *code++ = c;
3494 if (prop_type >= 0)
3495 {
3496 *code++ = prop_type;
3497 *code++ = prop_value;
3498 }
3499 repeat_max -= repeat_min;
3500
3501 if (repeat_max == 1)
3502 {
3503 *code++ = OP_QUERY + repeat_type;
3504 }
3505 else
3506 {
3507 *code++ = OP_UPTO + repeat_type;
3508 PUT2INC(code, 0, repeat_max);
3509 }
3510 }
3511 }
3512
3513 /* The character or character type itself comes last in all cases. */
3514
3515 #ifdef SUPPORT_UTF8
3516 if (utf8 && c >= 128)
3517 {
3518 memcpy(code, utf8_char, c & 7);
3519 code += c & 7;
3520 }
3521 else
3522 #endif
3523 *code++ = c;
3524
3525 /* For a repeated Unicode property match, there are two extra bytes that
3526 define the required property. */
3527
3528 #ifdef SUPPORT_UCP
3529 if (prop_type >= 0)
3530 {
3531 *code++ = prop_type;
3532 *code++ = prop_value;
3533 }
3534 #endif
3535 }
3536
3537 /* If previous was a character class or a back reference, we put the repeat
3538 stuff after it, but just skip the item if the repeat was {0,0}. */
3539
3540 else if (*previous == OP_CLASS ||
3541 *previous == OP_NCLASS ||
3542 #ifdef SUPPORT_UTF8
3543 *previous == OP_XCLASS ||
3544 #endif
3545 *previous == OP_REF)
3546 {
3547 if (repeat_max == 0)
3548 {
3549 code = previous;
3550 goto END_REPEAT;
3551 }
3552
3553 /* All real repeats make it impossible to handle partial matching (maybe
3554 one day we will be able to remove this restriction). */
3555
3556 if (repeat_max != 1) cd->nopartial = TRUE;
3557
3558 if (repeat_min == 0 && repeat_max == -1)
3559 *code++ = OP_CRSTAR + repeat_type;
3560 else if (repeat_min == 1 && repeat_max == -1)
3561 *code++ = OP_CRPLUS + repeat_type;
3562 else if (repeat_min == 0 && repeat_max == 1)
3563 *code++ = OP_CRQUERY + repeat_type;
3564 else
3565 {
3566 *code++ = OP_CRRANGE + repeat_type;
3567 PUT2INC(code, 0, repeat_min);
3568 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
3569 PUT2INC(code, 0, repeat_max);
3570 }
3571 }
3572
3573 /* If previous was a bracket group, we may have to replicate it in certain
3574 cases. */
3575
3576 else if (*previous == OP_BRA || *previous == OP_CBRA ||
3577 *previous == OP_ONCE || *previous == OP_COND)
3578 {
3579 register int i;
3580 int ketoffset = 0;
3581 int len = code - previous;
3582 uschar *bralink = NULL;
3583
3584 /* Repeating a DEFINE group is pointless */
3585
3586 if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
3587 {
3588 *errorcodeptr = ERR55;
3589 goto FAILED;
3590 }
3591
3592 /* If the maximum repeat count is unlimited, find the end of the bracket
3593 by scanning through from the start, and compute the offset back to it
3594 from the current code pointer. There may be an OP_OPT setting following
3595 the final KET, so we can't find the end just by going back from the code
3596 pointer. */
3597
3598 if (repeat_max == -1)
3599 {
3600 register uschar *ket = previous;
3601 do ket += GET(ket, 1); while (*ket != OP_KET);
3602 ketoffset = code - ket;
3603 }
3604
3605 /* The case of a zero minimum is special because of the need to stick
3606 OP_BRAZERO in front of it, and because the group appears once in the
3607 data, whereas in other cases it appears the minimum number of times. For
3608 this reason, it is simplest to treat this case separately, as otherwise
3609 the code gets far too messy. There are several special subcases when the
3610 minimum is zero. */
3611
3612 if (repeat_min == 0)
3613 {
3614 /* If the maximum is also zero, we just omit the group from the output
3615 altogether. */
3616
3617 if (repeat_max == 0)
3618 {
3619 code = previous;
3620 goto END_REPEAT;
3621 }
3622
3623 /* If the maximum is 1 or unlimited, we just have to stick in the
3624 BRAZERO and do no more at this point. However, we do need to adjust
3625 any OP_RECURSE calls inside the group that refer to the group itself or
3626 any internal or forward referenced group, because the offset is from
3627 the start of the whole regex. Temporarily terminate the pattern while
3628 doing this. */
3629
3630 if (repeat_max <= 1)
3631 {
3632 *code = OP_END;
3633 adjust_recurse(previous, 1, utf8, cd, save_hwm);
3634 memmove(previous+1, previous, len);
3635 code++;
3636 *previous++ = OP_BRAZERO + repeat_type;
3637 }
3638
3639 /* If the maximum is greater than 1 and limited, we have to replicate
3640 in a nested fashion, sticking OP_BRAZERO before each set of brackets.
3641 The first one has to be handled carefully because it's the original
3642 copy, which has to be moved up. The remainder can be handled by code
3643 that is common with the non-zero minimum case below. We have to
3644 adjust the value or repeat_max, since one less copy is required. Once
3645 again, we may have to adjust any OP_RECURSE calls inside the group. */
3646
3647 else
3648 {
3649 int offset;
3650 *code = OP_END;
3651 adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
3652 memmove(previous + 2 + LINK_SIZE, previous, len);
3653 code += 2 + LINK_SIZE;
3654 *previous++ = OP_BRAZERO + repeat_type;
3655 *previous++ = OP_BRA;
3656
3657 /* We chain together the bracket offset fields that have to be
3658 filled in later when the ends of the brackets are reached. */
3659
3660 offset = (bralink == NULL)? 0 : previous - bralink;
3661 bralink = previous;
3662 PUTINC(previous, 0, offset);
3663 }
3664
3665 repeat_max--;
3666 }
3667
3668 /* If the minimum is greater than zero, replicate the group as many
3669 times as necessary, and adjust the maximum to the number of subsequent
3670 copies that we need. If we set a first char from the group, and didn't
3671 set a required char, copy the latter from the former. If there are any
3672 forward reference subroutine calls in the group, there will be entries on
3673 the workspace list; replicate these with an appropriate increment. */
3674
3675 else
3676 {
3677 if (repeat_min > 1)
3678 {
3679 /* In the pre-compile phase, we don't actually do the replication. We
3680 just adjust the length as if we had. Do some paranoid checks for
3681 potential integer overflow. */
3682
3683 if (lengthptr != NULL)
3684 {
3685 int delta = (repeat_min - 1)*length_prevgroup;
3686 if ((double)(repeat_min - 1)*(double)length_prevgroup >
3687 (double)INT_MAX ||
3688 OFLOW_MAX - *lengthptr < delta)
3689 {
3690 *errorcodeptr = ERR20;
3691 goto FAILED;
3692 }
3693 *lengthptr += delta;
3694 }
3695
3696 /* This is compiling for real */
3697
3698 else
3699 {
3700 if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
3701 for (i = 1; i < repeat_min; i++)
3702 {
3703 uschar *hc;
3704 uschar *this_hwm = cd->hwm;
3705 memcpy(code, previous, len);
3706 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3707 {
3708 PUT(cd->hwm, 0, GET(hc, 0) + len);
3709 cd->hwm += LINK_SIZE;
3710 }
3711 save_hwm = this_hwm;
3712 code += len;
3713 }
3714 }
3715 }
3716
3717 if (repeat_max > 0) repeat_max -= repeat_min;
3718 }
3719
3720 /* This code is common to both the zero and non-zero minimum cases. If
3721 the maximum is limited, it replicates the group in a nested fashion,
3722 remembering the bracket starts on a stack. In the case of a zero minimum,
3723 the first one was set up above. In all cases the repeat_max now specifies
3724 the number of additional copies needed. Again, we must remember to
3725 replicate entries on the forward reference list. */
3726
3727 if (repeat_max >= 0)
3728 {
3729 /* In the pre-compile phase, we don't actually do the replication. We
3730 just adjust the length as if we had. For each repetition we must add 1
3731 to the length for BRAZERO and for all but the last repetition we must
3732 add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
3733 paranoid checks to avoid integer overflow. */
3734
3735 if (lengthptr != NULL && repeat_max > 0)
3736 {
3737 int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
3738 2 - 2*LINK_SIZE; /* Last one doesn't nest */
3739 if ((double)repeat_max *
3740 (double)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
3741 > (double)INT_MAX ||
3742 OFLOW_MAX - *lengthptr < delta)
3743 {
3744 *errorcodeptr = ERR20;
3745 goto FAILED;
3746 }
3747 *lengthptr += delta;
3748 }
3749
3750 /* This is compiling for real */
3751
3752 else for (i = repeat_max - 1; i >= 0; i--)
3753 {
3754 uschar *hc;
3755 uschar *this_hwm = cd->hwm;
3756
3757 *code++ = OP_BRAZERO + repeat_type;
3758
3759 /* All but the final copy start a new nesting, maintaining the
3760 chain of brackets outstanding. */
3761
3762 if (i != 0)
3763 {
3764 int offset;
3765 *code++ = OP_BRA;
3766 offset = (bralink == NULL)? 0 : code - bralink;
3767 bralink = code;
3768 PUTINC(code, 0, offset);
3769 }
3770
3771 memcpy(code, previous, len);
3772 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3773 {
3774 PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
3775 cd->hwm += LINK_SIZE;
3776 }
3777 save_hwm = this_hwm;
3778 code += len;
3779 }
3780
3781 /* Now chain through the pending brackets, and fill in their length
3782 fields (which are holding the chain links pro tem). */
3783
3784 while (bralink != NULL)
3785 {
3786 int oldlinkoffset;
3787 int offset = code - bralink + 1;
3788 uschar *bra = code - offset;
3789 oldlinkoffset = GET(bra, 1);
3790 bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
3791 *code++ = OP_KET;
3792 PUTINC(code, 0, offset);
3793 PUT(bra, 1, offset);
3794 }
3795 }
3796
3797 /* If the maximum is unlimited, set a repeater in the final copy. We
3798 can't just offset backwards from the current code point, because we
3799 don't know if there's been an options resetting after the ket. The
3800 correct offset was computed above.
3801
3802 Then, when we are doing the actual compile phase, check to see whether
3803 this group is a non-atomic one that could match an empty string. If so,
3804 convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
3805 that runtime checking can be done. [This check is also applied to
3806 atomic groups at runtime, but in a different way.] */
3807
3808 else
3809 {
3810 uschar *ketcode = code - ketoffset;
3811 uschar *bracode = ketcode - GET(ketcode, 1);
3812 *ketcode = OP_KETRMAX + repeat_type;
3813 if (lengthptr == NULL && *bracode != OP_ONCE)
3814 {
3815 uschar *scode = bracode;
3816 do
3817 {
3818 if (could_be_empty_branch(scode, ketcode, utf8))
3819 {
3820 *bracode += OP_SBRA - OP_BRA;
3821 break;
3822 }
3823 scode += GET(scode, 1);
3824 }
3825 while (*scode == OP_ALT);
3826 }
3827 }
3828 }
3829
3830 /* Else there's some kind of shambles */
3831
3832 else
3833 {
3834 *errorcodeptr = ERR11;
3835 goto FAILED;
3836 }
3837
3838 /* If the character following a repeat is '+', or if certain optimization
3839 tests above succeeded, possessive_quantifier is TRUE. For some of the
3840 simpler opcodes, there is an special alternative opcode for this. For
3841 anything else, we wrap the entire repeated item inside OP_ONCE brackets.
3842 The '+' notation is just syntactic sugar, taken from Sun's Java package,
3843 but the special opcodes can optimize it a bit. The repeated item starts at
3844 tempcode, not at previous, which might be the first part of a string whose
3845 (former) last char we repeated.
3846
3847 Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
3848 an 'upto' may follow. We skip over an 'exact' item, and then test the
3849 length of what remains before proceeding. */
3850
3851 if (possessive_quantifier)
3852 {
3853 int len;
3854 if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||
3855 *tempcode == OP_NOTEXACT)
3856 tempcode += _pcre_OP_lengths[*tempcode];
3857 len = code - tempcode;
3858 if (len > 0) switch (*tempcode)
3859 {
3860 case OP_STAR: *tempcode = OP_POSSTAR; break;
3861 case OP_PLUS: *tempcode = OP_POSPLUS; break;
3862 case OP_QUERY: *tempcode = OP_POSQUERY; break;
3863 case OP_UPTO: *tempcode = OP_POSUPTO; break;
3864
3865 case OP_TYPESTAR: *tempcode = OP_TYPEPOSSTAR; break;
3866 case OP_TYPEPLUS: *tempcode = OP_TYPEPOSPLUS; break;
3867 case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
3868 case OP_TYPEUPTO: *tempcode = OP_TYPEPOSUPTO; break;
3869
3870 case OP_NOTSTAR: *tempcode = OP_NOTPOSSTAR; break;
3871 case OP_NOTPLUS: *tempcode = OP_NOTPOSPLUS; break;
3872 case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
3873 case OP_NOTUPTO: *tempcode = OP_NOTPOSUPTO; break;
3874
3875 default:
3876 memmove(tempcode + 1+LINK_SIZE, tempcode, len);
3877 code += 1 + LINK_SIZE;
3878 len += 1 + LINK_SIZE;
3879 tempcode[0] = OP_ONCE;
3880 *code++ = OP_KET;
3881 PUTINC(code, 0, len);
3882 PUT(tempcode, 1, len);
3883 break;
3884 }
3885 }
3886
3887 /* In all case we no longer have a previous item. We also set the
3888 "follows varying string" flag for subsequently encountered reqbytes if
3889 it isn't already set and we have just passed a varying length item. */
3890
3891 END_REPEAT:
3892 previous = NULL;
3893 cd->req_varyopt |= reqvary;
3894 break;
3895
3896
3897 /* ===================================================================*/
3898 /* Start of nested parenthesized sub-expression, or comment or lookahead or
3899 lookbehind or option setting or condition or all the other extended
3900 parenthesis forms. */
3901
3902 case '(':
3903 newoptions = options;
3904 skipbytes = 0;
3905 bravalue = OP_CBRA;
3906 save_hwm = cd->hwm;
3907 reset_bracount = FALSE;
3908
3909 /* First deal with various "verbs" that can be introduced by '*'. */
3910
3911 if (*(++ptr) == '*' && (cd->ctypes[ptr[1]] & ctype_letter) != 0)
3912 {
3913 int i, namelen;
3914 const uschar *name = ++ptr;
3915 previous = NULL;
3916 while ((cd->ctypes[*++ptr] & ctype_letter) != 0);
3917 if (*ptr == ':')
3918 {
3919 *errorcodeptr = ERR59; /* Not supported */
3920 goto FAILED;
3921 }
3922 if (*ptr != ')')
3923 {
3924 *errorcodeptr = ERR60;
3925 goto FAILED;
3926 }
3927 namelen = ptr - name;
3928 for (i = 0; i < verbcount; i++)
3929 {
3930 if (namelen == verbs[i].len &&
3931 strncmp((char *)name, verbs[i].name, namelen) == 0)
3932 {
3933 *code = verbs[i].op;
3934 if (*code++ == OP_ACCEPT) cd->had_accept = TRUE;
3935 break;
3936 }
3937 }
3938 if (i < verbcount) continue;
3939 *errorcodeptr = ERR60;
3940 goto FAILED;
3941 }
3942
3943 /* Deal with the extended parentheses; all are introduced by '?', and the
3944 appearance of any of them means that this is not a capturing group. */
3945
3946 else if (*ptr == '?')
3947 {
3948 int i, set, unset, namelen;
3949 int *optset;
3950 const uschar *name;
3951 uschar *slot;
3952
3953 switch (*(++ptr))
3954 {
3955 case '#': /* Comment; skip to ket */
3956 ptr++;
3957 while (*ptr != 0 && *ptr != ')') ptr++;
3958 if (*ptr == 0)
3959 {
3960 *errorcodeptr = ERR18;
3961 goto FAILED;
3962 }
3963 continue;
3964
3965
3966 /* ------------------------------------------------------------ */
3967 case '|': /* Reset capture count for each branch */
3968 reset_bracount = TRUE;
3969 /* Fall through */
3970
3971 /* ------------------------------------------------------------ */
3972 case ':': /* Non-capturing bracket */
3973 bravalue = OP_BRA;
3974 ptr++;
3975 break;
3976
3977
3978 /* ------------------------------------------------------------ */
3979 case '(':
3980 bravalue = OP_COND; /* Conditional group */
3981
3982 /* A condition can be an assertion, a number (referring to a numbered
3983 group), a name (referring to a named group), or 'R', referring to
3984 recursion. R<digits> and R&name are also permitted for recursion tests.
3985
3986 There are several syntaxes for testing a named group: (?(name)) is used
3987 by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
3988
3989 There are two unfortunate ambiguities, caused by history. (a) 'R' can
3990 be the recursive thing or the name 'R' (and similarly for 'R' followed
3991 by digits), and (b) a number could be a name that consists of digits.
3992 In both cases, we look for a name first; if not found, we try the other
3993 cases. */
3994
3995 /* For conditions that are assertions, check the syntax, and then exit
3996 the switch. This will take control down to where bracketed groups,
3997 including assertions, are processed. */
3998
3999 if (ptr[1] == '?' && (ptr[2] == '=' || ptr[2] == '!' || ptr[2] == '<'))
4000 break;
4001
4002 /* Most other conditions use OP_CREF (a couple change to OP_RREF
4003 below), and all need to skip 3 bytes at the start of the group. */
4004
4005 code[1+LINK_SIZE] = OP_CREF;
4006 skipbytes = 3;
4007 refsign = -1;
4008
4009 /* Check for a test for recursion in a named group. */
4010
4011 if (ptr[1] == 'R' && ptr[2] == '&')
4012 {
4013 terminator = -1;
4014 ptr += 2;
4015 code[1+LINK_SIZE] = OP_RREF; /* Change the type of test */
4016 }
4017
4018 /* Check for a test for a named group's having been set, using the Perl
4019 syntax (?(<name>) or (?('name') */
4020
4021 else if (ptr[1] == '<')
4022 {
4023 terminator = '>';
4024 ptr++;
4025 }
4026 else if (ptr[1] == '\'')
4027 {
4028 terminator = '\'';
4029 ptr++;
4030 }
4031 else
4032 {
4033 terminator = 0;
4034 if (ptr[1] == '-' || ptr[1] == '+') refsign = *(++ptr);
4035 }
4036
4037 /* We now expect to read a name; any thing else is an error */
4038
4039 if ((cd->ctypes[ptr[1]] & ctype_word) == 0)
4040 {
4041 ptr += 1; /* To get the right offset */
4042 *errorcodeptr = ERR28;
4043 goto FAILED;
4044 }
4045
4046 /* Read the name, but also get it as a number if it's all digits */
4047
4048 recno = 0;
4049 name = ++ptr;
4050 while ((cd->ctypes[*ptr] & ctype_word) != 0)
4051 {
4052 if (recno >= 0)
4053 recno = ((digitab[*ptr] & ctype_digit) != 0)?
4054 recno * 10 + *ptr - '0' : -1;
4055 ptr++;
4056 }
4057 namelen = ptr - name;
4058
4059 if ((terminator > 0 && *ptr++ != terminator) || *ptr++ != ')')
4060 {
4061 ptr--; /* Error offset */
4062 *errorcodeptr = ERR26;
4063 goto FAILED;
4064 }
4065
4066 /* Do no further checking in the pre-compile phase. */
4067
4068 if (lengthptr != NULL) break;
4069
4070 /* In the real compile we do the work of looking for the actual
4071 reference. If the string started with "+" or "-" we require the rest to
4072 be digits, in which case recno will be set. */
4073
4074 if (refsign > 0)
4075 {
4076 if (recno <= 0)
4077 {
4078 *errorcodeptr = ERR58;
4079 goto FAILED;
4080 }
4081 if (refsign == '-')
4082 {
4083 recno = cd->bracount - recno + 1;
4084 if (recno <= 0)
4085 {
4086 *errorcodeptr = ERR15;
4087 goto FAILED;
4088 }
4089 }
4090 else recno += cd->bracount;
4091 PUT2(code, 2+LINK_SIZE, recno);
4092 break;
4093 }
4094
4095 /* Otherwise (did not start with "+" or "-"), start by looking for the
4096 name. */
4097
4098 slot = cd->name_table;
4099 for (i = 0; i < cd->names_found; i++)
4100 {
4101 if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
4102 slot += cd->name_entry_size;
4103 }
4104
4105 /* Found a previous named subpattern */
4106
4107 if (i < cd->names_found)
4108 {
4109 recno = GET2(slot, 0);
4110 PUT2(code, 2+LINK_SIZE, recno);
4111 }
4112
4113 /* Search the pattern for a forward reference */
4114
4115 else if ((i = find_parens(ptr, cd->bracount, name, namelen,
4116 (options & PCRE_EXTENDED) != 0)) > 0)
4117 {
4118 PUT2(code, 2+LINK_SIZE, i);
4119 }
4120
4121 /* If terminator == 0 it means that the name followed directly after
4122 the opening parenthesis [e.g. (?(abc)...] and in this case there are
4123 some further alternatives to try. For the cases where terminator != 0
4124 [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
4125 now checked all the possibilities, so give an error. */
4126
4127 else if (terminator != 0)
4128 {
4129 *errorcodeptr = ERR15;
4130 goto FAILED;
4131 }
4132
4133 /* Check for (?(R) for recursion. Allow digits after R to specify a
4134 specific group number. */
4135
4136 else if (*name == 'R')
4137 {
4138 recno = 0;
4139 for (i = 1; i < namelen; i++)
4140 {
4141 if ((digitab[name[i]] & ctype_digit) == 0)
4142 {
4143 *errorcodeptr = ERR15;
4144 goto FAILED;
4145 }
4146 recno = recno * 10 + name[i] - '0';
4147 }
4148 if (recno == 0) recno = RREF_ANY;
4149 code[1+LINK_SIZE] = OP_RREF; /* Change test type */
4150 PUT2(code, 2+LINK_SIZE, recno);
4151 }
4152
4153 /* Similarly, check for the (?(DEFINE) "condition", which is always
4154 false. */
4155
4156 else if (namelen == 6 && strncmp((char *)name, "DEFINE", 6) == 0)
4157 {
4158 code[1+LINK_SIZE] = OP_DEF;
4159 skipbytes = 1;
4160 }
4161
4162 /* Check for the "name" actually being a subpattern number. */
4163
4164 else if (recno > 0)
4165 {
4166 PUT2(code, 2+LINK_SIZE, recno);
4167 }
4168
4169 /* Either an unidentified subpattern, or a reference to (?(0) */
4170
4171 else
4172 {
4173 *errorcodeptr = (recno == 0)? ERR35: ERR15;
4174 goto FAILED;
4175 }
4176 break;
4177
4178
4179 /* ------------------------------------------------------------ */
4180 case '=': /* Positive lookahead */
4181 bravalue = OP_ASSERT;
4182 ptr++;
4183 break;
4184
4185
4186 /* ------------------------------------------------------------ */
4187 case '!': /* Negative lookahead */
4188 ptr++;
4189 if (*ptr == ')') /* Optimize (?!) */
4190 {
4191 *code++ = OP_FAIL;
4192 previous = NULL;
4193 continue;
4194 }
4195 bravalue = OP_ASSERT_NOT;
4196 break;
4197
4198
4199 /* ------------------------------------------------------------ */
4200 case '<': /* Lookbehind or named define */
4201 switch (ptr[1])
4202 {
4203 case '=': /* Positive lookbehind */
4204 bravalue = OP_ASSERTBACK;
4205 ptr += 2;
4206 break;
4207
4208 case '!': /* Negative lookbehind */
4209 bravalue = OP_ASSERTBACK_NOT;
4210 ptr += 2;
4211 break;
4212
4213 default: /* Could be name define, else bad */
4214 if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME;
4215 ptr++; /* Correct offset for error */
4216 *errorcodeptr = ERR24;
4217 goto FAILED;
4218 }
4219 break;
4220
4221
4222 /* ------------------------------------------------------------ */
4223 case '>': /* One-time brackets */
4224 bravalue = OP_ONCE;
4225 ptr++;
4226 break;
4227
4228
4229 /* ------------------------------------------------------------ */
4230 case 'C': /* Callout - may be followed by digits; */
4231 previous_callout = code; /* Save for later completion */
4232 after_manual_callout = 1; /* Skip one item before completing */
4233 *code++ = OP_CALLOUT;
4234 {
4235 int n = 0;
4236 while ((digitab[*(++ptr)] & ctype_digit) != 0)
4237 n = n * 10 + *ptr - '0';
4238 if (*ptr != ')')
4239 {
4240 *errorcodeptr = ERR39;
4241 goto FAILED;
4242 }
4243 if (n > 255)
4244 {
4245 *errorcodeptr = ERR38;
4246 goto FAILED;
4247 }
4248 *code++ = n;
4249 PUT(code, 0, ptr - cd->start_pattern + 1); /* Pattern offset */
4250 PUT(code, LINK_SIZE, 0); /* Default length */
4251 code += 2 * LINK_SIZE;
4252 }
4253 previous = NULL;
4254 continue;
4255
4256
4257 /* ------------------------------------------------------------ */
4258 case 'P': /* Python-style named subpattern handling */
4259 if (*(++ptr) == '=' || *ptr == '>') /* Reference or recursion */
4260 {
4261 is_recurse = *ptr == '>';
4262 terminator = ')';
4263 goto NAMED_REF_OR_RECURSE;
4264 }
4265 else if (*ptr != '<') /* Test for Python-style definition */
4266 {
4267 *errorcodeptr = ERR41;
4268 goto FAILED;
4269 }
4270 /* Fall through to handle (?P< as (?< is handled */
4271
4272
4273 /* ------------------------------------------------------------ */
4274 DEFINE_NAME: /* Come here from (?< handling */
4275 case '\'':
4276 {
4277 terminator = (*ptr == '<')? '>' : '\'';
4278 name = ++ptr;
4279
4280 while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4281 namelen = ptr - name;
4282
4283 /* In the pre-compile phase, just do a syntax check. */
4284
4285 if (lengthptr != NULL)
4286 {
4287 if (*ptr != terminator)
4288 {
4289 *errorcodeptr = ERR42;
4290 goto FAILED;
4291 }
4292 if (cd->names_found >= MAX_NAME_COUNT)
4293 {
4294 *errorcodeptr = ERR49;
4295 goto FAILED;
4296 }
4297 if (namelen + 3 > cd->name_entry_size)
4298 {
4299 cd->name_entry_size = namelen + 3;
4300 if (namelen > MAX_NAME_SIZE)
4301 {
4302 *errorcodeptr = ERR48;
4303 goto FAILED;
4304 }
4305 }
4306 }
4307
4308 /* In the real compile, create the entry in the table */
4309
4310 else
4311 {
4312 slot = cd->name_table;
4313 for (i = 0; i < cd->names_found; i++)
4314 {
4315 int crc = memcmp(name, slot+2, namelen);
4316 if (crc == 0)
4317 {
4318 if (slot[2+namelen] == 0)
4319 {
4320 if ((options & PCRE_DUPNAMES) == 0)
4321 {
4322 *errorcodeptr = ERR43;
4323 goto FAILED;
4324 }
4325 }
4326 else crc = -1; /* Current name is substring */
4327 }
4328 if (crc < 0)
4329 {
4330 memmove(slot + cd->name_entry_size, slot,
4331 (cd->names_found - i) * cd->name_entry_size);
4332 break;
4333 }
4334 slot += cd->name_entry_size;
4335 }
4336
4337 PUT2(slot, 0, cd->bracount + 1);
4338 memcpy(slot + 2, name, namelen);
4339 slot[2+namelen] = 0;
4340 }
4341 }
4342
4343 /* In both cases, count the number of names we've encountered. */
4344
4345 ptr++; /* Move past > or ' */
4346 cd->names_found++;
4347 goto NUMBERED_GROUP;
4348
4349
4350 /* ------------------------------------------------------------ */
4351 case '&': /* Perl recursion/subroutine syntax */
4352 terminator = ')';
4353 is_recurse = TRUE;
4354 /* Fall through */
4355
4356 /* We come here from the Python syntax above that handles both
4357 references (?P=name) and recursion (?P>name), as well as falling
4358 through from the Perl recursion syntax (?&name). */
4359
4360 NAMED_REF_OR_RECURSE:
4361 name = ++ptr;
4362 while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4363 namelen = ptr - name;
4364
4365 /* In the pre-compile phase, do a syntax check and set a dummy
4366 reference number. */
4367
4368 if (lengthptr != NULL)
4369 {
4370 if (*ptr != terminator)
4371 {
4372 *errorcodeptr = ERR42;
4373 goto FAILED;
4374 }
4375 if (namelen > MAX_NAME_SIZE)
4376 {
4377 *errorcodeptr = ERR48;
4378 goto FAILED;
4379 }
4380 recno = 0;
4381 }
4382
4383 /* In the real compile, seek the name in the table */
4384
4385 else
4386 {
4387 slot = cd->name_table;
4388 for (i = 0; i < cd->names_found; i++)
4389 {
4390 if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
4391 slot += cd->name_entry_size;
4392 }
4393
4394 if (i < cd->names_found) /* Back reference */
4395 {
4396 recno = GET2(slot, 0);
4397 }
4398 else if ((recno = /* Forward back reference */
4399 find_parens(ptr, cd->bracount, name, namelen,
4400 (options & PCRE_EXTENDED) != 0)) <= 0)
4401 {
4402 *errorcodeptr = ERR15;
4403 goto FAILED;
4404 }
4405 }
4406
4407 /* In both phases, we can now go to the code than handles numerical
4408 recursion or backreferences. */
4409
4410 if (is_recurse) goto HANDLE_RECURSION;
4411 else goto HANDLE_REFERENCE;
4412
4413
4414 /* ------------------------------------------------------------ */
4415 case 'R': /* Recursion */
4416 ptr++; /* Same as (?0) */
4417 /* Fall through */
4418
4419
4420 /* ------------------------------------------------------------ */
4421 case '-': case '+':
4422 case '0': case '1': case '2': case '3': case '4': /* Recursion or */
4423 case '5': case '6': case '7': case '8': case '9': /* subroutine */
4424 {
4425 const uschar *called;
4426
4427 if ((refsign = *ptr) == '+') ptr++;
4428 else if (refsign == '-')
4429 {
4430 if ((digitab[ptr[1]] & ctype_digit) == 0)
4431 goto OTHER_CHAR_AFTER_QUERY;
4432 ptr++;
4433 }
4434
4435 recno = 0;
4436 while((digitab[*ptr] & ctype_digit) != 0)
4437 recno = recno * 10 + *ptr++ - '0';
4438
4439 if (*ptr != ')')
4440 {
4441 *errorcodeptr = ERR29;
4442 goto FAILED;
4443 }
4444
4445 if (refsign == '-')
4446 {
4447 if (recno == 0)
4448 {
4449 *errorcodeptr = ERR58;
4450 goto FAILED;
4451 }
4452 recno = cd->bracount - recno + 1;
4453 if (recno <= 0)
4454 {
4455 *errorcodeptr = ERR15;
4456 goto FAILED;
4457 }
4458 }
4459 else if (refsign == '+')
4460 {
4461 if (recno == 0)
4462 {
4463 *errorcodeptr = ERR58;
4464 goto FAILED;
4465 }
4466 recno += cd->bracount;
4467 }
4468
4469 /* Come here from code above that handles a named recursion */
4470
4471 HANDLE_RECURSION:
4472
4473 previous = code;
4474 called = cd->start_code;
4475
4476 /* When we are actually compiling, find the bracket that is being
4477 referenced. Temporarily end the regex in case it doesn't exist before
4478 this point. If we end up with a forward reference, first check that
4479 the bracket does occur later so we can give the error (and position)
4480 now. Then remember this forward reference in the workspace so it can
4481 be filled in at the end. */
4482
4483 if (lengthptr == NULL)
4484 {
4485 *code = OP_END;
4486 if (recno != 0) called = find_bracket(cd->start_code, utf8, recno);
4487
4488 /* Forward reference */
4489
4490 if (called == NULL)
4491 {
4492 if (find_parens(ptr, cd->bracount, NULL, recno,
4493 (options & PCRE_EXTENDED) != 0) < 0)
4494 {
4495 *errorcodeptr = ERR15;
4496 goto FAILED;
4497 }
4498 called = cd->start_code + recno;
4499 PUTINC(cd->hwm, 0, code + 2 + LINK_SIZE - cd->start_code);
4500 }
4501
4502 /* If not a forward reference, and the subpattern is still open,
4503 this is a recursive call. We check to see if this is a left
4504 recursion that could loop for ever, and diagnose that case. */
4505
4506 else if (GET(called, 1) == 0 &&
4507 could_be_empty(called, code, bcptr, utf8))
4508 {
4509 *errorcodeptr = ERR40;
4510 goto FAILED;
4511 }
4512 }
4513
4514 /* Insert the recursion/subroutine item, automatically wrapped inside
4515 "once" brackets. Set up a "previous group" length so that a
4516 subsequent quantifier will work. */
4517
4518 *code = OP_ONCE;
4519 PUT(code, 1, 2 + 2*LINK_SIZE);
4520 code += 1 + LINK_SIZE;
4521
4522 *code = OP_RECURSE;
4523 PUT(code, 1, called - cd->start_code);
4524 code += 1 + LINK_SIZE;
4525
4526 *code = OP_KET;
4527 PUT(code, 1, 2 + 2*LINK_SIZE);
4528 code += 1 + LINK_SIZE;
4529
4530 length_prevgroup = 3 + 3*LINK_SIZE;
4531 }
4532
4533 /* Can't determine a first byte now */
4534
4535 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
4536 continue;
4537
4538
4539 /* ------------------------------------------------------------ */
4540 default: /* Other characters: check option setting */
4541 OTHER_CHAR_AFTER_QUERY:
4542 set = unset = 0;
4543 optset = &set;
4544
4545 while (*ptr != ')' && *ptr != ':')
4546 {
4547 switch (*ptr++)
4548 {
4549 case '-': optset = &unset; break;
4550
4551 case 'J': /* Record that it changed in the external options */
4552 *optset |= PCRE_DUPNAMES;
4553 cd->external_options |= PCRE_JCHANGED;
4554 break;
4555
4556 case 'i': *optset |= PCRE_CASELESS; break;
4557 case 'm': *optset |= PCRE_MULTILINE; break;
4558 case 's': *optset |= PCRE_DOTALL; break;
4559 case 'x': *optset |= PCRE_EXTENDED; break;
4560 case 'U': *optset |= PCRE_UNGREEDY; break;
4561 case 'X': *optset |= PCRE_EXTRA; break;
4562
4563 default: *errorcodeptr = ERR12;
4564 ptr--; /* Correct the offset */
4565 goto FAILED;
4566 }
4567 }
4568
4569 /* Set up the changed option bits, but don't change anything yet. */
4570
4571 newoptions = (options | set) & (~unset);
4572
4573 /* If the options ended with ')' this is not the start of a nested
4574 group with option changes, so the options change at this level. If this
4575 item is right at the start of the pattern, the options can be
4576 abstracted and made external in the pre-compile phase, and ignored in
4577 the compile phase. This can be helpful when matching -- for instance in
4578 caseless checking of required bytes.
4579
4580 If the code pointer is not (cd->start_code + 1 + LINK_SIZE), we are
4581 definitely *not* at the start of the pattern because something has been
4582 compiled. In the pre-compile phase, however, the code pointer can have
4583 that value after the start, because it gets reset as code is discarded
4584 during the pre-compile. However, this can happen only at top level - if
4585 we are within parentheses, the starting BRA will still be present. At
4586 any parenthesis level, the length value can be used to test if anything
4587 has been compiled at that level. Thus, a test for both these conditions
4588 is necessary to ensure we correctly detect the start of the pattern in
4589 both phases.
4590
4591 If we are not at the pattern start, compile code to change the ims
4592 options if this setting actually changes any of them. We also pass the
4593 new setting back so that it can be put at the start of any following
4594 branches, and when this group ends (if we are in a group), a resetting
4595 item can be compiled. */
4596
4597 if (*ptr == ')')
4598 {
4599 if (code == cd->start_code + 1 + LINK_SIZE &&
4600 (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE))
4601 {
4602 cd->external_options = newoptions;
4603 options = newoptions;
4604 }
4605 else
4606 {
4607 if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
4608 {
4609 *code++ = OP_OPT;
4610 *code++ = newoptions & PCRE_IMS;
4611 }
4612
4613 /* Change options at this level, and pass them back for use
4614 in subsequent branches. Reset the greedy defaults and the case
4615 value for firstbyte and reqbyte. */
4616
4617 *optionsptr = options = newoptions;
4618 greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
4619 greedy_non_default = greedy_default ^ 1;
4620 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
4621 }
4622
4623 previous = NULL; /* This item can't be repeated */
4624 continue; /* It is complete */
4625 }
4626
4627 /* If the options ended with ':' we are heading into a nested group
4628 with possible change of options. Such groups are non-capturing and are
4629 not assertions of any kind. All we need to do is skip over the ':';
4630 the newoptions value is handled below. */
4631
4632 bravalue = OP_BRA;
4633 ptr++;
4634 } /* End of switch for character following (? */
4635 } /* End of (? handling */
4636
4637 /* Opening parenthesis not followed by '?'. If PCRE_NO_AUTO_CAPTURE is set,
4638 all unadorned brackets become non-capturing and behave like (?:...)
4639 brackets. */
4640
4641 else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
4642 {
4643 bravalue = OP_BRA;
4644 }
4645
4646 /* Else we have a capturing group. */
4647
4648 else
4649 {
4650 NUMBERED_GROUP:
4651 cd->bracount += 1;
4652 PUT2(code, 1+LINK_SIZE, cd->bracount);
4653 skipbytes = 2;
4654 }
4655
4656 /* Process nested bracketed regex. Assertions may not be repeated, but
4657 other kinds can be. All their opcodes are >= OP_ONCE. We copy code into a
4658 non-register variable in order to be able to pass its address because some
4659 compilers complain otherwise. Pass in a new setting for the ims options if
4660 they have changed. */
4661
4662 previous = (bravalue >= OP_ONCE)? code : NULL;
4663 *code = bravalue;
4664 tempcode = code;
4665 tempreqvary = cd->req_varyopt; /* Save value before bracket */
4666 length_prevgroup = 0; /* Initialize for pre-compile phase */
4667
4668 if (!compile_regex(
4669 newoptions, /* The complete new option state */
4670 options & PCRE_IMS, /* The previous ims option state */
4671 &tempcode, /* Where to put code (updated) */
4672 &ptr, /* Input pointer (updated) */
4673 errorcodeptr, /* Where to put an error message */
4674 (bravalue == OP_ASSERTBACK ||
4675 bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
4676 reset_bracount, /* True if (?| group */
4677 skipbytes, /* Skip over bracket number */
4678 &subfirstbyte, /* For possible first char */
4679 &subreqbyte, /* For possible last char */
4680 bcptr, /* Current branch chain */
4681 cd, /* Tables block */
4682 (lengthptr == NULL)? NULL : /* Actual compile phase */
4683 &length_prevgroup /* Pre-compile phase */
4684 ))
4685 goto FAILED;
4686
4687 /* At the end of compiling, code is still pointing to the start of the
4688 group, while tempcode has been updated to point past the end of the group
4689 and any option resetting that may follow it. The pattern pointer (ptr)
4690 is on the bracket. */
4691
4692 /* If this is a conditional bracket, check that there are no more than
4693 two branches in the group, or just one if it's a DEFINE group. We do this
4694 in the real compile phase, not in the pre-pass, where the whole group may
4695 not be available. */
4696
4697 if (bravalue == OP_COND && lengthptr == NULL)
4698 {
4699 uschar *tc = code;
4700 int condcount = 0;
4701
4702 do {
4703 condcount++;
4704 tc += GET(tc,1);
4705 }
4706 while (*tc != OP_KET);
4707
4708 /* A DEFINE group is never obeyed inline (the "condition" is always
4709 false). It must have only one branch. */
4710
4711 if (code[LINK_SIZE+1] == OP_DEF)
4712 {
4713 if (condcount > 1)
4714 {
4715 *errorcodeptr = ERR54;
4716 goto FAILED;
4717 }
4718 bravalue = OP_DEF; /* Just a flag to suppress char handling below */
4719 }
4720
4721 /* A "normal" conditional group. If there is just one branch, we must not
4722 make use of its firstbyte or reqbyte, because this is equivalent to an
4723 empty second branch. */
4724
4725 else
4726 {
4727 if (condcount > 2)
4728 {
4729 *errorcodeptr = ERR27;
4730 goto FAILED;
4731 }
4732 if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
4733 }
4734 }
4735
4736 /* Error if hit end of pattern */
4737
4738 if (*ptr != ')')
4739 {
4740 *errorcodeptr = ERR14;
4741 goto FAILED;
4742 }
4743
4744 /* In the pre-compile phase, update the length by the length of the nested
4745 group, less the brackets at either end. Then reduce the compiled code to
4746 just the brackets so that it doesn't use much memory if it is duplicated by
4747 a quantifier. */
4748
4749 if (lengthptr != NULL)
4750 {
4751 if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE)
4752 {
4753 *errorcodeptr = ERR20;
4754 goto FAILED;
4755 }
4756 *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
4757 code++;
4758 PUTINC(code, 0, 1 + LINK_SIZE);
4759 *code++ = OP_KET;
4760 PUTINC(code, 0, 1 + LINK_SIZE);
4761 }
4762
4763 /* Otherwise update the main code pointer to the end of the group. */
4764
4765 else code = tempcode;
4766
4767 /* For a DEFINE group, required and first character settings are not
4768 relevant. */
4769
4770 if (bravalue == OP_DEF) break;
4771
4772 /* Handle updating of the required and first characters for other types of
4773 group. Update for normal brackets of all kinds, and conditions with two
4774 branches (see code above). If the bracket is followed by a quantifier with
4775 zero repeat, we have to back off. Hence the definition of zeroreqbyte and
4776 zerofirstbyte outside the main loop so that they can be accessed for the
4777 back off. */
4778
4779 zeroreqbyte = reqbyte;
4780 zerofirstbyte = firstbyte;
4781 groupsetfirstbyte = FALSE;
4782
4783 if (bravalue >= OP_ONCE)
4784 {
4785 /* If we have not yet set a firstbyte in this branch, take it from the
4786 subpattern, remembering that it was set here so that a repeat of more
4787 than one can replicate it as reqbyte if necessary. If the subpattern has
4788 no firstbyte, set "none" for the whole branch. In both cases, a zero
4789 repeat forces firstbyte to "none". */
4790
4791 if (firstbyte == REQ_UNSET)
4792 {
4793 if (subfirstbyte >= 0)
4794 {
4795 firstbyte = subfirstbyte;
4796 groupsetfirstbyte = TRUE;
4797 }
4798 else firstbyte = REQ_NONE;
4799 zerofirstbyte = REQ_NONE;
4800 }
4801
4802 /* If firstbyte was previously set, convert the subpattern's firstbyte
4803 into reqbyte if there wasn't one, using the vary flag that was in
4804 existence beforehand. */
4805
4806 else if (subfirstbyte >= 0 && subreqbyte < 0)
4807 subreqbyte = subfirstbyte | tempreqvary;
4808
4809 /* If the subpattern set a required byte (or set a first byte that isn't
4810 really the first byte - see above), set it. */
4811
4812 if (subreqbyte >= 0) reqbyte = subreqbyte;
4813 }
4814
4815 /* For a forward assertion, we take the reqbyte, if set. This can be
4816 helpful if the pattern that follows the assertion doesn't set a different
4817 char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
4818 for an assertion, however because it leads to incorrect effect for patterns
4819 such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
4820 of a firstbyte. This is overcome by a scan at the end if there's no
4821 firstbyte, looking for an asserted first char. */
4822
4823 else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
4824 break; /* End of processing '(' */
4825
4826
4827 /* ===================================================================*/
4828 /* Handle metasequences introduced by \. For ones like \d, the ESC_ values
4829 are arranged to be the negation of the corresponding OP_values. For the
4830 back references, the values are ESC_REF plus the reference number. Only
4831 back references and those types that consume a character may be repeated.
4832 We can test for values between ESC_b and ESC_Z for the latter; this may
4833 have to change if any new ones are ever created. */
4834
4835 case '\\':
4836 tempptr = ptr;
4837 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, FALSE);
4838 if (*errorcodeptr != 0) goto FAILED;
4839
4840 if (c < 0)
4841 {
4842 if (-c == ESC_Q) /* Handle start of quoted string */
4843 {
4844 if (ptr[1] == '\\' && ptr[2] == 'E') ptr += 2; /* avoid empty string */
4845 else inescq = TRUE;
4846 continue;
4847 }
4848
4849 if (-c == ESC_E) continue; /* Perl ignores an orphan \E */
4850
4851 /* For metasequences that actually match a character, we disable the
4852 setting of a first character if it hasn't already been set. */
4853
4854 if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
4855 firstbyte = REQ_NONE;
4856
4857 /* Set values to reset to if this is followed by a zero repeat. */
4858
4859 zerofirstbyte = firstbyte;
4860 zeroreqbyte = reqbyte;
4861
4862 /* \k<name> or \k'name' is a back reference by name (Perl syntax).
4863 We also support \k{name} (.NET syntax) */
4864
4865 if (-c == ESC_k && (ptr[1] == '<' || ptr[1] == '\'' || ptr[1] == '{'))
4866 {
4867 is_recurse = FALSE;
4868 terminator = (*(++ptr) == '<')? '>' : (*ptr == '\'')? '\'' : '}';
4869 goto NAMED_REF_OR_RECURSE;
4870 }
4871
4872 /* Back references are handled specially; must disable firstbyte if
4873 not set to cope with cases like (?=(\w+))\1: which would otherwise set
4874 ':' later. */
4875
4876 if (-c >= ESC_REF)
4877 {
4878 recno = -c - ESC_REF;
4879
4880 HANDLE_REFERENCE: /* Come here from named backref handling */
4881 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
4882 previous = code;
4883 *code++ = OP_REF;
4884 PUT2INC(code, 0, recno);
4885 cd->backref_map |= (recno < 32)? (1 << recno) : 1;
4886 if (recno > cd->top_backref) cd->top_backref = recno;
4887 }
4888
4889 /* So are Unicode property matches, if supported. */
4890
4891 #ifdef SUPPORT_UCP
4892 else if (-c == ESC_P || -c == ESC_p)
4893 {
4894 BOOL negated;
4895 int pdata;
4896 int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
4897 if (ptype < 0) goto FAILED;
4898 previous = code;
4899 *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
4900 *code++ = ptype;
4901 *code++ = pdata;
4902 }
4903 #else
4904
4905 /* If Unicode properties are not supported, \X, \P, and \p are not
4906 allowed. */
4907
4908 else if (-c == ESC_X || -c == ESC_P || -c == ESC_p)
4909 {
4910 *errorcodeptr = ERR45;
4911 goto FAILED;
4912 }
4913 #endif
4914
4915 /* For the rest (including \X when Unicode properties are supported), we
4916 can obtain the OP value by negating the escape value. */
4917
4918 else
4919 {
4920 previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
4921 *code++ = -c;
4922 }
4923 continue;
4924 }
4925
4926 /* We have a data character whose value is in c. In UTF-8 mode it may have
4927 a value > 127. We set its representation in the length/buffer, and then
4928 handle it as a data character. */
4929
4930 #ifdef SUPPORT_UTF8
4931 if (utf8 && c > 127)
4932 mclength = _pcre_ord2utf8(c, mcbuffer);
4933 else
4934 #endif
4935
4936 {
4937 mcbuffer[0] = c;
4938 mclength = 1;
4939 }
4940 goto ONE_CHAR;
4941
4942
4943 /* ===================================================================*/
4944 /* Handle a literal character. It is guaranteed not to be whitespace or #
4945 when the extended flag is set. If we are in UTF-8 mode, it may be a
4946 multi-byte literal character. */
4947
4948 default:
4949 NORMAL_CHAR:
4950 mclength = 1;
4951 mcbuffer[0] = c;
4952
4953 #ifdef SUPPORT_UTF8
4954 if (utf8 && c >= 0xc0)
4955 {
4956 while ((ptr[1] & 0xc0) == 0x80)
4957 mcbuffer[mclength++] = *(++ptr);
4958 }
4959 #endif
4960
4961 /* At this point we have the character's bytes in mcbuffer, and the length
4962 in mclength. When not in UTF-8 mode, the length is always 1. */
4963
4964 ONE_CHAR:
4965 previous = code;
4966 *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARNC : OP_CHAR;
4967 for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
4968
4969 /* Set the first and required bytes appropriately. If no previous first
4970 byte, set it from this character, but revert to none on a zero repeat.
4971 Otherwise, leave the firstbyte value alone, and don't change it on a zero
4972 repeat. */
4973
4974 if (firstbyte == REQ_UNSET)
4975 {
4976 zerofirstbyte = REQ_NONE;
4977 zeroreqbyte = reqbyte;
4978
4979 /* If the character is more than one byte long, we can set firstbyte
4980 only if it is not to be matched caselessly. */
4981
4982 if (mclength == 1 || req_caseopt == 0)
4983 {
4984 firstbyte = mcbuffer[0] | req_caseopt;
4985 if (mclength != 1) reqbyte = code[-1] | cd->req_varyopt;
4986 }
4987 else firstbyte = reqbyte = REQ_NONE;
4988 }
4989
4990 /* firstbyte was previously set; we can set reqbyte only the length is
4991 1 or the matching is caseful. */
4992
4993 else
4994 {
4995 zerofirstbyte = firstbyte;
4996 zeroreqbyte = reqbyte;
4997 if (mclength == 1 || req_caseopt == 0)
4998 reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
4999 }
5000
5001 break; /* End of literal character handling */
5002 }
5003 } /* end of big loop */
5004
5005
5006 /* Control never reaches here by falling through, only by a goto for all the
5007 error states. Pass back the position in the pattern so that it can be displayed
5008 to the user for diagnosing the error. */
5009
5010 FAILED:
5011 *ptrptr = ptr;
5012 return FALSE;
5013 }
5014
5015
5016
5017
5018 /*************************************************
5019 * Compile sequence of alternatives *
5020 *************************************************/
5021
5022 /* On entry, ptr is pointing past the bracket character, but on return it
5023 points to the closing bracket, or vertical bar, or end of string. The code
5024 variable is pointing at the byte into which the BRA operator has been stored.
5025 If the ims options are changed at the start (for a (?ims: group) or during any
5026 branch, we need to insert an OP_OPT item at the start of every following branch
5027 to ensure they get set correctly at run time, and also pass the new options
5028 into every subsequent branch compile.
5029
5030 This function is used during the pre-compile phase when we are trying to find
5031 out the amount of memory needed, as well as during the real compile phase. The
5032 value of lengthptr distinguishes the two phases.
5033
5034 Arguments:
5035 options option bits, including any changes for this subpattern
5036 oldims previous settings of ims option bits
5037 codeptr -> the address of the current code pointer
5038 ptrptr -> the address of the current pattern pointer
5039 errorcodeptr -> pointer to error code variable
5040 lookbehind TRUE if this is a lookbehind assertion
5041 reset_bracount TRUE to reset the count for each branch
5042 skipbytes skip this many bytes at start (for brackets and OP_COND)
5043 firstbyteptr place to put the first required character, or a negative number
5044 reqbyteptr place to put the last required character, or a negative number
5045 bcptr pointer to the chain of currently open branches
5046 cd points to the data block with tables pointers etc.
5047 lengthptr NULL during the real compile phase
5048 points to length accumulator during pre-compile phase
5049
5050 Returns: TRUE on success
5051 */
5052
5053 static BOOL
5054 compile_regex(int options, int oldims, uschar **codeptr, const uschar **ptrptr,
5055 int *errorcodeptr, BOOL lookbehind, BOOL reset_bracount, int skipbytes,
5056 int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd,
5057 int *lengthptr)
5058 {
5059 const uschar *ptr = *ptrptr;
5060 uschar *code = *codeptr;
5061 uschar *last_branch = code;
5062 uschar *start_bracket = code;
5063 uschar *reverse_count = NULL;
5064 int firstbyte, reqbyte;
5065 int branchfirstbyte, branchreqbyte;
5066 int length;
5067 int orig_bracount;
5068 int max_bracount;
5069 branch_chain bc;
5070
5071 bc.outer = bcptr;
5072 bc.current = code;
5073
5074 firstbyte = reqbyte = REQ_UNSET;
5075
5076 /* Accumulate the length for use in the pre-compile phase. Start with the
5077 length of the BRA and KET and any extra bytes that are required at the
5078 beginning. We accumulate in a local variable to save frequent testing of
5079 lenthptr for NULL. We cannot do this by looking at the value of code at the
5080 start and end of each alternative, because compiled items are discarded during
5081 the pre-compile phase so that the work space is not exceeded. */
5082
5083 length = 2 + 2*LINK_SIZE + skipbytes;
5084
5085 /* WARNING: If the above line is changed for any reason, you must also change
5086 the code that abstracts option settings at the start of the pattern and makes
5087 them global. It tests the value of length for (2 + 2*LINK_SIZE) in the
5088 pre-compile phase to find out whether anything has yet been compiled or not. */
5089
5090 /* Offset is set zero to mark that this bracket is still open */
5091
5092 PUT(code, 1, 0);
5093 code += 1 + LINK_SIZE + skipbytes;
5094
5095 /* Loop for each alternative branch */
5096
5097 orig_bracount = max_bracount = cd->bracount;
5098 for (;;)
5099 {
5100 /* For a (?| group, reset the capturing bracket count so that each branch
5101 uses the same numbers. */
5102
5103 if (reset_bracount) cd->bracount = orig_bracount;
5104
5105 /* Handle a change of ims options at the start of the branch */
5106
5107 if ((options & PCRE_IMS) != oldims)
5108 {
5109 *code++ = OP_OPT;
5110 *code++ = options & PCRE_IMS;
5111 length += 2;
5112 }
5113
5114 /* Set up dummy OP_REVERSE if lookbehind assertion */
5115
5116 if (lookbehind)
5117 {
5118 *code++ = OP_REVERSE;
5119 reverse_count = code;
5120 PUTINC(code, 0, 0);
5121 length += 1 + LINK_SIZE;
5122 }
5123
5124 /* Now compile the branch; in the pre-compile phase its length gets added
5125 into the length. */
5126
5127 if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstbyte,
5128 &branchreqbyte, &bc, cd, (lengthptr == NULL)? NULL : &length))
5129 {
5130 *ptrptr = ptr;
5131 return FALSE;
5132 }
5133
5134 /* Keep the highest bracket count in case (?| was used and some branch
5135 has fewer than the rest. */
5136
5137 if (cd->bracount > max_bracount) max_bracount = cd->bracount;
5138
5139 /* In the real compile phase, there is some post-processing to be done. */
5140
5141 if (lengthptr == NULL)
5142 {
5143 /* If this is the first branch, the firstbyte and reqbyte values for the
5144 branch become the values for the regex. */
5145
5146 if (*last_branch != OP_ALT)
5147 {
5148 firstbyte = branchfirstbyte;
5149 reqbyte = branchreqbyte;
5150 }
5151
5152 /* If this is not the first branch, the first char and reqbyte have to
5153 match the values from all the previous branches, except that if the
5154 previous value for reqbyte didn't have REQ_VARY set, it can still match,
5155 and we set REQ_VARY for the regex. */
5156
5157 else
5158 {
5159 /* If we previously had a firstbyte, but it doesn't match the new branch,
5160 we have to abandon the firstbyte for the regex, but if there was
5161 previously no reqbyte, it takes on the value of the old firstbyte. */
5162
5163 if (firstbyte >= 0 && firstbyte != branchfirstbyte)
5164 {
5165 if (reqbyte < 0) reqbyte = firstbyte;
5166 firstbyte = REQ_NONE;
5167 }
5168
5169 /* If we (now or from before) have no firstbyte, a firstbyte from the
5170 branch becomes a reqbyte if there isn't a branch reqbyte. */
5171
5172 if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
5173 branchreqbyte = branchfirstbyte;
5174
5175 /* Now ensure that the reqbytes match */
5176
5177 if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
5178 reqbyte = REQ_NONE;
5179 else reqbyte |= branchreqbyte; /* To "or" REQ_VARY */
5180 }
5181
5182 /* If lookbehind, check that this branch matches a fixed-length string, and
5183 put the length into the OP_REVERSE item. Temporarily mark the end of the
5184 branch with OP_END. */
5185
5186 if (lookbehind)
5187 {
5188 int fixed_length;
5189 *code = OP_END;
5190 fixed_length = find_fixedlength(last_branch, options);
5191 DPRINTF(("fixed length = %d\n", fixed_length));
5192 if (fixed_length < 0)
5193 {
5194 *errorcodeptr = (fixed_length == -2)? ERR36 : ERR25;
5195 *ptrptr = ptr;
5196 return FALSE;
5197 }
5198 PUT(reverse_count, 0, fixed_length);
5199 }
5200 }
5201
5202 /* Reached end of expression, either ')' or end of pattern. In the real
5203 compile phase, go back through the alternative branches and reverse the chain
5204 of offsets, with the field in the BRA item now becoming an offset to the
5205 first alternative. If there are no alternatives, it points to the end of the
5206 group. The length in the terminating ket is always the length of the whole
5207 bracketed item. If any of the ims options were changed inside the group,
5208 compile a resetting op-code following, except at the very end of the pattern.
5209 Return leaving the pointer at the terminating char. */
5210
5211 if (*ptr != '|')
5212 {
5213 if (lengthptr == NULL)
5214 {
5215 int branch_length = code - last_branch;
5216 do
5217 {
5218 int prev_length = GET(last_branch, 1);
5219 PUT(last_branch, 1, branch_length);
5220 branch_length = prev_length;
5221 last_branch -= branch_length;
5222 }
5223 while (branch_length > 0);
5224 }
5225
5226 /* Fill in the ket */
5227
5228 *code = OP_KET;
5229 PUT(code, 1, code - start_bracket);
5230 code += 1 + LINK_SIZE;
5231
5232 /* Resetting option if needed */
5233
5234 if ((options & PCRE_IMS) != oldims && *ptr == ')')
5235 {
5236 *code++ = OP_OPT;
5237 *code++ = oldims;
5238 length += 2;
5239 }
5240
5241 /* Retain the highest bracket number, in case resetting was used. */
5242
5243 cd->bracount = max_bracount;
5244
5245 /* Set values to pass back */
5246
5247 *codeptr = code;
5248 *ptrptr = ptr;
5249 *firstbyteptr = firstbyte;
5250 *reqbyteptr = reqbyte;
5251 if (lengthptr != NULL)
5252 {
5253 if (OFLOW_MAX - *lengthptr < length)
5254 {
5255 *errorcodeptr = ERR20;
5256 return FALSE;
5257 }
5258 *lengthptr += length;
5259 }
5260 return TRUE;
5261 }
5262
5263 /* Another branch follows. In the pre-compile phase, we can move the code
5264 pointer back to where it was for the start of the first branch. (That is,
5265 pretend that each branch is the only one.)
5266
5267 In the real compile phase, insert an ALT node. Its length field points back
5268 to the previous branch while the bracket remains open. At the end the chain
5269 is reversed. It's done like this so that the start of the bracket has a
5270 zero offset until it is closed, making it possible to detect recursion. */
5271
5272 if (lengthptr != NULL)
5273 {
5274 code = *codeptr + 1 + LINK_SIZE + skipbytes;
5275 length += 1 + LINK_SIZE;
5276 }
5277 else
5278 {
5279 *code = OP_ALT;
5280 PUT(code, 1, code - last_branch);
5281 bc.current = last_branch = code;
5282 code += 1 + LINK_SIZE;
5283 }
5284
5285 ptr++;
5286 }
5287 /* Control never reaches here */
5288 }
5289
5290
5291
5292
5293 /*************************************************
5294 * Check for anchored expression *
5295 *************************************************/
5296
5297 /* Try to find out if this is an anchored regular expression. Consider each
5298 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
5299 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
5300 it's anchored. However, if this is a multiline pattern, then only OP_SOD
5301 counts, since OP_CIRC can match in the middle.
5302
5303 We can also consider a regex to be anchored if OP_SOM starts all its branches.
5304 This is the code for \G, which means "match at start of match position, taking
5305 into account the match offset".
5306
5307 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
5308 because that will try the rest of the pattern at all possible matching points,
5309 so there is no point trying again.... er ....
5310
5311 .... except when the .* appears inside capturing parentheses, and there is a
5312 subsequent back reference to those parentheses. We haven't enough information
5313 to catch that case precisely.
5314
5315 At first, the best we could do was to detect when .* was in capturing brackets
5316 and the highest back reference was greater than or equal to that level.
5317 However, by keeping a bitmap of the first 31 back references, we can catch some
5318 of the more common cases more precisely.
5319
5320 Arguments:
5321 code points to start of expression (the bracket)
5322 options points to the options setting
5323 bracket_map a bitmap of which brackets we are inside while testing; this
5324 handles up to substring 31; after that we just have to take
5325 the less precise approach
5326 backref_map the back reference bitmap
5327
5328 Returns: TRUE or FALSE
5329 */
5330
5331 static BOOL
5332 is_anchored(register const uschar *code, int *options, unsigned int bracket_map,
5333 unsigned int backref_map)
5334 {
5335 do {
5336 const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
5337 options, PCRE_MULTILINE, FALSE);
5338 register int op = *scode;
5339
5340 /* Non-capturing brackets */
5341
5342 if (op == OP_BRA)
5343 {
5344 if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
5345 }
5346
5347 /* Capturing brackets */
5348
5349 else if (op == OP_CBRA)
5350 {
5351 int n = GET2(scode, 1+LINK_SIZE);
5352 int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
5353 if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;
5354 }
5355
5356 /* Other brackets */
5357
5358 else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
5359 {
5360 if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
5361 }
5362
5363 /* .* is not anchored unless DOTALL is set and it isn't in brackets that
5364 are or may be referenced. */
5365
5366 else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
5367 op == OP_TYPEPOSSTAR) &&
5368 (*options & PCRE_DOTALL) != 0)
5369 {
5370 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
5371 }
5372
5373 /* Check for explicit anchoring */
5374
5375 else if (op != OP_SOD && op != OP_SOM &&
5376 ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
5377 return FALSE;
5378 code += GET(code, 1);
5379 }
5380 while (*code == OP_ALT); /* Loop for each alternative */
5381 return TRUE;
5382 }
5383
5384
5385
5386 /*************************************************
5387 * Check for starting with ^ or .* *
5388 *************************************************/
5389
5390 /* This is called to find out if every branch starts with ^ or .* so that
5391 "first char" processing can be done to speed things up in multiline
5392 matching and for non-DOTALL patterns that start with .* (which must start at
5393 the beginning or after \n). As in the case of is_anchored() (see above), we
5394 have to take account of back references to capturing brackets that contain .*
5395 because in that case we can't make the assumption.
5396
5397 Arguments:
5398 code points to start of expression (the bracket)
5399 bracket_map a bitmap of which brackets we are inside while testing; this
5400 handles up to substring 31; after that we just have to take
5401 the less precise approach
5402 backref_map the back reference bitmap
5403
5404 Returns: TRUE or FALSE
5405 */
5406
5407 static BOOL
5408 is_startline(const uschar *code, unsigned int bracket_map,
5409 unsigned int backref_map)
5410 {
5411 do {
5412 const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
5413 NULL, 0, FALSE);
5414 register int op = *scode;
5415
5416 /* Non-capturing brackets */
5417
5418 if (op == OP_BRA)
5419 {
5420 if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
5421 }
5422
5423 /* Capturing brackets */
5424
5425 else if (op == OP_CBRA)
5426 {
5427 int n = GET2(scode, 1+LINK_SIZE);
5428 int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
5429 if (!is_startline(scode, new_map, backref_map)) return FALSE;
5430 }
5431
5432 /* Other brackets */
5433
5434 else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
5435 { if (!is_startline(scode, bracket_map, backref_map)) return FALSE; }
5436
5437 /* .* means "start at start or after \n" if it isn't in brackets that
5438 may be referenced. */
5439
5440 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
5441 {
5442 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
5443 }
5444
5445 /* Check for explicit circumflex */
5446
5447 else if (op != OP_CIRC) return FALSE;
5448
5449 /* Move on to the next alternative */
5450
5451 code += GET(code, 1);
5452 }
5453 while (*code == OP_ALT); /* Loop for each alternative */
5454 return TRUE;
5455 }
5456
5457
5458
5459 /*************************************************
5460 * Check for asserted fixed first char *
5461 *************************************************/
5462
5463 /* During compilation, the "first char" settings from forward assertions are
5464 discarded, because they can cause conflicts with actual literals that follow.
5465 However, if we end up without a first char setting for an unanchored pattern,
5466 it is worth scanning the regex to see if there is an initial asserted first
5467 char. If all branches start with the same asserted char, or with a bracket all
5468 of whose alternatives start with the same asserted char (recurse ad lib), then
5469 we return that char, otherwise -1.
5470
5471 Arguments:
5472 code points to start of expression (the bracket)
5473 options pointer to the options (used to check casing changes)
5474 inassert TRUE if in an assertion
5475
5476 Returns: -1 or the fixed first char
5477 */
5478
5479 static int
5480 find_firstassertedchar(const uschar *code, int *options, BOOL inassert)
5481 {
5482 register int c = -1;
5483 do {
5484 int d;
5485 const uschar *scode =
5486 first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE);
5487 register int op = *scode;
5488
5489 switch(op)
5490 {
5491 default:
5492 return -1;
5493
5494 case OP_BRA:
5495 case OP_CBRA:
5496 case OP_ASSERT:
5497 case OP_ONCE:
5498 case OP_COND:
5499 if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0)
5500 return -1;
5501 if (c < 0) c = d; else if (c != d) return -1;
5502 break;
5503
5504 case OP_EXACT: /* Fall through */
5505 scode += 2;
5506
5507 case OP_CHAR:
5508 case OP_CHARNC:
5509 case OP_PLUS:
5510 case OP_MINPLUS:
5511 case OP_POSPLUS:
5512 if (!inassert) return -1;
5513 if (c < 0)
5514 {
5515 c = scode[1];
5516 if ((*options & PCRE_CASELESS) != 0) c |= REQ_CASELESS;
5517 }
5518 else if (c != scode[1]) return -1;
5519 break;
5520 }
5521
5522 code += GET(code, 1);
5523 }
5524 while (*code == OP_ALT);
5525 return c;
5526 }
5527
5528
5529
5530 /*************************************************
5531 * Compile a Regular Expression *
5532 *************************************************/
5533
5534 /* This function takes a string and returns a pointer to a block of store
5535 holding a compiled version of the expression. The original API for this
5536 function had no error code return variable; it is retained for backwards
5537 compatibility. The new function is given a new name.
5538
5539 Arguments:
5540 pattern the regular expression
5541 options various option bits
5542 errorcodeptr pointer to error code variable (pcre_compile2() only)
5543 can be NULL if you don't want a code value
5544 errorptr pointer to pointer to error text
5545 erroroffset ptr offset in pattern where error was detected
5546 tables pointer to character tables or NULL
5547
5548 Returns: pointer to compiled data block, or NULL on error,
5549 with errorptr and erroroffset set
5550 */
5551
5552 PCRE_EXP_DEFN pcre *
5553 pcre_compile(const char *pattern, int options, const char **errorptr,
5554 int *erroroffset, const unsigned char *tables)
5555 {
5556 return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
5557 }
5558
5559
5560 PCRE_EXP_DEFN pcre *
5561 pcre_compile2(const char *pattern, int options, int *errorcodeptr,
5562 const char **errorptr, int *erroroffset, const unsigned char *tables)
5563 {
5564 real_pcre *re;
5565 int length = 1; /* For final END opcode */
5566 int firstbyte, reqbyte, newline;
5567 int errorcode = 0;
5568 #ifdef SUPPORT_UTF8
5569 BOOL utf8;
5570 #endif
5571 size_t size;
5572 uschar *code;
5573 const uschar *codestart;
5574 const uschar *ptr;
5575 compile_data compile_block;
5576 compile_data *cd = &compile_block;
5577
5578 /* This space is used for "compiling" into during the first phase, when we are
5579 computing the amount of memory that is needed. Compiled items are thrown away
5580 as soon as possible, so that a fairly large buffer should be sufficient for
5581 this purpose. The same space is used in the second phase for remembering where
5582 to fill in forward references to subpatterns. */
5583
5584 uschar cworkspace[COMPILE_WORK_SIZE];
5585
5586
5587 /* Set this early so that early errors get offset 0. */
5588
5589 ptr = (const uschar *)pattern;
5590
5591 /* We can't pass back an error message if errorptr is NULL; I guess the best we
5592 can do is just return NULL, but we can set a code value if there is a code
5593 pointer. */
5594
5595 if (errorptr == NULL)
5596 {
5597 if (errorcodeptr != NULL) *errorcodeptr = 99;
5598 return NULL;
5599 }
5600
5601 *errorptr = NULL;
5602 if (errorcodeptr != NULL) *errorcodeptr = ERR0;
5603
5604 /* However, we can give a message for this error */
5605
5606 if (erroroffset == NULL)
5607 {
5608 errorcode = ERR16;
5609 goto PCRE_EARLY_ERROR_RETURN2;
5610 }
5611
5612 *erroroffset = 0;
5613
5614 /* Can't support UTF8 unless PCRE has been compiled to include the code. */
5615
5616 #ifdef SUPPORT_UTF8
5617 utf8 = (options & PCRE_UTF8) != 0;
5618 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
5619 (*erroroffset = _pcre_valid_utf8((uschar *)pattern, -1)) >= 0)
5620 {
5621 errorcode = ERR44;
5622 goto PCRE_EARLY_ERROR_RETURN2;
5623 }
5624 #else
5625 if ((options & PCRE_UTF8) != 0)
5626 {
5627 errorcode = ERR32;
5628 goto PCRE_EARLY_ERROR_RETURN;
5629 }
5630 #endif
5631
5632 if ((options & ~PUBLIC_OPTIONS) != 0)
5633 {
5634 errorcode = ERR17;
5635 goto PCRE_EARLY_ERROR_RETURN;
5636 }
5637
5638 /* Set up pointers to the individual character tables */
5639
5640 if (tables == NULL) tables = _pcre_default_tables;
5641 cd->lcc = tables + lcc_offset;
5642 cd->fcc = tables + fcc_offset;
5643 cd->cbits = tables + cbits_offset;
5644 cd->ctypes = tables + ctypes_offset;
5645
5646 /* Handle different types of newline. The three bits give seven cases. The
5647 current code allows for fixed one- or two-byte sequences, plus "any" and
5648 "anycrlf". */
5649
5650 switch (options & (PCRE_NEWLINE_CRLF | PCRE_NEWLINE_ANY))
5651 {
5652 case 0: newline = NEWLINE; break; /* Compile-time default */
5653 case PCRE_NEWLINE_CR: newline = '\r'; break;
5654 case PCRE_NEWLINE_LF: newline = '\n'; break;
5655 case PCRE_NEWLINE_CR+
5656 PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
5657 case PCRE_NEWLINE_ANY: newline = -1; break;
5658 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
5659 default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
5660 }
5661
5662 if (newline == -2)
5663 {
5664 cd->nltype = NLTYPE_ANYCRLF;
5665 }
5666 else if (newline < 0)
5667 {
5668 cd->nltype = NLTYPE_ANY;
5669 }
5670 else
5671 {
5672 cd->nltype = NLTYPE_FIXED;
5673 if (newline > 255)
5674 {
5675 cd->nllen = 2;
5676 cd->nl[0] = (newline >> 8) & 255;
5677 cd->nl[1] = newline & 255;
5678 }
5679 else
5680 {
5681 cd->nllen = 1;
5682 cd->nl[0] = newline;
5683 }
5684 }
5685
5686 /* Maximum back reference and backref bitmap. The bitmap records up to 31 back
5687 references to help in deciding whether (.*) can be treated as anchored or not.
5688 */
5689
5690 cd->top_backref = 0;
5691 cd->backref_map = 0;
5692
5693 /* Reflect pattern for debugging output */
5694
5695 DPRINTF(("------------------------------------------------------------------\n"));
5696 DPRINTF(("%s\n", pattern));
5697
5698 /* Pretend to compile the pattern while actually just accumulating the length
5699 of memory required. This behaviour is triggered by passing a non-NULL final
5700 argument to compile_regex(). We pass a block of workspace (cworkspace) for it
5701 to compile parts of the pattern into; the compiled code is discarded when it is
5702 no longer needed, so hopefully this workspace will never overflow, though there
5703 is a test for its doing so. */
5704
5705 cd->bracount = 0;
5706 cd->names_found = 0;
5707 cd->name_entry_size = 0;
5708 cd->name_table = NULL;
5709 cd->start_workspace = cworkspace;
5710 cd->start_code = cworkspace;
5711 cd->hwm = cworkspace;
5712 cd->start_pattern = (const uschar *)pattern;
5713 cd->end_pattern = (const uschar *)(pattern + strlen(pattern));
5714 cd->req_varyopt = 0;
5715 cd->nopartial = FALSE;
5716 cd->external_options = options;
5717
5718 /* Now do the pre-compile. On error, errorcode will be set non-zero, so we
5719 don't need to look at the result of the function here. The initial options have
5720 been put into the cd block so that they can be changed if an option setting is
5721 found within the regex right at the beginning. Bringing initial option settings
5722 outside can help speed up starting point checks. */
5723
5724 code = cworkspace;
5725 *code = OP_BRA;
5726 (void)compile_regex(cd->external_options, cd->external_options & PCRE_IMS,
5727 &code, &ptr, &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd,
5728 &length);
5729 if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN;
5730
5731 DPRINTF(("end pre-compile: length=%d workspace=%d\n", length,
5732 cd->hwm - cworkspace));
5733
5734 if (length > MAX_PATTERN_SIZE)
5735 {
5736 errorcode = ERR20;
5737 goto PCRE_EARLY_ERROR_RETURN;
5738 }
5739
5740 /* Compute the size of data block needed and get it, either from malloc or
5741 externally provided function. Integer overflow should no longer be possible
5742 because nowadays we limit the maximum value of cd->names_found and
5743 cd->name_entry_size. */
5744
5745 size = length + sizeof(real_pcre) + cd->names_found * (cd->name_entry_size + 3);
5746 re = (real_pcre *)(pcre_malloc)(size);
5747
5748 if (re == NULL)
5749 {
5750 errorcode = ERR21;
5751 goto PCRE_EARLY_ERROR_RETURN;
5752 }
5753
5754 /* Put in the magic number, and save the sizes, initial options, and character
5755 table pointer. NULL is used for the default character tables. The nullpad field
5756 is at the end; it's there to help in the case when a regex compiled on a system
5757 with 4-byte pointers is run on another with 8-byte pointers. */
5758
5759 re->magic_number = MAGIC_NUMBER;
5760 re->size = size;
5761 re->options = cd->external_options;
5762 re->dummy1 = 0;
5763 re->first_byte = 0;
5764 re->req_byte = 0;
5765 re->name_table_offset = sizeof(real_pcre);
5766 re->name_entry_size = cd->name_entry_size;
5767 re->name_count = cd->names_found;
5768 re->ref_count = 0;
5769 re->tables = (tables == _pcre_default_tables)? NULL : tables;
5770 re->nullpad = NULL;
5771
5772 /* The starting points of the name/number translation table and of the code are
5773 passed around in the compile data block. The start/end pattern and initial
5774 options are already set from the pre-compile phase, as is the name_entry_size
5775 field. Reset the bracket count and the names_found field. Also reset the hwm
5776 field; this time it's used for remembering forward references to subpatterns.
5777 */
5778
5779 cd->bracount = 0;
5780 cd->names_found = 0;
5781 cd->name_table = (uschar *)re + re->name_table_offset;
5782 codestart = cd->name_table + re->name_entry_size * re->name_count;
5783 cd->start_code = codestart;
5784 cd->hwm = cworkspace;
5785 cd->req_varyopt = 0;
5786 cd->nopartial = FALSE;
5787 cd->had_accept = FALSE;
5788
5789 /* Set up a starting, non-extracting bracket, then compile the expression. On
5790 error, errorcode will be set non-zero, so we don't need to look at the result
5791 of the function here. */
5792
5793 ptr = (const uschar *)pattern;
5794 code = (uschar *)codestart;
5795 *code = OP_BRA;
5796 (void)compile_regex(re->options, re->options & PCRE_IMS, &code, &ptr,
5797 &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd, NULL);
5798 re->top_bracket = cd->bracount;
5799 re->top_backref = cd->top_backref;
5800
5801 if (cd->nopartial) re->options |= PCRE_NOPARTIAL;
5802 if (cd->had_accept) reqbyte = -1; /* Must disable after (*ACCEPT) */
5803
5804 /* If not reached end of pattern on success, there's an excess bracket. */
5805
5806 if (errorcode == 0 && *ptr != 0) errorcode = ERR22;
5807
5808 /* Fill in the terminating state and check for disastrous overflow, but
5809 if debugging, leave the test till after things are printed out. */
5810
5811 *code++ = OP_END;
5812
5813 #ifndef DEBUG
5814 if (code - codestart > length) errorcode = ERR23;
5815 #endif
5816
5817 /* Fill in any forward references that are required. */
5818
5819 while (errorcode == 0 && cd->hwm > cworkspace)
5820 {
5821 int offset, recno;
5822 const uschar *groupptr;
5823 cd->hwm -= LINK_SIZE;
5824 offset = GET(cd->hwm, 0);
5825 recno = GET(codestart, offset);
5826 groupptr = find_bracket(codestart, (re->options & PCRE_UTF8) != 0, recno);
5827 if (groupptr == NULL) errorcode = ERR53;
5828 else PUT(((uschar *)codestart), offset, groupptr - codestart);
5829 }
5830
5831 /* Give an error if there's back reference to a non-existent capturing
5832 subpattern. */
5833
5834 if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15;
5835
5836 /* Failed to compile, or error while post-processing */
5837
5838 if (errorcode != 0)
5839 {
5840 (pcre_free)(re);
5841 PCRE_EARLY_ERROR_RETURN:
5842 *erroroffset = ptr - (const uschar *)pattern;
5843 PCRE_EARLY_ERROR_RETURN2:
5844 *errorptr = error_texts[errorcode];
5845 if (errorcodeptr != NULL) *errorcodeptr = errorcode;
5846 return NULL;
5847 }
5848
5849 /* If the anchored option was not passed, set the flag if we can determine that
5850 the pattern is anchored by virtue of ^ characters or \A or anything else (such
5851 as starting with .* when DOTALL is set).
5852
5853 Otherwise, if we know what the first byte has to be, save it, because that
5854 speeds up unanchored matches no end. If not, see if we can set the
5855 PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
5856 start with ^. and also when all branches start with .* for non-DOTALL matches.
5857 */
5858
5859 if ((re->options & PCRE_ANCHORED) == 0)
5860 {
5861 int temp_options = re->options; /* May get changed during these scans */
5862 if (is_anchored(codestart, &temp_options, 0, cd->backref_map))
5863 re->options |= PCRE_ANCHORED;
5864 else
5865 {
5866 if (firstbyte < 0)
5867 firstbyte = find_firstassertedchar(codestart, &temp_options, FALSE);
5868 if (firstbyte >= 0) /* Remove caseless flag for non-caseable chars */
5869 {
5870 int ch = firstbyte & 255;
5871 re->first_byte = ((firstbyte & REQ_CASELESS) != 0 &&
5872 cd->fcc[ch] == ch)? ch : firstbyte;
5873 re->options |= PCRE_FIRSTSET;
5874 }
5875 else if (is_startline(codestart, 0, cd->backref_map))
5876 re->options |= PCRE_STARTLINE;
5877 }
5878 }
5879
5880 /* For an anchored pattern, we use the "required byte" only if it follows a
5881 variable length item in the regex. Remove the caseless flag for non-caseable
5882 bytes. */
5883
5884 if (reqbyte >= 0 &&
5885 ((re->options & PCRE_ANCHORED) == 0 || (reqbyte & REQ_VARY) != 0))
5886 {
5887 int ch = reqbyte & 255;
5888 re->req_byte = ((reqbyte & REQ_CASELESS) != 0 &&
5889 cd->fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte;
5890 re->options |= PCRE_REQCHSET;
5891 }
5892
5893 /* Print out the compiled data if debugging is enabled. This is never the
5894 case when building a production library. */
5895
5896 #ifdef DEBUG
5897
5898 printf("Length = %d top_bracket = %d top_backref = %d\n",
5899 length, re->top_bracket, re->top_backref);
5900
5901 if (re->options != 0)
5902 {
5903 printf("%s%s%s%s%s%s%s%s%s\n",
5904 ((re->options & PCRE_NOPARTIAL) != 0)? "nopartial " : "",
5905 ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
5906 ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
5907 ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
5908 ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
5909 ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
5910 ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
5911 ((re->options & PCRE_EXTRA) != 0)? "extra " : "",
5912 ((re->options & PCRE_UNGREEDY) != 0)? "ungreedy " : "");
5913 }
5914
5915 if ((re->options & PCRE_FIRSTSET) != 0)
5916 {
5917 int ch = re->first_byte & 255;
5918 const char *caseless = ((re->first_byte & REQ_CASELESS) == 0)?
5919 "" : " (caseless)";
5920 if (isprint(ch)) printf("First char = %c%s\n", ch, caseless);
5921 else printf("First char = \\x%02x%s\n", ch, caseless);
5922 }
5923
5924 if ((re->options & PCRE_REQCHSET) != 0)
5925 {
5926 int ch = re->req_byte & 255;
5927 const char *caseless = ((re->req_byte & REQ_CASELESS) == 0)?
5928 "" : " (caseless)";
5929 if (isprint(ch)) printf("Req char = %c%s\n", ch, caseless);
5930 else printf("Req char = \\x%02x%s\n", ch, caseless);
5931 }
5932
5933 pcre_printint(re, stdout, TRUE);
5934
5935 /* This check is done here in the debugging case so that the code that
5936 was compiled can be seen. */
5937
5938 if (code - codestart > length)
5939 {
5940 (pcre_free)(re);
5941 *errorptr = error_texts[ERR23];
5942 *erroroffset = ptr - (uschar *)pattern;
5943 if (errorcodeptr != NULL) *errorcodeptr = ERR23;
5944 return NULL;
5945 }
5946 #endif /* DEBUG */
5947
5948 return (pcre *)re;
5949 }
5950
5951 /* End of pcre_compile.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12