/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Contents of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 223 - (show annotations) (download)
Mon Aug 20 11:07:53 2007 UTC (7 years, 4 months ago) by ph10
File MIME type: text/plain
File size: 189555 byte(s)
Fix loop for classes containing \p or \P and just one ascii character.

1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2007 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains the external function pcre_compile(), along with
42 supporting internal functions that are not used by other modules. */
43
44
45 #ifdef HAVE_CONFIG_H
46 #include <config.h>
47 #endif
48
49 #define NLBLOCK cd /* Block containing newline information */
50 #define PSSTART start_pattern /* Field containing processed string start */
51 #define PSEND end_pattern /* Field containing processed string end */
52
53 #include "pcre_internal.h"
54
55
56 /* When DEBUG is defined, we need the pcre_printint() function, which is also
57 used by pcretest. DEBUG is not defined when building a production library. */
58
59 #ifdef DEBUG
60 #include "pcre_printint.src"
61 #endif
62
63
64 /* Macro for setting individual bits in class bitmaps. */
65
66 #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
67
68 /* Maximum length value to check against when making sure that the integer that
69 holds the compiled pattern length does not overflow. We make it a bit less than
70 INT_MAX to allow for adding in group terminating bytes, so that we don't have
71 to check them every time. */
72
73 #define OFLOW_MAX (INT_MAX - 20)
74
75
76 /*************************************************
77 * Code parameters and static tables *
78 *************************************************/
79
80 /* This value specifies the size of stack workspace that is used during the
81 first pre-compile phase that determines how much memory is required. The regex
82 is partly compiled into this space, but the compiled parts are discarded as
83 soon as they can be, so that hopefully there will never be an overrun. The code
84 does, however, check for an overrun. The largest amount I've seen used is 218,
85 so this number is very generous.
86
87 The same workspace is used during the second, actual compile phase for
88 remembering forward references to groups so that they can be filled in at the
89 end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
90 is 4 there is plenty of room. */
91
92 #define COMPILE_WORK_SIZE (4096)
93
94
95 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
96 are simple data values; negative values are for special things like \d and so
97 on. Zero means further processing is needed (for things like \x), or the escape
98 is invalid. */
99
100 #ifndef EBCDIC /* This is the "normal" table for ASCII systems */
101 static const short int escapes[] = {
102 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
103 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
104 '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */
105 -ESC_H, 0, 0, -ESC_K, 0, 0, 0, 0, /* H - O */
106 -ESC_P, -ESC_Q, -ESC_R, -ESC_S, 0, 0, -ESC_V, -ESC_W, /* P - W */
107 -ESC_X, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
108 '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */
109 -ESC_h, 0, 0, -ESC_k, 0, 0, ESC_n, 0, /* h - o */
110 -ESC_p, 0, ESC_r, -ESC_s, ESC_tee, 0, -ESC_v, -ESC_w, /* p - w */
111 0, 0, -ESC_z /* x - z */
112 };
113
114 #else /* This is the "abnormal" table for EBCDIC systems */
115 static const short int escapes[] = {
116 /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
117 /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
118 /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
119 /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
120 /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
121 /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
122 /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
123 /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
124 /* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0,
125 /* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
126 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
127 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0,
128 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
129 /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
130 /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
131 /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
132 /* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0,
133 /* D0 */ '}', 0, -ESC_K, 0, 0, 0, 0, -ESC_P,
134 /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
135 /* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X,
136 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
137 /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
138 /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
139 };
140 #endif
141
142
143 /* Table of special "verbs" like (*PRUNE) */
144
145 typedef struct verbitem {
146 const char *name;
147 int len;
148 int op;
149 } verbitem;
150
151 static verbitem verbs[] = {
152 { "ACCEPT", 6, OP_ACCEPT },
153 { "COMMIT", 6, OP_COMMIT },
154 { "F", 1, OP_FAIL },
155 { "FAIL", 4, OP_FAIL },
156 { "PRUNE", 5, OP_PRUNE },
157 { "SKIP", 4, OP_SKIP },
158 { "THEN", 4, OP_THEN }
159 };
160
161 static int verbcount = sizeof(verbs)/sizeof(verbitem);
162
163
164 /* Tables of names of POSIX character classes and their lengths. The list is
165 terminated by a zero length entry. The first three must be alpha, lower, upper,
166 as this is assumed for handling case independence. */
167
168 static const char *const posix_names[] = {
169 "alpha", "lower", "upper",
170 "alnum", "ascii", "blank", "cntrl", "digit", "graph",
171 "print", "punct", "space", "word", "xdigit" };
172
173 static const uschar posix_name_lengths[] = {
174 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
175
176 /* Table of class bit maps for each POSIX class. Each class is formed from a
177 base map, with an optional addition or removal of another map. Then, for some
178 classes, there is some additional tweaking: for [:blank:] the vertical space
179 characters are removed, and for [:alpha:] and [:alnum:] the underscore
180 character is removed. The triples in the table consist of the base map offset,
181 second map offset or -1 if no second map, and a non-negative value for map
182 addition or a negative value for map subtraction (if there are two maps). The
183 absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
184 remove vertical space characters, 2 => remove underscore. */
185
186 static const int posix_class_maps[] = {
187 cbit_word, cbit_digit, -2, /* alpha */
188 cbit_lower, -1, 0, /* lower */
189 cbit_upper, -1, 0, /* upper */
190 cbit_word, -1, 2, /* alnum - word without underscore */
191 cbit_print, cbit_cntrl, 0, /* ascii */
192 cbit_space, -1, 1, /* blank - a GNU extension */
193 cbit_cntrl, -1, 0, /* cntrl */
194 cbit_digit, -1, 0, /* digit */
195 cbit_graph, -1, 0, /* graph */
196 cbit_print, -1, 0, /* print */
197 cbit_punct, -1, 0, /* punct */
198 cbit_space, -1, 0, /* space */
199 cbit_word, -1, 0, /* word - a Perl extension */
200 cbit_xdigit,-1, 0 /* xdigit */
201 };
202
203
204 #define STRING(a) # a
205 #define XSTRING(s) STRING(s)
206
207 /* The texts of compile-time error messages. These are "char *" because they
208 are passed to the outside world. Do not ever re-use any error number, because
209 they are documented. Always add a new error instead. Messages marked DEAD below
210 are no longer used. */
211
212 static const char *error_texts[] = {
213 "no error",
214 "\\ at end of pattern",
215 "\\c at end of pattern",
216 "unrecognized character follows \\",
217 "numbers out of order in {} quantifier",
218 /* 5 */
219 "number too big in {} quantifier",
220 "missing terminating ] for character class",
221 "invalid escape sequence in character class",
222 "range out of order in character class",
223 "nothing to repeat",
224 /* 10 */
225 "operand of unlimited repeat could match the empty string", /** DEAD **/
226 "internal error: unexpected repeat",
227 "unrecognized character after (?",
228 "POSIX named classes are supported only within a class",
229 "missing )",
230 /* 15 */
231 "reference to non-existent subpattern",
232 "erroffset passed as NULL",
233 "unknown option bit(s) set",
234 "missing ) after comment",
235 "parentheses nested too deeply", /** DEAD **/
236 /* 20 */
237 "regular expression is too large",
238 "failed to get memory",
239 "unmatched parentheses",
240 "internal error: code overflow",
241 "unrecognized character after (?<",
242 /* 25 */
243 "lookbehind assertion is not fixed length",
244 "malformed number or name after (?(",
245 "conditional group contains more than two branches",
246 "assertion expected after (?(",
247 "(?R or (?[+-]digits must be followed by )",
248 /* 30 */
249 "unknown POSIX class name",
250 "POSIX collating elements are not supported",
251 "this version of PCRE is not compiled with PCRE_UTF8 support",
252 "spare error", /** DEAD **/
253 "character value in \\x{...} sequence is too large",
254 /* 35 */
255 "invalid condition (?(0)",
256 "\\C not allowed in lookbehind assertion",
257 "PCRE does not support \\L, \\l, \\N, \\U, or \\u",
258 "number after (?C is > 255",
259 "closing ) for (?C expected",
260 /* 40 */
261 "recursive call could loop indefinitely",
262 "unrecognized character after (?P",
263 "syntax error in subpattern name (missing terminator)",
264 "two named subpatterns have the same name",
265 "invalid UTF-8 string",
266 /* 45 */
267 "support for \\P, \\p, and \\X has not been compiled",
268 "malformed \\P or \\p sequence",
269 "unknown property name after \\P or \\p",
270 "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)",
271 "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")",
272 /* 50 */
273 "repeated subpattern is too long", /** DEAD **/
274 "octal value is greater than \\377 (not in UTF-8 mode)",
275 "internal error: overran compiling workspace",
276 "internal error: previously-checked referenced subpattern not found",
277 "DEFINE group contains more than one branch",
278 /* 55 */
279 "repeating a DEFINE group is not allowed",
280 "inconsistent NEWLINE options",
281 "\\g is not followed by a braced name or an optionally braced non-zero number",
282 "(?+ or (?- or (?(+ or (?(- must be followed by a non-zero number",
283 "(*VERB) with an argument is not supported",
284 /* 60 */
285 "(*VERB) not recognized",
286 "number is too big"
287 };
288
289
290 /* Table to identify digits and hex digits. This is used when compiling
291 patterns. Note that the tables in chartables are dependent on the locale, and
292 may mark arbitrary characters as digits - but the PCRE compiling code expects
293 to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
294 a private table here. It costs 256 bytes, but it is a lot faster than doing
295 character value tests (at least in some simple cases I timed), and in some
296 applications one wants PCRE to compile efficiently as well as match
297 efficiently.
298
299 For convenience, we use the same bit definitions as in chartables:
300
301 0x04 decimal digit
302 0x08 hexadecimal digit
303
304 Then we can use ctype_digit and ctype_xdigit in the code. */
305
306 #ifndef EBCDIC /* This is the "normal" case, for ASCII systems */
307 static const unsigned char digitab[] =
308 {
309 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
310 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
311 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
312 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
313 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
314 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
315 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
316 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
317 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
318 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
319 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
320 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
321 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
322 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
323 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
324 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
325 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
326 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
327 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
328 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
329 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
330 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
331 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
332 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
333 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
334 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
335 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
336 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
337 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
338 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
339 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
340 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
341
342 #else /* This is the "abnormal" case, for EBCDIC systems */
343 static const unsigned char digitab[] =
344 {
345 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
346 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
347 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
348 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
349 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
350 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
351 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
352 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
353 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
354 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
355 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
356 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */
357 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
358 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
359 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
360 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
361 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
362 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
363 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
364 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
365 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
366 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
367 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
368 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
369 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
370 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
371 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
372 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
373 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
374 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
375 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
376 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
377
378 static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
379 0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
380 0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
381 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
382 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
383 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
384 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
385 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
386 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
387 0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
388 0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
389 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
390 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */
391 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
392 0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
393 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
394 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
395 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
396 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
397 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
398 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
399 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
400 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
401 0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
402 0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
403 0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
404 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
405 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
406 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
407 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
408 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
409 0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
410 0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
411 #endif
412
413
414 /* Definition to allow mutual recursion */
415
416 static BOOL
417 compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
418 int *, int *, branch_chain *, compile_data *, int *);
419
420
421
422 /*************************************************
423 * Handle escapes *
424 *************************************************/
425
426 /* This function is called when a \ has been encountered. It either returns a
427 positive value for a simple escape such as \n, or a negative value which
428 encodes one of the more complicated things such as \d. A backreference to group
429 n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
430 UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
431 ptr is pointing at the \. On exit, it is on the final character of the escape
432 sequence.
433
434 Arguments:
435 ptrptr points to the pattern position pointer
436 errorcodeptr points to the errorcode variable
437 bracount number of previous extracting brackets
438 options the options bits
439 isclass TRUE if inside a character class
440
441 Returns: zero or positive => a data character
442 negative => a special escape sequence
443 on error, errorcodeptr is set
444 */
445
446 static int
447 check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
448 int options, BOOL isclass)
449 {
450 BOOL utf8 = (options & PCRE_UTF8) != 0;
451 const uschar *ptr = *ptrptr + 1;
452 int c, i;
453
454 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
455 ptr--; /* Set pointer back to the last byte */
456
457 /* If backslash is at the end of the pattern, it's an error. */
458
459 if (c == 0) *errorcodeptr = ERR1;
460
461 /* Non-alphamerics are literals. For digits or letters, do an initial lookup in
462 a table. A non-zero result is something that can be returned immediately.
463 Otherwise further processing may be required. */
464
465 #ifndef EBCDIC /* ASCII coding */
466 else if (c < '0' || c > 'z') {} /* Not alphameric */
467 else if ((i = escapes[c - '0']) != 0) c = i;
468
469 #else /* EBCDIC coding */
470 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphameric */
471 else if ((i = escapes[c - 0x48]) != 0) c = i;
472 #endif
473
474 /* Escapes that need further processing, or are illegal. */
475
476 else
477 {
478 const uschar *oldptr;
479 BOOL braced, negated;
480
481 switch (c)
482 {
483 /* A number of Perl escapes are not handled by PCRE. We give an explicit
484 error. */
485
486 case 'l':
487 case 'L':
488 case 'N':
489 case 'u':
490 case 'U':
491 *errorcodeptr = ERR37;
492 break;
493
494 /* \g must be followed by a number, either plain or braced. If positive, it
495 is an absolute backreference. If negative, it is a relative backreference.
496 This is a Perl 5.10 feature. Perl 5.10 also supports \g{name} as a
497 reference to a named group. This is part of Perl's movement towards a
498 unified syntax for back references. As this is synonymous with \k{name}, we
499 fudge it up by pretending it really was \k. */
500
501 case 'g':
502 if (ptr[1] == '{')
503 {
504 const uschar *p;
505 for (p = ptr+2; *p != 0 && *p != '}'; p++)
506 if (*p != '-' && (digitab[*p] & ctype_digit) == 0) break;
507 if (*p != 0 && *p != '}')
508 {
509 c = -ESC_k;
510 break;
511 }
512 braced = TRUE;
513 ptr++;
514 }
515 else braced = FALSE;
516
517 if (ptr[1] == '-')
518 {
519 negated = TRUE;
520 ptr++;
521 }
522 else negated = FALSE;
523
524 c = 0;
525 while ((digitab[ptr[1]] & ctype_digit) != 0)
526 c = c * 10 + *(++ptr) - '0';
527
528 if (c < 0)
529 {
530 *errorcodeptr = ERR61;
531 break;
532 }
533
534 if (c == 0 || (braced && *(++ptr) != '}'))
535 {
536 *errorcodeptr = ERR57;
537 break;
538 }
539
540 if (negated)
541 {
542 if (c > bracount)
543 {
544 *errorcodeptr = ERR15;
545 break;
546 }
547 c = bracount - (c - 1);
548 }
549
550 c = -(ESC_REF + c);
551 break;
552
553 /* The handling of escape sequences consisting of a string of digits
554 starting with one that is not zero is not straightforward. By experiment,
555 the way Perl works seems to be as follows:
556
557 Outside a character class, the digits are read as a decimal number. If the
558 number is less than 10, or if there are that many previous extracting
559 left brackets, then it is a back reference. Otherwise, up to three octal
560 digits are read to form an escaped byte. Thus \123 is likely to be octal
561 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
562 value is greater than 377, the least significant 8 bits are taken. Inside a
563 character class, \ followed by a digit is always an octal number. */
564
565 case '1': case '2': case '3': case '4': case '5':
566 case '6': case '7': case '8': case '9':
567
568 if (!isclass)
569 {
570 oldptr = ptr;
571 c -= '0';
572 while ((digitab[ptr[1]] & ctype_digit) != 0)
573 c = c * 10 + *(++ptr) - '0';
574 if (c < 0)
575 {
576 *errorcodeptr = ERR61;
577 break;
578 }
579 if (c < 10 || c <= bracount)
580 {
581 c = -(ESC_REF + c);
582 break;
583 }
584 ptr = oldptr; /* Put the pointer back and fall through */
585 }
586
587 /* Handle an octal number following \. If the first digit is 8 or 9, Perl
588 generates a binary zero byte and treats the digit as a following literal.
589 Thus we have to pull back the pointer by one. */
590
591 if ((c = *ptr) >= '8')
592 {
593 ptr--;
594 c = 0;
595 break;
596 }
597
598 /* \0 always starts an octal number, but we may drop through to here with a
599 larger first octal digit. The original code used just to take the least
600 significant 8 bits of octal numbers (I think this is what early Perls used
601 to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
602 than 3 octal digits. */
603
604 case '0':
605 c -= '0';
606 while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
607 c = c * 8 + *(++ptr) - '0';
608 if (!utf8 && c > 255) *errorcodeptr = ERR51;
609 break;
610
611 /* \x is complicated. \x{ddd} is a character number which can be greater
612 than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
613 treated as a data character. */
614
615 case 'x':
616 if (ptr[1] == '{')
617 {
618 const uschar *pt = ptr + 2;
619 int count = 0;
620
621 c = 0;
622 while ((digitab[*pt] & ctype_xdigit) != 0)
623 {
624 register int cc = *pt++;
625 if (c == 0 && cc == '0') continue; /* Leading zeroes */
626 count++;
627
628 #ifndef EBCDIC /* ASCII coding */
629 if (cc >= 'a') cc -= 32; /* Convert to upper case */
630 c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));
631 #else /* EBCDIC coding */
632 if (cc >= 'a' && cc <= 'z') cc += 64; /* Convert to upper case */
633 c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));
634 #endif
635 }
636
637 if (*pt == '}')
638 {
639 if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
640 ptr = pt;
641 break;
642 }
643
644 /* If the sequence of hex digits does not end with '}', then we don't
645 recognize this construct; fall through to the normal \x handling. */
646 }
647
648 /* Read just a single-byte hex-defined char */
649
650 c = 0;
651 while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
652 {
653 int cc; /* Some compilers don't like ++ */
654 cc = *(++ptr); /* in initializers */
655 #ifndef EBCDIC /* ASCII coding */
656 if (cc >= 'a') cc -= 32; /* Convert to upper case */
657 c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
658 #else /* EBCDIC coding */
659 if (cc <= 'z') cc += 64; /* Convert to upper case */
660 c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
661 #endif
662 }
663 break;
664
665 /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
666 This coding is ASCII-specific, but then the whole concept of \cx is
667 ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
668
669 case 'c':
670 c = *(++ptr);
671 if (c == 0)
672 {
673 *errorcodeptr = ERR2;
674 break;
675 }
676
677 #ifndef EBCDIC /* ASCII coding */
678 if (c >= 'a' && c <= 'z') c -= 32;
679 c ^= 0x40;
680 #else /* EBCDIC coding */
681 if (c >= 'a' && c <= 'z') c += 64;
682 c ^= 0xC0;
683 #endif
684 break;
685
686 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
687 other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
688 for Perl compatibility, it is a literal. This code looks a bit odd, but
689 there used to be some cases other than the default, and there may be again
690 in future, so I haven't "optimized" it. */
691
692 default:
693 if ((options & PCRE_EXTRA) != 0) switch(c)
694 {
695 default:
696 *errorcodeptr = ERR3;
697 break;
698 }
699 break;
700 }
701 }
702
703 *ptrptr = ptr;
704 return c;
705 }
706
707
708
709 #ifdef SUPPORT_UCP
710 /*************************************************
711 * Handle \P and \p *
712 *************************************************/
713
714 /* This function is called after \P or \p has been encountered, provided that
715 PCRE is compiled with support for Unicode properties. On entry, ptrptr is
716 pointing at the P or p. On exit, it is pointing at the final character of the
717 escape sequence.
718
719 Argument:
720 ptrptr points to the pattern position pointer
721 negptr points to a boolean that is set TRUE for negation else FALSE
722 dptr points to an int that is set to the detailed property value
723 errorcodeptr points to the error code variable
724
725 Returns: type value from ucp_type_table, or -1 for an invalid type
726 */
727
728 static int
729 get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
730 {
731 int c, i, bot, top;
732 const uschar *ptr = *ptrptr;
733 char name[32];
734
735 c = *(++ptr);
736 if (c == 0) goto ERROR_RETURN;
737
738 *negptr = FALSE;
739
740 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
741 negation. */
742
743 if (c == '{')
744 {
745 if (ptr[1] == '^')
746 {
747 *negptr = TRUE;
748 ptr++;
749 }
750 for (i = 0; i < (int)sizeof(name) - 1; i++)
751 {
752 c = *(++ptr);
753 if (c == 0) goto ERROR_RETURN;
754 if (c == '}') break;
755 name[i] = c;
756 }
757 if (c !='}') goto ERROR_RETURN;
758 name[i] = 0;
759 }
760
761 /* Otherwise there is just one following character */
762
763 else
764 {
765 name[0] = c;
766 name[1] = 0;
767 }
768
769 *ptrptr = ptr;
770
771 /* Search for a recognized property name using binary chop */
772
773 bot = 0;
774 top = _pcre_utt_size;
775
776 while (bot < top)
777 {
778 i = (bot + top) >> 1;
779 c = strcmp(name, _pcre_utt[i].name);
780 if (c == 0)
781 {
782 *dptr = _pcre_utt[i].value;
783 return _pcre_utt[i].type;
784 }
785 if (c > 0) bot = i + 1; else top = i;
786 }
787
788 *errorcodeptr = ERR47;
789 *ptrptr = ptr;
790 return -1;
791
792 ERROR_RETURN:
793 *errorcodeptr = ERR46;
794 *ptrptr = ptr;
795 return -1;
796 }
797 #endif
798
799
800
801
802 /*************************************************
803 * Check for counted repeat *
804 *************************************************/
805
806 /* This function is called when a '{' is encountered in a place where it might
807 start a quantifier. It looks ahead to see if it really is a quantifier or not.
808 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
809 where the ddds are digits.
810
811 Arguments:
812 p pointer to the first char after '{'
813
814 Returns: TRUE or FALSE
815 */
816
817 static BOOL
818 is_counted_repeat(const uschar *p)
819 {
820 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
821 while ((digitab[*p] & ctype_digit) != 0) p++;
822 if (*p == '}') return TRUE;
823
824 if (*p++ != ',') return FALSE;
825 if (*p == '}') return TRUE;
826
827 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
828 while ((digitab[*p] & ctype_digit) != 0) p++;
829
830 return (*p == '}');
831 }
832
833
834
835 /*************************************************
836 * Read repeat counts *
837 *************************************************/
838
839 /* Read an item of the form {n,m} and return the values. This is called only
840 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
841 so the syntax is guaranteed to be correct, but we need to check the values.
842
843 Arguments:
844 p pointer to first char after '{'
845 minp pointer to int for min
846 maxp pointer to int for max
847 returned as -1 if no max
848 errorcodeptr points to error code variable
849
850 Returns: pointer to '}' on success;
851 current ptr on error, with errorcodeptr set non-zero
852 */
853
854 static const uschar *
855 read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
856 {
857 int min = 0;
858 int max = -1;
859
860 /* Read the minimum value and do a paranoid check: a negative value indicates
861 an integer overflow. */
862
863 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
864 if (min < 0 || min > 65535)
865 {
866 *errorcodeptr = ERR5;
867 return p;
868 }
869
870 /* Read the maximum value if there is one, and again do a paranoid on its size.
871 Also, max must not be less than min. */
872
873 if (*p == '}') max = min; else
874 {
875 if (*(++p) != '}')
876 {
877 max = 0;
878 while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
879 if (max < 0 || max > 65535)
880 {
881 *errorcodeptr = ERR5;
882 return p;
883 }
884 if (max < min)
885 {
886 *errorcodeptr = ERR4;
887 return p;
888 }
889 }
890 }
891
892 /* Fill in the required variables, and pass back the pointer to the terminating
893 '}'. */
894
895 *minp = min;
896 *maxp = max;
897 return p;
898 }
899
900
901
902 /*************************************************
903 * Find forward referenced subpattern *
904 *************************************************/
905
906 /* This function scans along a pattern's text looking for capturing
907 subpatterns, and counting them. If it finds a named pattern that matches the
908 name it is given, it returns its number. Alternatively, if the name is NULL, it
909 returns when it reaches a given numbered subpattern. This is used for forward
910 references to subpatterns. We know that if (?P< is encountered, the name will
911 be terminated by '>' because that is checked in the first pass.
912
913 Arguments:
914 ptr current position in the pattern
915 count current count of capturing parens so far encountered
916 name name to seek, or NULL if seeking a numbered subpattern
917 lorn name length, or subpattern number if name is NULL
918 xmode TRUE if we are in /x mode
919
920 Returns: the number of the named subpattern, or -1 if not found
921 */
922
923 static int
924 find_parens(const uschar *ptr, int count, const uschar *name, int lorn,
925 BOOL xmode)
926 {
927 const uschar *thisname;
928
929 for (; *ptr != 0; ptr++)
930 {
931 int term;
932
933 /* Skip over backslashed characters and also entire \Q...\E */
934
935 if (*ptr == '\\')
936 {
937 if (*(++ptr) == 0) return -1;
938 if (*ptr == 'Q') for (;;)
939 {
940 while (*(++ptr) != 0 && *ptr != '\\');
941 if (*ptr == 0) return -1;
942 if (*(++ptr) == 'E') break;
943 }
944 continue;
945 }
946
947 /* Skip over character classes */
948
949 if (*ptr == '[')
950 {
951 while (*(++ptr) != ']')
952 {
953 if (*ptr == 0) return -1;
954 if (*ptr == '\\')
955 {
956 if (*(++ptr) == 0) return -1;
957 if (*ptr == 'Q') for (;;)
958 {
959 while (*(++ptr) != 0 && *ptr != '\\');
960 if (*ptr == 0) return -1;
961 if (*(++ptr) == 'E') break;
962 }
963 continue;
964 }
965 }
966 continue;
967 }
968
969 /* Skip comments in /x mode */
970
971 if (xmode && *ptr == '#')
972 {
973 while (*(++ptr) != 0 && *ptr != '\n');
974 if (*ptr == 0) return -1;
975 continue;
976 }
977
978 /* An opening parens must now be a real metacharacter */
979
980 if (*ptr != '(') continue;
981 if (ptr[1] != '?' && ptr[1] != '*')
982 {
983 count++;
984 if (name == NULL && count == lorn) return count;
985 continue;
986 }
987
988 ptr += 2;
989 if (*ptr == 'P') ptr++; /* Allow optional P */
990
991 /* We have to disambiguate (?<! and (?<= from (?<name> */
992
993 if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') &&
994 *ptr != '\'')
995 continue;
996
997 count++;
998
999 if (name == NULL && count == lorn) return count;
1000 term = *ptr++;
1001 if (term == '<') term = '>';
1002 thisname = ptr;
1003 while (*ptr != term) ptr++;
1004 if (name != NULL && lorn == ptr - thisname &&
1005 strncmp((const char *)name, (const char *)thisname, lorn) == 0)
1006 return count;
1007 }
1008
1009 return -1;
1010 }
1011
1012
1013
1014 /*************************************************
1015 * Find first significant op code *
1016 *************************************************/
1017
1018 /* This is called by several functions that scan a compiled expression looking
1019 for a fixed first character, or an anchoring op code etc. It skips over things
1020 that do not influence this. For some calls, a change of option is important.
1021 For some calls, it makes sense to skip negative forward and all backward
1022 assertions, and also the \b assertion; for others it does not.
1023
1024 Arguments:
1025 code pointer to the start of the group
1026 options pointer to external options
1027 optbit the option bit whose changing is significant, or
1028 zero if none are
1029 skipassert TRUE if certain assertions are to be skipped
1030
1031 Returns: pointer to the first significant opcode
1032 */
1033
1034 static const uschar*
1035 first_significant_code(const uschar *code, int *options, int optbit,
1036 BOOL skipassert)
1037 {
1038 for (;;)
1039 {
1040 switch ((int)*code)
1041 {
1042 case OP_OPT:
1043 if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
1044 *options = (int)code[1];
1045 code += 2;
1046 break;
1047
1048 case OP_ASSERT_NOT:
1049 case OP_ASSERTBACK:
1050 case OP_ASSERTBACK_NOT:
1051 if (!skipassert) return code;
1052 do code += GET(code, 1); while (*code == OP_ALT);
1053 code += _pcre_OP_lengths[*code];
1054 break;
1055
1056 case OP_WORD_BOUNDARY:
1057 case OP_NOT_WORD_BOUNDARY:
1058 if (!skipassert) return code;
1059 /* Fall through */
1060
1061 case OP_CALLOUT:
1062 case OP_CREF:
1063 case OP_RREF:
1064 case OP_DEF:
1065 code += _pcre_OP_lengths[*code];
1066 break;
1067
1068 default:
1069 return code;
1070 }
1071 }
1072 /* Control never reaches here */
1073 }
1074
1075
1076
1077
1078 /*************************************************
1079 * Find the fixed length of a pattern *
1080 *************************************************/
1081
1082 /* Scan a pattern and compute the fixed length of subject that will match it,
1083 if the length is fixed. This is needed for dealing with backward assertions.
1084 In UTF8 mode, the result is in characters rather than bytes.
1085
1086 Arguments:
1087 code points to the start of the pattern (the bracket)
1088 options the compiling options
1089
1090 Returns: the fixed length, or -1 if there is no fixed length,
1091 or -2 if \C was encountered
1092 */
1093
1094 static int
1095 find_fixedlength(uschar *code, int options)
1096 {
1097 int length = -1;
1098
1099 register int branchlength = 0;
1100 register uschar *cc = code + 1 + LINK_SIZE;
1101
1102 /* Scan along the opcodes for this branch. If we get to the end of the
1103 branch, check the length against that of the other branches. */
1104
1105 for (;;)
1106 {
1107 int d;
1108 register int op = *cc;
1109 switch (op)
1110 {
1111 case OP_CBRA:
1112 case OP_BRA:
1113 case OP_ONCE:
1114 case OP_COND:
1115 d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);
1116 if (d < 0) return d;
1117 branchlength += d;
1118 do cc += GET(cc, 1); while (*cc == OP_ALT);
1119 cc += 1 + LINK_SIZE;
1120 break;
1121
1122 /* Reached end of a branch; if it's a ket it is the end of a nested
1123 call. If it's ALT it is an alternation in a nested call. If it is
1124 END it's the end of the outer call. All can be handled by the same code. */
1125
1126 case OP_ALT:
1127 case OP_KET:
1128 case OP_KETRMAX:
1129 case OP_KETRMIN:
1130 case OP_END:
1131 if (length < 0) length = branchlength;
1132 else if (length != branchlength) return -1;
1133 if (*cc != OP_ALT) return length;
1134 cc += 1 + LINK_SIZE;
1135 branchlength = 0;
1136 break;
1137
1138 /* Skip over assertive subpatterns */
1139
1140 case OP_ASSERT:
1141 case OP_ASSERT_NOT:
1142 case OP_ASSERTBACK:
1143 case OP_ASSERTBACK_NOT:
1144 do cc += GET(cc, 1); while (*cc == OP_ALT);
1145 /* Fall through */
1146
1147 /* Skip over things that don't match chars */
1148
1149 case OP_REVERSE:
1150 case OP_CREF:
1151 case OP_RREF:
1152 case OP_DEF:
1153 case OP_OPT:
1154 case OP_CALLOUT:
1155 case OP_SOD:
1156 case OP_SOM:
1157 case OP_EOD:
1158 case OP_EODN:
1159 case OP_CIRC:
1160 case OP_DOLL:
1161 case OP_NOT_WORD_BOUNDARY:
1162 case OP_WORD_BOUNDARY:
1163 cc += _pcre_OP_lengths[*cc];
1164 break;
1165
1166 /* Handle literal characters */
1167
1168 case OP_CHAR:
1169 case OP_CHARNC:
1170 case OP_NOT:
1171 branchlength++;
1172 cc += 2;
1173 #ifdef SUPPORT_UTF8
1174 if ((options & PCRE_UTF8) != 0)
1175 {
1176 while ((*cc & 0xc0) == 0x80) cc++;
1177 }
1178 #endif
1179 break;
1180
1181 /* Handle exact repetitions. The count is already in characters, but we
1182 need to skip over a multibyte character in UTF8 mode. */
1183
1184 case OP_EXACT:
1185 branchlength += GET2(cc,1);
1186 cc += 4;
1187 #ifdef SUPPORT_UTF8
1188 if ((options & PCRE_UTF8) != 0)
1189 {
1190 while((*cc & 0x80) == 0x80) cc++;
1191 }
1192 #endif
1193 break;
1194
1195 case OP_TYPEEXACT:
1196 branchlength += GET2(cc,1);
1197 if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
1198 cc += 4;
1199 break;
1200
1201 /* Handle single-char matchers */
1202
1203 case OP_PROP:
1204 case OP_NOTPROP:
1205 cc += 2;
1206 /* Fall through */
1207
1208 case OP_NOT_DIGIT:
1209 case OP_DIGIT:
1210 case OP_NOT_WHITESPACE:
1211 case OP_WHITESPACE:
1212 case OP_NOT_WORDCHAR:
1213 case OP_WORDCHAR:
1214 case OP_ANY:
1215 branchlength++;
1216 cc++;
1217 break;
1218
1219 /* The single-byte matcher isn't allowed */
1220
1221 case OP_ANYBYTE:
1222 return -2;
1223
1224 /* Check a class for variable quantification */
1225
1226 #ifdef SUPPORT_UTF8
1227 case OP_XCLASS:
1228 cc += GET(cc, 1) - 33;
1229 /* Fall through */
1230 #endif
1231
1232 case OP_CLASS:
1233 case OP_NCLASS:
1234 cc += 33;
1235
1236 switch (*cc)
1237 {
1238 case OP_CRSTAR:
1239 case OP_CRMINSTAR:
1240 case OP_CRQUERY:
1241 case OP_CRMINQUERY:
1242 return -1;
1243
1244 case OP_CRRANGE:
1245 case OP_CRMINRANGE:
1246 if (GET2(cc,1) != GET2(cc,3)) return -1;
1247 branchlength += GET2(cc,1);
1248 cc += 5;
1249 break;
1250
1251 default:
1252 branchlength++;
1253 }
1254 break;
1255
1256 /* Anything else is variable length */
1257
1258 default:
1259 return -1;
1260 }
1261 }
1262 /* Control never gets here */
1263 }
1264
1265
1266
1267
1268 /*************************************************
1269 * Scan compiled regex for numbered bracket *
1270 *************************************************/
1271
1272 /* This little function scans through a compiled pattern until it finds a
1273 capturing bracket with the given number.
1274
1275 Arguments:
1276 code points to start of expression
1277 utf8 TRUE in UTF-8 mode
1278 number the required bracket number
1279
1280 Returns: pointer to the opcode for the bracket, or NULL if not found
1281 */
1282
1283 static const uschar *
1284 find_bracket(const uschar *code, BOOL utf8, int number)
1285 {
1286 for (;;)
1287 {
1288 register int c = *code;
1289 if (c == OP_END) return NULL;
1290
1291 /* XCLASS is used for classes that cannot be represented just by a bit
1292 map. This includes negated single high-valued characters. The length in
1293 the table is zero; the actual length is stored in the compiled code. */
1294
1295 if (c == OP_XCLASS) code += GET(code, 1);
1296
1297 /* Handle capturing bracket */
1298
1299 else if (c == OP_CBRA)
1300 {
1301 int n = GET2(code, 1+LINK_SIZE);
1302 if (n == number) return (uschar *)code;
1303 code += _pcre_OP_lengths[c];
1304 }
1305
1306 /* Otherwise, we can get the item's length from the table, except that for
1307 repeated character types, we have to test for \p and \P, which have an extra
1308 two bytes of parameters. */
1309
1310 else
1311 {
1312 switch(c)
1313 {
1314 case OP_TYPESTAR:
1315 case OP_TYPEMINSTAR:
1316 case OP_TYPEPLUS:
1317 case OP_TYPEMINPLUS:
1318 case OP_TYPEQUERY:
1319 case OP_TYPEMINQUERY:
1320 case OP_TYPEPOSSTAR:
1321 case OP_TYPEPOSPLUS:
1322 case OP_TYPEPOSQUERY:
1323 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1324 break;
1325
1326 case OP_TYPEUPTO:
1327 case OP_TYPEMINUPTO:
1328 case OP_TYPEEXACT:
1329 case OP_TYPEPOSUPTO:
1330 if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1331 break;
1332 }
1333
1334 /* Add in the fixed length from the table */
1335
1336 code += _pcre_OP_lengths[c];
1337
1338 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1339 a multi-byte character. The length in the table is a minimum, so we have to
1340 arrange to skip the extra bytes. */
1341
1342 #ifdef SUPPORT_UTF8
1343 if (utf8) switch(c)
1344 {
1345 case OP_CHAR:
1346 case OP_CHARNC:
1347 case OP_EXACT:
1348 case OP_UPTO:
1349 case OP_MINUPTO:
1350 case OP_POSUPTO:
1351 case OP_STAR:
1352 case OP_MINSTAR:
1353 case OP_POSSTAR:
1354 case OP_PLUS:
1355 case OP_MINPLUS:
1356 case OP_POSPLUS:
1357 case OP_QUERY:
1358 case OP_MINQUERY:
1359 case OP_POSQUERY:
1360 if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1361 break;
1362 }
1363 #endif
1364 }
1365 }
1366 }
1367
1368
1369
1370 /*************************************************
1371 * Scan compiled regex for recursion reference *
1372 *************************************************/
1373
1374 /* This little function scans through a compiled pattern until it finds an
1375 instance of OP_RECURSE.
1376
1377 Arguments:
1378 code points to start of expression
1379 utf8 TRUE in UTF-8 mode
1380
1381 Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1382 */
1383
1384 static const uschar *
1385 find_recurse(const uschar *code, BOOL utf8)
1386 {
1387 for (;;)
1388 {
1389 register int c = *code;
1390 if (c == OP_END) return NULL;
1391 if (c == OP_RECURSE) return code;
1392
1393 /* XCLASS is used for classes that cannot be represented just by a bit
1394 map. This includes negated single high-valued characters. The length in
1395 the table is zero; the actual length is stored in the compiled code. */
1396
1397 if (c == OP_XCLASS) code += GET(code, 1);
1398
1399 /* Otherwise, we can get the item's length from the table, except that for
1400 repeated character types, we have to test for \p and \P, which have an extra
1401 two bytes of parameters. */
1402
1403 else
1404 {
1405 switch(c)
1406 {
1407 case OP_TYPESTAR:
1408 case OP_TYPEMINSTAR:
1409 case OP_TYPEPLUS:
1410 case OP_TYPEMINPLUS:
1411 case OP_TYPEQUERY:
1412 case OP_TYPEMINQUERY:
1413 case OP_TYPEPOSSTAR:
1414 case OP_TYPEPOSPLUS:
1415 case OP_TYPEPOSQUERY:
1416 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1417 break;
1418
1419 case OP_TYPEPOSUPTO:
1420 case OP_TYPEUPTO:
1421 case OP_TYPEMINUPTO:
1422 case OP_TYPEEXACT:
1423 if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1424 break;
1425 }
1426
1427 /* Add in the fixed length from the table */
1428
1429 code += _pcre_OP_lengths[c];
1430
1431 /* In UTF-8 mode, opcodes that are followed by a character may be followed
1432 by a multi-byte character. The length in the table is a minimum, so we have
1433 to arrange to skip the extra bytes. */
1434
1435 #ifdef SUPPORT_UTF8
1436 if (utf8) switch(c)
1437 {
1438 case OP_CHAR:
1439 case OP_CHARNC:
1440 case OP_EXACT:
1441 case OP_UPTO:
1442 case OP_MINUPTO:
1443 case OP_POSUPTO:
1444 case OP_STAR:
1445 case OP_MINSTAR:
1446 case OP_POSSTAR:
1447 case OP_PLUS:
1448 case OP_MINPLUS:
1449 case OP_POSPLUS:
1450 case OP_QUERY:
1451 case OP_MINQUERY:
1452 case OP_POSQUERY:
1453 if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1454 break;
1455 }
1456 #endif
1457 }
1458 }
1459 }
1460
1461
1462
1463 /*************************************************
1464 * Scan compiled branch for non-emptiness *
1465 *************************************************/
1466
1467 /* This function scans through a branch of a compiled pattern to see whether it
1468 can match the empty string or not. It is called from could_be_empty()
1469 below and from compile_branch() when checking for an unlimited repeat of a
1470 group that can match nothing. Note that first_significant_code() skips over
1471 assertions. If we hit an unclosed bracket, we return "empty" - this means we've
1472 struck an inner bracket whose current branch will already have been scanned.
1473
1474 Arguments:
1475 code points to start of search
1476 endcode points to where to stop
1477 utf8 TRUE if in UTF8 mode
1478
1479 Returns: TRUE if what is matched could be empty
1480 */
1481
1482 static BOOL
1483 could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1484 {
1485 register int c;
1486 for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1487 code < endcode;
1488 code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1489 {
1490 const uschar *ccode;
1491
1492 c = *code;
1493
1494 /* Groups with zero repeats can of course be empty; skip them. */
1495
1496 if (c == OP_BRAZERO || c == OP_BRAMINZERO)
1497 {
1498 code += _pcre_OP_lengths[c];
1499 do code += GET(code, 1); while (*code == OP_ALT);
1500 c = *code;
1501 continue;
1502 }
1503
1504 /* For other groups, scan the branches. */
1505
1506 if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND)
1507 {
1508 BOOL empty_branch;
1509 if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1510
1511 /* Scan a closed bracket */
1512
1513 empty_branch = FALSE;
1514 do
1515 {
1516 if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1517 empty_branch = TRUE;
1518 code += GET(code, 1);
1519 }
1520 while (*code == OP_ALT);
1521 if (!empty_branch) return FALSE; /* All branches are non-empty */
1522 c = *code;
1523 continue;
1524 }
1525
1526 /* Handle the other opcodes */
1527
1528 switch (c)
1529 {
1530 /* Check for quantifiers after a class. XCLASS is used for classes that
1531 cannot be represented just by a bit map. This includes negated single
1532 high-valued characters. The length in _pcre_OP_lengths[] is zero; the
1533 actual length is stored in the compiled code, so we must update "code"
1534 here. */
1535
1536 #ifdef SUPPORT_UTF8
1537 case OP_XCLASS:
1538 ccode = code += GET(code, 1);
1539 goto CHECK_CLASS_REPEAT;
1540 #endif
1541
1542 case OP_CLASS:
1543 case OP_NCLASS:
1544 ccode = code + 33;
1545
1546 #ifdef SUPPORT_UTF8
1547 CHECK_CLASS_REPEAT:
1548 #endif
1549
1550 switch (*ccode)
1551 {
1552 case OP_CRSTAR: /* These could be empty; continue */
1553 case OP_CRMINSTAR:
1554 case OP_CRQUERY:
1555 case OP_CRMINQUERY:
1556 break;
1557
1558 default: /* Non-repeat => class must match */
1559 case OP_CRPLUS: /* These repeats aren't empty */
1560 case OP_CRMINPLUS:
1561 return FALSE;
1562
1563 case OP_CRRANGE:
1564 case OP_CRMINRANGE:
1565 if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
1566 break;
1567 }
1568 break;
1569
1570 /* Opcodes that must match a character */
1571
1572 case OP_PROP:
1573 case OP_NOTPROP:
1574 case OP_EXTUNI:
1575 case OP_NOT_DIGIT:
1576 case OP_DIGIT:
1577 case OP_NOT_WHITESPACE:
1578 case OP_WHITESPACE:
1579 case OP_NOT_WORDCHAR:
1580 case OP_WORDCHAR:
1581 case OP_ANY:
1582 case OP_ANYBYTE:
1583 case OP_CHAR:
1584 case OP_CHARNC:
1585 case OP_NOT:
1586 case OP_PLUS:
1587 case OP_MINPLUS:
1588 case OP_POSPLUS:
1589 case OP_EXACT:
1590 case OP_NOTPLUS:
1591 case OP_NOTMINPLUS:
1592 case OP_NOTPOSPLUS:
1593 case OP_NOTEXACT:
1594 case OP_TYPEPLUS:
1595 case OP_TYPEMINPLUS:
1596 case OP_TYPEPOSPLUS:
1597 case OP_TYPEEXACT:
1598 return FALSE;
1599
1600 /* End of branch */
1601
1602 case OP_KET:
1603 case OP_KETRMAX:
1604 case OP_KETRMIN:
1605 case OP_ALT:
1606 return TRUE;
1607
1608 /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1609 MINUPTO, and POSUPTO may be followed by a multibyte character */
1610
1611 #ifdef SUPPORT_UTF8
1612 case OP_STAR:
1613 case OP_MINSTAR:
1614 case OP_POSSTAR:
1615 case OP_QUERY:
1616 case OP_MINQUERY:
1617 case OP_POSQUERY:
1618 case OP_UPTO:
1619 case OP_MINUPTO:
1620 case OP_POSUPTO:
1621 if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1622 break;
1623 #endif
1624 }
1625 }
1626
1627 return TRUE;
1628 }
1629
1630
1631
1632 /*************************************************
1633 * Scan compiled regex for non-emptiness *
1634 *************************************************/
1635
1636 /* This function is called to check for left recursive calls. We want to check
1637 the current branch of the current pattern to see if it could match the empty
1638 string. If it could, we must look outwards for branches at other levels,
1639 stopping when we pass beyond the bracket which is the subject of the recursion.
1640
1641 Arguments:
1642 code points to start of the recursion
1643 endcode points to where to stop (current RECURSE item)
1644 bcptr points to the chain of current (unclosed) branch starts
1645 utf8 TRUE if in UTF-8 mode
1646
1647 Returns: TRUE if what is matched could be empty
1648 */
1649
1650 static BOOL
1651 could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1652 BOOL utf8)
1653 {
1654 while (bcptr != NULL && bcptr->current >= code)
1655 {
1656 if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1657 bcptr = bcptr->outer;
1658 }
1659 return TRUE;
1660 }
1661
1662
1663
1664 /*************************************************
1665 * Check for POSIX class syntax *
1666 *************************************************/
1667
1668 /* This function is called when the sequence "[:" or "[." or "[=" is
1669 encountered in a character class. It checks whether this is followed by an
1670 optional ^ and then a sequence of letters, terminated by a matching ":]" or
1671 ".]" or "=]".
1672
1673 Argument:
1674 ptr pointer to the initial [
1675 endptr where to return the end pointer
1676 cd pointer to compile data
1677
1678 Returns: TRUE or FALSE
1679 */
1680
1681 static BOOL
1682 check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
1683 {
1684 int terminator; /* Don't combine these lines; the Solaris cc */
1685 terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
1686 if (*(++ptr) == '^') ptr++;
1687 while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
1688 if (*ptr == terminator && ptr[1] == ']')
1689 {
1690 *endptr = ptr;
1691 return TRUE;
1692 }
1693 return FALSE;
1694 }
1695
1696
1697
1698
1699 /*************************************************
1700 * Check POSIX class name *
1701 *************************************************/
1702
1703 /* This function is called to check the name given in a POSIX-style class entry
1704 such as [:alnum:].
1705
1706 Arguments:
1707 ptr points to the first letter
1708 len the length of the name
1709
1710 Returns: a value representing the name, or -1 if unknown
1711 */
1712
1713 static int
1714 check_posix_name(const uschar *ptr, int len)
1715 {
1716 register int yield = 0;
1717 while (posix_name_lengths[yield] != 0)
1718 {
1719 if (len == posix_name_lengths[yield] &&
1720 strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
1721 yield++;
1722 }
1723 return -1;
1724 }
1725
1726
1727 /*************************************************
1728 * Adjust OP_RECURSE items in repeated group *
1729 *************************************************/
1730
1731 /* OP_RECURSE items contain an offset from the start of the regex to the group
1732 that is referenced. This means that groups can be replicated for fixed
1733 repetition simply by copying (because the recursion is allowed to refer to
1734 earlier groups that are outside the current group). However, when a group is
1735 optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
1736 it, after it has been compiled. This means that any OP_RECURSE items within it
1737 that refer to the group itself or any contained groups have to have their
1738 offsets adjusted. That one of the jobs of this function. Before it is called,
1739 the partially compiled regex must be temporarily terminated with OP_END.
1740
1741 This function has been extended with the possibility of forward references for
1742 recursions and subroutine calls. It must also check the list of such references
1743 for the group we are dealing with. If it finds that one of the recursions in
1744 the current group is on this list, it adjusts the offset in the list, not the
1745 value in the reference (which is a group number).
1746
1747 Arguments:
1748 group points to the start of the group
1749 adjust the amount by which the group is to be moved
1750 utf8 TRUE in UTF-8 mode
1751 cd contains pointers to tables etc.
1752 save_hwm the hwm forward reference pointer at the start of the group
1753
1754 Returns: nothing
1755 */
1756
1757 static void
1758 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
1759 uschar *save_hwm)
1760 {
1761 uschar *ptr = group;
1762 while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1763 {
1764 int offset;
1765 uschar *hc;
1766
1767 /* See if this recursion is on the forward reference list. If so, adjust the
1768 reference. */
1769
1770 for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
1771 {
1772 offset = GET(hc, 0);
1773 if (cd->start_code + offset == ptr + 1)
1774 {
1775 PUT(hc, 0, offset + adjust);
1776 break;
1777 }
1778 }
1779
1780 /* Otherwise, adjust the recursion offset if it's after the start of this
1781 group. */
1782
1783 if (hc >= cd->hwm)
1784 {
1785 offset = GET(ptr, 1);
1786 if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1787 }
1788
1789 ptr += 1 + LINK_SIZE;
1790 }
1791 }
1792
1793
1794
1795 /*************************************************
1796 * Insert an automatic callout point *
1797 *************************************************/
1798
1799 /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
1800 callout points before each pattern item.
1801
1802 Arguments:
1803 code current code pointer
1804 ptr current pattern pointer
1805 cd pointers to tables etc
1806
1807 Returns: new code pointer
1808 */
1809
1810 static uschar *
1811 auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
1812 {
1813 *code++ = OP_CALLOUT;
1814 *code++ = 255;
1815 PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */
1816 PUT(code, LINK_SIZE, 0); /* Default length */
1817 return code + 2*LINK_SIZE;
1818 }
1819
1820
1821
1822 /*************************************************
1823 * Complete a callout item *
1824 *************************************************/
1825
1826 /* A callout item contains the length of the next item in the pattern, which
1827 we can't fill in till after we have reached the relevant point. This is used
1828 for both automatic and manual callouts.
1829
1830 Arguments:
1831 previous_callout points to previous callout item
1832 ptr current pattern pointer
1833 cd pointers to tables etc
1834
1835 Returns: nothing
1836 */
1837
1838 static void
1839 complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
1840 {
1841 int length = ptr - cd->start_pattern - GET(previous_callout, 2);
1842 PUT(previous_callout, 2 + LINK_SIZE, length);
1843 }
1844
1845
1846
1847 #ifdef SUPPORT_UCP
1848 /*************************************************
1849 * Get othercase range *
1850 *************************************************/
1851
1852 /* This function is passed the start and end of a class range, in UTF-8 mode
1853 with UCP support. It searches up the characters, looking for internal ranges of
1854 characters in the "other" case. Each call returns the next one, updating the
1855 start address.
1856
1857 Arguments:
1858 cptr points to starting character value; updated
1859 d end value
1860 ocptr where to put start of othercase range
1861 odptr where to put end of othercase range
1862
1863 Yield: TRUE when range returned; FALSE when no more
1864 */
1865
1866 static BOOL
1867 get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
1868 unsigned int *odptr)
1869 {
1870 unsigned int c, othercase, next;
1871
1872 for (c = *cptr; c <= d; c++)
1873 { if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR) break; }
1874
1875 if (c > d) return FALSE;
1876
1877 *ocptr = othercase;
1878 next = othercase + 1;
1879
1880 for (++c; c <= d; c++)
1881 {
1882 if (_pcre_ucp_othercase(c) != next) break;
1883 next++;
1884 }
1885
1886 *odptr = next - 1;
1887 *cptr = c;
1888
1889 return TRUE;
1890 }
1891 #endif /* SUPPORT_UCP */
1892
1893
1894
1895 /*************************************************
1896 * Check if auto-possessifying is possible *
1897 *************************************************/
1898
1899 /* This function is called for unlimited repeats of certain items, to see
1900 whether the next thing could possibly match the repeated item. If not, it makes
1901 sense to automatically possessify the repeated item.
1902
1903 Arguments:
1904 op_code the repeated op code
1905 this data for this item, depends on the opcode
1906 utf8 TRUE in UTF-8 mode
1907 utf8_char used for utf8 character bytes, NULL if not relevant
1908 ptr next character in pattern
1909 options options bits
1910 cd contains pointers to tables etc.
1911
1912 Returns: TRUE if possessifying is wanted
1913 */
1914
1915 static BOOL
1916 check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
1917 const uschar *ptr, int options, compile_data *cd)
1918 {
1919 int next;
1920
1921 /* Skip whitespace and comments in extended mode */
1922
1923 if ((options & PCRE_EXTENDED) != 0)
1924 {
1925 for (;;)
1926 {
1927 while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1928 if (*ptr == '#')
1929 {
1930 while (*(++ptr) != 0)
1931 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1932 }
1933 else break;
1934 }
1935 }
1936
1937 /* If the next item is one that we can handle, get its value. A non-negative
1938 value is a character, a negative value is an escape value. */
1939
1940 if (*ptr == '\\')
1941 {
1942 int temperrorcode = 0;
1943 next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
1944 if (temperrorcode != 0) return FALSE;
1945 ptr++; /* Point after the escape sequence */
1946 }
1947
1948 else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
1949 {
1950 #ifdef SUPPORT_UTF8
1951 if (utf8) { GETCHARINC(next, ptr); } else
1952 #endif
1953 next = *ptr++;
1954 }
1955
1956 else return FALSE;
1957
1958 /* Skip whitespace and comments in extended mode */
1959
1960 if ((options & PCRE_EXTENDED) != 0)
1961 {
1962 for (;;)
1963 {
1964 while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1965 if (*ptr == '#')
1966 {
1967 while (*(++ptr) != 0)
1968 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1969 }
1970 else break;
1971 }
1972 }
1973
1974 /* If the next thing is itself optional, we have to give up. */
1975
1976 if (*ptr == '*' || *ptr == '?' || strncmp((char *)ptr, "{0,", 3) == 0)
1977 return FALSE;
1978
1979 /* Now compare the next item with the previous opcode. If the previous is a
1980 positive single character match, "item" either contains the character or, if
1981 "item" is greater than 127 in utf8 mode, the character's bytes are in
1982 utf8_char. */
1983
1984
1985 /* Handle cases when the next item is a character. */
1986
1987 if (next >= 0) switch(op_code)
1988 {
1989 case OP_CHAR:
1990 #ifdef SUPPORT_UTF8
1991 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1992 #endif
1993 return item != next;
1994
1995 /* For CHARNC (caseless character) we must check the other case. If we have
1996 Unicode property support, we can use it to test the other case of
1997 high-valued characters. */
1998
1999 case OP_CHARNC:
2000 #ifdef SUPPORT_UTF8
2001 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2002 #endif
2003 if (item == next) return FALSE;
2004 #ifdef SUPPORT_UTF8
2005 if (utf8)
2006 {
2007 unsigned int othercase;
2008 if (next < 128) othercase = cd->fcc[next]; else
2009 #ifdef SUPPORT_UCP
2010 othercase = _pcre_ucp_othercase((unsigned int)next);
2011 #else
2012 othercase = NOTACHAR;
2013 #endif
2014 return (unsigned int)item != othercase;
2015 }
2016 else
2017 #endif /* SUPPORT_UTF8 */
2018 return (item != cd->fcc[next]); /* Non-UTF-8 mode */
2019
2020 /* For OP_NOT, "item" must be a single-byte character. */
2021
2022 case OP_NOT:
2023 if (next < 0) return FALSE; /* Not a character */
2024 if (item == next) return TRUE;
2025 if ((options & PCRE_CASELESS) == 0) return FALSE;
2026 #ifdef SUPPORT_UTF8
2027 if (utf8)
2028 {
2029 unsigned int othercase;
2030 if (next < 128) othercase = cd->fcc[next]; else
2031 #ifdef SUPPORT_UCP
2032 othercase = _pcre_ucp_othercase(next);
2033 #else
2034 othercase = NOTACHAR;
2035 #endif
2036 return (unsigned int)item == othercase;
2037 }
2038 else
2039 #endif /* SUPPORT_UTF8 */
2040 return (item == cd->fcc[next]); /* Non-UTF-8 mode */
2041
2042 case OP_DIGIT:
2043 return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
2044
2045 case OP_NOT_DIGIT:
2046 return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
2047
2048 case OP_WHITESPACE:
2049 return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
2050
2051 case OP_NOT_WHITESPACE:
2052 return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
2053
2054 case OP_WORDCHAR:
2055 return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
2056
2057 case OP_NOT_WORDCHAR:
2058 return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
2059
2060 case OP_HSPACE:
2061 case OP_NOT_HSPACE:
2062 switch(next)
2063 {
2064 case 0x09:
2065 case 0x20:
2066 case 0xa0:
2067 case 0x1680:
2068 case 0x180e:
2069 case 0x2000:
2070 case 0x2001:
2071 case 0x2002:
2072 case 0x2003:
2073 case 0x2004:
2074 case 0x2005:
2075 case 0x2006:
2076 case 0x2007:
2077 case 0x2008:
2078 case 0x2009:
2079 case 0x200A:
2080 case 0x202f:
2081 case 0x205f:
2082 case 0x3000:
2083 return op_code != OP_HSPACE;
2084 default:
2085 return op_code == OP_HSPACE;
2086 }
2087
2088 case OP_VSPACE:
2089 case OP_NOT_VSPACE:
2090 switch(next)
2091 {
2092 case 0x0a:
2093 case 0x0b:
2094 case 0x0c:
2095 case 0x0d:
2096 case 0x85:
2097 case 0x2028:
2098 case 0x2029:
2099 return op_code != OP_VSPACE;
2100 default:
2101 return op_code == OP_VSPACE;
2102 }
2103
2104 default:
2105 return FALSE;
2106 }
2107
2108
2109 /* Handle the case when the next item is \d, \s, etc. */
2110
2111 switch(op_code)
2112 {
2113 case OP_CHAR:
2114 case OP_CHARNC:
2115 #ifdef SUPPORT_UTF8
2116 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2117 #endif
2118 switch(-next)
2119 {
2120 case ESC_d:
2121 return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
2122
2123 case ESC_D:
2124 return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
2125
2126 case ESC_s:
2127 return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
2128
2129 case ESC_S:
2130 return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
2131
2132 case ESC_w:
2133 return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
2134
2135 case ESC_W:
2136 return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
2137
2138 case ESC_h:
2139 case ESC_H:
2140 switch(item)
2141 {
2142 case 0x09:
2143 case 0x20:
2144 case 0xa0:
2145 case 0x1680:
2146 case 0x180e:
2147 case 0x2000:
2148 case 0x2001:
2149 case 0x2002:
2150 case 0x2003:
2151 case 0x2004:
2152 case 0x2005:
2153 case 0x2006:
2154 case 0x2007:
2155 case 0x2008:
2156 case 0x2009:
2157 case 0x200A:
2158 case 0x202f:
2159 case 0x205f:
2160 case 0x3000:
2161 return -next != ESC_h;
2162 default:
2163 return -next == ESC_h;
2164 }
2165
2166 case ESC_v:
2167 case ESC_V:
2168 switch(item)
2169 {
2170 case 0x0a:
2171 case 0x0b:
2172 case 0x0c:
2173 case 0x0d:
2174 case 0x85:
2175 case 0x2028:
2176 case 0x2029:
2177 return -next != ESC_v;
2178 default:
2179 return -next == ESC_v;
2180 }
2181
2182 default:
2183 return FALSE;
2184 }
2185
2186 case OP_DIGIT:
2187 return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
2188 next == -ESC_h || next == -ESC_v;
2189
2190 case OP_NOT_DIGIT:
2191 return next == -ESC_d;
2192
2193 case OP_WHITESPACE:
2194 return next == -ESC_S || next == -ESC_d || next == -ESC_w;
2195
2196 case OP_NOT_WHITESPACE:
2197 return next == -ESC_s || next == -ESC_h || next == -ESC_v;
2198
2199 case OP_HSPACE:
2200 return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w;
2201
2202 case OP_NOT_HSPACE:
2203 return next == -ESC_h;
2204
2205 /* Can't have \S in here because VT matches \S (Perl anomaly) */
2206 case OP_VSPACE:
2207 return next == -ESC_V || next == -ESC_d || next == -ESC_w;
2208
2209 case OP_NOT_VSPACE:
2210 return next == -ESC_v;
2211
2212 case OP_WORDCHAR:
2213 return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v;
2214
2215 case OP_NOT_WORDCHAR:
2216 return next == -ESC_w || next == -ESC_d;
2217
2218 default:
2219 return FALSE;
2220 }
2221
2222 /* Control does not reach here */
2223 }
2224
2225
2226
2227 /*************************************************
2228 * Compile one branch *
2229 *************************************************/
2230
2231 /* Scan the pattern, compiling it into the a vector. If the options are
2232 changed during the branch, the pointer is used to change the external options
2233 bits. This function is used during the pre-compile phase when we are trying
2234 to find out the amount of memory needed, as well as during the real compile
2235 phase. The value of lengthptr distinguishes the two phases.
2236
2237 Arguments:
2238 optionsptr pointer to the option bits
2239 codeptr points to the pointer to the current code point
2240 ptrptr points to the current pattern pointer
2241 errorcodeptr points to error code variable
2242 firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2243 reqbyteptr set to the last literal character required, else < 0
2244 bcptr points to current branch chain
2245 cd contains pointers to tables etc.
2246 lengthptr NULL during the real compile phase
2247 points to length accumulator during pre-compile phase
2248
2249 Returns: TRUE on success
2250 FALSE, with *errorcodeptr set non-zero on error
2251 */
2252
2253 static BOOL
2254 compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2255 int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2256 compile_data *cd, int *lengthptr)
2257 {
2258 int repeat_type, op_type;
2259 int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
2260 int bravalue = 0;
2261 int greedy_default, greedy_non_default;
2262 int firstbyte, reqbyte;
2263 int zeroreqbyte, zerofirstbyte;
2264 int req_caseopt, reqvary, tempreqvary;
2265 int options = *optionsptr;
2266 int after_manual_callout = 0;
2267 int length_prevgroup = 0;
2268 register int c;
2269 register uschar *code = *codeptr;
2270 uschar *last_code = code;
2271 uschar *orig_code = code;
2272 uschar *tempcode;
2273 BOOL inescq = FALSE;
2274 BOOL groupsetfirstbyte = FALSE;
2275 const uschar *ptr = *ptrptr;
2276 const uschar *tempptr;
2277 uschar *previous = NULL;
2278 uschar *previous_callout = NULL;
2279 uschar *save_hwm = NULL;
2280 uschar classbits[32];
2281
2282 #ifdef SUPPORT_UTF8
2283 BOOL class_utf8;
2284 BOOL utf8 = (options & PCRE_UTF8) != 0;
2285 uschar *class_utf8data;
2286 uschar utf8_char[6];
2287 #else
2288 BOOL utf8 = FALSE;
2289 uschar *utf8_char = NULL;
2290 #endif
2291
2292 #ifdef DEBUG
2293 if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2294 #endif
2295
2296 /* Set up the default and non-default settings for greediness */
2297
2298 greedy_default = ((options & PCRE_UNGREEDY) != 0);
2299 greedy_non_default = greedy_default ^ 1;
2300
2301 /* Initialize no first byte, no required byte. REQ_UNSET means "no char
2302 matching encountered yet". It gets changed to REQ_NONE if we hit something that
2303 matches a non-fixed char first char; reqbyte just remains unset if we never
2304 find one.
2305
2306 When we hit a repeat whose minimum is zero, we may have to adjust these values
2307 to take the zero repeat into account. This is implemented by setting them to
2308 zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2309 item types that can be repeated set these backoff variables appropriately. */
2310
2311 firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2312
2313 /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2314 according to the current setting of the caseless flag. REQ_CASELESS is a bit
2315 value > 255. It is added into the firstbyte or reqbyte variables to record the
2316 case status of the value. This is used only for ASCII characters. */
2317
2318 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2319
2320 /* Switch on next character until the end of the branch */
2321
2322 for (;; ptr++)
2323 {
2324 BOOL negate_class;
2325 BOOL possessive_quantifier;
2326 BOOL is_quantifier;
2327 BOOL is_recurse;
2328 BOOL reset_bracount;
2329 int class_charcount;
2330 int class_lastchar;
2331 int newoptions;
2332 int recno;
2333 int refsign;
2334 int skipbytes;
2335 int subreqbyte;
2336 int subfirstbyte;
2337 int terminator;
2338 int mclength;
2339 uschar mcbuffer[8];
2340
2341 /* Get next byte in the pattern */
2342
2343 c = *ptr;
2344
2345 /* If we are in the pre-compile phase, accumulate the length used for the
2346 previous cycle of this loop. */
2347
2348 if (lengthptr != NULL)
2349 {
2350 #ifdef DEBUG
2351 if (code > cd->hwm) cd->hwm = code; /* High water info */
2352 #endif
2353 if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
2354 {
2355 *errorcodeptr = ERR52;
2356 goto FAILED;
2357 }
2358
2359 /* There is at least one situation where code goes backwards: this is the
2360 case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2361 the class is simply eliminated. However, it is created first, so we have to
2362 allow memory for it. Therefore, don't ever reduce the length at this point.
2363 */
2364
2365 if (code < last_code) code = last_code;
2366
2367 /* Paranoid check for integer overflow */
2368
2369 if (OFLOW_MAX - *lengthptr < code - last_code)
2370 {
2371 *errorcodeptr = ERR20;
2372 goto FAILED;
2373 }
2374
2375 *lengthptr += code - last_code;
2376 DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2377
2378 /* If "previous" is set and it is not at the start of the work space, move
2379 it back to there, in order to avoid filling up the work space. Otherwise,
2380 if "previous" is NULL, reset the current code pointer to the start. */
2381
2382 if (previous != NULL)
2383 {
2384 if (previous > orig_code)
2385 {
2386 memmove(orig_code, previous, code - previous);
2387 code -= previous - orig_code;
2388 previous = orig_code;
2389 }
2390 }
2391 else code = orig_code;
2392
2393 /* Remember where this code item starts so we can pick up the length
2394 next time round. */
2395
2396 last_code = code;
2397 }
2398
2399 /* In the real compile phase, just check the workspace used by the forward
2400 reference list. */
2401
2402 else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
2403 {
2404 *errorcodeptr = ERR52;
2405 goto FAILED;
2406 }
2407
2408 /* If in \Q...\E, check for the end; if not, we have a literal */
2409
2410 if (inescq && c != 0)
2411 {
2412 if (c == '\\' && ptr[1] == 'E')
2413 {
2414 inescq = FALSE;
2415 ptr++;
2416 continue;
2417 }
2418 else
2419 {
2420 if (previous_callout != NULL)
2421 {
2422 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2423 complete_callout(previous_callout, ptr, cd);
2424 previous_callout = NULL;
2425 }
2426 if ((options & PCRE_AUTO_CALLOUT) != 0)
2427 {
2428 previous_callout = code;
2429 code = auto_callout(code, ptr, cd);
2430 }
2431 goto NORMAL_CHAR;
2432 }
2433 }
2434
2435 /* Fill in length of a previous callout, except when the next thing is
2436 a quantifier. */
2437
2438 is_quantifier = c == '*' || c == '+' || c == '?' ||
2439 (c == '{' && is_counted_repeat(ptr+1));
2440
2441 if (!is_quantifier && previous_callout != NULL &&
2442 after_manual_callout-- <= 0)
2443 {
2444 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2445 complete_callout(previous_callout, ptr, cd);
2446 previous_callout = NULL;
2447 }
2448
2449 /* In extended mode, skip white space and comments */
2450
2451 if ((options & PCRE_EXTENDED) != 0)
2452 {
2453 if ((cd->ctypes[c] & ctype_space) != 0) continue;
2454 if (c == '#')
2455 {
2456 while (*(++ptr) != 0)
2457 {
2458 if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
2459 }
2460 if (*ptr != 0) continue;
2461
2462 /* Else fall through to handle end of string */
2463 c = 0;
2464 }
2465 }
2466
2467 /* No auto callout for quantifiers. */
2468
2469 if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
2470 {
2471 previous_callout = code;
2472 code = auto_callout(code, ptr, cd);
2473 }
2474
2475 switch(c)
2476 {
2477 /* ===================================================================*/
2478 case 0: /* The branch terminates at string end */
2479 case '|': /* or | or ) */
2480 case ')':
2481 *firstbyteptr = firstbyte;
2482 *reqbyteptr = reqbyte;
2483 *codeptr = code;
2484 *ptrptr = ptr;
2485 if (lengthptr != NULL)
2486 {
2487 if (OFLOW_MAX - *lengthptr < code - last_code)
2488 {
2489 *errorcodeptr = ERR20;
2490 goto FAILED;
2491 }
2492 *lengthptr += code - last_code; /* To include callout length */
2493 DPRINTF((">> end branch\n"));
2494 }
2495 return TRUE;
2496
2497
2498 /* ===================================================================*/
2499 /* Handle single-character metacharacters. In multiline mode, ^ disables
2500 the setting of any following char as a first character. */
2501
2502 case '^':
2503 if ((options & PCRE_MULTILINE) != 0)
2504 {
2505 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2506 }
2507 previous = NULL;
2508 *code++ = OP_CIRC;
2509 break;
2510
2511 case '$':
2512 previous = NULL;
2513 *code++ = OP_DOLL;
2514 break;
2515
2516 /* There can never be a first char if '.' is first, whatever happens about
2517 repeats. The value of reqbyte doesn't change either. */
2518
2519 case '.':
2520 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2521 zerofirstbyte = firstbyte;
2522 zeroreqbyte = reqbyte;
2523 previous = code;
2524 *code++ = OP_ANY;
2525 break;
2526
2527
2528 /* ===================================================================*/
2529 /* Character classes. If the included characters are all < 256, we build a
2530 32-byte bitmap of the permitted characters, except in the special case
2531 where there is only one such character. For negated classes, we build the
2532 map as usual, then invert it at the end. However, we use a different opcode
2533 so that data characters > 255 can be handled correctly.
2534
2535 If the class contains characters outside the 0-255 range, a different
2536 opcode is compiled. It may optionally have a bit map for characters < 256,
2537 but those above are are explicitly listed afterwards. A flag byte tells
2538 whether the bitmap is present, and whether this is a negated class or not.
2539 */
2540
2541 case '[':
2542 previous = code;
2543
2544 /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
2545 they are encountered at the top level, so we'll do that too. */
2546
2547 if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2548 check_posix_syntax(ptr, &tempptr, cd))
2549 {
2550 *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;
2551 goto FAILED;
2552 }
2553
2554 /* If the first character is '^', set the negation flag and skip it. Also,
2555 if the first few characters (either before or after ^) are \Q\E or \E we
2556 skip them too. This makes for compatibility with Perl. */
2557
2558 negate_class = FALSE;
2559 for (;;)
2560 {
2561 c = *(++ptr);
2562 if (c == '\\')
2563 {
2564 if (ptr[1] == 'E') ptr++;
2565 else if (strncmp((const char *)ptr+1, "Q\\E", 3) == 0) ptr += 3;
2566 else break;
2567 }
2568 else if (!negate_class && c == '^')
2569 negate_class = TRUE;
2570 else break;
2571 }
2572
2573 /* Keep a count of chars with values < 256 so that we can optimize the case
2574 of just a single character (as long as it's < 256). However, For higher
2575 valued UTF-8 characters, we don't yet do any optimization. */
2576
2577 class_charcount = 0;
2578 class_lastchar = -1;
2579
2580 /* Initialize the 32-char bit map to all zeros. We build the map in a
2581 temporary bit of memory, in case the class contains only 1 character (less
2582 than 256), because in that case the compiled code doesn't use the bit map.
2583 */
2584
2585 memset(classbits, 0, 32 * sizeof(uschar));
2586
2587 #ifdef SUPPORT_UTF8
2588 class_utf8 = FALSE; /* No chars >= 256 */
2589 class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */
2590 #endif
2591
2592 /* Process characters until ] is reached. By writing this as a "do" it
2593 means that an initial ] is taken as a data character. At the start of the
2594 loop, c contains the first byte of the character. */
2595
2596 if (c != 0) do
2597 {
2598 const uschar *oldptr;
2599
2600 #ifdef SUPPORT_UTF8
2601 if (utf8 && c > 127)
2602 { /* Braces are required because the */
2603 GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
2604 }
2605 #endif
2606
2607 /* Inside \Q...\E everything is literal except \E */
2608
2609 if (inescq)
2610 {
2611 if (c == '\\' && ptr[1] == 'E') /* If we are at \E */
2612 {
2613 inescq = FALSE; /* Reset literal state */
2614 ptr++; /* Skip the 'E' */
2615 continue; /* Carry on with next */
2616 }
2617 goto CHECK_RANGE; /* Could be range if \E follows */
2618 }
2619
2620 /* Handle POSIX class names. Perl allows a negation extension of the
2621 form [:^name:]. A square bracket that doesn't match the syntax is
2622 treated as a literal. We also recognize the POSIX constructions
2623 [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
2624 5.6 and 5.8 do. */
2625
2626 if (c == '[' &&
2627 (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2628 check_posix_syntax(ptr, &tempptr, cd))
2629 {
2630 BOOL local_negate = FALSE;
2631 int posix_class, taboffset, tabopt;
2632 register const uschar *cbits = cd->cbits;
2633 uschar pbits[32];
2634
2635 if (ptr[1] != ':')
2636 {
2637 *errorcodeptr = ERR31;
2638 goto FAILED;
2639 }
2640
2641 ptr += 2;
2642 if (*ptr == '^')
2643 {
2644 local_negate = TRUE;
2645 ptr++;
2646 }
2647
2648 posix_class = check_posix_name(ptr, tempptr - ptr);
2649 if (posix_class < 0)
2650 {
2651 *errorcodeptr = ERR30;
2652 goto FAILED;
2653 }
2654
2655 /* If matching is caseless, upper and lower are converted to
2656 alpha. This relies on the fact that the class table starts with
2657 alpha, lower, upper as the first 3 entries. */
2658
2659 if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
2660 posix_class = 0;
2661
2662 /* We build the bit map for the POSIX class in a chunk of local store
2663 because we may be adding and subtracting from it, and we don't want to
2664 subtract bits that may be in the main map already. At the end we or the
2665 result into the bit map that is being built. */
2666
2667 posix_class *= 3;
2668
2669 /* Copy in the first table (always present) */
2670
2671 memcpy(pbits, cbits + posix_class_maps[posix_class],
2672 32 * sizeof(uschar));
2673
2674 /* If there is a second table, add or remove it as required. */
2675
2676 taboffset = posix_class_maps[posix_class + 1];
2677 tabopt = posix_class_maps[posix_class + 2];
2678
2679 if (taboffset >= 0)
2680 {
2681 if (tabopt >= 0)
2682 for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
2683 else
2684 for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
2685 }
2686
2687 /* Not see if we need to remove any special characters. An option
2688 value of 1 removes vertical space and 2 removes underscore. */
2689
2690 if (tabopt < 0) tabopt = -tabopt;
2691 if (tabopt == 1) pbits[1] &= ~0x3c;
2692 else if (tabopt == 2) pbits[11] &= 0x7f;
2693
2694 /* Add the POSIX table or its complement into the main table that is
2695 being built and we are done. */
2696
2697 if (local_negate)
2698 for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
2699 else
2700 for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
2701
2702 ptr = tempptr + 1;
2703 class_charcount = 10; /* Set > 1; assumes more than 1 per class */
2704 continue; /* End of POSIX syntax handling */
2705 }
2706
2707 /* Backslash may introduce a single character, or it may introduce one
2708 of the specials, which just set a flag. The sequence \b is a special
2709 case. Inside a class (and only there) it is treated as backspace.
2710 Elsewhere it marks a word boundary. Other escapes have preset maps ready
2711 to 'or' into the one we are building. We assume they have more than one
2712 character in them, so set class_charcount bigger than one. */
2713
2714 if (c == '\\')
2715 {
2716 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2717 if (*errorcodeptr != 0) goto FAILED;
2718
2719 if (-c == ESC_b) c = '\b'; /* \b is backslash in a class */
2720 else if (-c == ESC_X) c = 'X'; /* \X is literal X in a class */
2721 else if (-c == ESC_R) c = 'R'; /* \R is literal R in a class */
2722 else if (-c == ESC_Q) /* Handle start of quoted string */
2723 {
2724 if (ptr[1] == '\\' && ptr[2] == 'E')
2725 {
2726 ptr += 2; /* avoid empty string */
2727 }
2728 else inescq = TRUE;
2729 continue;
2730 }
2731 else if (-c == ESC_E) continue; /* Ignore orphan \E */
2732
2733 if (c < 0)
2734 {
2735 register const uschar *cbits = cd->cbits;
2736 class_charcount += 2; /* Greater than 1 is what matters */
2737
2738 /* Save time by not doing this in the pre-compile phase. */
2739
2740 if (lengthptr == NULL) switch (-c)
2741 {
2742 case ESC_d:
2743 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
2744 continue;
2745
2746 case ESC_D:
2747 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
2748 continue;
2749
2750 case ESC_w:
2751 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
2752 continue;
2753
2754 case ESC_W:
2755 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
2756 continue;
2757
2758 case ESC_s:
2759 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
2760 classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
2761 continue;
2762
2763 case ESC_S:
2764 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
2765 classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
2766 continue;
2767
2768 case ESC_E: /* Perl ignores an orphan \E */
2769 continue;
2770
2771 default: /* Not recognized; fall through */
2772 break; /* Need "default" setting to stop compiler warning. */
2773 }
2774
2775 /* In the pre-compile phase, just do the recognition. */
2776
2777 else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
2778 c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
2779
2780 /* We need to deal with \H, \h, \V, and \v in both phases because
2781 they use extra memory. */
2782
2783 if (-c == ESC_h)
2784 {
2785 SETBIT(classbits, 0x09); /* VT */
2786 SETBIT(classbits, 0x20); /* SPACE */
2787 SETBIT(classbits, 0xa0); /* NSBP */
2788 #ifdef SUPPORT_UTF8
2789 if (utf8)
2790 {
2791 class_utf8 = TRUE;
2792 *class_utf8data++ = XCL_SINGLE;
2793 class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
2794 *class_utf8data++ = XCL_SINGLE;
2795 class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
2796 *class_utf8data++ = XCL_RANGE;
2797 class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
2798 class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
2799 *class_utf8data++ = XCL_SINGLE;
2800 class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
2801 *class_utf8data++ = XCL_SINGLE;
2802 class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
2803 *class_utf8data++ = XCL_SINGLE;
2804 class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
2805 }
2806 #endif
2807 continue;
2808 }
2809
2810 if (-c == ESC_H)
2811 {
2812 for (c = 0; c < 32; c++)
2813 {
2814 int x = 0xff;
2815 switch (c)
2816 {
2817 case 0x09/8: x ^= 1 << (0x09%8); break;
2818 case 0x20/8: x ^= 1 << (0x20%8); break;
2819 case 0xa0/8: x ^= 1 << (0xa0%8); break;
2820 default: break;
2821 }
2822 classbits[c] |= x;
2823 }
2824
2825 #ifdef SUPPORT_UTF8
2826 if (utf8)
2827 {
2828 class_utf8 = TRUE;
2829 *class_utf8data++ = XCL_RANGE;
2830 class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2831 class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
2832 *class_utf8data++ = XCL_RANGE;
2833 class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
2834 class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
2835 *class_utf8data++ = XCL_RANGE;
2836 class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
2837 class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
2838 *class_utf8data++ = XCL_RANGE;
2839 class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
2840 class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
2841 *class_utf8data++ = XCL_RANGE;
2842 class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
2843 class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
2844 *class_utf8data++ = XCL_RANGE;
2845 class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
2846 class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
2847 *class_utf8data++ = XCL_RANGE;
2848 class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
2849 class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2850 }
2851 #endif
2852 continue;
2853 }
2854
2855 if (-c == ESC_v)
2856 {
2857 SETBIT(classbits, 0x0a); /* LF */
2858 SETBIT(classbits, 0x0b); /* VT */
2859 SETBIT(classbits, 0x0c); /* FF */
2860 SETBIT(classbits, 0x0d); /* CR */
2861 SETBIT(classbits, 0x85); /* NEL */
2862 #ifdef SUPPORT_UTF8
2863 if (utf8)
2864 {
2865 class_utf8 = TRUE;
2866 *class_utf8data++ = XCL_RANGE;
2867 class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
2868 class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
2869 }
2870 #endif
2871 continue;
2872 }
2873
2874 if (-c == ESC_V)
2875 {
2876 for (c = 0; c < 32; c++)
2877 {
2878 int x = 0xff;
2879 switch (c)
2880 {
2881 case 0x0a/8: x ^= 1 << (0x0a%8);
2882 x ^= 1 << (0x0b%8);
2883 x ^= 1 << (0x0c%8);
2884 x ^= 1 << (0x0d%8);
2885 break;
2886 case 0x85/8: x ^= 1 << (0x85%8); break;
2887 default: break;
2888 }
2889 classbits[c] |= x;
2890 }
2891
2892 #ifdef SUPPORT_UTF8
2893 if (utf8)
2894 {
2895 class_utf8 = TRUE;
2896 *class_utf8data++ = XCL_RANGE;
2897 class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2898 class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
2899 *class_utf8data++ = XCL_RANGE;
2900 class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
2901 class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2902 }
2903 #endif
2904 continue;
2905 }
2906
2907 /* We need to deal with \P and \p in both phases. */
2908
2909 #ifdef SUPPORT_UCP
2910 if (-c == ESC_p || -c == ESC_P)
2911 {
2912 BOOL negated;
2913 int pdata;
2914 int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
2915 if (ptype < 0) goto FAILED;
2916 class_utf8 = TRUE;
2917 *class_utf8data++ = ((-c == ESC_p) != negated)?
2918 XCL_PROP : XCL_NOTPROP;
2919 *class_utf8data++ = ptype;
2920 *class_utf8data++ = pdata;
2921 class_charcount -= 2; /* Not a < 256 character */
2922 continue;
2923 }
2924 #endif
2925 /* Unrecognized escapes are faulted if PCRE is running in its
2926 strict mode. By default, for compatibility with Perl, they are
2927 treated as literals. */
2928
2929 if ((options & PCRE_EXTRA) != 0)
2930 {
2931 *errorcodeptr = ERR7;
2932 goto FAILED;
2933 }
2934
2935 class_charcount -= 2; /* Undo the default count from above */
2936 c = *ptr; /* Get the final character and fall through */
2937 }
2938
2939 /* Fall through if we have a single character (c >= 0). This may be
2940 greater than 256 in UTF-8 mode. */
2941
2942 } /* End of backslash handling */
2943
2944 /* A single character may be followed by '-' to form a range. However,
2945 Perl does not permit ']' to be the end of the range. A '-' character
2946 at the end is treated as a literal. Perl ignores orphaned \E sequences
2947 entirely. The code for handling \Q and \E is messy. */
2948
2949 CHECK_RANGE:
2950 while (ptr[1] == '\\' && ptr[2] == 'E')
2951 {
2952 inescq = FALSE;
2953 ptr += 2;
2954 }
2955
2956 oldptr = ptr;
2957
2958 if (!inescq && ptr[1] == '-')
2959 {
2960 int d;
2961 ptr += 2;
2962 while (*ptr == '\\' && ptr[1] == 'E') ptr += 2;
2963
2964 /* If we hit \Q (not followed by \E) at this point, go into escaped
2965 mode. */
2966
2967 while (*ptr == '\\' && ptr[1] == 'Q')
2968 {
2969 ptr += 2;
2970 if (*ptr == '\\' && ptr[1] == 'E') { ptr += 2; continue; }
2971 inescq = TRUE;
2972 break;
2973 }
2974
2975 if (*ptr == 0 || (!inescq && *ptr == ']'))
2976 {
2977 ptr = oldptr;
2978 goto LONE_SINGLE_CHARACTER;
2979 }
2980
2981 #ifdef SUPPORT_UTF8
2982 if (utf8)
2983 { /* Braces are required because the */
2984 GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
2985 }
2986 else
2987 #endif
2988 d = *ptr; /* Not UTF-8 mode */
2989
2990 /* The second part of a range can be a single-character escape, but
2991 not any of the other escapes. Perl 5.6 treats a hyphen as a literal
2992 in such circumstances. */
2993
2994 if (!inescq && d == '\\')
2995 {
2996 d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2997 if (*errorcodeptr != 0) goto FAILED;
2998
2999 /* \b is backslash; \X is literal X; \R is literal R; any other
3000 special means the '-' was literal */
3001
3002 if (d < 0)
3003 {
3004 if (d == -ESC_b) d = '\b';
3005 else if (d == -ESC_X) d = 'X';
3006 else if (d == -ESC_R) d = 'R'; else
3007 {
3008 ptr = oldptr;
3009 goto LONE_SINGLE_CHARACTER; /* A few lines below */
3010 }
3011 }
3012 }
3013
3014 /* Check that the two values are in the correct order. Optimize
3015 one-character ranges */
3016
3017 if (d < c)
3018 {
3019 *errorcodeptr = ERR8;
3020 goto FAILED;
3021 }
3022
3023 if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
3024
3025 /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
3026 matching, we have to use an XCLASS with extra data items. Caseless
3027 matching for characters > 127 is available only if UCP support is
3028 available. */
3029
3030 #ifdef SUPPORT_UTF8
3031 if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
3032 {
3033 class_utf8 = TRUE;
3034
3035 /* With UCP support, we can find the other case equivalents of
3036 the relevant characters. There may be several ranges. Optimize how
3037 they fit with the basic range. */
3038
3039 #ifdef SUPPORT_UCP
3040 if ((options & PCRE_CASELESS) != 0)
3041 {
3042 unsigned int occ, ocd;
3043 unsigned int cc = c;
3044 unsigned int origd = d;
3045 while (get_othercase_range(&cc, origd, &occ, &ocd))
3046 {
3047 if (occ >= (unsigned int)c &&
3048 ocd <= (unsigned int)d)
3049 continue; /* Skip embedded ranges */
3050
3051 if (occ < (unsigned int)c &&
3052 ocd >= (unsigned int)c - 1) /* Extend the basic range */
3053 { /* if there is overlap, */
3054 c = occ; /* noting that if occ < c */
3055 continue; /* we can't have ocd > d */
3056 } /* because a subrange is */
3057 if (ocd > (unsigned int)d &&
3058 occ <= (unsigned int)d + 1) /* always shorter than */
3059 { /* the basic range. */
3060 d = ocd;
3061 continue;
3062 }
3063
3064 if (occ == ocd)
3065 {
3066 *class_utf8data++ = XCL_SINGLE;
3067 }
3068 else
3069 {
3070 *class_utf8data++ = XCL_RANGE;
3071 class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
3072 }
3073 class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
3074 }
3075 }
3076 #endif /* SUPPORT_UCP */
3077
3078 /* Now record the original range, possibly modified for UCP caseless
3079 overlapping ranges. */
3080
3081 *class_utf8data++ = XCL_RANGE;
3082 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3083 class_utf8data += _pcre_ord2utf8(d, class_utf8data);
3084
3085 /* With UCP support, we are done. Without UCP support, there is no
3086 caseless matching for UTF-8 characters > 127; we can use the bit map
3087 for the smaller ones. */
3088
3089 #ifdef SUPPORT_UCP
3090 continue; /* With next character in the class */
3091 #else
3092 if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
3093
3094 /* Adjust upper limit and fall through to set up the map */
3095
3096 d = 127;
3097
3098 #endif /* SUPPORT_UCP */
3099 }
3100 #endif /* SUPPORT_UTF8 */
3101
3102 /* We use the bit map for all cases when not in UTF-8 mode; else
3103 ranges that lie entirely within 0-127 when there is UCP support; else
3104 for partial ranges without UCP support. */
3105
3106 class_charcount += d - c + 1;
3107 class_lastchar = d;
3108
3109 /* We can save a bit of time by skipping this in the pre-compile. */
3110
3111 if (lengthptr == NULL) for (; c <= d; c++)
3112 {
3113 classbits[c/8] |= (1 << (c&7));
3114 if ((options & PCRE_CASELESS) != 0)
3115 {
3116 int uc = cd->fcc[c]; /* flip case */
3117 classbits[uc/8] |= (1 << (uc&7));
3118 }
3119 }
3120
3121 continue; /* Go get the next char in the class */
3122 }
3123
3124 /* Handle a lone single character - we can get here for a normal
3125 non-escape char, or after \ that introduces a single character or for an
3126 apparent range that isn't. */
3127
3128 LONE_SINGLE_CHARACTER:
3129
3130 /* Handle a character that cannot go in the bit map */
3131
3132 #ifdef SUPPORT_UTF8
3133 if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
3134 {
3135 class_utf8 = TRUE;
3136 *class_utf8data++ = XCL_SINGLE;
3137 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3138
3139 #ifdef SUPPORT_UCP
3140 if ((options & PCRE_CASELESS) != 0)
3141 {
3142 unsigned int othercase;
3143 if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR)
3144 {
3145 *class_utf8data++ = XCL_SINGLE;
3146 class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
3147 }
3148 }
3149 #endif /* SUPPORT_UCP */
3150
3151 }
3152 else
3153 #endif /* SUPPORT_UTF8 */
3154
3155 /* Handle a single-byte character */
3156 {
3157 classbits[c/8] |= (1 << (c&7));
3158 if ((options & PCRE_CASELESS) != 0)
3159 {
3160 c = cd->fcc[c]; /* flip case */
3161 classbits[c/8] |= (1 << (c&7));
3162 }
3163 class_charcount++;
3164 class_lastchar = c;
3165 }
3166 }
3167
3168 /* Loop until ']' reached. This "while" is the end of the "do" above. */
3169
3170 while ((c = *(++ptr)) != 0 && (c != ']' || inescq));
3171
3172 if (c == 0) /* Missing terminating ']' */
3173 {
3174 *errorcodeptr = ERR6;
3175 goto FAILED;
3176 }
3177
3178 /* If class_charcount is 1, we saw precisely one character whose value is
3179 less than 256. As long as there were no characters >= 128 and there was no
3180 use of \p or \P, in other words, no use of any XCLASS features, we can
3181 optimize.
3182
3183 In UTF-8 mode, we can optimize the negative case only if there were no
3184 characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR
3185 operate on single-bytes only. This is an historical hangover. Maybe one day
3186 we can tidy these opcodes to handle multi-byte characters.
3187
3188 The optimization throws away the bit map. We turn the item into a
3189 1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
3190 that OP_NOT does not support multibyte characters. In the positive case, it
3191 can cause firstbyte to be set. Otherwise, there can be no first char if
3192 this item is first, whatever repeat count may follow. In the case of
3193 reqbyte, save the previous value for reinstating. */
3194
3195 #ifdef SUPPORT_UTF8
3196 if (class_charcount == 1 && !class_utf8 &&
3197 (!utf8 || !negate_class || class_lastchar < 128))
3198 #else
3199 if (class_charcount == 1)
3200 #endif
3201 {
3202 zeroreqbyte = reqbyte;
3203
3204 /* The OP_NOT opcode works on one-byte characters only. */
3205
3206 if (negate_class)
3207 {
3208 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3209 zerofirstbyte = firstbyte;
3210 *code++ = OP_NOT;
3211 *code++ = class_lastchar;
3212 break;
3213 }
3214
3215 /* For a single, positive character, get the value into mcbuffer, and
3216 then we can handle this with the normal one-character code. */
3217
3218 #ifdef SUPPORT_UTF8
3219 if (utf8 && class_lastchar > 127)
3220 mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
3221 else
3222 #endif
3223 {
3224 mcbuffer[0] = class_lastchar;
3225 mclength = 1;
3226 }
3227 goto ONE_CHAR;
3228 } /* End of 1-char optimization */
3229
3230 /* The general case - not the one-char optimization. If this is the first
3231 thing in the branch, there can be no first char setting, whatever the
3232 repeat count. Any reqbyte setting must remain unchanged after any kind of
3233 repeat. */
3234
3235 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3236 zerofirstbyte = firstbyte;
3237 zeroreqbyte = reqbyte;
3238
3239 /* If there are characters with values > 255, we have to compile an
3240 extended class, with its own opcode. If there are no characters < 256,
3241 we can omit the bitmap in the actual compiled code. */
3242
3243 #ifdef SUPPORT_UTF8
3244 if (class_utf8)
3245 {
3246 *class_utf8data++ = XCL_END; /* Marks the end of extra data */
3247 *code++ = OP_XCLASS;
3248 code += LINK_SIZE;
3249 *code = negate_class? XCL_NOT : 0;
3250
3251 /* If the map is required, move up the extra data to make room for it;
3252 otherwise just move the code pointer to the end of the extra data. */
3253
3254 if (class_charcount > 0)
3255 {
3256 *code++ |= XCL_MAP;
3257 memmove(code + 32, code, class_utf8data - code);
3258 memcpy(code, classbits, 32);
3259 code = class_utf8data + 32;
3260 }
3261 else code = class_utf8data;
3262
3263 /* Now fill in the complete length of the item */
3264
3265 PUT(previous, 1, code - previous);
3266 break; /* End of class handling */
3267 }
3268 #endif
3269
3270 /* If there are no characters > 255, negate the 32-byte map if necessary,
3271 and copy it into the code vector. If this is the first thing in the branch,
3272 there can be no first char setting, whatever the repeat count. Any reqbyte
3273 setting must remain unchanged after any kind of repeat. */
3274
3275 if (negate_class)
3276 {
3277 *code++ = OP_NCLASS;
3278 if (lengthptr == NULL) /* Save time in the pre-compile phase */
3279 for (c = 0; c < 32; c++) code[c] = ~classbits[c];
3280 }
3281 else
3282 {
3283 *code++ = OP_CLASS;
3284 memcpy(code, classbits, 32);
3285 }
3286 code += 32;
3287 break;
3288
3289
3290 /* ===================================================================*/
3291 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
3292 has been tested above. */
3293
3294 case '{':
3295 if (!is_quantifier) goto NORMAL_CHAR;
3296 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
3297 if (*errorcodeptr != 0) goto FAILED;
3298 goto REPEAT;
3299
3300 case '*':
3301 repeat_min = 0;
3302 repeat_max = -1;
3303 goto REPEAT;
3304
3305 case '+':
3306 repeat_min = 1;
3307 repeat_max = -1;
3308 goto REPEAT;
3309
3310 case '?':
3311 repeat_min = 0;
3312 repeat_max = 1;
3313
3314 REPEAT:
3315 if (previous == NULL)
3316 {
3317 *errorcodeptr = ERR9;
3318 goto FAILED;
3319 }
3320
3321 if (repeat_min == 0)
3322 {
3323 firstbyte = zerofirstbyte; /* Adjust for zero repeat */
3324 reqbyte = zeroreqbyte; /* Ditto */
3325 }
3326
3327 /* Remember whether this is a variable length repeat */
3328
3329 reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
3330
3331 op_type = 0; /* Default single-char op codes */
3332 possessive_quantifier = FALSE; /* Default not possessive quantifier */
3333
3334 /* Save start of previous item, in case we have to move it up to make space
3335 for an inserted OP_ONCE for the additional '+' extension. */
3336
3337 tempcode = previous;
3338
3339 /* If the next character is '+', we have a possessive quantifier. This
3340 implies greediness, whatever the setting of the PCRE_UNGREEDY option.
3341 If the next character is '?' this is a minimizing repeat, by default,
3342 but if PCRE_UNGREEDY is set, it works the other way round. We change the
3343 repeat type to the non-default. */
3344
3345 if (ptr[1] == '+')
3346 {
3347 repeat_type = 0; /* Force greedy */
3348 possessive_quantifier = TRUE;
3349 ptr++;
3350 }
3351 else if (ptr[1] == '?')
3352 {
3353 repeat_type = greedy_non_default;
3354 ptr++;
3355 }
3356 else repeat_type = greedy_default;
3357
3358 /* If previous was a character match, abolish the item and generate a
3359 repeat item instead. If a char item has a minumum of more than one, ensure
3360 that it is set in reqbyte - it might not be if a sequence such as x{3} is
3361 the first thing in a branch because the x will have gone into firstbyte
3362 instead. */
3363
3364 if (*previous == OP_CHAR || *previous == OP_CHARNC)
3365 {
3366 /* Deal with UTF-8 characters that take up more than one byte. It's
3367 easier to write this out separately than try to macrify it. Use c to
3368 hold the length of the character in bytes, plus 0x80 to flag that it's a
3369 length rather than a small character. */
3370
3371 #ifdef SUPPORT_UTF8
3372 if (utf8 && (code[-1] & 0x80) != 0)
3373 {
3374 uschar *lastchar = code - 1;
3375 while((*lastchar & 0xc0) == 0x80) lastchar--;
3376 c = code - lastchar; /* Length of UTF-8 character */
3377 memcpy(utf8_char, lastchar, c); /* Save the char */
3378 c |= 0x80; /* Flag c as a length */
3379 }
3380 else
3381 #endif
3382
3383 /* Handle the case of a single byte - either with no UTF8 support, or
3384 with UTF-8 disabled, or for a UTF-8 character < 128. */
3385
3386 {
3387 c = code[-1];
3388 if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
3389 }
3390
3391 /* If the repetition is unlimited, it pays to see if the next thing on
3392 the line is something that cannot possibly match this character. If so,
3393 automatically possessifying this item gains some performance in the case
3394 where the match fails. */
3395
3396 if (!possessive_quantifier &&
3397 repeat_max < 0 &&
3398 check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3399 options, cd))
3400 {
3401 repeat_type = 0; /* Force greedy */
3402 possessive_quantifier = TRUE;
3403 }
3404
3405 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
3406 }
3407
3408 /* If previous was a single negated character ([^a] or similar), we use
3409 one of the special opcodes, replacing it. The code is shared with single-
3410 character repeats by setting opt_type to add a suitable offset into
3411 repeat_type. We can also test for auto-possessification. OP_NOT is
3412 currently used only for single-byte chars. */
3413
3414 else if (*previous == OP_NOT)
3415 {
3416 op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
3417 c = previous[1];
3418 if (!possessive_quantifier &&
3419 repeat_max < 0 &&
3420 check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3421 {
3422 repeat_type = 0; /* Force greedy */
3423 possessive_quantifier = TRUE;
3424 }
3425 goto OUTPUT_SINGLE_REPEAT;
3426 }
3427
3428 /* If previous was a character type match (\d or similar), abolish it and
3429 create a suitable repeat item. The code is shared with single-character
3430 repeats by setting op_type to add a suitable offset into repeat_type. Note
3431 the the Unicode property types will be present only when SUPPORT_UCP is
3432 defined, but we don't wrap the little bits of code here because it just
3433 makes it horribly messy. */
3434
3435 else if (*previous < OP_EODN)
3436 {
3437 uschar *oldcode;
3438 int prop_type, prop_value;
3439 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
3440 c = *previous;
3441
3442 if (!possessive_quantifier &&
3443 repeat_max < 0 &&
3444 check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3445 {
3446 repeat_type = 0; /* Force greedy */
3447 possessive_quantifier = TRUE;
3448 }
3449
3450 OUTPUT_SINGLE_REPEAT:
3451 if (*previous == OP_PROP || *previous == OP_NOTPROP)
3452 {
3453 prop_type = previous[1];
3454 prop_value = previous[2];
3455 }
3456 else prop_type = prop_value = -1;
3457
3458 oldcode = code;
3459 code = previous; /* Usually overwrite previous item */
3460
3461 /* If the maximum is zero then the minimum must also be zero; Perl allows
3462 this case, so we do too - by simply omitting the item altogether. */
3463
3464 if (repeat_max == 0) goto END_REPEAT;
3465
3466 /* All real repeats make it impossible to handle partial matching (maybe
3467 one day we will be able to remove this restriction). */
3468
3469 if (repeat_max != 1) cd->nopartial = TRUE;
3470
3471 /* Combine the op_type with the repeat_type */
3472
3473 repeat_type += op_type;
3474
3475 /* A minimum of zero is handled either as the special case * or ?, or as
3476 an UPTO, with the maximum given. */
3477
3478 if (repeat_min == 0)
3479 {
3480 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
3481 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
3482 else
3483 {
3484 *code++ = OP_UPTO + repeat_type;
3485 PUT2INC(code, 0, repeat_max);
3486 }
3487 }
3488
3489 /* A repeat minimum of 1 is optimized into some special cases. If the
3490 maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
3491 left in place and, if the maximum is greater than 1, we use OP_UPTO with
3492 one less than the maximum. */
3493
3494 else if (repeat_min == 1)
3495 {
3496 if (repeat_max == -1)
3497 *code++ = OP_PLUS + repeat_type;
3498 else
3499 {
3500 code = oldcode; /* leave previous item in place */
3501 if (repeat_max == 1) goto END_REPEAT;
3502 *code++ = OP_UPTO + repeat_type;
3503 PUT2INC(code, 0, repeat_max - 1);
3504 }
3505 }
3506
3507 /* The case {n,n} is just an EXACT, while the general case {n,m} is
3508 handled as an EXACT followed by an UPTO. */
3509
3510 else
3511 {
3512 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
3513 PUT2INC(code, 0, repeat_min);
3514
3515 /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
3516 we have to insert the character for the previous code. For a repeated
3517 Unicode property match, there are two extra bytes that define the
3518 required property. In UTF-8 mode, long characters have their length in
3519 c, with the 0x80 bit as a flag. */
3520
3521 if (repeat_max < 0)
3522 {
3523 #ifdef SUPPORT_UTF8
3524 if (utf8 && c >= 128)
3525 {
3526 memcpy(code, utf8_char, c & 7);
3527 code += c & 7;
3528 }
3529 else
3530 #endif
3531 {
3532 *code++ = c;
3533 if (prop_type >= 0)
3534 {
3535 *code++ = prop_type;
3536 *code++ = prop_value;
3537 }
3538 }
3539 *code++ = OP_STAR + repeat_type;
3540 }
3541
3542 /* Else insert an UPTO if the max is greater than the min, again
3543 preceded by the character, for the previously inserted code. If the
3544 UPTO is just for 1 instance, we can use QUERY instead. */
3545
3546 else if (repeat_max != repeat_min)
3547 {
3548 #ifdef SUPPORT_UTF8
3549 if (utf8 && c >= 128)
3550 {
3551 memcpy(code, utf8_char, c & 7);
3552 code += c & 7;
3553 }
3554 else
3555 #endif
3556 *code++ = c;
3557 if (prop_type >= 0)
3558 {
3559 *code++ = prop_type;
3560 *code++ = prop_value;
3561 }
3562 repeat_max -= repeat_min;
3563
3564 if (repeat_max == 1)
3565 {
3566 *code++ = OP_QUERY + repeat_type;
3567 }
3568 else
3569 {
3570 *code++ = OP_UPTO + repeat_type;
3571 PUT2INC(code, 0, repeat_max);
3572 }
3573 }
3574 }
3575
3576 /* The character or character type itself comes last in all cases. */
3577
3578 #ifdef SUPPORT_UTF8
3579 if (utf8 && c >= 128)
3580 {
3581 memcpy(code, utf8_char, c & 7);
3582 code += c & 7;
3583 }
3584 else
3585 #endif
3586 *code++ = c;
3587
3588 /* For a repeated Unicode property match, there are two extra bytes that
3589 define the required property. */
3590
3591 #ifdef SUPPORT_UCP
3592 if (prop_type >= 0)
3593 {
3594 *code++ = prop_type;
3595 *code++ = prop_value;
3596 }
3597 #endif
3598 }
3599
3600 /* If previous was a character class or a back reference, we put the repeat
3601 stuff after it, but just skip the item if the repeat was {0,0}. */
3602
3603 else if (*previous == OP_CLASS ||
3604 *previous == OP_NCLASS ||
3605 #ifdef SUPPORT_UTF8
3606 *previous == OP_XCLASS ||
3607 #endif
3608 *previous == OP_REF)
3609 {
3610 if (repeat_max == 0)
3611 {
3612 code = previous;
3613 goto END_REPEAT;
3614 }
3615
3616 /* All real repeats make it impossible to handle partial matching (maybe
3617 one day we will be able to remove this restriction). */
3618
3619 if (repeat_max != 1) cd->nopartial = TRUE;
3620
3621 if (repeat_min == 0 && repeat_max == -1)
3622 *code++ = OP_CRSTAR + repeat_type;
3623 else if (repeat_min == 1 && repeat_max == -1)
3624 *code++ = OP_CRPLUS + repeat_type;
3625 else if (repeat_min == 0 && repeat_max == 1)
3626 *code++ = OP_CRQUERY + repeat_type;
3627 else
3628 {
3629 *code++ = OP_CRRANGE + repeat_type;
3630 PUT2INC(code, 0, repeat_min);
3631 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
3632 PUT2INC(code, 0, repeat_max);
3633 }
3634 }
3635
3636 /* If previous was a bracket group, we may have to replicate it in certain
3637 cases. */
3638
3639 else if (*previous == OP_BRA || *previous == OP_CBRA ||
3640 *previous == OP_ONCE || *previous == OP_COND)
3641 {
3642 register int i;
3643 int ketoffset = 0;
3644 int len = code - previous;
3645 uschar *bralink = NULL;
3646
3647 /* Repeating a DEFINE group is pointless */
3648
3649 if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
3650 {
3651 *errorcodeptr = ERR55;
3652 goto FAILED;
3653 }
3654
3655 /* If the maximum repeat count is unlimited, find the end of the bracket
3656 by scanning through from the start, and compute the offset back to it
3657 from the current code pointer. There may be an OP_OPT setting following
3658 the final KET, so we can't find the end just by going back from the code
3659 pointer. */
3660
3661 if (repeat_max == -1)
3662 {
3663 register uschar *ket = previous;
3664 do ket += GET(ket, 1); while (*ket != OP_KET);
3665 ketoffset = code - ket;
3666 }
3667
3668 /* The case of a zero minimum is special because of the need to stick
3669 OP_BRAZERO in front of it, and because the group appears once in the
3670 data, whereas in other cases it appears the minimum number of times. For
3671 this reason, it is simplest to treat this case separately, as otherwise
3672 the code gets far too messy. There are several special subcases when the
3673 minimum is zero. */
3674
3675 if (repeat_min == 0)
3676 {
3677 /* If the maximum is also zero, we just omit the group from the output
3678 altogether. */
3679
3680 if (repeat_max == 0)
3681 {
3682 code = previous;
3683 goto END_REPEAT;
3684 }
3685
3686 /* If the maximum is 1 or unlimited, we just have to stick in the
3687 BRAZERO and do no more at this point. However, we do need to adjust
3688 any OP_RECURSE calls inside the group that refer to the group itself or
3689 any internal or forward referenced group, because the offset is from
3690 the start of the whole regex. Temporarily terminate the pattern while
3691 doing this. */
3692
3693 if (repeat_max <= 1)
3694 {
3695 *code = OP_END;
3696 adjust_recurse(previous, 1, utf8, cd, save_hwm);
3697 memmove(previous+1, previous, len);
3698 code++;
3699 *previous++ = OP_BRAZERO + repeat_type;
3700 }
3701
3702 /* If the maximum is greater than 1 and limited, we have to replicate
3703 in a nested fashion, sticking OP_BRAZERO before each set of brackets.
3704 The first one has to be handled carefully because it's the original
3705 copy, which has to be moved up. The remainder can be handled by code
3706 that is common with the non-zero minimum case below. We have to
3707 adjust the value or repeat_max, since one less copy is required. Once
3708 again, we may have to adjust any OP_RECURSE calls inside the group. */
3709
3710 else
3711 {
3712 int offset;
3713 *code = OP_END;
3714 adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
3715 memmove(previous + 2 + LINK_SIZE, previous, len);
3716 code += 2 + LINK_SIZE;
3717 *previous++ = OP_BRAZERO + repeat_type;
3718 *previous++ = OP_BRA;
3719
3720 /* We chain together the bracket offset fields that have to be
3721 filled in later when the ends of the brackets are reached. */
3722
3723 offset = (bralink == NULL)? 0 : previous - bralink;
3724 bralink = previous;
3725 PUTINC(previous, 0, offset);
3726 }
3727
3728 repeat_max--;
3729 }
3730
3731 /* If the minimum is greater than zero, replicate the group as many
3732 times as necessary, and adjust the maximum to the number of subsequent
3733 copies that we need. If we set a first char from the group, and didn't
3734 set a required char, copy the latter from the former. If there are any
3735 forward reference subroutine calls in the group, there will be entries on
3736 the workspace list; replicate these with an appropriate increment. */
3737
3738 else
3739 {
3740 if (repeat_min > 1)
3741 {
3742 /* In the pre-compile phase, we don't actually do the replication. We
3743 just adjust the length as if we had. Do some paranoid checks for
3744 potential integer overflow. */
3745
3746 if (lengthptr != NULL)
3747 {
3748 int delta = (repeat_min - 1)*length_prevgroup;
3749 if ((double)(repeat_min - 1)*(double)length_prevgroup >
3750 (double)INT_MAX ||
3751 OFLOW_MAX - *lengthptr < delta)
3752 {
3753 *errorcodeptr = ERR20;
3754 goto FAILED;
3755 }
3756 *lengthptr += delta;
3757 }
3758
3759 /* This is compiling for real */
3760
3761 else
3762 {
3763 if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
3764 for (i = 1; i < repeat_min; i++)
3765 {
3766 uschar *hc;
3767 uschar *this_hwm = cd->hwm;
3768 memcpy(code, previous, len);
3769 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3770 {
3771 PUT(cd->hwm, 0, GET(hc, 0) + len);
3772 cd->hwm += LINK_SIZE;
3773 }
3774 save_hwm = this_hwm;
3775 code += len;
3776 }
3777 }
3778 }
3779
3780 if (repeat_max > 0) repeat_max -= repeat_min;
3781 }
3782
3783 /* This code is common to both the zero and non-zero minimum cases. If
3784 the maximum is limited, it replicates the group in a nested fashion,
3785 remembering the bracket starts on a stack. In the case of a zero minimum,
3786 the first one was set up above. In all cases the repeat_max now specifies
3787 the number of additional copies needed. Again, we must remember to
3788 replicate entries on the forward reference list. */
3789
3790 if (repeat_max >= 0)
3791 {
3792 /* In the pre-compile phase, we don't actually do the replication. We
3793 just adjust the length as if we had. For each repetition we must add 1
3794 to the length for BRAZERO and for all but the last repetition we must
3795 add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
3796 paranoid checks to avoid integer overflow. */
3797
3798 if (lengthptr != NULL && repeat_max > 0)
3799 {
3800 int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
3801 2 - 2*LINK_SIZE; /* Last one doesn't nest */
3802 if ((double)repeat_max *
3803 (double)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
3804 > (double)INT_MAX ||
3805 OFLOW_MAX - *lengthptr < delta)
3806 {
3807 *errorcodeptr = ERR20;
3808 goto FAILED;
3809 }
3810 *lengthptr += delta;
3811 }
3812
3813 /* This is compiling for real */
3814
3815 else for (i = repeat_max - 1; i >= 0; i--)
3816 {
3817 uschar *hc;
3818 uschar *this_hwm = cd->hwm;
3819
3820 *code++ = OP_BRAZERO + repeat_type;
3821
3822 /* All but the final copy start a new nesting, maintaining the
3823 chain of brackets outstanding. */
3824
3825 if (i != 0)
3826 {
3827 int offset;
3828 *code++ = OP_BRA;
3829 offset = (bralink == NULL)? 0 : code - bralink;
3830 bralink = code;
3831 PUTINC(code, 0, offset);
3832 }
3833
3834 memcpy(code, previous, len);
3835 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3836 {
3837 PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
3838 cd->hwm += LINK_SIZE;
3839 }
3840 save_hwm = this_hwm;
3841 code += len;
3842 }
3843
3844 /* Now chain through the pending brackets, and fill in their length
3845 fields (which are holding the chain links pro tem). */
3846
3847 while (bralink != NULL)
3848 {
3849 int oldlinkoffset;
3850 int offset = code - bralink + 1;
3851 uschar *bra = code - offset;
3852 oldlinkoffset = GET(bra, 1);
3853 bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
3854 *code++ = OP_KET;
3855 PUTINC(code, 0, offset);
3856 PUT(bra, 1, offset);
3857 }
3858 }
3859
3860 /* If the maximum is unlimited, set a repeater in the final copy. We
3861 can't just offset backwards from the current code point, because we
3862 don't know if there's been an options resetting after the ket. The
3863 correct offset was computed above.
3864
3865 Then, when we are doing the actual compile phase, check to see whether
3866 this group is a non-atomic one that could match an empty string. If so,
3867 convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
3868 that runtime checking can be done. [This check is also applied to
3869 atomic groups at runtime, but in a different way.] */
3870
3871 else
3872 {
3873 uschar *ketcode = code - ketoffset;
3874 uschar *bracode = ketcode - GET(ketcode, 1);
3875 *ketcode = OP_KETRMAX + repeat_type;
3876 if (lengthptr == NULL && *bracode != OP_ONCE)
3877 {
3878 uschar *scode = bracode;
3879 do
3880 {
3881 if (could_be_empty_branch(scode, ketcode, utf8))
3882 {
3883 *bracode += OP_SBRA - OP_BRA;
3884 break;
3885 }
3886 scode += GET(scode, 1);
3887 }
3888 while (*scode == OP_ALT);
3889 }
3890 }
3891 }
3892
3893 /* Else there's some kind of shambles */
3894
3895 else
3896 {
3897 *errorcodeptr = ERR11;
3898 goto FAILED;
3899 }
3900
3901 /* If the character following a repeat is '+', or if certain optimization
3902 tests above succeeded, possessive_quantifier is TRUE. For some of the
3903 simpler opcodes, there is an special alternative opcode for this. For
3904 anything else, we wrap the entire repeated item inside OP_ONCE brackets.
3905 The '+' notation is just syntactic sugar, taken from Sun's Java package,
3906 but the special opcodes can optimize it a bit. The repeated item starts at
3907 tempcode, not at previous, which might be the first part of a string whose
3908 (former) last char we repeated.
3909
3910 Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
3911 an 'upto' may follow. We skip over an 'exact' item, and then test the
3912 length of what remains before proceeding. */
3913
3914 if (possessive_quantifier)
3915 {
3916 int len;
3917 if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||
3918 *tempcode == OP_NOTEXACT)
3919 tempcode += _pcre_OP_lengths[*tempcode];
3920 len = code - tempcode;
3921 if (len > 0) switch (*tempcode)
3922 {
3923 case OP_STAR: *tempcode = OP_POSSTAR; break;
3924 case OP_PLUS: *tempcode = OP_POSPLUS; break;
3925 case OP_QUERY: *tempcode = OP_POSQUERY; break;
3926 case OP_UPTO: *tempcode = OP_POSUPTO; break;
3927
3928 case OP_TYPESTAR: *tempcode = OP_TYPEPOSSTAR; break;
3929 case OP_TYPEPLUS: *tempcode = OP_TYPEPOSPLUS; break;
3930 case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
3931 case OP_TYPEUPTO: *tempcode = OP_TYPEPOSUPTO; break;
3932
3933 case OP_NOTSTAR: *tempcode = OP_NOTPOSSTAR; break;
3934 case OP_NOTPLUS: *tempcode = OP_NOTPOSPLUS; break;
3935 case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
3936 case OP_NOTUPTO: *tempcode = OP_NOTPOSUPTO; break;
3937
3938 default:
3939 memmove(tempcode + 1+LINK_SIZE, tempcode, len);
3940 code += 1 + LINK_SIZE;
3941 len += 1 + LINK_SIZE;
3942 tempcode[0] = OP_ONCE;
3943 *code++ = OP_KET;
3944 PUTINC(code, 0, len);
3945 PUT(tempcode, 1, len);
3946 break;
3947 }
3948 }
3949
3950 /* In all case we no longer have a previous item. We also set the
3951 "follows varying string" flag for subsequently encountered reqbytes if
3952 it isn't already set and we have just passed a varying length item. */
3953
3954 END_REPEAT:
3955 previous = NULL;
3956 cd->req_varyopt |= reqvary;
3957 break;
3958
3959
3960 /* ===================================================================*/
3961 /* Start of nested parenthesized sub-expression, or comment or lookahead or
3962 lookbehind or option setting or condition or all the other extended
3963 parenthesis forms. */
3964
3965 case '(':
3966 newoptions = options;
3967 skipbytes = 0;
3968 bravalue = OP_CBRA;
3969 save_hwm = cd->hwm;
3970 reset_bracount = FALSE;
3971
3972 /* First deal with various "verbs" that can be introduced by '*'. */
3973
3974 if (*(++ptr) == '*' && (cd->ctypes[ptr[1]] & ctype_letter) != 0)
3975 {
3976 int i, namelen;
3977 const uschar *name = ++ptr;
3978 previous = NULL;
3979 while ((cd->ctypes[*++ptr] & ctype_letter) != 0);
3980 if (*ptr == ':')
3981 {
3982 *errorcodeptr = ERR59; /* Not supported */
3983 goto FAILED;
3984 }
3985 if (*ptr != ')')
3986 {
3987 *errorcodeptr = ERR60;
3988 goto FAILED;
3989 }
3990 namelen = ptr - name;
3991 for (i = 0; i < verbcount; i++)
3992 {
3993 if (namelen == verbs[i].len &&
3994 strncmp((char *)name, verbs[i].name, namelen) == 0)
3995 {
3996 *code = verbs[i].op;
3997 if (*code++ == OP_ACCEPT) cd->had_accept = TRUE;
3998 break;
3999 }
4000 }
4001 if (i < verbcount) continue;
4002 *errorcodeptr = ERR60;
4003 goto FAILED;
4004 }
4005
4006 /* Deal with the extended parentheses; all are introduced by '?', and the
4007 appearance of any of them means that this is not a capturing group. */
4008
4009 else if (*ptr == '?')
4010 {
4011 int i, set, unset, namelen;
4012 int *optset;
4013 const uschar *name;
4014 uschar *slot;
4015
4016 switch (*(++ptr))
4017 {
4018 case '#': /* Comment; skip to ket */
4019 ptr++;
4020 while (*ptr != 0 && *ptr != ')') ptr++;
4021 if (*ptr == 0)
4022 {
4023 *errorcodeptr = ERR18;
4024 goto FAILED;
4025 }
4026 continue;
4027
4028
4029 /* ------------------------------------------------------------ */
4030 case '|': /* Reset capture count for each branch */
4031 reset_bracount = TRUE;
4032 /* Fall through */
4033
4034 /* ------------------------------------------------------------ */
4035 case ':': /* Non-capturing bracket */
4036 bravalue = OP_BRA;
4037 ptr++;
4038 break;
4039
4040
4041 /* ------------------------------------------------------------ */
4042 case '(':
4043 bravalue = OP_COND; /* Conditional group */
4044
4045 /* A condition can be an assertion, a number (referring to a numbered
4046 group), a name (referring to a named group), or 'R', referring to
4047 recursion. R<digits> and R&name are also permitted for recursion tests.
4048
4049 There are several syntaxes for testing a named group: (?(name)) is used
4050 by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
4051
4052 There are two unfortunate ambiguities, caused by history. (a) 'R' can
4053 be the recursive thing or the name 'R' (and similarly for 'R' followed
4054 by digits), and (b) a number could be a name that consists of digits.
4055 In both cases, we look for a name first; if not found, we try the other
4056 cases. */
4057
4058 /* For conditions that are assertions, check the syntax, and then exit
4059 the switch. This will take control down to where bracketed groups,
4060 including assertions, are processed. */
4061
4062 if (ptr[1] == '?' && (ptr[2] == '=' || ptr[2] == '!' || ptr[2] == '<'))
4063 break;
4064
4065 /* Most other conditions use OP_CREF (a couple change to OP_RREF
4066 below), and all need to skip 3 bytes at the start of the group. */
4067
4068 code[1+LINK_SIZE] = OP_CREF;
4069 skipbytes = 3;
4070 refsign = -1;
4071
4072 /* Check for a test for recursion in a named group. */
4073
4074 if (ptr[1] == 'R' && ptr[2] == '&')
4075 {
4076 terminator = -1;
4077 ptr += 2;
4078 code[1+LINK_SIZE] = OP_RREF; /* Change the type of test */
4079 }
4080
4081 /* Check for a test for a named group's having been set, using the Perl
4082 syntax (?(<name>) or (?('name') */
4083
4084 else if (ptr[1] == '<')
4085 {
4086 terminator = '>';
4087 ptr++;
4088 }
4089 else if (ptr[1] == '\'')
4090 {
4091 terminator = '\'';
4092 ptr++;
4093 }
4094 else
4095 {
4096 terminator = 0;
4097 if (ptr[1] == '-' || ptr[1] == '+') refsign = *(++ptr);
4098 }
4099
4100 /* We now expect to read a name; any thing else is an error */
4101
4102 if ((cd->ctypes[ptr[1]] & ctype_word) == 0)
4103 {
4104 ptr += 1; /* To get the right offset */
4105 *errorcodeptr = ERR28;
4106 goto FAILED;
4107 }
4108
4109 /* Read the name, but also get it as a number if it's all digits */
4110
4111 recno = 0;
4112 name = ++ptr;
4113 while ((cd->ctypes[*ptr] & ctype_word) != 0)
4114 {
4115 if (recno >= 0)
4116 recno = ((digitab[*ptr] & ctype_digit) != 0)?
4117 recno * 10 + *ptr - '0' : -1;
4118 ptr++;
4119 }
4120 namelen = ptr - name;
4121
4122 if ((terminator > 0 && *ptr++ != terminator) || *ptr++ != ')')
4123 {
4124 ptr--; /* Error offset */
4125 *errorcodeptr = ERR26;
4126 goto FAILED;
4127 }
4128
4129 /* Do no further checking in the pre-compile phase. */
4130
4131 if (lengthptr != NULL) break;
4132
4133 /* In the real compile we do the work of looking for the actual
4134 reference. If the string started with "+" or "-" we require the rest to
4135 be digits, in which case recno will be set. */
4136
4137 if (refsign > 0)
4138 {
4139 if (recno <= 0)
4140 {
4141 *errorcodeptr = ERR58;
4142 goto FAILED;
4143 }
4144 if (refsign == '-')
4145 {
4146 recno = cd->bracount - recno + 1;
4147 if (recno <= 0)
4148 {
4149 *errorcodeptr = ERR15;
4150 goto FAILED;
4151 }
4152 }
4153 else recno += cd->bracount;
4154 PUT2(code, 2+LINK_SIZE, recno);
4155 break;
4156 }
4157
4158 /* Otherwise (did not start with "+" or "-"), start by looking for the
4159 name. */
4160
4161 slot = cd->name_table;
4162 for (i = 0; i < cd->names_found; i++)
4163 {
4164 if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
4165 slot += cd->name_entry_size;
4166 }
4167
4168 /* Found a previous named subpattern */
4169
4170 if (i < cd->names_found)
4171 {
4172 recno = GET2(slot, 0);
4173 PUT2(code, 2+LINK_SIZE, recno);
4174 }
4175
4176 /* Search the pattern for a forward reference */
4177
4178 else if ((i = find_parens(ptr, cd->bracount, name, namelen,
4179 (options & PCRE_EXTENDED) != 0)) > 0)
4180 {
4181 PUT2(code, 2+LINK_SIZE, i);
4182 }
4183
4184 /* If terminator == 0 it means that the name followed directly after
4185 the opening parenthesis [e.g. (?(abc)...] and in this case there are
4186 some further alternatives to try. For the cases where terminator != 0
4187 [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
4188 now checked all the possibilities, so give an error. */
4189
4190 else if (terminator != 0)
4191 {
4192 *errorcodeptr = ERR15;
4193 goto FAILED;
4194 }
4195
4196 /* Check for (?(R) for recursion. Allow digits after R to specify a
4197 specific group number. */
4198
4199 else if (*name == 'R')
4200 {
4201 recno = 0;
4202 for (i = 1; i < namelen; i++)
4203 {
4204 if ((digitab[name[i]] & ctype_digit) == 0)
4205 {
4206 *errorcodeptr = ERR15;
4207 goto FAILED;
4208 }
4209 recno = recno * 10 + name[i] - '0';
4210 }
4211 if (recno == 0) recno = RREF_ANY;
4212 code[1+LINK_SIZE] = OP_RREF; /* Change test type */
4213 PUT2(code, 2+LINK_SIZE, recno);
4214 }
4215
4216 /* Similarly, check for the (?(DEFINE) "condition", which is always
4217 false. */
4218
4219 else if (namelen == 6 && strncmp((char *)name, "DEFINE", 6) == 0)
4220 {
4221 code[1+LINK_SIZE] = OP_DEF;
4222 skipbytes = 1;
4223 }
4224
4225 /* Check for the "name" actually being a subpattern number. */
4226
4227 else if (recno > 0)
4228 {
4229 PUT2(code, 2+LINK_SIZE, recno);
4230 }
4231
4232 /* Either an unidentified subpattern, or a reference to (?(0) */
4233
4234 else
4235 {
4236 *errorcodeptr = (recno == 0)? ERR35: ERR15;
4237 goto FAILED;
4238 }
4239 break;
4240
4241
4242 /* ------------------------------------------------------------ */
4243 case '=': /* Positive lookahead */
4244 bravalue = OP_ASSERT;
4245 ptr++;
4246 break;
4247
4248
4249 /* ------------------------------------------------------------ */
4250 case '!': /* Negative lookahead */
4251 ptr++;
4252 if (*ptr == ')') /* Optimize (?!) */
4253 {
4254 *code++ = OP_FAIL;
4255 previous = NULL;
4256 continue;
4257 }
4258 bravalue = OP_ASSERT_NOT;
4259 break;
4260
4261
4262 /* ------------------------------------------------------------ */
4263 case '<': /* Lookbehind or named define */
4264 switch (ptr[1])
4265 {
4266 case '=': /* Positive lookbehind */
4267 bravalue = OP_ASSERTBACK;
4268 ptr += 2;
4269 break;
4270
4271 case '!': /* Negative lookbehind */
4272 bravalue = OP_ASSERTBACK_NOT;
4273 ptr += 2;
4274 break;
4275
4276 default: /* Could be name define, else bad */
4277 if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME;
4278 ptr++; /* Correct offset for error */
4279 *errorcodeptr = ERR24;
4280 goto FAILED;
4281 }
4282 break;
4283
4284
4285 /* ------------------------------------------------------------ */
4286 case '>': /* One-time brackets */
4287 bravalue = OP_ONCE;
4288 ptr++;
4289 break;
4290
4291
4292 /* ------------------------------------------------------------ */
4293 case 'C': /* Callout - may be followed by digits; */
4294 previous_callout = code; /* Save for later completion */
4295 after_manual_callout = 1; /* Skip one item before completing */
4296 *code++ = OP_CALLOUT;
4297 {
4298 int n = 0;
4299 while ((digitab[*(++ptr)] & ctype_digit) != 0)
4300 n = n * 10 + *ptr - '0';
4301 if (*ptr != ')')
4302 {
4303 *errorcodeptr = ERR39;
4304 goto FAILED;
4305 }
4306 if (n > 255)
4307 {
4308 *errorcodeptr = ERR38;
4309 goto FAILED;
4310 }
4311 *code++ = n;
4312 PUT(code, 0, ptr - cd->start_pattern + 1); /* Pattern offset */
4313 PUT(code, LINK_SIZE, 0); /* Default length */
4314 code += 2 * LINK_SIZE;
4315 }
4316 previous = NULL;
4317 continue;
4318
4319
4320 /* ------------------------------------------------------------ */
4321 case 'P': /* Python-style named subpattern handling */
4322 if (*(++ptr) == '=' || *ptr == '>') /* Reference or recursion */
4323 {
4324 is_recurse = *ptr == '>';
4325 terminator = ')';
4326 goto NAMED_REF_OR_RECURSE;
4327 }
4328 else if (*ptr != '<') /* Test for Python-style definition */
4329 {
4330 *errorcodeptr = ERR41;
4331 goto FAILED;
4332 }
4333 /* Fall through to handle (?P< as (?< is handled */
4334
4335
4336 /* ------------------------------------------------------------ */
4337 DEFINE_NAME: /* Come here from (?< handling */
4338 case '\'':
4339 {
4340 terminator = (*ptr == '<')? '>' : '\'';
4341 name = ++ptr;
4342
4343 while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4344 namelen = ptr - name;
4345
4346 /* In the pre-compile phase, just do a syntax check. */
4347
4348 if (lengthptr != NULL)
4349 {
4350 if (*ptr != terminator)
4351 {
4352 *errorcodeptr = ERR42;
4353 goto FAILED;
4354 }
4355 if (cd->names_found >= MAX_NAME_COUNT)
4356 {
4357 *errorcodeptr = ERR49;
4358 goto FAILED;
4359 }
4360 if (namelen + 3 > cd->name_entry_size)
4361 {
4362 cd->name_entry_size = namelen + 3;
4363 if (namelen > MAX_NAME_SIZE)
4364 {
4365 *errorcodeptr = ERR48;
4366 goto FAILED;
4367 }
4368 }
4369 }
4370
4371 /* In the real compile, create the entry in the table */
4372
4373 else
4374 {
4375 slot = cd->name_table;
4376 for (i = 0; i < cd->names_found; i++)
4377 {
4378 int crc = memcmp(name, slot+2, namelen);
4379 if (crc == 0)
4380 {
4381 if (slot[2+namelen] == 0)
4382 {
4383 if ((options & PCRE_DUPNAMES) == 0)
4384 {
4385 *errorcodeptr = ERR43;
4386 goto FAILED;
4387 }
4388 }
4389 else crc = -1; /* Current name is substring */
4390 }
4391 if (crc < 0)
4392 {
4393 memmove(slot + cd->name_entry_size, slot,
4394 (cd->names_found - i) * cd->name_entry_size);
4395 break;
4396 }
4397 slot += cd->name_entry_size;
4398 }
4399
4400 PUT2(slot, 0, cd->bracount + 1);
4401 memcpy(slot + 2, name, namelen);
4402 slot[2+namelen] = 0;
4403 }
4404 }
4405
4406 /* In both cases, count the number of names we've encountered. */
4407
4408 ptr++; /* Move past > or ' */
4409 cd->names_found++;
4410 goto NUMBERED_GROUP;
4411
4412
4413 /* ------------------------------------------------------------ */
4414 case '&': /* Perl recursion/subroutine syntax */
4415 terminator = ')';
4416 is_recurse = TRUE;
4417 /* Fall through */
4418
4419 /* We come here from the Python syntax above that handles both
4420 references (?P=name) and recursion (?P>name), as well as falling
4421 through from the Perl recursion syntax (?&name). */
4422
4423 NAMED_REF_OR_RECURSE:
4424 name = ++ptr;
4425 while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4426 namelen = ptr - name;
4427
4428 /* In the pre-compile phase, do a syntax check and set a dummy
4429 reference number. */
4430
4431 if (lengthptr != NULL)
4432 {
4433 if (*ptr != terminator)
4434 {
4435 *errorcodeptr = ERR42;
4436 goto FAILED;
4437 }
4438 if (namelen > MAX_NAME_SIZE)
4439 {
4440 *errorcodeptr = ERR48;
4441 goto FAILED;
4442 }
4443 recno = 0;
4444 }
4445
4446 /* In the real compile, seek the name in the table */
4447
4448 else
4449 {
4450 slot = cd->name_table;
4451 for (i = 0; i < cd->names_found; i++)
4452 {
4453 if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
4454 slot += cd->name_entry_size;
4455 }
4456
4457 if (i < cd->names_found) /* Back reference */
4458 {
4459 recno = GET2(slot, 0);
4460 }
4461 else if ((recno = /* Forward back reference */
4462 find_parens(ptr, cd->bracount, name, namelen,
4463 (options & PCRE_EXTENDED) != 0)) <= 0)
4464 {
4465 *errorcodeptr = ERR15;
4466 goto FAILED;
4467 }
4468 }
4469
4470 /* In both phases, we can now go to the code than handles numerical
4471 recursion or backreferences. */
4472
4473 if (is_recurse) goto HANDLE_RECURSION;
4474 else goto HANDLE_REFERENCE;
4475
4476
4477 /* ------------------------------------------------------------ */
4478 case 'R': /* Recursion */
4479 ptr++; /* Same as (?0) */
4480 /* Fall through */
4481
4482
4483 /* ------------------------------------------------------------ */
4484 case '-': case '+':
4485 case '0': case '1': case '2': case '3': case '4': /* Recursion or */
4486 case '5': case '6': case '7': case '8': case '9': /* subroutine */
4487 {
4488 const uschar *called;
4489
4490 if ((refsign = *ptr) == '+') ptr++;
4491 else if (refsign == '-')
4492 {
4493 if ((digitab[ptr[1]] & ctype_digit) == 0)
4494 goto OTHER_CHAR_AFTER_QUERY;
4495 ptr++;
4496 }
4497
4498 recno = 0;
4499 while((digitab[*ptr] & ctype_digit) != 0)
4500 recno = recno * 10 + *ptr++ - '0';
4501
4502 if (*ptr != ')')
4503 {
4504 *errorcodeptr = ERR29;
4505 goto FAILED;
4506 }
4507
4508 if (refsign == '-')
4509 {
4510 if (recno == 0)
4511 {
4512 *errorcodeptr = ERR58;
4513 goto FAILED;
4514 }
4515 recno = cd->bracount - recno + 1;
4516 if (recno <= 0)
4517 {
4518 *errorcodeptr = ERR15;
4519 goto FAILED;
4520 }
4521 }
4522 else if (refsign == '+')
4523 {
4524 if (recno == 0)
4525 {
4526 *errorcodeptr = ERR58;
4527 goto FAILED;
4528 }
4529 recno += cd->bracount;
4530 }
4531
4532 /* Come here from code above that handles a named recursion */
4533
4534 HANDLE_RECURSION:
4535
4536 previous = code;
4537 called = cd->start_code;
4538
4539 /* When we are actually compiling, find the bracket that is being
4540 referenced. Temporarily end the regex in case it doesn't exist before
4541 this point. If we end up with a forward reference, first check that
4542 the bracket does occur later so we can give the error (and position)
4543 now. Then remember this forward reference in the workspace so it can
4544 be filled in at the end. */
4545
4546 if (lengthptr == NULL)
4547 {
4548 *code = OP_END;
4549 if (recno != 0) called = find_bracket(cd->start_code, utf8, recno);
4550
4551 /* Forward reference */
4552
4553 if (called == NULL)
4554 {
4555 if (find_parens(ptr, cd->bracount, NULL, recno,
4556 (options & PCRE_EXTENDED) != 0) < 0)
4557 {
4558 *errorcodeptr = ERR15;
4559 goto FAILED;
4560 }
4561 called = cd->start_code + recno;
4562 PUTINC(cd->hwm, 0, code + 2 + LINK_SIZE - cd->start_code);
4563 }
4564
4565 /* If not a forward reference, and the subpattern is still open,
4566 this is a recursive call. We check to see if this is a left
4567 recursion that could loop for ever, and diagnose that case. */
4568
4569 else if (GET(called, 1) == 0 &&
4570 could_be_empty(called, code, bcptr, utf8))
4571 {
4572 *errorcodeptr = ERR40;
4573 goto FAILED;
4574 }
4575 }
4576
4577 /* Insert the recursion/subroutine item, automatically wrapped inside
4578 "once" brackets. Set up a "previous group" length so that a
4579 subsequent quantifier will work. */
4580
4581 *code = OP_ONCE;
4582 PUT(code, 1, 2 + 2*LINK_SIZE);
4583 code += 1 + LINK_SIZE;
4584
4585 *code = OP_RECURSE;
4586 PUT(code, 1, called - cd->start_code);
4587 code += 1 + LINK_SIZE;
4588
4589 *code = OP_KET;
4590 PUT(code, 1, 2 + 2*LINK_SIZE);
4591 code += 1 + LINK_SIZE;
4592
4593 length_prevgroup = 3 + 3*LINK_SIZE;
4594 }
4595
4596 /* Can't determine a first byte now */
4597
4598 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
4599 continue;
4600
4601
4602 /* ------------------------------------------------------------ */
4603 default: /* Other characters: check option setting */
4604 OTHER_CHAR_AFTER_QUERY:
4605 set = unset = 0;
4606 optset = &set;
4607
4608 while (*ptr != ')' && *ptr != ':')
4609 {
4610 switch (*ptr++)
4611 {
4612 case '-': optset = &unset; break;
4613
4614 case 'J': /* Record that it changed in the external options */
4615 *optset |= PCRE_DUPNAMES;
4616 cd->external_options |= PCRE_JCHANGED;
4617 break;
4618
4619 case 'i': *optset |= PCRE_CASELESS; break;
4620 case 'm': *optset |= PCRE_MULTILINE; break;
4621 case 's': *optset |= PCRE_DOTALL; break;
4622 case 'x': *optset |= PCRE_EXTENDED; break;
4623 case 'U': *optset |= PCRE_UNGREEDY; break;
4624 case 'X': *optset |= PCRE_EXTRA; break;
4625
4626 default: *errorcodeptr = ERR12;
4627 ptr--; /* Correct the offset */
4628 goto FAILED;
4629 }
4630 }
4631
4632 /* Set up the changed option bits, but don't change anything yet. */
4633
4634 newoptions = (options | set) & (~unset);
4635
4636 /* If the options ended with ')' this is not the start of a nested
4637 group with option changes, so the options change at this level. If this
4638 item is right at the start of the pattern, the options can be
4639 abstracted and made external in the pre-compile phase, and ignored in
4640 the compile phase. This can be helpful when matching -- for instance in
4641 caseless checking of required bytes.
4642
4643 If the code pointer is not (cd->start_code + 1 + LINK_SIZE), we are
4644 definitely *not* at the start of the pattern because something has been
4645 compiled. In the pre-compile phase, however, the code pointer can have
4646 that value after the start, because it gets reset as code is discarded
4647 during the pre-compile. However, this can happen only at top level - if
4648 we are within parentheses, the starting BRA will still be present. At
4649 any parenthesis level, the length value can be used to test if anything
4650 has been compiled at that level. Thus, a test for both these conditions
4651 is necessary to ensure we correctly detect the start of the pattern in
4652 both phases.
4653
4654 If we are not at the pattern start, compile code to change the ims
4655 options if this setting actually changes any of them. We also pass the
4656 new setting back so that it can be put at the start of any following
4657 branches, and when this group ends (if we are in a group), a resetting
4658 item can be compiled. */
4659
4660 if (*ptr == ')')
4661 {
4662 if (code == cd->start_code + 1 + LINK_SIZE &&
4663 (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE))
4664 {
4665 cd->external_options = newoptions;
4666 options = newoptions;
4667 }
4668 else
4669 {
4670 if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
4671 {
4672 *code++ = OP_OPT;
4673 *code++ = newoptions & PCRE_IMS;
4674 }
4675
4676 /* Change options at this level, and pass them back for use
4677 in subsequent branches. Reset the greedy defaults and the case
4678 value for firstbyte and reqbyte. */
4679
4680 *optionsptr = options = newoptions;
4681 greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
4682 greedy_non_default = greedy_default ^ 1;
4683 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
4684 }
4685
4686 previous = NULL; /* This item can't be repeated */
4687 continue; /* It is complete */
4688 }
4689
4690 /* If the options ended with ':' we are heading into a nested group
4691 with possible change of options. Such groups are non-capturing and are
4692 not assertions of any kind. All we need to do is skip over the ':';
4693 the newoptions value is handled below. */
4694
4695 bravalue = OP_BRA;
4696 ptr++;
4697 } /* End of switch for character following (? */
4698 } /* End of (? handling */
4699
4700 /* Opening parenthesis not followed by '?'. If PCRE_NO_AUTO_CAPTURE is set,
4701 all unadorned brackets become non-capturing and behave like (?:...)
4702 brackets. */
4703
4704 else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
4705 {
4706 bravalue = OP_BRA;
4707 }
4708
4709 /* Else we have a capturing group. */
4710
4711 else
4712 {
4713 NUMBERED_GROUP:
4714 cd->bracount += 1;
4715 PUT2(code, 1+LINK_SIZE, cd->bracount);
4716 skipbytes = 2;
4717 }
4718
4719 /* Process nested bracketed regex. Assertions may not be repeated, but
4720 other kinds can be. All their opcodes are >= OP_ONCE. We copy code into a
4721 non-register variable in order to be able to pass its address because some
4722 compilers complain otherwise. Pass in a new setting for the ims options if
4723 they have changed. */
4724
4725 previous = (bravalue >= OP_ONCE)? code : NULL;
4726 *code = bravalue;
4727 tempcode = code;
4728 tempreqvary = cd->req_varyopt; /* Save value before bracket */
4729 length_prevgroup = 0; /* Initialize for pre-compile phase */
4730
4731 if (!compile_regex(
4732 newoptions, /* The complete new option state */
4733 options & PCRE_IMS, /* The previous ims option state */
4734 &tempcode, /* Where to put code (updated) */
4735 &ptr, /* Input pointer (updated) */
4736 errorcodeptr, /* Where to put an error message */
4737 (bravalue == OP_ASSERTBACK ||
4738 bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
4739 reset_bracount, /* True if (?| group */
4740 skipbytes, /* Skip over bracket number */
4741 &subfirstbyte, /* For possible first char */
4742 &subreqbyte, /* For possible last char */
4743 bcptr, /* Current branch chain */
4744 cd, /* Tables block */
4745 (lengthptr == NULL)? NULL : /* Actual compile phase */
4746 &length_prevgroup /* Pre-compile phase */
4747 ))
4748 goto FAILED;
4749
4750 /* At the end of compiling, code is still pointing to the start of the
4751 group, while tempcode has been updated to point past the end of the group
4752 and any option resetting that may follow it. The pattern pointer (ptr)
4753 is on the bracket. */
4754
4755 /* If this is a conditional bracket, check that there are no more than
4756 two branches in the group, or just one if it's a DEFINE group. We do this
4757 in the real compile phase, not in the pre-pass, where the whole group may
4758 not be available. */
4759
4760 if (bravalue == OP_COND && lengthptr == NULL)
4761 {
4762 uschar *tc = code;
4763 int condcount = 0;
4764
4765 do {
4766 condcount++;
4767 tc += GET(tc,1);
4768 }
4769 while (*tc != OP_KET);
4770
4771 /* A DEFINE group is never obeyed inline (the "condition" is always
4772 false). It must have only one branch. */
4773
4774 if (code[LINK_SIZE+1] == OP_DEF)
4775 {
4776 if (condcount > 1)
4777 {
4778 *errorcodeptr = ERR54;
4779 goto FAILED;
4780 }
4781 bravalue = OP_DEF; /* Just a flag to suppress char handling below */
4782 }
4783
4784 /* A "normal" conditional group. If there is just one branch, we must not
4785 make use of its firstbyte or reqbyte, because this is equivalent to an
4786 empty second branch. */
4787
4788 else
4789 {
4790 if (condcount > 2)
4791 {
4792 *errorcodeptr = ERR27;
4793 goto FAILED;
4794 }
4795 if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
4796 }
4797 }
4798
4799 /* Error if hit end of pattern */
4800
4801 if (*ptr != ')')
4802 {
4803 *errorcodeptr = ERR14;
4804 goto FAILED;
4805 }
4806
4807 /* In the pre-compile phase, update the length by the length of the group,
4808 less the brackets at either end. Then reduce the compiled code to just a
4809 set of non-capturing brackets so that it doesn't use much memory if it is
4810 duplicated by a quantifier.*/
4811
4812 if (lengthptr != NULL)
4813 {
4814 if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE)
4815 {
4816 *errorcodeptr = ERR20;
4817 goto FAILED;
4818 }
4819 *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
4820 *code++ = OP_BRA;
4821 PUTINC(code, 0, 1 + LINK_SIZE);
4822 *code++ = OP_KET;
4823 PUTINC(code, 0, 1 + LINK_SIZE);
4824 break; /* No need to waste time with special character handling */
4825 }
4826
4827 /* Otherwise update the main code pointer to the end of the group. */
4828
4829 code = tempcode;
4830
4831 /* For a DEFINE group, required and first character settings are not
4832 relevant. */
4833
4834 if (bravalue == OP_DEF) break;
4835
4836 /* Handle updating of the required and first characters for other types of
4837 group. Update for normal brackets of all kinds, and conditions with two
4838 branches (see code above). If the bracket is followed by a quantifier with
4839 zero repeat, we have to back off. Hence the definition of zeroreqbyte and
4840 zerofirstbyte outside the main loop so that they can be accessed for the
4841 back off. */
4842
4843 zeroreqbyte = reqbyte;
4844 zerofirstbyte = firstbyte;
4845 groupsetfirstbyte = FALSE;
4846
4847 if (bravalue >= OP_ONCE)
4848 {
4849 /* If we have not yet set a firstbyte in this branch, take it from the
4850 subpattern, remembering that it was set here so that a repeat of more
4851 than one can replicate it as reqbyte if necessary. If the subpattern has
4852 no firstbyte, set "none" for the whole branch. In both cases, a zero
4853 repeat forces firstbyte to "none". */
4854
4855 if (firstbyte == REQ_UNSET)
4856 {
4857 if (subfirstbyte >= 0)
4858 {
4859 firstbyte = subfirstbyte;
4860 groupsetfirstbyte = TRUE;
4861 }
4862 else firstbyte = REQ_NONE;
4863 zerofirstbyte = REQ_NONE;
4864 }
4865
4866 /* If firstbyte was previously set, convert the subpattern's firstbyte
4867 into reqbyte if there wasn't one, using the vary flag that was in
4868 existence beforehand. */
4869
4870 else if (subfirstbyte >= 0 && subreqbyte < 0)
4871 subreqbyte = subfirstbyte | tempreqvary;
4872
4873 /* If the subpattern set a required byte (or set a first byte that isn't
4874 really the first byte - see above), set it. */
4875
4876 if (subreqbyte >= 0) reqbyte = subreqbyte;
4877 }
4878
4879 /* For a forward assertion, we take the reqbyte, if set. This can be
4880 helpful if the pattern that follows the assertion doesn't set a different
4881 char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
4882 for an assertion, however because it leads to incorrect effect for patterns
4883 such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
4884 of a firstbyte. This is overcome by a scan at the end if there's no
4885 firstbyte, looking for an asserted first char. */
4886
4887 else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
4888 break; /* End of processing '(' */
4889
4890
4891 /* ===================================================================*/
4892 /* Handle metasequences introduced by \. For ones like \d, the ESC_ values
4893 are arranged to be the negation of the corresponding OP_values. For the
4894 back references, the values are ESC_REF plus the reference number. Only
4895 back references and those types that consume a character may be repeated.
4896 We can test for values between ESC_b and ESC_Z for the latter; this may
4897 have to change if any new ones are ever created. */
4898
4899 case '\\':
4900 tempptr = ptr;
4901 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, FALSE);
4902 if (*errorcodeptr != 0) goto FAILED;
4903
4904 if (c < 0)
4905 {
4906 if (-c == ESC_Q) /* Handle start of quoted string */
4907 {
4908 if (ptr[1] == '\\' && ptr[2] == 'E') ptr += 2; /* avoid empty string */
4909 else inescq = TRUE;
4910 continue;
4911 }
4912
4913 if (-c == ESC_E) continue; /* Perl ignores an orphan \E */
4914
4915 /* For metasequences that actually match a character, we disable the
4916 setting of a first character if it hasn't already been set. */
4917
4918 if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
4919 firstbyte = REQ_NONE;
4920
4921 /* Set values to reset to if this is followed by a zero repeat. */
4922
4923 zerofirstbyte = firstbyte;
4924 zeroreqbyte = reqbyte;
4925
4926 /* \k<name> or \k'name' is a back reference by name (Perl syntax).
4927 We also support \k{name} (.NET syntax) */
4928
4929 if (-c == ESC_k && (ptr[1] == '<' || ptr[1] == '\'' || ptr[1] == '{'))
4930 {
4931 is_recurse = FALSE;
4932 terminator = (*(++ptr) == '<')? '>' : (*ptr == '\'')? '\'' : '}';
4933 goto NAMED_REF_OR_RECURSE;
4934 }
4935
4936 /* Back references are handled specially; must disable firstbyte if
4937 not set to cope with cases like (?=(\w+))\1: which would otherwise set
4938 ':' later. */
4939
4940 if (-c >= ESC_REF)
4941 {
4942 recno = -c - ESC_REF;
4943
4944 HANDLE_REFERENCE: /* Come here from named backref handling */
4945 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
4946 previous = code;
4947 *code++ = OP_REF;
4948 PUT2INC(code, 0, recno);
4949 cd->backref_map |= (recno < 32)? (1 << recno) : 1;
4950 if (recno > cd->top_backref) cd->top_backref = recno;
4951 }
4952
4953 /* So are Unicode property matches, if supported. */
4954
4955 #ifdef SUPPORT_UCP
4956 else if (-c == ESC_P || -c == ESC_p)
4957 {
4958 BOOL negated;
4959 int pdata;
4960 int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
4961 if (ptype < 0) goto FAILED;
4962 previous = code;
4963 *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
4964 *code++ = ptype;
4965 *code++ = pdata;
4966 }
4967 #else
4968
4969 /* If Unicode properties are not supported, \X, \P, and \p are not
4970 allowed. */
4971
4972 else if (-c == ESC_X || -c == ESC_P || -c == ESC_p)
4973 {
4974 *errorcodeptr = ERR45;
4975 goto FAILED;
4976 }
4977 #endif
4978
4979 /* For the rest (including \X when Unicode properties are supported), we
4980 can obtain the OP value by negating the escape value. */
4981
4982 else
4983 {
4984 previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
4985 *code++ = -c;
4986 }
4987 continue;
4988 }
4989
4990 /* We have a data character whose value is in c. In UTF-8 mode it may have
4991 a value > 127. We set its representation in the length/buffer, and then
4992 handle it as a data character. */
4993
4994 #ifdef SUPPORT_UTF8
4995 if (utf8 && c > 127)
4996 mclength = _pcre_ord2utf8(c, mcbuffer);
4997 else
4998 #endif
4999
5000 {
5001 mcbuffer[0] = c;
5002 mclength = 1;
5003 }
5004 goto ONE_CHAR;
5005
5006
5007 /* ===================================================================*/
5008 /* Handle a literal character. It is guaranteed not to be whitespace or #
5009 when the extended flag is set. If we are in UTF-8 mode, it may be a
5010 multi-byte literal character. */
5011
5012 default:
5013 NORMAL_CHAR:
5014 mclength = 1;
5015 mcbuffer[0] = c;
5016
5017 #ifdef SUPPORT_UTF8
5018 if (utf8 && c >= 0xc0)
5019 {
5020 while ((ptr[1] & 0xc0) == 0x80)
5021 mcbuffer[mclength++] = *(++ptr);
5022 }
5023 #endif
5024
5025 /* At this point we have the character's bytes in mcbuffer, and the length
5026 in mclength. When not in UTF-8 mode, the length is always 1. */
5027
5028 ONE_CHAR:
5029 previous = code;
5030 *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARNC : OP_CHAR;
5031 for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
5032
5033 /* Set the first and required bytes appropriately. If no previous first
5034 byte, set it from this character, but revert to none on a zero repeat.
5035 Otherwise, leave the firstbyte value alone, and don't change it on a zero
5036 repeat. */
5037
5038 if (firstbyte == REQ_UNSET)
5039 {
5040 zerofirstbyte = REQ_NONE;
5041 zeroreqbyte = reqbyte;
5042
5043 /* If the character is more than one byte long, we can set firstbyte
5044 only if it is not to be matched caselessly. */
5045
5046 if (mclength == 1 || req_caseopt == 0)
5047 {
5048 firstbyte = mcbuffer[0] | req_caseopt;
5049 if (mclength != 1) reqbyte = code[-1] | cd->req_varyopt;
5050 }
5051 else firstbyte = reqbyte = REQ_NONE;
5052 }
5053
5054 /* firstbyte was previously set; we can set reqbyte only the length is
5055 1 or the matching is caseful. */
5056
5057 else
5058 {
5059 zerofirstbyte = firstbyte;
5060 zeroreqbyte = reqbyte;
5061 if (mclength == 1 || req_caseopt == 0)
5062 reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
5063 }
5064
5065 break; /* End of literal character handling */
5066 }
5067 } /* end of big loop */
5068
5069
5070 /* Control never reaches here by falling through, only by a goto for all the
5071 error states. Pass back the position in the pattern so that it can be displayed
5072 to the user for diagnosing the error. */
5073
5074 FAILED:
5075 *ptrptr = ptr;
5076 return FALSE;
5077 }
5078
5079
5080
5081
5082 /*************************************************
5083 * Compile sequence of alternatives *
5084 *************************************************/
5085
5086 /* On entry, ptr is pointing past the bracket character, but on return it
5087 points to the closing bracket, or vertical bar, or end of string. The code
5088 variable is pointing at the byte into which the BRA operator has been stored.
5089 If the ims options are changed at the start (for a (?ims: group) or during any
5090 branch, we need to insert an OP_OPT item at the start of every following branch
5091 to ensure they get set correctly at run time, and also pass the new options
5092 into every subsequent branch compile.
5093
5094 This function is used during the pre-compile phase when we are trying to find
5095 out the amount of memory needed, as well as during the real compile phase. The
5096 value of lengthptr distinguishes the two phases.
5097
5098 Arguments:
5099 options option bits, including any changes for this subpattern
5100 oldims previous settings of ims option bits
5101 codeptr -> the address of the current code pointer
5102 ptrptr -> the address of the current pattern pointer
5103 errorcodeptr -> pointer to error code variable
5104 lookbehind TRUE if this is a lookbehind assertion
5105 reset_bracount TRUE to reset the count for each branch
5106 skipbytes skip this many bytes at start (for brackets and OP_COND)
5107 firstbyteptr place to put the first required character, or a negative number
5108 reqbyteptr place to put the last required character, or a negative number
5109 bcptr pointer to the chain of currently open branches
5110 cd points to the data block with tables pointers etc.
5111 lengthptr NULL during the real compile phase
5112 points to length accumulator during pre-compile phase
5113
5114 Returns: TRUE on success
5115 */
5116
5117 static BOOL
5118 compile_regex(int options, int oldims, uschar **codeptr, const uschar **ptrptr,
5119 int *errorcodeptr, BOOL lookbehind, BOOL reset_bracount, int skipbytes,
5120 int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd,
5121 int *lengthptr)
5122 {
5123 const uschar *ptr = *ptrptr;
5124 uschar *code = *codeptr;
5125 uschar *last_branch = code;
5126 uschar *start_bracket = code;
5127 uschar *reverse_count = NULL;
5128 int firstbyte, reqbyte;
5129 int branchfirstbyte, branchreqbyte;
5130 int length;
5131 int orig_bracount;
5132 int max_bracount;
5133 branch_chain bc;
5134
5135 bc.outer = bcptr;
5136 bc.current = code;
5137
5138 firstbyte = reqbyte = REQ_UNSET;
5139
5140 /* Accumulate the length for use in the pre-compile phase. Start with the
5141 length of the BRA and KET and any extra bytes that are required at the
5142 beginning. We accumulate in a local variable to save frequent testing of
5143 lenthptr for NULL. We cannot do this by looking at the value of code at the
5144 start and end of each alternative, because compiled items are discarded during
5145 the pre-compile phase so that the work space is not exceeded. */
5146
5147 length = 2 + 2*LINK_SIZE + skipbytes;
5148
5149 /* WARNING: If the above line is changed for any reason, you must also change
5150 the code that abstracts option settings at the start of the pattern and makes
5151 them global. It tests the value of length for (2 + 2*LINK_SIZE) in the
5152 pre-compile phase to find out whether anything has yet been compiled or not. */
5153
5154 /* Offset is set zero to mark that this bracket is still open */
5155
5156 PUT(code, 1, 0);
5157 code += 1 + LINK_SIZE + skipbytes;
5158
5159 /* Loop for each alternative branch */
5160
5161 orig_bracount = max_bracount = cd->bracount;
5162 for (;;)
5163 {
5164 /* For a (?| group, reset the capturing bracket count so that each branch
5165 uses the same numbers. */
5166
5167 if (reset_bracount) cd->bracount = orig_bracount;
5168
5169 /* Handle a change of ims options at the start of the branch */
5170
5171 if ((options & PCRE_IMS) != oldims)
5172 {
5173 *code++ = OP_OPT;
5174 *code++ = options & PCRE_IMS;
5175 length += 2;
5176 }
5177
5178 /* Set up dummy OP_REVERSE if lookbehind assertion */
5179
5180 if (lookbehind)
5181 {
5182 *code++ = OP_REVERSE;
5183 reverse_count = code;
5184 PUTINC(code, 0, 0);
5185 length += 1 + LINK_SIZE;
5186 }
5187
5188 /* Now compile the branch; in the pre-compile phase its length gets added
5189 into the length. */
5190
5191 if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstbyte,
5192 &branchreqbyte, &bc, cd, (lengthptr == NULL)? NULL : &length))
5193 {
5194 *ptrptr = ptr;
5195 return FALSE;
5196 }
5197
5198 /* Keep the highest bracket count in case (?| was used and some branch
5199 has fewer than the rest. */
5200
5201 if (cd->bracount > max_bracount) max_bracount = cd->bracount;
5202
5203 /* In the real compile phase, there is some post-processing to be done. */
5204
5205 if (lengthptr == NULL)
5206 {
5207 /* If this is the first branch, the firstbyte and reqbyte values for the
5208 branch become the values for the regex. */
5209
5210 if (*last_branch != OP_ALT)
5211 {
5212 firstbyte = branchfirstbyte;
5213 reqbyte = branchreqbyte;
5214 }
5215
5216 /* If this is not the first branch, the first char and reqbyte have to
5217 match the values from all the previous branches, except that if the
5218 previous value for reqbyte didn't have REQ_VARY set, it can still match,
5219 and we set REQ_VARY for the regex. */
5220
5221 else
5222 {
5223 /* If we previously had a firstbyte, but it doesn't match the new branch,
5224 we have to abandon the firstbyte for the regex, but if there was
5225 previously no reqbyte, it takes on the value of the old firstbyte. */
5226
5227 if (firstbyte >= 0 && firstbyte != branchfirstbyte)
5228 {
5229 if (reqbyte < 0) reqbyte = firstbyte;
5230 firstbyte = REQ_NONE;
5231 }
5232
5233 /* If we (now or from before) have no firstbyte, a firstbyte from the
5234 branch becomes a reqbyte if there isn't a branch reqbyte. */
5235
5236 if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
5237 branchreqbyte = branchfirstbyte;
5238
5239 /* Now ensure that the reqbytes match */
5240
5241 if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
5242 reqbyte = REQ_NONE;
5243 else reqbyte |= branchreqbyte; /* To "or" REQ_VARY */
5244 }
5245
5246 /* If lookbehind, check that this branch matches a fixed-length string, and
5247 put the length into the OP_REVERSE item. Temporarily mark the end of the
5248 branch with OP_END. */
5249
5250 if (lookbehind)
5251 {
5252 int fixed_length;
5253 *code = OP_END;
5254 fixed_length = find_fixedlength(last_branch, options);
5255 DPRINTF(("fixed length = %d\n", fixed_length));
5256 if (fixed_length < 0)
5257 {
5258 *errorcodeptr = (fixed_length == -2)? ERR36 : ERR25;
5259 *ptrptr = ptr;
5260 return FALSE;
5261 }
5262 PUT(reverse_count, 0, fixed_length);
5263 }
5264 }
5265
5266 /* Reached end of expression, either ')' or end of pattern. In the real
5267 compile phase, go back through the alternative branches and reverse the chain
5268 of offsets, with the field in the BRA item now becoming an offset to the
5269 first alternative. If there are no alternatives, it points to the end of the
5270 group. The length in the terminating ket is always the length of the whole
5271 bracketed item. If any of the ims options were changed inside the group,
5272 compile a resetting op-code following, except at the very end of the pattern.
5273 Return leaving the pointer at the terminating char. */
5274
5275 if (*ptr != '|')
5276 {
5277 if (lengthptr == NULL)
5278 {
5279 int branch_length = code - last_branch;
5280 do
5281 {
5282 int prev_length = GET(last_branch, 1);
5283 PUT(last_branch, 1, branch_length);
5284 branch_length = prev_length;
5285 last_branch -= branch_length;
5286 }
5287 while (branch_length > 0);
5288 }
5289
5290 /* Fill in the ket */
5291
5292 *code = OP_KET;
5293 PUT(code, 1, code - start_bracket);
5294 code += 1 + LINK_SIZE;
5295
5296 /* Resetting option if needed */
5297
5298 if ((options & PCRE_IMS) != oldims && *ptr == ')')
5299 {
5300 *code++ = OP_OPT;
5301 *code++ = oldims;
5302 length += 2;
5303 }
5304
5305 /* Retain the highest bracket number, in case resetting was used. */
5306
5307 cd->bracount = max_bracount;
5308
5309 /* Set values to pass back */
5310
5311 *codeptr = code;
5312 *ptrptr = ptr;
5313 *firstbyteptr = firstbyte;
5314 *reqbyteptr = reqbyte;
5315 if (lengthptr != NULL)
5316 {
5317 if (OFLOW_MAX - *lengthptr < length)
5318 {
5319 *errorcodeptr = ERR20;
5320 return FALSE;
5321 }
5322 *lengthptr += length;
5323 }
5324 return TRUE;
5325 }
5326
5327 /* Another branch follows. In the pre-compile phase, we can move the code
5328 pointer back to where it was for the start of the first branch. (That is,
5329 pretend that each branch is the only one.)
5330
5331 In the real compile phase, insert an ALT node. Its length field points back
5332 to the previous branch while the bracket remains open. At the end the chain
5333 is reversed. It's done like this so that the start of the bracket has a
5334 zero offset until it is closed, making it possible to detect recursion. */
5335
5336 if (lengthptr != NULL)
5337 {
5338 code = *codeptr + 1 + LINK_SIZE + skipbytes;
5339 length += 1 + LINK_SIZE;
5340 }
5341 else
5342 {
5343 *code = OP_ALT;
5344 PUT(code, 1, code - last_branch);
5345 bc.current = last_branch = code;
5346 code += 1 + LINK_SIZE;
5347 }
5348
5349 ptr++;
5350 }
5351 /* Control never reaches here */
5352 }
5353
5354
5355
5356
5357 /*************************************************
5358 * Check for anchored expression *
5359 *************************************************/
5360
5361 /* Try to find out if this is an anchored regular expression. Consider each
5362 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
5363 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
5364 it's anchored. However, if this is a multiline pattern, then only OP_SOD
5365 counts, since OP_CIRC can match in the middle.
5366
5367 We can also consider a regex to be anchored if OP_SOM starts all its branches.
5368 This is the code for \G, which means "match at start of match position, taking
5369 into account the match offset".
5370
5371 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
5372 because that will try the rest of the pattern at all possible matching points,
5373 so there is no point trying again.... er ....
5374
5375 .... except when the .* appears inside capturing parentheses, and there is a
5376 subsequent back reference to those parentheses. We haven't enough information
5377 to catch that case precisely.
5378
5379 At first, the best we could do was to detect when .* was in capturing brackets
5380 and the highest back reference was greater than or equal to that level.
5381 However, by keeping a bitmap of the first 31 back references, we can catch some
5382 of the more common cases more precisely.
5383
5384 Arguments:
5385 code points to start of expression (the bracket)
5386 options points to the options setting
5387 bracket_map a bitmap of which brackets we are inside while testing; this
5388 handles up to substring 31; after that we just have to take
5389 the less precise approach
5390 backref_map the back reference bitmap
5391
5392 Returns: TRUE or FALSE
5393 */
5394
5395 static BOOL
5396 is_anchored(register const uschar *code, int *options, unsigned int bracket_map,
5397 unsigned int backref_map)
5398 {
5399 do {
5400 const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
5401 options, PCRE_MULTILINE, FALSE);
5402 register int op = *scode;
5403
5404 /* Non-capturing brackets */
5405
5406 if (op == OP_BRA)
5407 {
5408 if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
5409 }
5410
5411 /* Capturing brackets */
5412
5413 else if (op == OP_CBRA)
5414 {
5415 int n = GET2(scode, 1+LINK_SIZE);
5416 int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
5417 if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;
5418 }
5419
5420 /* Other brackets */
5421
5422 else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
5423 {
5424 if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
5425 }
5426
5427 /* .* is not anchored unless DOTALL is set and it isn't in brackets that
5428 are or may be referenced. */
5429
5430 else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
5431 op == OP_TYPEPOSSTAR) &&
5432 (*options & PCRE_DOTALL) != 0)
5433 {
5434 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
5435 }
5436
5437 /* Check for explicit anchoring */
5438
5439 else if (op != OP_SOD && op != OP_SOM &&
5440 ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
5441 return FALSE;
5442 code += GET(code, 1);
5443 }
5444 while (*code == OP_ALT); /* Loop for each alternative */
5445 return TRUE;
5446 }
5447
5448
5449
5450 /*************************************************
5451 * Check for starting with ^ or .* *
5452 *************************************************/
5453
5454 /* This is called to find out if every branch starts with ^ or .* so that
5455 "first char" processing can be done to speed things up in multiline
5456 matching and for non-DOTALL patterns that start with .* (which must start at
5457 the beginning or after \n). As in the case of is_anchored() (see above), we
5458 have to take account of back references to capturing brackets that contain .*
5459 because in that case we can't make the assumption.
5460
5461 Arguments:
5462 code points to start of expression (the bracket)
5463 bracket_map a bitmap of which brackets we are inside while testing; this
5464 handles up to substring 31; after that we just have to take
5465 the less precise approach
5466 backref_map the back reference bitmap
5467
5468 Returns: TRUE or FALSE
5469 */
5470
5471 static BOOL
5472 is_startline(const uschar *code, unsigned int bracket_map,
5473 unsigned int backref_map)
5474 {
5475 do {
5476 const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
5477 NULL, 0, FALSE);
5478 register int op = *scode;
5479
5480 /* Non-capturing brackets */
5481
5482 if (op == OP_BRA)
5483 {
5484 if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
5485 }
5486
5487 /* Capturing brackets */
5488
5489 else if (op == OP_CBRA)
5490 {
5491 int n = GET2(scode, 1+LINK_SIZE);
5492 int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
5493 if (!is_startline(scode, new_map, backref_map)) return FALSE;
5494 }
5495
5496 /* Other brackets */
5497
5498 else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
5499 { if (!is_startline(scode, bracket_map, backref_map)) return FALSE; }
5500
5501 /* .* means "start at start or after \n" if it isn't in brackets that
5502 may be referenced. */
5503
5504 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
5505 {
5506 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
5507 }
5508
5509 /* Check for explicit circumflex */
5510
5511 else if (op != OP_CIRC) return FALSE;
5512
5513 /* Move on to the next alternative */
5514
5515 code += GET(code, 1);
5516 }
5517 while (*code == OP_ALT); /* Loop for each alternative */
5518 return TRUE;
5519 }
5520
5521
5522
5523 /*************************************************
5524 * Check for asserted fixed first char *
5525 *************************************************/
5526
5527 /* During compilation, the "first char" settings from forward assertions are
5528 discarded, because they can cause conflicts with actual literals that follow.
5529 However, if we end up without a first char setting for an unanchored pattern,
5530 it is worth scanning the regex to see if there is an initial asserted first
5531 char. If all branches start with the same asserted char, or with a bracket all
5532 of whose alternatives start with the same asserted char (recurse ad lib), then
5533 we return that char, otherwise -1.
5534
5535 Arguments:
5536 code points to start of expression (the bracket)
5537 options pointer to the options (used to check casing changes)
5538 inassert TRUE if in an assertion
5539
5540 Returns: -1 or the fixed first char
5541 */
5542
5543 static int
5544 find_firstassertedchar(const uschar *code, int *options, BOOL inassert)
5545 {
5546 register int c = -1;
5547 do {
5548 int d;
5549 const uschar *scode =
5550 first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE);
5551 register int op = *scode;
5552
5553 switch(op)
5554 {
5555 default:
5556 return -1;
5557
5558 case OP_BRA:
5559 case OP_CBRA:
5560 case OP_ASSERT:
5561 case OP_ONCE:
5562 case OP_COND:
5563 if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0)
5564 return -1;
5565 if (c < 0) c = d; else if (c != d) return -1;
5566 break;
5567
5568 case OP_EXACT: /* Fall through */
5569 scode += 2;
5570
5571 case OP_CHAR:
5572 case OP_CHARNC:
5573 case OP_PLUS:
5574 case OP_MINPLUS:
5575 case OP_POSPLUS:
5576 if (!inassert) return -1;
5577 if (c < 0)
5578 {
5579 c = scode[1];
5580 if ((*options & PCRE_CASELESS) != 0) c |= REQ_CASELESS;
5581 }
5582 else if (c != scode[1]) return -1;
5583 break;
5584 }
5585
5586 code += GET(code, 1);
5587 }
5588 while (*code == OP_ALT);
5589 return c;
5590 }
5591
5592
5593
5594 /*************************************************
5595 * Compile a Regular Expression *
5596 *************************************************/
5597
5598 /* This function takes a string and returns a pointer to a block of store
5599 holding a compiled version of the expression. The original API for this
5600 function had no error code return variable; it is retained for backwards
5601 compatibility. The new function is given a new name.
5602
5603 Arguments:
5604 pattern the regular expression
5605 options various option bits
5606 errorcodeptr pointer to error code variable (pcre_compile2() only)
5607 can be NULL if you don't want a code value
5608 errorptr pointer to pointer to error text
5609 erroroffset ptr offset in pattern where error was detected
5610 tables pointer to character tables or NULL
5611
5612 Returns: pointer to compiled data block, or NULL on error,
5613 with errorptr and erroroffset set
5614 */
5615
5616 PCRE_EXP_DEFN pcre *
5617 pcre_compile(const char *pattern, int options, const char **errorptr,
5618 int *erroroffset, const unsigned char *tables)
5619 {
5620 return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
5621 }
5622
5623
5624 PCRE_EXP_DEFN pcre *
5625 pcre_compile2(const char *pattern, int options, int *errorcodeptr,
5626 const char **errorptr, int *erroroffset, const unsigned char *tables)
5627 {
5628 real_pcre *re;
5629 int length = 1; /* For final END opcode */
5630 int firstbyte, reqbyte, newline;
5631 int errorcode = 0;
5632 #ifdef SUPPORT_UTF8
5633 BOOL utf8;
5634 #endif
5635 size_t size;
5636 uschar *code;
5637 const uschar *codestart;
5638 const uschar *ptr;
5639 compile_data compile_block;
5640 compile_data *cd = &compile_block;
5641
5642 /* This space is used for "compiling" into during the first phase, when we are
5643 computing the amount of memory that is needed. Compiled items are thrown away
5644 as soon as possible, so that a fairly large buffer should be sufficient for
5645 this purpose. The same space is used in the second phase for remembering where
5646 to fill in forward references to subpatterns. */
5647
5648 uschar cworkspace[COMPILE_WORK_SIZE];
5649
5650
5651 /* Set this early so that early errors get offset 0. */
5652
5653 ptr = (const uschar *)pattern;
5654
5655 /* We can't pass back an error message if errorptr is NULL; I guess the best we
5656 can do is just return NULL, but we can set a code value if there is a code
5657 pointer. */
5658
5659 if (errorptr == NULL)
5660 {
5661 if (errorcodeptr != NULL) *errorcodeptr = 99;
5662 return NULL;
5663 }
5664
5665 *errorptr = NULL;
5666 if (errorcodeptr != NULL) *errorcodeptr = ERR0;
5667
5668 /* However, we can give a message for this error */
5669
5670 if (erroroffset == NULL)
5671 {
5672 errorcode = ERR16;
5673 goto PCRE_EARLY_ERROR_RETURN2;
5674 }
5675
5676 *erroroffset = 0;
5677
5678 /* Can't support UTF8 unless PCRE has been compiled to include the code. */
5679
5680 #ifdef SUPPORT_UTF8
5681 utf8 = (options & PCRE_UTF8) != 0;
5682 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
5683 (*erroroffset = _pcre_valid_utf8((uschar *)pattern, -1)) >= 0)
5684 {
5685 errorcode = ERR44;
5686 goto PCRE_EARLY_ERROR_RETURN2;
5687 }
5688 #else
5689 if ((options & PCRE_UTF8) != 0)
5690 {
5691 errorcode = ERR32;
5692 goto PCRE_EARLY_ERROR_RETURN;
5693 }
5694 #endif
5695
5696 if ((options & ~PUBLIC_OPTIONS) != 0)
5697 {
5698 errorcode = ERR17;
5699 goto PCRE_EARLY_ERROR_RETURN;
5700 }
5701
5702 /* Set up pointers to the individual character tables */
5703
5704 if (tables == NULL) tables = _pcre_default_tables;
5705 cd->lcc = tables + lcc_offset;
5706 cd->fcc = tables + fcc_offset;
5707 cd->cbits = tables + cbits_offset;
5708 cd->ctypes = tables + ctypes_offset;
5709
5710 /* Handle different types of newline. The three bits give seven cases. The
5711 current code allows for fixed one- or two-byte sequences, plus "any" and
5712 "anycrlf". */
5713
5714 switch (options & (PCRE_NEWLINE_CRLF | PCRE_NEWLINE_ANY))
5715 {
5716 case 0: newline = NEWLINE; break; /* Compile-time default */
5717 case PCRE_NEWLINE_CR: newline = '\r'; break;
5718 case PCRE_NEWLINE_LF: newline = '\n'; break;
5719 case PCRE_NEWLINE_CR+
5720 PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
5721 case PCRE_NEWLINE_ANY: newline = -1; break;
5722 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
5723 default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
5724 }
5725
5726 if (newline == -2)
5727 {
5728 cd->nltype = NLTYPE_ANYCRLF;
5729 }
5730 else if (newline < 0)
5731 {
5732 cd->nltype = NLTYPE_ANY;
5733 }
5734 else
5735 {
5736 cd->nltype = NLTYPE_FIXED;
5737 if (newline > 255)
5738 {
5739 cd->nllen = 2;
5740 cd->nl[0] = (newline >> 8) & 255;
5741 cd->nl[1] = newline & 255;
5742 }
5743 else
5744 {
5745 cd->nllen = 1;
5746 cd->nl[0] = newline;
5747 }
5748 }
5749
5750 /* Maximum back reference and backref bitmap. The bitmap records up to 31 back
5751 references to help in deciding whether (.*) can be treated as anchored or not.
5752 */
5753
5754 cd->top_backref = 0;
5755 cd->backref_map = 0;
5756
5757 /* Reflect pattern for debugging output */
5758
5759 DPRINTF(("------------------------------------------------------------------\n"));
5760 DPRINTF(("%s\n", pattern));
5761
5762 /* Pretend to compile the pattern while actually just accumulating the length
5763 of memory required. This behaviour is triggered by passing a non-NULL final
5764 argument to compile_regex(). We pass a block of workspace (cworkspace) for it
5765 to compile parts of the pattern into; the compiled code is discarded when it is
5766 no longer needed, so hopefully this workspace will never overflow, though there
5767 is a test for its doing so. */
5768
5769 cd->bracount = 0;
5770 cd->names_found = 0;
5771 cd->name_entry_size = 0;
5772 cd->name_table = NULL;
5773 cd->start_workspace = cworkspace;
5774 cd->start_code = cworkspace;
5775 cd->hwm = cworkspace;
5776 cd->start_pattern = (const uschar *)pattern;
5777 cd->end_pattern = (const uschar *)(pattern + strlen(pattern));
5778 cd->req_varyopt = 0;
5779 cd->nopartial = FALSE;
5780 cd->external_options = options;
5781
5782 /* Now do the pre-compile. On error, errorcode will be set non-zero, so we
5783 don't need to look at the result of the function here. The initial options have
5784 been put into the cd block so that they can be changed if an option setting is
5785 found within the regex right at the beginning. Bringing initial option settings
5786 outside can help speed up starting point checks. */
5787
5788 code = cworkspace;
5789 *code = OP_BRA;
5790 (void)compile_regex(cd->external_options, cd->external_options & PCRE_IMS,
5791 &code, &ptr, &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd,
5792 &length);
5793 if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN;
5794
5795 DPRINTF(("end pre-compile: length=%d workspace=%d\n", length,
5796 cd->hwm - cworkspace));
5797
5798 if (length > MAX_PATTERN_SIZE)
5799 {
5800 errorcode = ERR20;
5801 goto PCRE_EARLY_ERROR_RETURN;
5802 }
5803
5804 /* Compute the size of data block needed and get it, either from malloc or
5805 externally provided function. Integer overflow should no longer be possible
5806 because nowadays we limit the maximum value of cd->names_found and
5807 cd->name_entry_size. */
5808
5809 size = length + sizeof(real_pcre) + cd->names_found * (cd->name_entry_size + 3);
5810 re = (real_pcre *)(pcre_malloc)(size);
5811
5812 if (re == NULL)
5813 {
5814 errorcode = ERR21;
5815 goto PCRE_EARLY_ERROR_RETURN;
5816 }
5817
5818 /* Put in the magic number, and save the sizes, initial options, and character
5819 table pointer. NULL is used for the default character tables. The nullpad field
5820 is at the end; it's there to help in the case when a regex compiled on a system
5821 with 4-byte pointers is run on another with 8-byte pointers. */
5822
5823 re->magic_number = MAGIC_NUMBER;
5824 re->size = size;
5825 re->options = cd->external_options;
5826 re->dummy1 = 0;
5827 re->first_byte = 0;
5828 re->req_byte = 0;
5829 re->name_table_offset = sizeof(real_pcre);
5830 re->name_entry_size = cd->name_entry_size;
5831 re->name_count = cd->names_found;
5832 re->ref_count = 0;
5833 re->tables = (tables == _pcre_default_tables)? NULL : tables;
5834 re->nullpad = NULL;
5835
5836 /* The starting points of the name/number translation table and of the code are
5837 passed around in the compile data block. The start/end pattern and initial
5838 options are already set from the pre-compile phase, as is the name_entry_size
5839 field. Reset the bracket count and the names_found field. Also reset the hwm
5840 field; this time it's used for remembering forward references to subpatterns.
5841 */
5842
5843 cd->bracount = 0;
5844 cd->names_found = 0;
5845 cd->name_table = (uschar *)re + re->name_table_offset;
5846 codestart = cd->name_table + re->name_entry_size * re->name_count;
5847 cd->start_code = codestart;
5848 cd->hwm = cworkspace;
5849 cd->req_varyopt = 0;
5850 cd->nopartial = FALSE;
5851 cd->had_accept = FALSE;
5852
5853 /* Set up a starting, non-extracting bracket, then compile the expression. On
5854 error, errorcode will be set non-zero, so we don't need to look at the result
5855 of the function here. */
5856
5857 ptr = (const uschar *)pattern;
5858 code = (uschar *)codestart;
5859 *code = OP_BRA;
5860 (void)compile_regex(re->options, re->options & PCRE_IMS, &code, &ptr,
5861 &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd, NULL);
5862 re->top_bracket = cd->bracount;
5863 re->top_backref = cd->top_backref;
5864
5865 if (cd->nopartial) re->options |= PCRE_NOPARTIAL;
5866 if (cd->had_accept) reqbyte = -1; /* Must disable after (*ACCEPT) */
5867
5868 /* If not reached end of pattern on success, there's an excess bracket. */
5869
5870 if (errorcode == 0 && *ptr != 0) errorcode = ERR22;
5871
5872 /* Fill in the terminating state and check for disastrous overflow, but
5873 if debugging, leave the test till after things are printed out. */
5874
5875 *code++ = OP_END;
5876
5877 #ifndef DEBUG
5878 if (code - codestart > length) errorcode = ERR23;
5879 #endif
5880
5881 /* Fill in any forward references that are required. */
5882
5883 while (errorcode == 0 && cd->hwm > cworkspace)
5884 {
5885 int offset, recno;
5886 const uschar *groupptr;
5887 cd->hwm -= LINK_SIZE;
5888 offset = GET(cd->hwm, 0);
5889 recno = GET(codestart, offset);
5890 groupptr = find_bracket(codestart, (re->options & PCRE_UTF8) != 0, recno);
5891 if (groupptr == NULL) errorcode = ERR53;
5892 else PUT(((uschar *)codestart), offset, groupptr - codestart);
5893 }
5894
5895 /* Give an error if there's back reference to a non-existent capturing
5896 subpattern. */
5897
5898 if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15;
5899
5900 /* Failed to compile, or error while post-processing */
5901
5902 if (errorcode != 0)
5903 {
5904 (pcre_free)(re);
5905 PCRE_EARLY_ERROR_RETURN:
5906 *erroroffset = ptr - (const uschar *)pattern;
5907 PCRE_EARLY_ERROR_RETURN2:
5908 *errorptr = error_texts[errorcode];
5909 if (errorcodeptr != NULL) *errorcodeptr = errorcode;
5910 return NULL;
5911 }
5912
5913 /* If the anchored option was not passed, set the flag if we can determine that
5914 the pattern is anchored by virtue of ^ characters or \A or anything else (such
5915 as starting with .* when DOTALL is set).
5916
5917 Otherwise, if we know what the first byte has to be, save it, because that
5918 speeds up unanchored matches no end. If not, see if we can set the
5919 PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
5920 start with ^. and also when all branches start with .* for non-DOTALL matches.
5921 */
5922
5923 if ((re->options & PCRE_ANCHORED) == 0)
5924 {
5925 int temp_options = re->options; /* May get changed during these scans */
5926 if (is_anchored(codestart, &temp_options, 0, cd->backref_map))
5927 re->options |= PCRE_ANCHORED;
5928 else
5929 {
5930 if (firstbyte < 0)
5931 firstbyte = find_firstassertedchar(codestart, &temp_options, FALSE);
5932 if (firstbyte >= 0) /* Remove caseless flag for non-caseable chars */
5933 {
5934 int ch = firstbyte & 255;
5935 re->first_byte = ((firstbyte & REQ_CASELESS) != 0 &&
5936 cd->fcc[ch] == ch)? ch : firstbyte;
5937 re->options |= PCRE_FIRSTSET;
5938 }
5939 else if (is_startline(codestart, 0, cd->backref_map))
5940 re->options |= PCRE_STARTLINE;
5941 }
5942 }
5943
5944 /* For an anchored pattern, we use the "required byte" only if it follows a
5945 variable length item in the regex. Remove the caseless flag for non-caseable
5946 bytes. */
5947
5948 if (reqbyte >= 0 &&
5949 ((re->options & PCRE_ANCHORED) == 0 || (reqbyte & REQ_VARY) != 0))
5950 {
5951 int ch = reqbyte & 255;
5952 re->req_byte = ((reqbyte & REQ_CASELESS) != 0 &&
5953 cd->fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte;
5954 re->options |= PCRE_REQCHSET;
5955 }
5956
5957 /* Print out the compiled data if debugging is enabled. This is never the
5958 case when building a production library. */
5959
5960 #ifdef DEBUG
5961
5962 printf("Length = %d top_bracket = %d top_backref = %d\n",
5963 length, re->top_bracket, re->top_backref);
5964
5965 if (re->options != 0)
5966 {
5967 printf("%s%s%s%s%s%s%s%s%s\n",
5968 ((re->options & PCRE_NOPARTIAL) != 0)? "nopartial " : "",
5969 ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
5970 ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
5971 ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
5972 ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
5973 ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
5974 ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
5975 ((re->options & PCRE_EXTRA) != 0)? "extra " : "",
5976 ((re->options & PCRE_UNGREEDY) != 0)? "ungreedy " : "");
5977 }
5978
5979 if ((re->options & PCRE_FIRSTSET) != 0)
5980 {
5981 int ch = re->first_byte & 255;
5982 const char *caseless = ((re->first_byte & REQ_CASELESS) == 0)?
5983 "" : " (caseless)";
5984 if (isprint(ch)) printf("First char = %c%s\n", ch, caseless);
5985 else printf("First char = \\x%02x%s\n", ch, caseless);
5986 }
5987
5988 if ((re->options & PCRE_REQCHSET) != 0)
5989 {
5990 int ch = re->req_byte & 255;
5991 const char *caseless = ((re->req_byte & REQ_CASELESS) == 0)?
5992 "" : " (caseless)";
5993 if (isprint(ch)) printf("Req char = %c%s\n", ch, caseless);
5994 else printf("Req char = \\x%02x%s\n", ch, caseless);
5995 }
5996
5997 pcre_printint(re, stdout, TRUE);
5998
5999 /* This check is done here in the debugging case so that the code that
6000 was compiled can be seen. */
6001
6002 if (code - codestart > length)
6003 {
6004 (pcre_free)(re);
6005 *errorptr = error_texts[ERR23];
6006 *erroroffset = ptr - (uschar *)pattern;
6007 if (errorcodeptr != NULL) *errorcodeptr = ERR23;
6008 return NULL;
6009 }
6010 #endif /* DEBUG */
6011
6012 return (pcre *)re;
6013 }
6014
6015 /* End of pcre_compile.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12