/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Contents of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 220 - (show annotations) (download)
Thu Aug 16 13:29:39 2007 UTC (7 years, 1 month ago) by ph10
File MIME type: text/plain
File size: 189301 byte(s)
Tidies for 7.3-RC5 prerelease

1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2007 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains the external function pcre_compile(), along with
42 supporting internal functions that are not used by other modules. */
43
44
45 #ifdef HAVE_CONFIG_H
46 #include <config.h>
47 #endif
48
49 #define NLBLOCK cd /* Block containing newline information */
50 #define PSSTART start_pattern /* Field containing processed string start */
51 #define PSEND end_pattern /* Field containing processed string end */
52
53 #include "pcre_internal.h"
54
55
56 /* When DEBUG is defined, we need the pcre_printint() function, which is also
57 used by pcretest. DEBUG is not defined when building a production library. */
58
59 #ifdef DEBUG
60 #include "pcre_printint.src"
61 #endif
62
63
64 /* Macro for setting individual bits in class bitmaps. */
65
66 #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
67
68 /* Maximum length value to check against when making sure that the integer that
69 holds the compiled pattern length does not overflow. We make it a bit less than
70 INT_MAX to allow for adding in group terminating bytes, so that we don't have
71 to check them every time. */
72
73 #define OFLOW_MAX (INT_MAX - 20)
74
75
76 /*************************************************
77 * Code parameters and static tables *
78 *************************************************/
79
80 /* This value specifies the size of stack workspace that is used during the
81 first pre-compile phase that determines how much memory is required. The regex
82 is partly compiled into this space, but the compiled parts are discarded as
83 soon as they can be, so that hopefully there will never be an overrun. The code
84 does, however, check for an overrun. The largest amount I've seen used is 218,
85 so this number is very generous.
86
87 The same workspace is used during the second, actual compile phase for
88 remembering forward references to groups so that they can be filled in at the
89 end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
90 is 4 there is plenty of room. */
91
92 #define COMPILE_WORK_SIZE (4096)
93
94
95 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
96 are simple data values; negative values are for special things like \d and so
97 on. Zero means further processing is needed (for things like \x), or the escape
98 is invalid. */
99
100 #ifndef EBCDIC /* This is the "normal" table for ASCII systems */
101 static const short int escapes[] = {
102 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
103 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
104 '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */
105 -ESC_H, 0, 0, -ESC_K, 0, 0, 0, 0, /* H - O */
106 -ESC_P, -ESC_Q, -ESC_R, -ESC_S, 0, 0, -ESC_V, -ESC_W, /* P - W */
107 -ESC_X, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
108 '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */
109 -ESC_h, 0, 0, -ESC_k, 0, 0, ESC_n, 0, /* h - o */
110 -ESC_p, 0, ESC_r, -ESC_s, ESC_tee, 0, -ESC_v, -ESC_w, /* p - w */
111 0, 0, -ESC_z /* x - z */
112 };
113
114 #else /* This is the "abnormal" table for EBCDIC systems */
115 static const short int escapes[] = {
116 /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
117 /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
118 /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
119 /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
120 /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
121 /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
122 /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
123 /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
124 /* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0,
125 /* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
126 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
127 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0,
128 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
129 /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
130 /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
131 /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
132 /* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0,
133 /* D0 */ '}', 0, -ESC_K, 0, 0, 0, 0, -ESC_P,
134 /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
135 /* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X,
136 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
137 /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
138 /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
139 };
140 #endif
141
142
143 /* Table of special "verbs" like (*PRUNE) */
144
145 typedef struct verbitem {
146 const char *name;
147 int len;
148 int op;
149 } verbitem;
150
151 static verbitem verbs[] = {
152 { "ACCEPT", 6, OP_ACCEPT },
153 { "COMMIT", 6, OP_COMMIT },
154 { "F", 1, OP_FAIL },
155 { "FAIL", 4, OP_FAIL },
156 { "PRUNE", 5, OP_PRUNE },
157 { "SKIP", 4, OP_SKIP },
158 { "THEN", 4, OP_THEN }
159 };
160
161 static int verbcount = sizeof(verbs)/sizeof(verbitem);
162
163
164 /* Tables of names of POSIX character classes and their lengths. The list is
165 terminated by a zero length entry. The first three must be alpha, lower, upper,
166 as this is assumed for handling case independence. */
167
168 static const char *const posix_names[] = {
169 "alpha", "lower", "upper",
170 "alnum", "ascii", "blank", "cntrl", "digit", "graph",
171 "print", "punct", "space", "word", "xdigit" };
172
173 static const uschar posix_name_lengths[] = {
174 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
175
176 /* Table of class bit maps for each POSIX class. Each class is formed from a
177 base map, with an optional addition or removal of another map. Then, for some
178 classes, there is some additional tweaking: for [:blank:] the vertical space
179 characters are removed, and for [:alpha:] and [:alnum:] the underscore
180 character is removed. The triples in the table consist of the base map offset,
181 second map offset or -1 if no second map, and a non-negative value for map
182 addition or a negative value for map subtraction (if there are two maps). The
183 absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
184 remove vertical space characters, 2 => remove underscore. */
185
186 static const int posix_class_maps[] = {
187 cbit_word, cbit_digit, -2, /* alpha */
188 cbit_lower, -1, 0, /* lower */
189 cbit_upper, -1, 0, /* upper */
190 cbit_word, -1, 2, /* alnum - word without underscore */
191 cbit_print, cbit_cntrl, 0, /* ascii */
192 cbit_space, -1, 1, /* blank - a GNU extension */
193 cbit_cntrl, -1, 0, /* cntrl */
194 cbit_digit, -1, 0, /* digit */
195 cbit_graph, -1, 0, /* graph */
196 cbit_print, -1, 0, /* print */
197 cbit_punct, -1, 0, /* punct */
198 cbit_space, -1, 0, /* space */
199 cbit_word, -1, 0, /* word - a Perl extension */
200 cbit_xdigit,-1, 0 /* xdigit */
201 };
202
203
204 #define STRING(a) # a
205 #define XSTRING(s) STRING(s)
206
207 /* The texts of compile-time error messages. These are "char *" because they
208 are passed to the outside world. Do not ever re-use any error number, because
209 they are documented. Always add a new error instead. Messages marked DEAD below
210 are no longer used. */
211
212 static const char *error_texts[] = {
213 "no error",
214 "\\ at end of pattern",
215 "\\c at end of pattern",
216 "unrecognized character follows \\",
217 "numbers out of order in {} quantifier",
218 /* 5 */
219 "number too big in {} quantifier",
220 "missing terminating ] for character class",
221 "invalid escape sequence in character class",
222 "range out of order in character class",
223 "nothing to repeat",
224 /* 10 */
225 "operand of unlimited repeat could match the empty string", /** DEAD **/
226 "internal error: unexpected repeat",
227 "unrecognized character after (?",
228 "POSIX named classes are supported only within a class",
229 "missing )",
230 /* 15 */
231 "reference to non-existent subpattern",
232 "erroffset passed as NULL",
233 "unknown option bit(s) set",
234 "missing ) after comment",
235 "parentheses nested too deeply", /** DEAD **/
236 /* 20 */
237 "regular expression is too large",
238 "failed to get memory",
239 "unmatched parentheses",
240 "internal error: code overflow",
241 "unrecognized character after (?<",
242 /* 25 */
243 "lookbehind assertion is not fixed length",
244 "malformed number or name after (?(",
245 "conditional group contains more than two branches",
246 "assertion expected after (?(",
247 "(?R or (?[+-]digits must be followed by )",
248 /* 30 */
249 "unknown POSIX class name",
250 "POSIX collating elements are not supported",
251 "this version of PCRE is not compiled with PCRE_UTF8 support",
252 "spare error", /** DEAD **/
253 "character value in \\x{...} sequence is too large",
254 /* 35 */
255 "invalid condition (?(0)",
256 "\\C not allowed in lookbehind assertion",
257 "PCRE does not support \\L, \\l, \\N, \\U, or \\u",
258 "number after (?C is > 255",
259 "closing ) for (?C expected",
260 /* 40 */
261 "recursive call could loop indefinitely",
262 "unrecognized character after (?P",
263 "syntax error in subpattern name (missing terminator)",
264 "two named subpatterns have the same name",
265 "invalid UTF-8 string",
266 /* 45 */
267 "support for \\P, \\p, and \\X has not been compiled",
268 "malformed \\P or \\p sequence",
269 "unknown property name after \\P or \\p",
270 "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)",
271 "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")",
272 /* 50 */
273 "repeated subpattern is too long", /** DEAD **/
274 "octal value is greater than \\377 (not in UTF-8 mode)",
275 "internal error: overran compiling workspace",
276 "internal error: previously-checked referenced subpattern not found",
277 "DEFINE group contains more than one branch",
278 /* 55 */
279 "repeating a DEFINE group is not allowed",
280 "inconsistent NEWLINE options",
281 "\\g is not followed by a braced name or an optionally braced non-zero number",
282 "(?+ or (?- or (?(+ or (?(- must be followed by a non-zero number",
283 "(*VERB) with an argument is not supported",
284 /* 60 */
285 "(*VERB) not recognized",
286 "number is too big"
287 };
288
289
290 /* Table to identify digits and hex digits. This is used when compiling
291 patterns. Note that the tables in chartables are dependent on the locale, and
292 may mark arbitrary characters as digits - but the PCRE compiling code expects
293 to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
294 a private table here. It costs 256 bytes, but it is a lot faster than doing
295 character value tests (at least in some simple cases I timed), and in some
296 applications one wants PCRE to compile efficiently as well as match
297 efficiently.
298
299 For convenience, we use the same bit definitions as in chartables:
300
301 0x04 decimal digit
302 0x08 hexadecimal digit
303
304 Then we can use ctype_digit and ctype_xdigit in the code. */
305
306 #ifndef EBCDIC /* This is the "normal" case, for ASCII systems */
307 static const unsigned char digitab[] =
308 {
309 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
310 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
311 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
312 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
313 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
314 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
315 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
316 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
317 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
318 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
319 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
320 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
321 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
322 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
323 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
324 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
325 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
326 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
327 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
328 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
329 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
330 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
331 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
332 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
333 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
334 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
335 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
336 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
337 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
338 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
339 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
340 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
341
342 #else /* This is the "abnormal" case, for EBCDIC systems */
343 static const unsigned char digitab[] =
344 {
345 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
346 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
347 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
348 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
349 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
350 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
351 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
352 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
353 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
354 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
355 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
356 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */
357 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
358 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
359 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
360 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
361 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
362 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
363 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
364 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
365 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
366 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
367 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
368 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
369 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
370 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
371 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
372 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
373 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
374 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
375 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
376 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
377
378 static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
379 0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
380 0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
381 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
382 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
383 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
384 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
385 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
386 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
387 0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
388 0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
389 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
390 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */
391 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
392 0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
393 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
394 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
395 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
396 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
397 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
398 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
399 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
400 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
401 0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
402 0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
403 0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
404 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
405 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
406 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
407 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
408 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
409 0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
410 0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
411 #endif
412
413
414 /* Definition to allow mutual recursion */
415
416 static BOOL
417 compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
418 int *, int *, branch_chain *, compile_data *, int *);
419
420
421
422 /*************************************************
423 * Handle escapes *
424 *************************************************/
425
426 /* This function is called when a \ has been encountered. It either returns a
427 positive value for a simple escape such as \n, or a negative value which
428 encodes one of the more complicated things such as \d. A backreference to group
429 n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
430 UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
431 ptr is pointing at the \. On exit, it is on the final character of the escape
432 sequence.
433
434 Arguments:
435 ptrptr points to the pattern position pointer
436 errorcodeptr points to the errorcode variable
437 bracount number of previous extracting brackets
438 options the options bits
439 isclass TRUE if inside a character class
440
441 Returns: zero or positive => a data character
442 negative => a special escape sequence
443 on error, errorcodeptr is set
444 */
445
446 static int
447 check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
448 int options, BOOL isclass)
449 {
450 BOOL utf8 = (options & PCRE_UTF8) != 0;
451 const uschar *ptr = *ptrptr + 1;
452 int c, i;
453
454 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
455 ptr--; /* Set pointer back to the last byte */
456
457 /* If backslash is at the end of the pattern, it's an error. */
458
459 if (c == 0) *errorcodeptr = ERR1;
460
461 /* Non-alphamerics are literals. For digits or letters, do an initial lookup in
462 a table. A non-zero result is something that can be returned immediately.
463 Otherwise further processing may be required. */
464
465 #ifndef EBCDIC /* ASCII coding */
466 else if (c < '0' || c > 'z') {} /* Not alphameric */
467 else if ((i = escapes[c - '0']) != 0) c = i;
468
469 #else /* EBCDIC coding */
470 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphameric */
471 else if ((i = escapes[c - 0x48]) != 0) c = i;
472 #endif
473
474 /* Escapes that need further processing, or are illegal. */
475
476 else
477 {
478 const uschar *oldptr;
479 BOOL braced, negated;
480
481 switch (c)
482 {
483 /* A number of Perl escapes are not handled by PCRE. We give an explicit
484 error. */
485
486 case 'l':
487 case 'L':
488 case 'N':
489 case 'u':
490 case 'U':
491 *errorcodeptr = ERR37;
492 break;
493
494 /* \g must be followed by a number, either plain or braced. If positive, it
495 is an absolute backreference. If negative, it is a relative backreference.
496 This is a Perl 5.10 feature. Perl 5.10 also supports \g{name} as a
497 reference to a named group. This is part of Perl's movement towards a
498 unified syntax for back references. As this is synonymous with \k{name}, we
499 fudge it up by pretending it really was \k. */
500
501 case 'g':
502 if (ptr[1] == '{')
503 {
504 const uschar *p;
505 for (p = ptr+2; *p != 0 && *p != '}'; p++)
506 if (*p != '-' && (digitab[*p] & ctype_digit) == 0) break;
507 if (*p != 0 && *p != '}')
508 {
509 c = -ESC_k;
510 break;
511 }
512 braced = TRUE;
513 ptr++;
514 }
515 else braced = FALSE;
516
517 if (ptr[1] == '-')
518 {
519 negated = TRUE;
520 ptr++;
521 }
522 else negated = FALSE;
523
524 c = 0;
525 while ((digitab[ptr[1]] & ctype_digit) != 0)
526 c = c * 10 + *(++ptr) - '0';
527
528 if (c < 0)
529 {
530 *errorcodeptr = ERR61;
531 break;
532 }
533
534 if (c == 0 || (braced && *(++ptr) != '}'))
535 {
536 *errorcodeptr = ERR57;
537 break;
538 }
539
540 if (negated)
541 {
542 if (c > bracount)
543 {
544 *errorcodeptr = ERR15;
545 break;
546 }
547 c = bracount - (c - 1);
548 }
549
550 c = -(ESC_REF + c);
551 break;
552
553 /* The handling of escape sequences consisting of a string of digits
554 starting with one that is not zero is not straightforward. By experiment,
555 the way Perl works seems to be as follows:
556
557 Outside a character class, the digits are read as a decimal number. If the
558 number is less than 10, or if there are that many previous extracting
559 left brackets, then it is a back reference. Otherwise, up to three octal
560 digits are read to form an escaped byte. Thus \123 is likely to be octal
561 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
562 value is greater than 377, the least significant 8 bits are taken. Inside a
563 character class, \ followed by a digit is always an octal number. */
564
565 case '1': case '2': case '3': case '4': case '5':
566 case '6': case '7': case '8': case '9':
567
568 if (!isclass)
569 {
570 oldptr = ptr;
571 c -= '0';
572 while ((digitab[ptr[1]] & ctype_digit) != 0)
573 c = c * 10 + *(++ptr) - '0';
574 if (c < 0)
575 {
576 *errorcodeptr = ERR61;
577 break;
578 }
579 if (c < 10 || c <= bracount)
580 {
581 c = -(ESC_REF + c);
582 break;
583 }
584 ptr = oldptr; /* Put the pointer back and fall through */
585 }
586
587 /* Handle an octal number following \. If the first digit is 8 or 9, Perl
588 generates a binary zero byte and treats the digit as a following literal.
589 Thus we have to pull back the pointer by one. */
590
591 if ((c = *ptr) >= '8')
592 {
593 ptr--;
594 c = 0;
595 break;
596 }
597
598 /* \0 always starts an octal number, but we may drop through to here with a
599 larger first octal digit. The original code used just to take the least
600 significant 8 bits of octal numbers (I think this is what early Perls used
601 to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
602 than 3 octal digits. */
603
604 case '0':
605 c -= '0';
606 while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
607 c = c * 8 + *(++ptr) - '0';
608 if (!utf8 && c > 255) *errorcodeptr = ERR51;
609 break;
610
611 /* \x is complicated. \x{ddd} is a character number which can be greater
612 than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
613 treated as a data character. */
614
615 case 'x':
616 if (ptr[1] == '{')
617 {
618 const uschar *pt = ptr + 2;
619 int count = 0;
620
621 c = 0;
622 while ((digitab[*pt] & ctype_xdigit) != 0)
623 {
624 register int cc = *pt++;
625 if (c == 0 && cc == '0') continue; /* Leading zeroes */
626 count++;
627
628 #ifndef EBCDIC /* ASCII coding */
629 if (cc >= 'a') cc -= 32; /* Convert to upper case */
630 c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));
631 #else /* EBCDIC coding */
632 if (cc >= 'a' && cc <= 'z') cc += 64; /* Convert to upper case */
633 c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));
634 #endif
635 }
636
637 if (*pt == '}')
638 {
639 if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
640 ptr = pt;
641 break;
642 }
643
644 /* If the sequence of hex digits does not end with '}', then we don't
645 recognize this construct; fall through to the normal \x handling. */
646 }
647
648 /* Read just a single-byte hex-defined char */
649
650 c = 0;
651 while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
652 {
653 int cc; /* Some compilers don't like ++ */
654 cc = *(++ptr); /* in initializers */
655 #ifndef EBCDIC /* ASCII coding */
656 if (cc >= 'a') cc -= 32; /* Convert to upper case */
657 c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
658 #else /* EBCDIC coding */
659 if (cc <= 'z') cc += 64; /* Convert to upper case */
660 c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
661 #endif
662 }
663 break;
664
665 /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
666 This coding is ASCII-specific, but then the whole concept of \cx is
667 ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
668
669 case 'c':
670 c = *(++ptr);
671 if (c == 0)
672 {
673 *errorcodeptr = ERR2;
674 break;
675 }
676
677 #ifndef EBCDIC /* ASCII coding */
678 if (c >= 'a' && c <= 'z') c -= 32;
679 c ^= 0x40;
680 #else /* EBCDIC coding */
681 if (c >= 'a' && c <= 'z') c += 64;
682 c ^= 0xC0;
683 #endif
684 break;
685
686 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
687 other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
688 for Perl compatibility, it is a literal. This code looks a bit odd, but
689 there used to be some cases other than the default, and there may be again
690 in future, so I haven't "optimized" it. */
691
692 default:
693 if ((options & PCRE_EXTRA) != 0) switch(c)
694 {
695 default:
696 *errorcodeptr = ERR3;
697 break;
698 }
699 break;
700 }
701 }
702
703 *ptrptr = ptr;
704 return c;
705 }
706
707
708
709 #ifdef SUPPORT_UCP
710 /*************************************************
711 * Handle \P and \p *
712 *************************************************/
713
714 /* This function is called after \P or \p has been encountered, provided that
715 PCRE is compiled with support for Unicode properties. On entry, ptrptr is
716 pointing at the P or p. On exit, it is pointing at the final character of the
717 escape sequence.
718
719 Argument:
720 ptrptr points to the pattern position pointer
721 negptr points to a boolean that is set TRUE for negation else FALSE
722 dptr points to an int that is set to the detailed property value
723 errorcodeptr points to the error code variable
724
725 Returns: type value from ucp_type_table, or -1 for an invalid type
726 */
727
728 static int
729 get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
730 {
731 int c, i, bot, top;
732 const uschar *ptr = *ptrptr;
733 char name[32];
734
735 c = *(++ptr);
736 if (c == 0) goto ERROR_RETURN;
737
738 *negptr = FALSE;
739
740 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
741 negation. */
742
743 if (c == '{')
744 {
745 if (ptr[1] == '^')
746 {
747 *negptr = TRUE;
748 ptr++;
749 }
750 for (i = 0; i < (int)sizeof(name) - 1; i++)
751 {
752 c = *(++ptr);
753 if (c == 0) goto ERROR_RETURN;
754 if (c == '}') break;
755 name[i] = c;
756 }
757 if (c !='}') goto ERROR_RETURN;
758 name[i] = 0;
759 }
760
761 /* Otherwise there is just one following character */
762
763 else
764 {
765 name[0] = c;
766 name[1] = 0;
767 }
768
769 *ptrptr = ptr;
770
771 /* Search for a recognized property name using binary chop */
772
773 bot = 0;
774 top = _pcre_utt_size;
775
776 while (bot < top)
777 {
778 i = (bot + top) >> 1;
779 c = strcmp(name, _pcre_utt[i].name);
780 if (c == 0)
781 {
782 *dptr = _pcre_utt[i].value;
783 return _pcre_utt[i].type;
784 }
785 if (c > 0) bot = i + 1; else top = i;
786 }
787
788 *errorcodeptr = ERR47;
789 *ptrptr = ptr;
790 return -1;
791
792 ERROR_RETURN:
793 *errorcodeptr = ERR46;
794 *ptrptr = ptr;
795 return -1;
796 }
797 #endif
798
799
800
801
802 /*************************************************
803 * Check for counted repeat *
804 *************************************************/
805
806 /* This function is called when a '{' is encountered in a place where it might
807 start a quantifier. It looks ahead to see if it really is a quantifier or not.
808 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
809 where the ddds are digits.
810
811 Arguments:
812 p pointer to the first char after '{'
813
814 Returns: TRUE or FALSE
815 */
816
817 static BOOL
818 is_counted_repeat(const uschar *p)
819 {
820 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
821 while ((digitab[*p] & ctype_digit) != 0) p++;
822 if (*p == '}') return TRUE;
823
824 if (*p++ != ',') return FALSE;
825 if (*p == '}') return TRUE;
826
827 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
828 while ((digitab[*p] & ctype_digit) != 0) p++;
829
830 return (*p == '}');
831 }
832
833
834
835 /*************************************************
836 * Read repeat counts *
837 *************************************************/
838
839 /* Read an item of the form {n,m} and return the values. This is called only
840 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
841 so the syntax is guaranteed to be correct, but we need to check the values.
842
843 Arguments:
844 p pointer to first char after '{'
845 minp pointer to int for min
846 maxp pointer to int for max
847 returned as -1 if no max
848 errorcodeptr points to error code variable
849
850 Returns: pointer to '}' on success;
851 current ptr on error, with errorcodeptr set non-zero
852 */
853
854 static const uschar *
855 read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
856 {
857 int min = 0;
858 int max = -1;
859
860 /* Read the minimum value and do a paranoid check: a negative value indicates
861 an integer overflow. */
862
863 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
864 if (min < 0 || min > 65535)
865 {
866 *errorcodeptr = ERR5;
867 return p;
868 }
869
870 /* Read the maximum value if there is one, and again do a paranoid on its size.
871 Also, max must not be less than min. */
872
873 if (*p == '}') max = min; else
874 {
875 if (*(++p) != '}')
876 {
877 max = 0;
878 while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
879 if (max < 0 || max > 65535)
880 {
881 *errorcodeptr = ERR5;
882 return p;
883 }
884 if (max < min)
885 {
886 *errorcodeptr = ERR4;
887 return p;
888 }
889 }
890 }
891
892 /* Fill in the required variables, and pass back the pointer to the terminating
893 '}'. */
894
895 *minp = min;
896 *maxp = max;
897 return p;
898 }
899
900
901
902 /*************************************************
903 * Find forward referenced subpattern *
904 *************************************************/
905
906 /* This function scans along a pattern's text looking for capturing
907 subpatterns, and counting them. If it finds a named pattern that matches the
908 name it is given, it returns its number. Alternatively, if the name is NULL, it
909 returns when it reaches a given numbered subpattern. This is used for forward
910 references to subpatterns. We know that if (?P< is encountered, the name will
911 be terminated by '>' because that is checked in the first pass.
912
913 Arguments:
914 ptr current position in the pattern
915 count current count of capturing parens so far encountered
916 name name to seek, or NULL if seeking a numbered subpattern
917 lorn name length, or subpattern number if name is NULL
918 xmode TRUE if we are in /x mode
919
920 Returns: the number of the named subpattern, or -1 if not found
921 */
922
923 static int
924 find_parens(const uschar *ptr, int count, const uschar *name, int lorn,
925 BOOL xmode)
926 {
927 const uschar *thisname;
928
929 for (; *ptr != 0; ptr++)
930 {
931 int term;
932
933 /* Skip over backslashed characters and also entire \Q...\E */
934
935 if (*ptr == '\\')
936 {
937 if (*(++ptr) == 0) return -1;
938 if (*ptr == 'Q') for (;;)
939 {
940 while (*(++ptr) != 0 && *ptr != '\\');
941 if (*ptr == 0) return -1;
942 if (*(++ptr) == 'E') break;
943 }
944 continue;
945 }
946
947 /* Skip over character classes */
948
949 if (*ptr == '[')
950 {
951 while (*(++ptr) != ']')
952 {
953 if (*ptr == 0) return -1;
954 if (*ptr == '\\')
955 {
956 if (*(++ptr) == 0) return -1;
957 if (*ptr == 'Q') for (;;)
958 {
959 while (*(++ptr) != 0 && *ptr != '\\');
960 if (*ptr == 0) return -1;
961 if (*(++ptr) == 'E') break;
962 }
963 continue;
964 }
965 }
966 continue;
967 }
968
969 /* Skip comments in /x mode */
970
971 if (xmode && *ptr == '#')
972 {
973 while (*(++ptr) != 0 && *ptr != '\n');
974 if (*ptr == 0) return -1;
975 continue;
976 }
977
978 /* An opening parens must now be a real metacharacter */
979
980 if (*ptr != '(') continue;
981 if (ptr[1] != '?' && ptr[1] != '*')
982 {
983 count++;
984 if (name == NULL && count == lorn) return count;
985 continue;
986 }
987
988 ptr += 2;
989 if (*ptr == 'P') ptr++; /* Allow optional P */
990
991 /* We have to disambiguate (?<! and (?<= from (?<name> */
992
993 if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') &&
994 *ptr != '\'')
995 continue;
996
997 count++;
998
999 if (name == NULL && count == lorn) return count;
1000 term = *ptr++;
1001 if (term == '<') term = '>';
1002 thisname = ptr;
1003 while (*ptr != term) ptr++;
1004 if (name != NULL && lorn == ptr - thisname &&
1005 strncmp((const char *)name, (const char *)thisname, lorn) == 0)
1006 return count;
1007 }
1008
1009 return -1;
1010 }
1011
1012
1013
1014 /*************************************************
1015 * Find first significant op code *
1016 *************************************************/
1017
1018 /* This is called by several functions that scan a compiled expression looking
1019 for a fixed first character, or an anchoring op code etc. It skips over things
1020 that do not influence this. For some calls, a change of option is important.
1021 For some calls, it makes sense to skip negative forward and all backward
1022 assertions, and also the \b assertion; for others it does not.
1023
1024 Arguments:
1025 code pointer to the start of the group
1026 options pointer to external options
1027 optbit the option bit whose changing is significant, or
1028 zero if none are
1029 skipassert TRUE if certain assertions are to be skipped
1030
1031 Returns: pointer to the first significant opcode
1032 */
1033
1034 static const uschar*
1035 first_significant_code(const uschar *code, int *options, int optbit,
1036 BOOL skipassert)
1037 {
1038 for (;;)
1039 {
1040 switch ((int)*code)
1041 {
1042 case OP_OPT:
1043 if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
1044 *options = (int)code[1];
1045 code += 2;
1046 break;
1047
1048 case OP_ASSERT_NOT:
1049 case OP_ASSERTBACK:
1050 case OP_ASSERTBACK_NOT:
1051 if (!skipassert) return code;
1052 do code += GET(code, 1); while (*code == OP_ALT);
1053 code += _pcre_OP_lengths[*code];
1054 break;
1055
1056 case OP_WORD_BOUNDARY:
1057 case OP_NOT_WORD_BOUNDARY:
1058 if (!skipassert) return code;
1059 /* Fall through */
1060
1061 case OP_CALLOUT:
1062 case OP_CREF:
1063 case OP_RREF:
1064 case OP_DEF:
1065 code += _pcre_OP_lengths[*code];
1066 break;
1067
1068 default:
1069 return code;
1070 }
1071 }
1072 /* Control never reaches here */
1073 }
1074
1075
1076
1077
1078 /*************************************************
1079 * Find the fixed length of a pattern *
1080 *************************************************/
1081
1082 /* Scan a pattern and compute the fixed length of subject that will match it,
1083 if the length is fixed. This is needed for dealing with backward assertions.
1084 In UTF8 mode, the result is in characters rather than bytes.
1085
1086 Arguments:
1087 code points to the start of the pattern (the bracket)
1088 options the compiling options
1089
1090 Returns: the fixed length, or -1 if there is no fixed length,
1091 or -2 if \C was encountered
1092 */
1093
1094 static int
1095 find_fixedlength(uschar *code, int options)
1096 {
1097 int length = -1;
1098
1099 register int branchlength = 0;
1100 register uschar *cc = code + 1 + LINK_SIZE;
1101
1102 /* Scan along the opcodes for this branch. If we get to the end of the
1103 branch, check the length against that of the other branches. */
1104
1105 for (;;)
1106 {
1107 int d;
1108 register int op = *cc;
1109 switch (op)
1110 {
1111 case OP_CBRA:
1112 case OP_BRA:
1113 case OP_ONCE:
1114 case OP_COND:
1115 d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);
1116 if (d < 0) return d;
1117 branchlength += d;
1118 do cc += GET(cc, 1); while (*cc == OP_ALT);
1119 cc += 1 + LINK_SIZE;
1120 break;
1121
1122 /* Reached end of a branch; if it's a ket it is the end of a nested
1123 call. If it's ALT it is an alternation in a nested call. If it is
1124 END it's the end of the outer call. All can be handled by the same code. */
1125
1126 case OP_ALT:
1127 case OP_KET:
1128 case OP_KETRMAX:
1129 case OP_KETRMIN:
1130 case OP_END:
1131 if (length < 0) length = branchlength;
1132 else if (length != branchlength) return -1;
1133 if (*cc != OP_ALT) return length;
1134 cc += 1 + LINK_SIZE;
1135 branchlength = 0;
1136 break;
1137
1138 /* Skip over assertive subpatterns */
1139
1140 case OP_ASSERT:
1141 case OP_ASSERT_NOT:
1142 case OP_ASSERTBACK:
1143 case OP_ASSERTBACK_NOT:
1144 do cc += GET(cc, 1); while (*cc == OP_ALT);
1145 /* Fall through */
1146
1147 /* Skip over things that don't match chars */
1148
1149 case OP_REVERSE:
1150 case OP_CREF:
1151 case OP_RREF:
1152 case OP_DEF:
1153 case OP_OPT:
1154 case OP_CALLOUT:
1155 case OP_SOD:
1156 case OP_SOM:
1157 case OP_EOD:
1158 case OP_EODN:
1159 case OP_CIRC:
1160 case OP_DOLL:
1161 case OP_NOT_WORD_BOUNDARY:
1162 case OP_WORD_BOUNDARY:
1163 cc += _pcre_OP_lengths[*cc];
1164 break;
1165
1166 /* Handle literal characters */
1167
1168 case OP_CHAR:
1169 case OP_CHARNC:
1170 case OP_NOT:
1171 branchlength++;
1172 cc += 2;
1173 #ifdef SUPPORT_UTF8
1174 if ((options & PCRE_UTF8) != 0)
1175 {
1176 while ((*cc & 0xc0) == 0x80) cc++;
1177 }
1178 #endif
1179 break;
1180
1181 /* Handle exact repetitions. The count is already in characters, but we
1182 need to skip over a multibyte character in UTF8 mode. */
1183
1184 case OP_EXACT:
1185 branchlength += GET2(cc,1);
1186 cc += 4;
1187 #ifdef SUPPORT_UTF8
1188 if ((options & PCRE_UTF8) != 0)
1189 {
1190 while((*cc & 0x80) == 0x80) cc++;
1191 }
1192 #endif
1193 break;
1194
1195 case OP_TYPEEXACT:
1196 branchlength += GET2(cc,1);
1197 if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
1198 cc += 4;
1199 break;
1200
1201 /* Handle single-char matchers */
1202
1203 case OP_PROP:
1204 case OP_NOTPROP:
1205 cc += 2;
1206 /* Fall through */
1207
1208 case OP_NOT_DIGIT:
1209 case OP_DIGIT:
1210 case OP_NOT_WHITESPACE:
1211 case OP_WHITESPACE:
1212 case OP_NOT_WORDCHAR:
1213 case OP_WORDCHAR:
1214 case OP_ANY:
1215 branchlength++;
1216 cc++;
1217 break;
1218
1219 /* The single-byte matcher isn't allowed */
1220
1221 case OP_ANYBYTE:
1222 return -2;
1223
1224 /* Check a class for variable quantification */
1225
1226 #ifdef SUPPORT_UTF8
1227 case OP_XCLASS:
1228 cc += GET(cc, 1) - 33;
1229 /* Fall through */
1230 #endif
1231
1232 case OP_CLASS:
1233 case OP_NCLASS:
1234 cc += 33;
1235
1236 switch (*cc)
1237 {
1238 case OP_CRSTAR:
1239 case OP_CRMINSTAR:
1240 case OP_CRQUERY:
1241 case OP_CRMINQUERY:
1242 return -1;
1243
1244 case OP_CRRANGE:
1245 case OP_CRMINRANGE:
1246 if (GET2(cc,1) != GET2(cc,3)) return -1;
1247 branchlength += GET2(cc,1);
1248 cc += 5;
1249 break;
1250
1251 default:
1252 branchlength++;
1253 }
1254 break;
1255
1256 /* Anything else is variable length */
1257
1258 default:
1259 return -1;
1260 }
1261 }
1262 /* Control never gets here */
1263 }
1264
1265
1266
1267
1268 /*************************************************
1269 * Scan compiled regex for numbered bracket *
1270 *************************************************/
1271
1272 /* This little function scans through a compiled pattern until it finds a
1273 capturing bracket with the given number.
1274
1275 Arguments:
1276 code points to start of expression
1277 utf8 TRUE in UTF-8 mode
1278 number the required bracket number
1279
1280 Returns: pointer to the opcode for the bracket, or NULL if not found
1281 */
1282
1283 static const uschar *
1284 find_bracket(const uschar *code, BOOL utf8, int number)
1285 {
1286 for (;;)
1287 {
1288 register int c = *code;
1289 if (c == OP_END) return NULL;
1290
1291 /* XCLASS is used for classes that cannot be represented just by a bit
1292 map. This includes negated single high-valued characters. The length in
1293 the table is zero; the actual length is stored in the compiled code. */
1294
1295 if (c == OP_XCLASS) code += GET(code, 1);
1296
1297 /* Handle capturing bracket */
1298
1299 else if (c == OP_CBRA)
1300 {
1301 int n = GET2(code, 1+LINK_SIZE);
1302 if (n == number) return (uschar *)code;
1303 code += _pcre_OP_lengths[c];
1304 }
1305
1306 /* Otherwise, we can get the item's length from the table, except that for
1307 repeated character types, we have to test for \p and \P, which have an extra
1308 two bytes of parameters. */
1309
1310 else
1311 {
1312 switch(c)
1313 {
1314 case OP_TYPESTAR:
1315 case OP_TYPEMINSTAR:
1316 case OP_TYPEPLUS:
1317 case OP_TYPEMINPLUS:
1318 case OP_TYPEQUERY:
1319 case OP_TYPEMINQUERY:
1320 case OP_TYPEUPTO:
1321 case OP_TYPEMINUPTO:
1322 case OP_TYPEEXACT:
1323 case OP_TYPEPOSSTAR:
1324 case OP_TYPEPOSPLUS:
1325 case OP_TYPEPOSQUERY:
1326 case OP_TYPEPOSUPTO:
1327 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1328 break;
1329 }
1330
1331 /* Add in the fixed length from the table */
1332
1333 code += _pcre_OP_lengths[c];
1334
1335 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1336 a multi-byte character. The length in the table is a minimum, so we have to
1337 arrange to skip the extra bytes. */
1338
1339 #ifdef SUPPORT_UTF8
1340 if (utf8) switch(c)
1341 {
1342 case OP_CHAR:
1343 case OP_CHARNC:
1344 case OP_EXACT:
1345 case OP_UPTO:
1346 case OP_MINUPTO:
1347 case OP_POSUPTO:
1348 case OP_STAR:
1349 case OP_MINSTAR:
1350 case OP_POSSTAR:
1351 case OP_PLUS:
1352 case OP_MINPLUS:
1353 case OP_POSPLUS:
1354 case OP_QUERY:
1355 case OP_MINQUERY:
1356 case OP_POSQUERY:
1357 if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1358 break;
1359 }
1360 #endif
1361 }
1362 }
1363 }
1364
1365
1366
1367 /*************************************************
1368 * Scan compiled regex for recursion reference *
1369 *************************************************/
1370
1371 /* This little function scans through a compiled pattern until it finds an
1372 instance of OP_RECURSE.
1373
1374 Arguments:
1375 code points to start of expression
1376 utf8 TRUE in UTF-8 mode
1377
1378 Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1379 */
1380
1381 static const uschar *
1382 find_recurse(const uschar *code, BOOL utf8)
1383 {
1384 for (;;)
1385 {
1386 register int c = *code;
1387 if (c == OP_END) return NULL;
1388 if (c == OP_RECURSE) return code;
1389
1390 /* XCLASS is used for classes that cannot be represented just by a bit
1391 map. This includes negated single high-valued characters. The length in
1392 the table is zero; the actual length is stored in the compiled code. */
1393
1394 if (c == OP_XCLASS) code += GET(code, 1);
1395
1396 /* Otherwise, we can get the item's length from the table, except that for
1397 repeated character types, we have to test for \p and \P, which have an extra
1398 two bytes of parameters. */
1399
1400 else
1401 {
1402 switch(c)
1403 {
1404 case OP_TYPESTAR:
1405 case OP_TYPEMINSTAR:
1406 case OP_TYPEPLUS:
1407 case OP_TYPEMINPLUS:
1408 case OP_TYPEQUERY:
1409 case OP_TYPEMINQUERY:
1410 case OP_TYPEUPTO:
1411 case OP_TYPEMINUPTO:
1412 case OP_TYPEEXACT:
1413 case OP_TYPEPOSSTAR:
1414 case OP_TYPEPOSPLUS:
1415 case OP_TYPEPOSQUERY:
1416 case OP_TYPEPOSUPTO:
1417 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1418 break;
1419 }
1420
1421 /* Add in the fixed length from the table */
1422
1423 code += _pcre_OP_lengths[c];
1424
1425 /* In UTF-8 mode, opcodes that are followed by a character may be followed
1426 by a multi-byte character. The length in the table is a minimum, so we have
1427 to arrange to skip the extra bytes. */
1428
1429 #ifdef SUPPORT_UTF8
1430 if (utf8) switch(c)
1431 {
1432 case OP_CHAR:
1433 case OP_CHARNC:
1434 case OP_EXACT:
1435 case OP_UPTO:
1436 case OP_MINUPTO:
1437 case OP_POSUPTO:
1438 case OP_STAR:
1439 case OP_MINSTAR:
1440 case OP_POSSTAR:
1441 case OP_PLUS:
1442 case OP_MINPLUS:
1443 case OP_POSPLUS:
1444 case OP_QUERY:
1445 case OP_MINQUERY:
1446 case OP_POSQUERY:
1447 if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1448 break;
1449 }
1450 #endif
1451 }
1452 }
1453 }
1454
1455
1456
1457 /*************************************************
1458 * Scan compiled branch for non-emptiness *
1459 *************************************************/
1460
1461 /* This function scans through a branch of a compiled pattern to see whether it
1462 can match the empty string or not. It is called from could_be_empty()
1463 below and from compile_branch() when checking for an unlimited repeat of a
1464 group that can match nothing. Note that first_significant_code() skips over
1465 assertions. If we hit an unclosed bracket, we return "empty" - this means we've
1466 struck an inner bracket whose current branch will already have been scanned.
1467
1468 Arguments:
1469 code points to start of search
1470 endcode points to where to stop
1471 utf8 TRUE if in UTF8 mode
1472
1473 Returns: TRUE if what is matched could be empty
1474 */
1475
1476 static BOOL
1477 could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1478 {
1479 register int c;
1480 for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1481 code < endcode;
1482 code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1483 {
1484 const uschar *ccode;
1485
1486 c = *code;
1487
1488 /* Groups with zero repeats can of course be empty; skip them. */
1489
1490 if (c == OP_BRAZERO || c == OP_BRAMINZERO)
1491 {
1492 code += _pcre_OP_lengths[c];
1493 do code += GET(code, 1); while (*code == OP_ALT);
1494 c = *code;
1495 continue;
1496 }
1497
1498 /* For other groups, scan the branches. */
1499
1500 if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND)
1501 {
1502 BOOL empty_branch;
1503 if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1504
1505 /* Scan a closed bracket */
1506
1507 empty_branch = FALSE;
1508 do
1509 {
1510 if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1511 empty_branch = TRUE;
1512 code += GET(code, 1);
1513 }
1514 while (*code == OP_ALT);
1515 if (!empty_branch) return FALSE; /* All branches are non-empty */
1516 c = *code;
1517 continue;
1518 }
1519
1520 /* Handle the other opcodes */
1521
1522 switch (c)
1523 {
1524 /* Check for quantifiers after a class. XCLASS is used for classes that
1525 cannot be represented just by a bit map. This includes negated single
1526 high-valued characters. The length in _pcre_OP_lengths[] is zero; the
1527 actual length is stored in the compiled code, so we must update "code"
1528 here. */
1529
1530 #ifdef SUPPORT_UTF8
1531 case OP_XCLASS:
1532 ccode = code += GET(code, 1);
1533 goto CHECK_CLASS_REPEAT;
1534 #endif
1535
1536 case OP_CLASS:
1537 case OP_NCLASS:
1538 ccode = code + 33;
1539
1540 #ifdef SUPPORT_UTF8
1541 CHECK_CLASS_REPEAT:
1542 #endif
1543
1544 switch (*ccode)
1545 {
1546 case OP_CRSTAR: /* These could be empty; continue */
1547 case OP_CRMINSTAR:
1548 case OP_CRQUERY:
1549 case OP_CRMINQUERY:
1550 break;
1551
1552 default: /* Non-repeat => class must match */
1553 case OP_CRPLUS: /* These repeats aren't empty */
1554 case OP_CRMINPLUS:
1555 return FALSE;
1556
1557 case OP_CRRANGE:
1558 case OP_CRMINRANGE:
1559 if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
1560 break;
1561 }
1562 break;
1563
1564 /* Opcodes that must match a character */
1565
1566 case OP_PROP:
1567 case OP_NOTPROP:
1568 case OP_EXTUNI:
1569 case OP_NOT_DIGIT:
1570 case OP_DIGIT:
1571 case OP_NOT_WHITESPACE:
1572 case OP_WHITESPACE:
1573 case OP_NOT_WORDCHAR:
1574 case OP_WORDCHAR:
1575 case OP_ANY:
1576 case OP_ANYBYTE:
1577 case OP_CHAR:
1578 case OP_CHARNC:
1579 case OP_NOT:
1580 case OP_PLUS:
1581 case OP_MINPLUS:
1582 case OP_POSPLUS:
1583 case OP_EXACT:
1584 case OP_NOTPLUS:
1585 case OP_NOTMINPLUS:
1586 case OP_NOTPOSPLUS:
1587 case OP_NOTEXACT:
1588 case OP_TYPEPLUS:
1589 case OP_TYPEMINPLUS:
1590 case OP_TYPEPOSPLUS:
1591 case OP_TYPEEXACT:
1592 return FALSE;
1593
1594 /* End of branch */
1595
1596 case OP_KET:
1597 case OP_KETRMAX:
1598 case OP_KETRMIN:
1599 case OP_ALT:
1600 return TRUE;
1601
1602 /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1603 MINUPTO, and POSUPTO may be followed by a multibyte character */
1604
1605 #ifdef SUPPORT_UTF8
1606 case OP_STAR:
1607 case OP_MINSTAR:
1608 case OP_POSSTAR:
1609 case OP_QUERY:
1610 case OP_MINQUERY:
1611 case OP_POSQUERY:
1612 case OP_UPTO:
1613 case OP_MINUPTO:
1614 case OP_POSUPTO:
1615 if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1616 break;
1617 #endif
1618 }
1619 }
1620
1621 return TRUE;
1622 }
1623
1624
1625
1626 /*************************************************
1627 * Scan compiled regex for non-emptiness *
1628 *************************************************/
1629
1630 /* This function is called to check for left recursive calls. We want to check
1631 the current branch of the current pattern to see if it could match the empty
1632 string. If it could, we must look outwards for branches at other levels,
1633 stopping when we pass beyond the bracket which is the subject of the recursion.
1634
1635 Arguments:
1636 code points to start of the recursion
1637 endcode points to where to stop (current RECURSE item)
1638 bcptr points to the chain of current (unclosed) branch starts
1639 utf8 TRUE if in UTF-8 mode
1640
1641 Returns: TRUE if what is matched could be empty
1642 */
1643
1644 static BOOL
1645 could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1646 BOOL utf8)
1647 {
1648 while (bcptr != NULL && bcptr->current >= code)
1649 {
1650 if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1651 bcptr = bcptr->outer;
1652 }
1653 return TRUE;
1654 }
1655
1656
1657
1658 /*************************************************
1659 * Check for POSIX class syntax *
1660 *************************************************/
1661
1662 /* This function is called when the sequence "[:" or "[." or "[=" is
1663 encountered in a character class. It checks whether this is followed by an
1664 optional ^ and then a sequence of letters, terminated by a matching ":]" or
1665 ".]" or "=]".
1666
1667 Argument:
1668 ptr pointer to the initial [
1669 endptr where to return the end pointer
1670 cd pointer to compile data
1671
1672 Returns: TRUE or FALSE
1673 */
1674
1675 static BOOL
1676 check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
1677 {
1678 int terminator; /* Don't combine these lines; the Solaris cc */
1679 terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
1680 if (*(++ptr) == '^') ptr++;
1681 while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
1682 if (*ptr == terminator && ptr[1] == ']')
1683 {
1684 *endptr = ptr;
1685 return TRUE;
1686 }
1687 return FALSE;
1688 }
1689
1690
1691
1692
1693 /*************************************************
1694 * Check POSIX class name *
1695 *************************************************/
1696
1697 /* This function is called to check the name given in a POSIX-style class entry
1698 such as [:alnum:].
1699
1700 Arguments:
1701 ptr points to the first letter
1702 len the length of the name
1703
1704 Returns: a value representing the name, or -1 if unknown
1705 */
1706
1707 static int
1708 check_posix_name(const uschar *ptr, int len)
1709 {
1710 register int yield = 0;
1711 while (posix_name_lengths[yield] != 0)
1712 {
1713 if (len == posix_name_lengths[yield] &&
1714 strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
1715 yield++;
1716 }
1717 return -1;
1718 }
1719
1720
1721 /*************************************************
1722 * Adjust OP_RECURSE items in repeated group *
1723 *************************************************/
1724
1725 /* OP_RECURSE items contain an offset from the start of the regex to the group
1726 that is referenced. This means that groups can be replicated for fixed
1727 repetition simply by copying (because the recursion is allowed to refer to
1728 earlier groups that are outside the current group). However, when a group is
1729 optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
1730 it, after it has been compiled. This means that any OP_RECURSE items within it
1731 that refer to the group itself or any contained groups have to have their
1732 offsets adjusted. That one of the jobs of this function. Before it is called,
1733 the partially compiled regex must be temporarily terminated with OP_END.
1734
1735 This function has been extended with the possibility of forward references for
1736 recursions and subroutine calls. It must also check the list of such references
1737 for the group we are dealing with. If it finds that one of the recursions in
1738 the current group is on this list, it adjusts the offset in the list, not the
1739 value in the reference (which is a group number).
1740
1741 Arguments:
1742 group points to the start of the group
1743 adjust the amount by which the group is to be moved
1744 utf8 TRUE in UTF-8 mode
1745 cd contains pointers to tables etc.
1746 save_hwm the hwm forward reference pointer at the start of the group
1747
1748 Returns: nothing
1749 */
1750
1751 static void
1752 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
1753 uschar *save_hwm)
1754 {
1755 uschar *ptr = group;
1756 while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1757 {
1758 int offset;
1759 uschar *hc;
1760
1761 /* See if this recursion is on the forward reference list. If so, adjust the
1762 reference. */
1763
1764 for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
1765 {
1766 offset = GET(hc, 0);
1767 if (cd->start_code + offset == ptr + 1)
1768 {
1769 PUT(hc, 0, offset + adjust);
1770 break;
1771 }
1772 }
1773
1774 /* Otherwise, adjust the recursion offset if it's after the start of this
1775 group. */
1776
1777 if (hc >= cd->hwm)
1778 {
1779 offset = GET(ptr, 1);
1780 if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1781 }
1782
1783 ptr += 1 + LINK_SIZE;
1784 }
1785 }
1786
1787
1788
1789 /*************************************************
1790 * Insert an automatic callout point *
1791 *************************************************/
1792
1793 /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
1794 callout points before each pattern item.
1795
1796 Arguments:
1797 code current code pointer
1798 ptr current pattern pointer
1799 cd pointers to tables etc
1800
1801 Returns: new code pointer
1802 */
1803
1804 static uschar *
1805 auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
1806 {
1807 *code++ = OP_CALLOUT;
1808 *code++ = 255;
1809 PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */
1810 PUT(code, LINK_SIZE, 0); /* Default length */
1811 return code + 2*LINK_SIZE;
1812 }
1813
1814
1815
1816 /*************************************************
1817 * Complete a callout item *
1818 *************************************************/
1819
1820 /* A callout item contains the length of the next item in the pattern, which
1821 we can't fill in till after we have reached the relevant point. This is used
1822 for both automatic and manual callouts.
1823
1824 Arguments:
1825 previous_callout points to previous callout item
1826 ptr current pattern pointer
1827 cd pointers to tables etc
1828
1829 Returns: nothing
1830 */
1831
1832 static void
1833 complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
1834 {
1835 int length = ptr - cd->start_pattern - GET(previous_callout, 2);
1836 PUT(previous_callout, 2 + LINK_SIZE, length);
1837 }
1838
1839
1840
1841 #ifdef SUPPORT_UCP
1842 /*************************************************
1843 * Get othercase range *
1844 *************************************************/
1845
1846 /* This function is passed the start and end of a class range, in UTF-8 mode
1847 with UCP support. It searches up the characters, looking for internal ranges of
1848 characters in the "other" case. Each call returns the next one, updating the
1849 start address.
1850
1851 Arguments:
1852 cptr points to starting character value; updated
1853 d end value
1854 ocptr where to put start of othercase range
1855 odptr where to put end of othercase range
1856
1857 Yield: TRUE when range returned; FALSE when no more
1858 */
1859
1860 static BOOL
1861 get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
1862 unsigned int *odptr)
1863 {
1864 unsigned int c, othercase, next;
1865
1866 for (c = *cptr; c <= d; c++)
1867 { if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR) break; }
1868
1869 if (c > d) return FALSE;
1870
1871 *ocptr = othercase;
1872 next = othercase + 1;
1873
1874 for (++c; c <= d; c++)
1875 {
1876 if (_pcre_ucp_othercase(c) != next) break;
1877 next++;
1878 }
1879
1880 *odptr = next - 1;
1881 *cptr = c;
1882
1883 return TRUE;
1884 }
1885 #endif /* SUPPORT_UCP */
1886
1887
1888
1889 /*************************************************
1890 * Check if auto-possessifying is possible *
1891 *************************************************/
1892
1893 /* This function is called for unlimited repeats of certain items, to see
1894 whether the next thing could possibly match the repeated item. If not, it makes
1895 sense to automatically possessify the repeated item.
1896
1897 Arguments:
1898 op_code the repeated op code
1899 this data for this item, depends on the opcode
1900 utf8 TRUE in UTF-8 mode
1901 utf8_char used for utf8 character bytes, NULL if not relevant
1902 ptr next character in pattern
1903 options options bits
1904 cd contains pointers to tables etc.
1905
1906 Returns: TRUE if possessifying is wanted
1907 */
1908
1909 static BOOL
1910 check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
1911 const uschar *ptr, int options, compile_data *cd)
1912 {
1913 int next;
1914
1915 /* Skip whitespace and comments in extended mode */
1916
1917 if ((options & PCRE_EXTENDED) != 0)
1918 {
1919 for (;;)
1920 {
1921 while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1922 if (*ptr == '#')
1923 {
1924 while (*(++ptr) != 0)
1925 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1926 }
1927 else break;
1928 }
1929 }
1930
1931 /* If the next item is one that we can handle, get its value. A non-negative
1932 value is a character, a negative value is an escape value. */
1933
1934 if (*ptr == '\\')
1935 {
1936 int temperrorcode = 0;
1937 next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
1938 if (temperrorcode != 0) return FALSE;
1939 ptr++; /* Point after the escape sequence */
1940 }
1941
1942 else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
1943 {
1944 #ifdef SUPPORT_UTF8
1945 if (utf8) { GETCHARINC(next, ptr); } else
1946 #endif
1947 next = *ptr++;
1948 }
1949
1950 else return FALSE;
1951
1952 /* Skip whitespace and comments in extended mode */
1953
1954 if ((options & PCRE_EXTENDED) != 0)
1955 {
1956 for (;;)
1957 {
1958 while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1959 if (*ptr == '#')
1960 {
1961 while (*(++ptr) != 0)
1962 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1963 }
1964 else break;
1965 }
1966 }
1967
1968 /* If the next thing is itself optional, we have to give up. */
1969
1970 if (*ptr == '*' || *ptr == '?' || strncmp((char *)ptr, "{0,", 3) == 0)
1971 return FALSE;
1972
1973 /* Now compare the next item with the previous opcode. If the previous is a
1974 positive single character match, "item" either contains the character or, if
1975 "item" is greater than 127 in utf8 mode, the character's bytes are in
1976 utf8_char. */
1977
1978
1979 /* Handle cases when the next item is a character. */
1980
1981 if (next >= 0) switch(op_code)
1982 {
1983 case OP_CHAR:
1984 #ifdef SUPPORT_UTF8
1985 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1986 #endif
1987 return item != next;
1988
1989 /* For CHARNC (caseless character) we must check the other case. If we have
1990 Unicode property support, we can use it to test the other case of
1991 high-valued characters. */
1992
1993 case OP_CHARNC:
1994 #ifdef SUPPORT_UTF8
1995 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1996 #endif
1997 if (item == next) return FALSE;
1998 #ifdef SUPPORT_UTF8
1999 if (utf8)
2000 {
2001 unsigned int othercase;
2002 if (next < 128) othercase = cd->fcc[next]; else
2003 #ifdef SUPPORT_UCP
2004 othercase = _pcre_ucp_othercase((unsigned int)next);
2005 #else
2006 othercase = NOTACHAR;
2007 #endif
2008 return (unsigned int)item != othercase;
2009 }
2010 else
2011 #endif /* SUPPORT_UTF8 */
2012 return (item != cd->fcc[next]); /* Non-UTF-8 mode */
2013
2014 /* For OP_NOT, "item" must be a single-byte character. */
2015
2016 case OP_NOT:
2017 if (next < 0) return FALSE; /* Not a character */
2018 if (item == next) return TRUE;
2019 if ((options & PCRE_CASELESS) == 0) return FALSE;
2020 #ifdef SUPPORT_UTF8
2021 if (utf8)
2022 {
2023 unsigned int othercase;
2024 if (next < 128) othercase = cd->fcc[next]; else
2025 #ifdef SUPPORT_UCP
2026 othercase = _pcre_ucp_othercase(next);
2027 #else
2028 othercase = NOTACHAR;
2029 #endif
2030 return (unsigned int)item == othercase;
2031 }
2032 else
2033 #endif /* SUPPORT_UTF8 */
2034 return (item == cd->fcc[next]); /* Non-UTF-8 mode */
2035
2036 case OP_DIGIT:
2037 return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
2038
2039 case OP_NOT_DIGIT:
2040 return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
2041
2042 case OP_WHITESPACE:
2043 return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
2044
2045 case OP_NOT_WHITESPACE:
2046 return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
2047
2048 case OP_WORDCHAR:
2049 return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
2050
2051 case OP_NOT_WORDCHAR:
2052 return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
2053
2054 case OP_HSPACE:
2055 case OP_NOT_HSPACE:
2056 switch(next)
2057 {
2058 case 0x09:
2059 case 0x20:
2060 case 0xa0:
2061 case 0x1680:
2062 case 0x180e:
2063 case 0x2000:
2064 case 0x2001:
2065 case 0x2002:
2066 case 0x2003:
2067 case 0x2004:
2068 case 0x2005:
2069 case 0x2006:
2070 case 0x2007:
2071 case 0x2008:
2072 case 0x2009:
2073 case 0x200A:
2074 case 0x202f:
2075 case 0x205f:
2076 case 0x3000:
2077 return op_code != OP_HSPACE;
2078 default:
2079 return op_code == OP_HSPACE;
2080 }
2081
2082 case OP_VSPACE:
2083 case OP_NOT_VSPACE:
2084 switch(next)
2085 {
2086 case 0x0a:
2087 case 0x0b:
2088 case 0x0c:
2089 case 0x0d:
2090 case 0x85:
2091 case 0x2028:
2092 case 0x2029:
2093 return op_code != OP_VSPACE;
2094 default:
2095 return op_code == OP_VSPACE;
2096 }
2097
2098 default:
2099 return FALSE;
2100 }
2101
2102
2103 /* Handle the case when the next item is \d, \s, etc. */
2104
2105 switch(op_code)
2106 {
2107 case OP_CHAR:
2108 case OP_CHARNC:
2109 #ifdef SUPPORT_UTF8
2110 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2111 #endif
2112 switch(-next)
2113 {
2114 case ESC_d:
2115 return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
2116
2117 case ESC_D:
2118 return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
2119
2120 case ESC_s:
2121 return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
2122
2123 case ESC_S:
2124 return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
2125
2126 case ESC_w:
2127 return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
2128
2129 case ESC_W:
2130 return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
2131
2132 case ESC_h:
2133 case ESC_H:
2134 switch(item)
2135 {
2136 case 0x09:
2137 case 0x20:
2138 case 0xa0:
2139 case 0x1680:
2140 case 0x180e:
2141 case 0x2000:
2142 case 0x2001:
2143 case 0x2002:
2144 case 0x2003:
2145 case 0x2004:
2146 case 0x2005:
2147 case 0x2006:
2148 case 0x2007:
2149 case 0x2008:
2150 case 0x2009:
2151 case 0x200A:
2152 case 0x202f:
2153 case 0x205f:
2154 case 0x3000:
2155 return -next != ESC_h;
2156 default:
2157 return -next == ESC_h;
2158 }
2159
2160 case ESC_v:
2161 case ESC_V:
2162 switch(item)
2163 {
2164 case 0x0a:
2165 case 0x0b:
2166 case 0x0c:
2167 case 0x0d:
2168 case 0x85:
2169 case 0x2028:
2170 case 0x2029:
2171 return -next != ESC_v;
2172 default:
2173 return -next == ESC_v;
2174 }
2175
2176 default:
2177 return FALSE;
2178 }
2179
2180 case OP_DIGIT:
2181 return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
2182 next == -ESC_h || next == -ESC_v;
2183
2184 case OP_NOT_DIGIT:
2185 return next == -ESC_d;
2186
2187 case OP_WHITESPACE:
2188 return next == -ESC_S || next == -ESC_d || next == -ESC_w;
2189
2190 case OP_NOT_WHITESPACE:
2191 return next == -ESC_s || next == -ESC_h || next == -ESC_v;
2192
2193 case OP_HSPACE:
2194 return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w;
2195
2196 case OP_NOT_HSPACE:
2197 return next == -ESC_h;
2198
2199 /* Can't have \S in here because VT matches \S (Perl anomaly) */
2200 case OP_VSPACE:
2201 return next == -ESC_V || next == -ESC_d || next == -ESC_w;
2202
2203 case OP_NOT_VSPACE:
2204 return next == -ESC_v;
2205
2206 case OP_WORDCHAR:
2207 return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v;
2208
2209 case OP_NOT_WORDCHAR:
2210 return next == -ESC_w || next == -ESC_d;
2211
2212 default:
2213 return FALSE;
2214 }
2215
2216 /* Control does not reach here */
2217 }
2218
2219
2220
2221 /*************************************************
2222 * Compile one branch *
2223 *************************************************/
2224
2225 /* Scan the pattern, compiling it into the a vector. If the options are
2226 changed during the branch, the pointer is used to change the external options
2227 bits. This function is used during the pre-compile phase when we are trying
2228 to find out the amount of memory needed, as well as during the real compile
2229 phase. The value of lengthptr distinguishes the two phases.
2230
2231 Arguments:
2232 optionsptr pointer to the option bits
2233 codeptr points to the pointer to the current code point
2234 ptrptr points to the current pattern pointer
2235 errorcodeptr points to error code variable
2236 firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2237 reqbyteptr set to the last literal character required, else < 0
2238 bcptr points to current branch chain
2239 cd contains pointers to tables etc.
2240 lengthptr NULL during the real compile phase
2241 points to length accumulator during pre-compile phase
2242
2243 Returns: TRUE on success
2244 FALSE, with *errorcodeptr set non-zero on error
2245 */
2246
2247 static BOOL
2248 compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2249 int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2250 compile_data *cd, int *lengthptr)
2251 {
2252 int repeat_type, op_type;
2253 int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
2254 int bravalue = 0;
2255 int greedy_default, greedy_non_default;
2256 int firstbyte, reqbyte;
2257 int zeroreqbyte, zerofirstbyte;
2258 int req_caseopt, reqvary, tempreqvary;
2259 int options = *optionsptr;
2260 int after_manual_callout = 0;
2261 int length_prevgroup = 0;
2262 register int c;
2263 register uschar *code = *codeptr;
2264 uschar *last_code = code;
2265 uschar *orig_code = code;
2266 uschar *tempcode;
2267 BOOL inescq = FALSE;
2268 BOOL groupsetfirstbyte = FALSE;
2269 const uschar *ptr = *ptrptr;
2270 const uschar *tempptr;
2271 uschar *previous = NULL;
2272 uschar *previous_callout = NULL;
2273 uschar *save_hwm = NULL;
2274 uschar classbits[32];
2275
2276 #ifdef SUPPORT_UTF8
2277 BOOL class_utf8;
2278 BOOL utf8 = (options & PCRE_UTF8) != 0;
2279 uschar *class_utf8data;
2280 uschar utf8_char[6];
2281 #else
2282 BOOL utf8 = FALSE;
2283 uschar *utf8_char = NULL;
2284 #endif
2285
2286 #ifdef DEBUG
2287 if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2288 #endif
2289
2290 /* Set up the default and non-default settings for greediness */
2291
2292 greedy_default = ((options & PCRE_UNGREEDY) != 0);
2293 greedy_non_default = greedy_default ^ 1;
2294
2295 /* Initialize no first byte, no required byte. REQ_UNSET means "no char
2296 matching encountered yet". It gets changed to REQ_NONE if we hit something that
2297 matches a non-fixed char first char; reqbyte just remains unset if we never
2298 find one.
2299
2300 When we hit a repeat whose minimum is zero, we may have to adjust these values
2301 to take the zero repeat into account. This is implemented by setting them to
2302 zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2303 item types that can be repeated set these backoff variables appropriately. */
2304
2305 firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2306
2307 /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2308 according to the current setting of the caseless flag. REQ_CASELESS is a bit
2309 value > 255. It is added into the firstbyte or reqbyte variables to record the
2310 case status of the value. This is used only for ASCII characters. */
2311
2312 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2313
2314 /* Switch on next character until the end of the branch */
2315
2316 for (;; ptr++)
2317 {
2318 BOOL negate_class;
2319 BOOL possessive_quantifier;
2320 BOOL is_quantifier;
2321 BOOL is_recurse;
2322 BOOL reset_bracount;
2323 int class_charcount;
2324 int class_lastchar;
2325 int newoptions;
2326 int recno;
2327 int refsign;
2328 int skipbytes;
2329 int subreqbyte;
2330 int subfirstbyte;
2331 int terminator;
2332 int mclength;
2333 uschar mcbuffer[8];
2334
2335 /* Get next byte in the pattern */
2336
2337 c = *ptr;
2338
2339 /* If we are in the pre-compile phase, accumulate the length used for the
2340 previous cycle of this loop. */
2341
2342 if (lengthptr != NULL)
2343 {
2344 #ifdef DEBUG
2345 if (code > cd->hwm) cd->hwm = code; /* High water info */
2346 #endif
2347 if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
2348 {
2349 *errorcodeptr = ERR52;
2350 goto FAILED;
2351 }
2352
2353 /* There is at least one situation where code goes backwards: this is the
2354 case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2355 the class is simply eliminated. However, it is created first, so we have to
2356 allow memory for it. Therefore, don't ever reduce the length at this point.
2357 */
2358
2359 if (code < last_code) code = last_code;
2360
2361 /* Paranoid check for integer overflow */
2362
2363 if (OFLOW_MAX - *lengthptr < code - last_code)
2364 {
2365 *errorcodeptr = ERR20;
2366 goto FAILED;
2367 }
2368
2369 *lengthptr += code - last_code;
2370 DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2371
2372 /* If "previous" is set and it is not at the start of the work space, move
2373 it back to there, in order to avoid filling up the work space. Otherwise,
2374 if "previous" is NULL, reset the current code pointer to the start. */
2375
2376 if (previous != NULL)
2377 {
2378 if (previous > orig_code)
2379 {
2380 memmove(orig_code, previous, code - previous);
2381 code -= previous - orig_code;
2382 previous = orig_code;
2383 }
2384 }
2385 else code = orig_code;
2386
2387 /* Remember where this code item starts so we can pick up the length
2388 next time round. */
2389
2390 last_code = code;
2391 }
2392
2393 /* In the real compile phase, just check the workspace used by the forward
2394 reference list. */
2395
2396 else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
2397 {
2398 *errorcodeptr = ERR52;
2399 goto FAILED;
2400 }
2401
2402 /* If in \Q...\E, check for the end; if not, we have a literal */
2403
2404 if (inescq && c != 0)
2405 {
2406 if (c == '\\' && ptr[1] == 'E')
2407 {
2408 inescq = FALSE;
2409 ptr++;
2410 continue;
2411 }
2412 else
2413 {
2414 if (previous_callout != NULL)
2415 {
2416 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2417 complete_callout(previous_callout, ptr, cd);
2418 previous_callout = NULL;
2419 }
2420 if ((options & PCRE_AUTO_CALLOUT) != 0)
2421 {
2422 previous_callout = code;
2423 code = auto_callout(code, ptr, cd);
2424 }
2425 goto NORMAL_CHAR;
2426 }
2427 }
2428
2429 /* Fill in length of a previous callout, except when the next thing is
2430 a quantifier. */
2431
2432 is_quantifier = c == '*' || c == '+' || c == '?' ||
2433 (c == '{' && is_counted_repeat(ptr+1));
2434
2435 if (!is_quantifier && previous_callout != NULL &&
2436 after_manual_callout-- <= 0)
2437 {
2438 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2439 complete_callout(previous_callout, ptr, cd);
2440 previous_callout = NULL;
2441 }
2442
2443 /* In extended mode, skip white space and comments */
2444
2445 if ((options & PCRE_EXTENDED) != 0)
2446 {
2447 if ((cd->ctypes[c] & ctype_space) != 0) continue;
2448 if (c == '#')
2449 {
2450 while (*(++ptr) != 0)
2451 {
2452 if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
2453 }
2454 if (*ptr != 0) continue;
2455
2456 /* Else fall through to handle end of string */
2457 c = 0;
2458 }
2459 }
2460
2461 /* No auto callout for quantifiers. */
2462
2463 if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
2464 {
2465 previous_callout = code;
2466 code = auto_callout(code, ptr, cd);
2467 }
2468
2469 switch(c)
2470 {
2471 /* ===================================================================*/
2472 case 0: /* The branch terminates at string end */
2473 case '|': /* or | or ) */
2474 case ')':
2475 *firstbyteptr = firstbyte;
2476 *reqbyteptr = reqbyte;
2477 *codeptr = code;
2478 *ptrptr = ptr;
2479 if (lengthptr != NULL)
2480 {
2481 if (OFLOW_MAX - *lengthptr < code - last_code)
2482 {
2483 *errorcodeptr = ERR20;
2484 goto FAILED;
2485 }
2486 *lengthptr += code - last_code; /* To include callout length */
2487 DPRINTF((">> end branch\n"));
2488 }
2489 return TRUE;
2490
2491
2492 /* ===================================================================*/
2493 /* Handle single-character metacharacters. In multiline mode, ^ disables
2494 the setting of any following char as a first character. */
2495
2496 case '^':
2497 if ((options & PCRE_MULTILINE) != 0)
2498 {
2499 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2500 }
2501 previous = NULL;
2502 *code++ = OP_CIRC;
2503 break;
2504
2505 case '$':
2506 previous = NULL;
2507 *code++ = OP_DOLL;
2508 break;
2509
2510 /* There can never be a first char if '.' is first, whatever happens about
2511 repeats. The value of reqbyte doesn't change either. */
2512
2513 case '.':
2514 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2515 zerofirstbyte = firstbyte;
2516 zeroreqbyte = reqbyte;
2517 previous = code;
2518 *code++ = OP_ANY;
2519 break;
2520
2521
2522 /* ===================================================================*/
2523 /* Character classes. If the included characters are all < 256, we build a
2524 32-byte bitmap of the permitted characters, except in the special case
2525 where there is only one such character. For negated classes, we build the
2526 map as usual, then invert it at the end. However, we use a different opcode
2527 so that data characters > 255 can be handled correctly.
2528
2529 If the class contains characters outside the 0-255 range, a different
2530 opcode is compiled. It may optionally have a bit map for characters < 256,
2531 but those above are are explicitly listed afterwards. A flag byte tells
2532 whether the bitmap is present, and whether this is a negated class or not.
2533 */
2534
2535 case '[':
2536 previous = code;
2537
2538 /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
2539 they are encountered at the top level, so we'll do that too. */
2540
2541 if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2542 check_posix_syntax(ptr, &tempptr, cd))
2543 {
2544 *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;
2545 goto FAILED;
2546 }
2547
2548 /* If the first character is '^', set the negation flag and skip it. Also,
2549 if the first few characters (either before or after ^) are \Q\E or \E we
2550 skip them too. This makes for compatibility with Perl. */
2551
2552 negate_class = FALSE;
2553 for (;;)
2554 {
2555 c = *(++ptr);
2556 if (c == '\\')
2557 {
2558 if (ptr[1] == 'E') ptr++;
2559 else if (strncmp((const char *)ptr+1, "Q\\E", 3) == 0) ptr += 3;
2560 else break;
2561 }
2562 else if (!negate_class && c == '^')
2563 negate_class = TRUE;
2564 else break;
2565 }
2566
2567 /* Keep a count of chars with values < 256 so that we can optimize the case
2568 of just a single character (as long as it's < 256). However, For higher
2569 valued UTF-8 characters, we don't yet do any optimization. */
2570
2571 class_charcount = 0;
2572 class_lastchar = -1;
2573
2574 /* Initialize the 32-char bit map to all zeros. We build the map in a
2575 temporary bit of memory, in case the class contains only 1 character (less
2576 than 256), because in that case the compiled code doesn't use the bit map.
2577 */
2578
2579 memset(classbits, 0, 32 * sizeof(uschar));
2580
2581 #ifdef SUPPORT_UTF8
2582 class_utf8 = FALSE; /* No chars >= 256 */
2583 class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */
2584 #endif
2585
2586 /* Process characters until ] is reached. By writing this as a "do" it
2587 means that an initial ] is taken as a data character. At the start of the
2588 loop, c contains the first byte of the character. */
2589
2590 if (c != 0) do
2591 {
2592 const uschar *oldptr;
2593
2594 #ifdef SUPPORT_UTF8
2595 if (utf8 && c > 127)
2596 { /* Braces are required because the */
2597 GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
2598 }
2599 #endif
2600
2601 /* Inside \Q...\E everything is literal except \E */
2602
2603 if (inescq)
2604 {
2605 if (c == '\\' && ptr[1] == 'E') /* If we are at \E */
2606 {
2607 inescq = FALSE; /* Reset literal state */
2608 ptr++; /* Skip the 'E' */
2609 continue; /* Carry on with next */
2610 }
2611 goto CHECK_RANGE; /* Could be range if \E follows */
2612 }
2613
2614 /* Handle POSIX class names. Perl allows a negation extension of the
2615 form [:^name:]. A square bracket that doesn't match the syntax is
2616 treated as a literal. We also recognize the POSIX constructions
2617 [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
2618 5.6 and 5.8 do. */
2619
2620 if (c == '[' &&
2621 (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2622 check_posix_syntax(ptr, &tempptr, cd))
2623 {
2624 BOOL local_negate = FALSE;
2625 int posix_class, taboffset, tabopt;
2626 register const uschar *cbits = cd->cbits;
2627 uschar pbits[32];
2628
2629 if (ptr[1] != ':')
2630 {
2631 *errorcodeptr = ERR31;
2632 goto FAILED;
2633 }
2634
2635 ptr += 2;
2636 if (*ptr == '^')
2637 {
2638 local_negate = TRUE;
2639 ptr++;
2640 }
2641
2642 posix_class = check_posix_name(ptr, tempptr - ptr);
2643 if (posix_class < 0)
2644 {
2645 *errorcodeptr = ERR30;
2646 goto FAILED;
2647 }
2648
2649 /* If matching is caseless, upper and lower are converted to
2650 alpha. This relies on the fact that the class table starts with
2651 alpha, lower, upper as the first 3 entries. */
2652
2653 if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
2654 posix_class = 0;
2655
2656 /* We build the bit map for the POSIX class in a chunk of local store
2657 because we may be adding and subtracting from it, and we don't want to
2658 subtract bits that may be in the main map already. At the end we or the
2659 result into the bit map that is being built. */
2660
2661 posix_class *= 3;
2662
2663 /* Copy in the first table (always present) */
2664
2665 memcpy(pbits, cbits + posix_class_maps[posix_class],
2666 32 * sizeof(uschar));
2667
2668 /* If there is a second table, add or remove it as required. */
2669
2670 taboffset = posix_class_maps[posix_class + 1];
2671 tabopt = posix_class_maps[posix_class + 2];
2672
2673 if (taboffset >= 0)
2674 {
2675 if (tabopt >= 0)
2676 for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
2677 else
2678 for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
2679 }
2680
2681 /* Not see if we need to remove any special characters. An option
2682 value of 1 removes vertical space and 2 removes underscore. */
2683
2684 if (tabopt < 0) tabopt = -tabopt;
2685 if (tabopt == 1) pbits[1] &= ~0x3c;
2686 else if (tabopt == 2) pbits[11] &= 0x7f;
2687
2688 /* Add the POSIX table or its complement into the main table that is
2689 being built and we are done. */
2690
2691 if (local_negate)
2692 for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
2693 else
2694 for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
2695
2696 ptr = tempptr + 1;
2697 class_charcount = 10; /* Set > 1; assumes more than 1 per class */
2698 continue; /* End of POSIX syntax handling */
2699 }
2700
2701 /* Backslash may introduce a single character, or it may introduce one
2702 of the specials, which just set a flag. The sequence \b is a special
2703 case. Inside a class (and only there) it is treated as backspace.
2704 Elsewhere it marks a word boundary. Other escapes have preset maps ready
2705 to 'or' into the one we are building. We assume they have more than one
2706 character in them, so set class_charcount bigger than one. */
2707
2708 if (c == '\\')
2709 {
2710 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2711 if (*errorcodeptr != 0) goto FAILED;
2712
2713 if (-c == ESC_b) c = '\b'; /* \b is backslash in a class */
2714 else if (-c == ESC_X) c = 'X'; /* \X is literal X in a class */
2715 else if (-c == ESC_R) c = 'R'; /* \R is literal R in a class */
2716 else if (-c == ESC_Q) /* Handle start of quoted string */
2717 {
2718 if (ptr[1] == '\\' && ptr[2] == 'E')
2719 {
2720 ptr += 2; /* avoid empty string */
2721 }
2722 else inescq = TRUE;
2723 continue;
2724 }
2725 else if (-c == ESC_E) continue; /* Ignore orphan \E */
2726
2727 if (c < 0)
2728 {
2729 register const uschar *cbits = cd->cbits;
2730 class_charcount += 2; /* Greater than 1 is what matters */
2731
2732 /* Save time by not doing this in the pre-compile phase. */
2733
2734 if (lengthptr == NULL) switch (-c)
2735 {
2736 case ESC_d:
2737 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
2738 continue;
2739
2740 case ESC_D:
2741 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
2742 continue;
2743
2744 case ESC_w:
2745 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
2746 continue;
2747
2748 case ESC_W:
2749 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
2750 continue;
2751
2752 case ESC_s:
2753 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
2754 classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
2755 continue;
2756
2757 case ESC_S:
2758 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
2759 classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
2760 continue;
2761
2762 case ESC_E: /* Perl ignores an orphan \E */
2763 continue;
2764
2765 default: /* Not recognized; fall through */
2766 break; /* Need "default" setting to stop compiler warning. */
2767 }
2768
2769 /* In the pre-compile phase, just do the recognition. */
2770
2771 else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
2772 c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
2773
2774 /* We need to deal with \H, \h, \V, and \v in both phases because
2775 they use extra memory. */
2776
2777 if (-c == ESC_h)
2778 {
2779 SETBIT(classbits, 0x09); /* VT */
2780 SETBIT(classbits, 0x20); /* SPACE */
2781 SETBIT(classbits, 0xa0); /* NSBP */
2782 #ifdef SUPPORT_UTF8
2783 if (utf8)
2784 {
2785 class_utf8 = TRUE;
2786 *class_utf8data++ = XCL_SINGLE;
2787 class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
2788 *class_utf8data++ = XCL_SINGLE;
2789 class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
2790 *class_utf8data++ = XCL_RANGE;
2791 class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
2792 class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
2793 *class_utf8data++ = XCL_SINGLE;
2794 class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
2795 *class_utf8data++ = XCL_SINGLE;
2796 class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
2797 *class_utf8data++ = XCL_SINGLE;
2798 class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
2799 }
2800 #endif
2801 continue;
2802 }
2803
2804 if (-c == ESC_H)
2805 {
2806 for (c = 0; c < 32; c++)
2807 {
2808 int x = 0xff;
2809 switch (c)
2810 {
2811 case 0x09/8: x ^= 1 << (0x09%8); break;
2812 case 0x20/8: x ^= 1 << (0x20%8); break;
2813 case 0xa0/8: x ^= 1 << (0xa0%8); break;
2814 default: break;
2815 }
2816 classbits[c] |= x;
2817 }
2818
2819 #ifdef SUPPORT_UTF8
2820 if (utf8)
2821 {
2822 class_utf8 = TRUE;
2823 *class_utf8data++ = XCL_RANGE;
2824 class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2825 class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
2826 *class_utf8data++ = XCL_RANGE;
2827 class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
2828 class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
2829 *class_utf8data++ = XCL_RANGE;
2830 class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
2831 class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
2832 *class_utf8data++ = XCL_RANGE;
2833 class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
2834 class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
2835 *class_utf8data++ = XCL_RANGE;
2836 class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
2837 class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
2838 *class_utf8data++ = XCL_RANGE;
2839 class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
2840 class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
2841 *class_utf8data++ = XCL_RANGE;
2842 class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
2843 class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2844 }
2845 #endif
2846 continue;
2847 }
2848
2849 if (-c == ESC_v)
2850 {
2851 SETBIT(classbits, 0x0a); /* LF */
2852 SETBIT(classbits, 0x0b); /* VT */
2853 SETBIT(classbits, 0x0c); /* FF */
2854 SETBIT(classbits, 0x0d); /* CR */
2855 SETBIT(classbits, 0x85); /* NEL */
2856 #ifdef SUPPORT_UTF8
2857 if (utf8)
2858 {
2859 class_utf8 = TRUE;
2860 *class_utf8data++ = XCL_RANGE;
2861 class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
2862 class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
2863 }
2864 #endif
2865 continue;
2866 }
2867
2868 if (-c == ESC_V)
2869 {
2870 for (c = 0; c < 32; c++)
2871 {
2872 int x = 0xff;
2873 switch (c)
2874 {
2875 case 0x0a/8: x ^= 1 << (0x0a%8);
2876 x ^= 1 << (0x0b%8);
2877 x ^= 1 << (0x0c%8);
2878 x ^= 1 << (0x0d%8);
2879 break;
2880 case 0x85/8: x ^= 1 << (0x85%8); break;
2881 default: break;
2882 }
2883 classbits[c] |= x;
2884 }
2885
2886 #ifdef SUPPORT_UTF8
2887 if (utf8)
2888 {
2889 class_utf8 = TRUE;
2890 *class_utf8data++ = XCL_RANGE;
2891 class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2892 class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
2893 *class_utf8data++ = XCL_RANGE;
2894 class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
2895 class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2896 }
2897 #endif
2898 continue;
2899 }
2900
2901 /* We need to deal with \P and \p in both phases. */
2902
2903 #ifdef SUPPORT_UCP
2904 if (-c == ESC_p || -c == ESC_P)
2905 {
2906 BOOL negated;
2907 int pdata;
2908 int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
2909 if (ptype < 0) goto FAILED;
2910 class_utf8 = TRUE;
2911 *class_utf8data++ = ((-c == ESC_p) != negated)?
2912 XCL_PROP : XCL_NOTPROP;
2913 *class_utf8data++ = ptype;
2914 *class_utf8data++ = pdata;
2915 class_charcount -= 2; /* Not a < 256 character */
2916 continue;
2917 }
2918 #endif
2919 /* Unrecognized escapes are faulted if PCRE is running in its
2920 strict mode. By default, for compatibility with Perl, they are
2921 treated as literals. */
2922
2923 if ((options & PCRE_EXTRA) != 0)
2924 {
2925 *errorcodeptr = ERR7;
2926 goto FAILED;
2927 }
2928
2929 class_charcount -= 2; /* Undo the default count from above */
2930 c = *ptr; /* Get the final character and fall through */
2931 }
2932
2933 /* Fall through if we have a single character (c >= 0). This may be
2934 greater than 256 in UTF-8 mode. */
2935
2936 } /* End of backslash handling */
2937
2938 /* A single character may be followed by '-' to form a range. However,
2939 Perl does not permit ']' to be the end of the range. A '-' character
2940 at the end is treated as a literal. Perl ignores orphaned \E sequences
2941 entirely. The code for handling \Q and \E is messy. */
2942
2943 CHECK_RANGE:
2944 while (ptr[1] == '\\' && ptr[2] == 'E')
2945 {
2946 inescq = FALSE;
2947 ptr += 2;
2948 }
2949
2950 oldptr = ptr;
2951
2952 if (!inescq && ptr[1] == '-')
2953 {
2954 int d;
2955 ptr += 2;
2956 while (*ptr == '\\' && ptr[1] == 'E') ptr += 2;
2957
2958 /* If we hit \Q (not followed by \E) at this point, go into escaped
2959 mode. */
2960
2961 while (*ptr == '\\' && ptr[1] == 'Q')
2962 {
2963 ptr += 2;
2964 if (*ptr == '\\' && ptr[1] == 'E') { ptr += 2; continue; }
2965 inescq = TRUE;
2966 break;
2967 }
2968
2969 if (*ptr == 0 || (!inescq && *ptr == ']'))
2970 {
2971 ptr = oldptr;
2972 goto LONE_SINGLE_CHARACTER;
2973 }
2974
2975 #ifdef SUPPORT_UTF8
2976 if (utf8)
2977 { /* Braces are required because the */
2978 GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
2979 }
2980 else
2981 #endif
2982 d = *ptr; /* Not UTF-8 mode */
2983
2984 /* The second part of a range can be a single-character escape, but
2985 not any of the other escapes. Perl 5.6 treats a hyphen as a literal
2986 in such circumstances. */
2987
2988 if (!inescq && d == '\\')
2989 {
2990 d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2991 if (*errorcodeptr != 0) goto FAILED;
2992
2993 /* \b is backslash; \X is literal X; \R is literal R; any other
2994 special means the '-' was literal */
2995
2996 if (d < 0)
2997 {
2998 if (d == -ESC_b) d = '\b';
2999 else if (d == -ESC_X) d = 'X';
3000 else if (d == -ESC_R) d = 'R'; else
3001 {
3002 ptr = oldptr;
3003 goto LONE_SINGLE_CHARACTER; /* A few lines below */
3004 }
3005 }
3006 }
3007
3008 /* Check that the two values are in the correct order. Optimize
3009 one-character ranges */
3010
3011 if (d < c)
3012 {
3013 *errorcodeptr = ERR8;
3014 goto FAILED;
3015 }
3016
3017 if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
3018
3019 /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
3020 matching, we have to use an XCLASS with extra data items. Caseless
3021 matching for characters > 127 is available only if UCP support is
3022 available. */
3023
3024 #ifdef SUPPORT_UTF8
3025 if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
3026 {
3027 class_utf8 = TRUE;
3028
3029 /* With UCP support, we can find the other case equivalents of
3030 the relevant characters. There may be several ranges. Optimize how
3031 they fit with the basic range. */
3032
3033 #ifdef SUPPORT_UCP
3034 if ((options & PCRE_CASELESS) != 0)
3035 {
3036 unsigned int occ, ocd;
3037 unsigned int cc = c;
3038 unsigned int origd = d;
3039 while (get_othercase_range(&cc, origd, &occ, &ocd))
3040 {
3041 if (occ >= (unsigned int)c &&
3042 ocd <= (unsigned int)d)
3043 continue; /* Skip embedded ranges */
3044
3045 if (occ < (unsigned int)c &&
3046 ocd >= (unsigned int)c - 1) /* Extend the basic range */
3047 { /* if there is overlap, */
3048 c = occ; /* noting that if occ < c */
3049 continue; /* we can't have ocd > d */
3050 } /* because a subrange is */
3051 if (ocd > (unsigned int)d &&
3052 occ <= (unsigned int)d + 1) /* always shorter than */
3053 { /* the basic range. */
3054 d = ocd;
3055 continue;
3056 }
3057
3058 if (occ == ocd)
3059 {
3060 *class_utf8data++ = XCL_SINGLE;
3061 }
3062 else
3063 {
3064 *class_utf8data++ = XCL_RANGE;
3065 class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
3066 }
3067 class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
3068 }
3069 }
3070 #endif /* SUPPORT_UCP */
3071
3072 /* Now record the original range, possibly modified for UCP caseless
3073 overlapping ranges. */
3074
3075 *class_utf8data++ = XCL_RANGE;
3076 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3077 class_utf8data += _pcre_ord2utf8(d, class_utf8data);
3078
3079 /* With UCP support, we are done. Without UCP support, there is no
3080 caseless matching for UTF-8 characters > 127; we can use the bit map
3081 for the smaller ones. */
3082
3083 #ifdef SUPPORT_UCP
3084 continue; /* With next character in the class */
3085 #else
3086 if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
3087
3088 /* Adjust upper limit and fall through to set up the map */
3089
3090 d = 127;
3091
3092 #endif /* SUPPORT_UCP */
3093 }
3094 #endif /* SUPPORT_UTF8 */
3095
3096 /* We use the bit map for all cases when not in UTF-8 mode; else
3097 ranges that lie entirely within 0-127 when there is UCP support; else
3098 for partial ranges without UCP support. */
3099
3100 class_charcount += d - c + 1;
3101 class_lastchar = d;
3102
3103 /* We can save a bit of time by skipping this in the pre-compile. */
3104
3105 if (lengthptr == NULL) for (; c <= d; c++)
3106 {
3107 classbits[c/8] |= (1 << (c&7));
3108 if ((options & PCRE_CASELESS) != 0)
3109 {
3110 int uc = cd->fcc[c]; /* flip case */
3111 classbits[uc/8] |= (1 << (uc&7));
3112 }
3113 }
3114
3115 continue; /* Go get the next char in the class */
3116 }
3117
3118 /* Handle a lone single character - we can get here for a normal
3119 non-escape char, or after \ that introduces a single character or for an
3120 apparent range that isn't. */
3121
3122 LONE_SINGLE_CHARACTER:
3123
3124 /* Handle a character that cannot go in the bit map */
3125
3126 #ifdef SUPPORT_UTF8
3127 if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
3128 {
3129 class_utf8 = TRUE;
3130 *class_utf8data++ = XCL_SINGLE;
3131 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3132
3133 #ifdef SUPPORT_UCP
3134 if ((options & PCRE_CASELESS) != 0)
3135 {
3136 unsigned int othercase;
3137 if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR)
3138 {
3139 *class_utf8data++ = XCL_SINGLE;
3140 class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
3141 }
3142 }
3143 #endif /* SUPPORT_UCP */
3144
3145 }
3146 else
3147 #endif /* SUPPORT_UTF8 */
3148
3149 /* Handle a single-byte character */
3150 {
3151 classbits[c/8] |= (1 << (c&7));
3152 if ((options & PCRE_CASELESS) != 0)
3153 {
3154 c = cd->fcc[c]; /* flip case */
3155 classbits[c/8] |= (1 << (c&7));
3156 }
3157 class_charcount++;
3158 class_lastchar = c;
3159 }
3160 }
3161
3162 /* Loop until ']' reached. This "while" is the end of the "do" above. */
3163
3164 while ((c = *(++ptr)) != 0 && (c != ']' || inescq));
3165
3166 if (c == 0) /* Missing terminating ']' */
3167 {
3168 *errorcodeptr = ERR6;
3169 goto FAILED;
3170 }
3171
3172 /* If class_charcount is 1, we saw precisely one character whose value is
3173 less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we
3174 can optimize the negative case only if there were no characters >= 128
3175 because OP_NOT and the related opcodes like OP_NOTSTAR operate on
3176 single-bytes only. This is an historical hangover. Maybe one day we can
3177 tidy these opcodes to handle multi-byte characters.
3178
3179 The optimization throws away the bit map. We turn the item into a
3180 1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
3181 that OP_NOT does not support multibyte characters. In the positive case, it
3182 can cause firstbyte to be set. Otherwise, there can be no first char if
3183 this item is first, whatever repeat count may follow. In the case of
3184 reqbyte, save the previous value for reinstating. */
3185
3186 #ifdef SUPPORT_UTF8
3187 if (class_charcount == 1 &&
3188 (!utf8 ||
3189 (!class_utf8 && (!negate_class || class_lastchar < 128))))
3190
3191 #else
3192 if (class_charcount == 1)
3193 #endif
3194 {
3195 zeroreqbyte = reqbyte;
3196
3197 /* The OP_NOT opcode works on one-byte characters only. */
3198
3199 if (negate_class)
3200 {
3201 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3202 zerofirstbyte = firstbyte;
3203 *code++ = OP_NOT;
3204 *code++ = class_lastchar;
3205 break;
3206 }
3207
3208 /* For a single, positive character, get the value into mcbuffer, and
3209 then we can handle this with the normal one-character code. */
3210
3211 #ifdef SUPPORT_UTF8
3212 if (utf8 && class_lastchar > 127)
3213 mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
3214 else
3215 #endif
3216 {
3217 mcbuffer[0] = class_lastchar;
3218 mclength = 1;
3219 }
3220 goto ONE_CHAR;
3221 } /* End of 1-char optimization */
3222
3223 /* The general case - not the one-char optimization. If this is the first
3224 thing in the branch, there can be no first char setting, whatever the
3225 repeat count. Any reqbyte setting must remain unchanged after any kind of
3226 repeat. */
3227
3228 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3229 zerofirstbyte = firstbyte;
3230 zeroreqbyte = reqbyte;
3231
3232 /* If there are characters with values > 255, we have to compile an
3233 extended class, with its own opcode. If there are no characters < 256,
3234 we can omit the bitmap in the actual compiled code. */
3235
3236 #ifdef SUPPORT_UTF8
3237 if (class_utf8)
3238 {
3239 *class_utf8data++ = XCL_END; /* Marks the end of extra data */
3240 *code++ = OP_XCLASS;
3241 code += LINK_SIZE;
3242 *code = negate_class? XCL_NOT : 0;
3243
3244 /* If the map is required, move up the extra data to make room for it;
3245 otherwise just move the code pointer to the end of the extra data. */
3246
3247 if (class_charcount > 0)
3248 {
3249 *code++ |= XCL_MAP;
3250 memmove(code + 32, code, class_utf8data - code);
3251 memcpy(code, classbits, 32);
3252 code = class_utf8data + 32;
3253 }
3254 else code = class_utf8data;
3255
3256 /* Now fill in the complete length of the item */
3257
3258 PUT(previous, 1, code - previous);
3259 break; /* End of class handling */
3260 }
3261 #endif
3262
3263 /* If there are no characters > 255, negate the 32-byte map if necessary,
3264 and copy it into the code vector. If this is the first thing in the branch,
3265 there can be no first char setting, whatever the repeat count. Any reqbyte
3266 setting must remain unchanged after any kind of repeat. */
3267
3268 if (negate_class)
3269 {
3270 *code++ = OP_NCLASS;
3271 if (lengthptr == NULL) /* Save time in the pre-compile phase */
3272 for (c = 0; c < 32; c++) code[c] = ~classbits[c];
3273 }
3274 else
3275 {
3276 *code++ = OP_CLASS;
3277 memcpy(code, classbits, 32);
3278 }
3279 code += 32;
3280 break;
3281
3282
3283 /* ===================================================================*/
3284 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
3285 has been tested above. */
3286
3287 case '{':
3288 if (!is_quantifier) goto NORMAL_CHAR;
3289 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
3290 if (*errorcodeptr != 0) goto FAILED;
3291 goto REPEAT;
3292
3293 case '*':
3294 repeat_min = 0;
3295 repeat_max = -1;
3296 goto REPEAT;
3297
3298 case '+':
3299 repeat_min = 1;
3300 repeat_max = -1;
3301 goto REPEAT;
3302
3303 case '?':
3304 repeat_min = 0;
3305 repeat_max = 1;
3306
3307 REPEAT:
3308 if (previous == NULL)
3309 {
3310 *errorcodeptr = ERR9;
3311 goto FAILED;
3312 }
3313
3314 if (repeat_min == 0)
3315 {
3316 firstbyte = zerofirstbyte; /* Adjust for zero repeat */
3317 reqbyte = zeroreqbyte; /* Ditto */
3318 }
3319
3320 /* Remember whether this is a variable length repeat */
3321
3322 reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
3323
3324 op_type = 0; /* Default single-char op codes */
3325 possessive_quantifier = FALSE; /* Default not possessive quantifier */
3326
3327 /* Save start of previous item, in case we have to move it up to make space
3328 for an inserted OP_ONCE for the additional '+' extension. */
3329
3330 tempcode = previous;
3331
3332 /* If the next character is '+', we have a possessive quantifier. This
3333 implies greediness, whatever the setting of the PCRE_UNGREEDY option.
3334 If the next character is '?' this is a minimizing repeat, by default,
3335 but if PCRE_UNGREEDY is set, it works the other way round. We change the
3336 repeat type to the non-default. */
3337
3338 if (ptr[1] == '+')
3339 {
3340 repeat_type = 0; /* Force greedy */
3341 possessive_quantifier = TRUE;
3342 ptr++;
3343 }
3344 else if (ptr[1] == '?')
3345 {
3346 repeat_type = greedy_non_default;
3347 ptr++;
3348 }
3349 else repeat_type = greedy_default;
3350
3351 /* If previous was a character match, abolish the item and generate a
3352 repeat item instead. If a char item has a minumum of more than one, ensure
3353 that it is set in reqbyte - it might not be if a sequence such as x{3} is
3354 the first thing in a branch because the x will have gone into firstbyte
3355 instead. */
3356
3357 if (*previous == OP_CHAR || *previous == OP_CHARNC)
3358 {
3359 /* Deal with UTF-8 characters that take up more than one byte. It's
3360 easier to write this out separately than try to macrify it. Use c to
3361 hold the length of the character in bytes, plus 0x80 to flag that it's a
3362 length rather than a small character. */
3363
3364 #ifdef SUPPORT_UTF8
3365 if (utf8 && (code[-1] & 0x80) != 0)
3366 {
3367 uschar *lastchar = code - 1;
3368 while((*lastchar & 0xc0) == 0x80) lastchar--;
3369 c = code - lastchar; /* Length of UTF-8 character */
3370 memcpy(utf8_char, lastchar, c); /* Save the char */
3371 c |= 0x80; /* Flag c as a length */
3372 }
3373 else
3374 #endif
3375
3376 /* Handle the case of a single byte - either with no UTF8 support, or
3377 with UTF-8 disabled, or for a UTF-8 character < 128. */
3378
3379 {
3380 c = code[-1];
3381 if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
3382 }
3383
3384 /* If the repetition is unlimited, it pays to see if the next thing on
3385 the line is something that cannot possibly match this character. If so,
3386 automatically possessifying this item gains some performance in the case
3387 where the match fails. */
3388
3389 if (!possessive_quantifier &&
3390 repeat_max < 0 &&
3391 check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3392 options, cd))
3393 {
3394 repeat_type = 0; /* Force greedy */
3395 possessive_quantifier = TRUE;
3396 }
3397
3398 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
3399 }
3400
3401 /* If previous was a single negated character ([^a] or similar), we use
3402 one of the special opcodes, replacing it. The code is shared with single-
3403 character repeats by setting opt_type to add a suitable offset into
3404 repeat_type. We can also test for auto-possessification. OP_NOT is
3405 currently used only for single-byte chars. */
3406
3407 else if (*previous == OP_NOT)
3408 {
3409 op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
3410 c = previous[1];
3411 if (!possessive_quantifier &&
3412 repeat_max < 0 &&
3413 check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3414 {
3415 repeat_type = 0; /* Force greedy */
3416 possessive_quantifier = TRUE;
3417 }
3418 goto OUTPUT_SINGLE_REPEAT;
3419 }
3420
3421 /* If previous was a character type match (\d or similar), abolish it and
3422 create a suitable repeat item. The code is shared with single-character
3423 repeats by setting op_type to add a suitable offset into repeat_type. Note
3424 the the Unicode property types will be present only when SUPPORT_UCP is
3425 defined, but we don't wrap the little bits of code here because it just
3426 makes it horribly messy. */
3427
3428 else if (*previous < OP_EODN)
3429 {
3430 uschar *oldcode;
3431 int prop_type, prop_value;
3432 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
3433 c = *previous;
3434
3435 if (!possessive_quantifier &&
3436 repeat_max < 0 &&
3437 check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3438 {
3439 repeat_type = 0; /* Force greedy */
3440 possessive_quantifier = TRUE;
3441 }
3442
3443 OUTPUT_SINGLE_REPEAT:
3444 if (*previous == OP_PROP || *previous == OP_NOTPROP)
3445 {
3446 prop_type = previous[1];
3447 prop_value = previous[2];
3448 }
3449 else prop_type = prop_value = -1;
3450
3451 oldcode = code;
3452 code = previous; /* Usually overwrite previous item */
3453
3454 /* If the maximum is zero then the minimum must also be zero; Perl allows
3455 this case, so we do too - by simply omitting the item altogether. */
3456
3457 if (repeat_max == 0) goto END_REPEAT;
3458
3459 /* All real repeats make it impossible to handle partial matching (maybe
3460 one day we will be able to remove this restriction). */
3461
3462 if (repeat_max != 1) cd->nopartial = TRUE;
3463
3464 /* Combine the op_type with the repeat_type */
3465
3466 repeat_type += op_type;
3467
3468 /* A minimum of zero is handled either as the special case * or ?, or as
3469 an UPTO, with the maximum given. */
3470
3471 if (repeat_min == 0)
3472 {
3473 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
3474 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
3475 else
3476 {
3477 *code++ = OP_UPTO + repeat_type;
3478 PUT2INC(code, 0, repeat_max);
3479 }
3480 }
3481
3482 /* A repeat minimum of 1 is optimized into some special cases. If the
3483 maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
3484 left in place and, if the maximum is greater than 1, we use OP_UPTO with
3485 one less than the maximum. */
3486
3487 else if (repeat_min == 1)
3488 {
3489 if (repeat_max == -1)
3490 *code++ = OP_PLUS + repeat_type;
3491 else
3492 {
3493 code = oldcode; /* leave previous item in place */
3494 if (repeat_max == 1) goto END_REPEAT;
3495 *code++ = OP_UPTO + repeat_type;
3496 PUT2INC(code, 0, repeat_max - 1);
3497 }
3498 }
3499
3500 /* The case {n,n} is just an EXACT, while the general case {n,m} is
3501 handled as an EXACT followed by an UPTO. */
3502
3503 else
3504 {
3505 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
3506 PUT2INC(code, 0, repeat_min);
3507
3508 /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
3509 we have to insert the character for the previous code. For a repeated
3510 Unicode property match, there are two extra bytes that define the
3511 required property. In UTF-8 mode, long characters have their length in
3512 c, with the 0x80 bit as a flag. */
3513
3514 if (repeat_max < 0)
3515 {
3516 #ifdef SUPPORT_UTF8
3517 if (utf8 && c >= 128)
3518 {
3519 memcpy(code, utf8_char, c & 7);
3520 code += c & 7;
3521 }
3522 else
3523 #endif
3524 {
3525 *code++ = c;
3526 if (prop_type >= 0)
3527 {
3528 *code++ = prop_type;
3529 *code++ = prop_value;
3530 }
3531 }
3532 *code++ = OP_STAR + repeat_type;
3533 }
3534
3535 /* Else insert an UPTO if the max is greater than the min, again
3536 preceded by the character, for the previously inserted code. If the
3537 UPTO is just for 1 instance, we can use QUERY instead. */
3538
3539 else if (repeat_max != repeat_min)
3540 {
3541 #ifdef SUPPORT_UTF8
3542 if (utf8 && c >= 128)
3543 {
3544 memcpy(code, utf8_char, c & 7);
3545 code += c & 7;
3546 }
3547 else
3548 #endif
3549 *code++ = c;
3550 if (prop_type >= 0)
3551 {
3552 *code++ = prop_type;
3553 *code++ = prop_value;
3554 }
3555 repeat_max -= repeat_min;
3556
3557 if (repeat_max == 1)
3558 {
3559 *code++ = OP_QUERY + repeat_type;
3560 }
3561 else
3562 {
3563 *code++ = OP_UPTO + repeat_type;
3564 PUT2INC(code, 0, repeat_max);
3565 }
3566 }
3567 }
3568
3569 /* The character or character type itself comes last in all cases. */
3570
3571 #ifdef SUPPORT_UTF8
3572 if (utf8 && c >= 128)
3573 {
3574 memcpy(code, utf8_char, c & 7);
3575 code += c & 7;
3576 }
3577 else
3578 #endif
3579 *code++ = c;
3580
3581 /* For a repeated Unicode property match, there are two extra bytes that
3582 define the required property. */
3583
3584 #ifdef SUPPORT_UCP
3585 if (prop_type >= 0)
3586 {
3587 *code++ = prop_type;
3588 *code++ = prop_value;
3589 }
3590 #endif
3591 }
3592
3593 /* If previous was a character class or a back reference, we put the repeat
3594 stuff after it, but just skip the item if the repeat was {0,0}. */
3595
3596 else if (*previous == OP_CLASS ||
3597 *previous == OP_NCLASS ||
3598 #ifdef SUPPORT_UTF8
3599 *previous == OP_XCLASS ||
3600 #endif
3601 *previous == OP_REF)
3602 {
3603 if (repeat_max == 0)
3604 {
3605 code = previous;
3606 goto END_REPEAT;
3607 }
3608
3609 /* All real repeats make it impossible to handle partial matching (maybe
3610 one day we will be able to remove this restriction). */
3611
3612 if (repeat_max != 1) cd->nopartial = TRUE;
3613
3614 if (repeat_min == 0 && repeat_max == -1)
3615 *code++ = OP_CRSTAR + repeat_type;
3616 else if (repeat_min == 1 && repeat_max == -1)
3617 *code++ = OP_CRPLUS + repeat_type;
3618 else if (repeat_min == 0 && repeat_max == 1)
3619 *code++ = OP_CRQUERY + repeat_type;
3620 else
3621 {
3622 *code++ = OP_CRRANGE + repeat_type;
3623 PUT2INC(code, 0, repeat_min);
3624 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
3625 PUT2INC(code, 0, repeat_max);
3626 }
3627 }
3628
3629 /* If previous was a bracket group, we may have to replicate it in certain
3630 cases. */
3631
3632 else if (*previous == OP_BRA || *previous == OP_CBRA ||
3633 *previous == OP_ONCE || *previous == OP_COND)
3634 {
3635 register int i;
3636 int ketoffset = 0;
3637 int len = code - previous;
3638 uschar *bralink = NULL;
3639
3640 /* Repeating a DEFINE group is pointless */
3641
3642 if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
3643 {
3644 *errorcodeptr = ERR55;
3645 goto FAILED;
3646 }
3647
3648 /* If the maximum repeat count is unlimited, find the end of the bracket
3649 by scanning through from the start, and compute the offset back to it
3650 from the current code pointer. There may be an OP_OPT setting following
3651 the final KET, so we can't find the end just by going back from the code
3652 pointer. */
3653
3654 if (repeat_max == -1)
3655 {
3656 register uschar *ket = previous;
3657 do ket += GET(ket, 1); while (*ket != OP_KET);
3658 ketoffset = code - ket;
3659 }
3660
3661 /* The case of a zero minimum is special because of the need to stick
3662 OP_BRAZERO in front of it, and because the group appears once in the
3663 data, whereas in other cases it appears the minimum number of times. For
3664 this reason, it is simplest to treat this case separately, as otherwise
3665 the code gets far too messy. There are several special subcases when the
3666 minimum is zero. */
3667
3668 if (repeat_min == 0)
3669 {
3670 /* If the maximum is also zero, we just omit the group from the output
3671 altogether. */
3672
3673 if (repeat_max == 0)
3674 {
3675 code = previous;
3676 goto END_REPEAT;
3677 }
3678
3679 /* If the maximum is 1 or unlimited, we just have to stick in the
3680 BRAZERO and do no more at this point. However, we do need to adjust
3681 any OP_RECURSE calls inside the group that refer to the group itself or
3682 any internal or forward referenced group, because the offset is from
3683 the start of the whole regex. Temporarily terminate the pattern while
3684 doing this. */
3685
3686 if (repeat_max <= 1)
3687 {
3688 *code = OP_END;
3689 adjust_recurse(previous, 1, utf8, cd, save_hwm);
3690 memmove(previous+1, previous, len);
3691 code++;
3692 *previous++ = OP_BRAZERO + repeat_type;
3693 }
3694
3695 /* If the maximum is greater than 1 and limited, we have to replicate
3696 in a nested fashion, sticking OP_BRAZERO before each set of brackets.
3697 The first one has to be handled carefully because it's the original
3698 copy, which has to be moved up. The remainder can be handled by code
3699 that is common with the non-zero minimum case below. We have to
3700 adjust the value or repeat_max, since one less copy is required. Once
3701 again, we may have to adjust any OP_RECURSE calls inside the group. */
3702
3703 else
3704 {
3705 int offset;
3706 *code = OP_END;
3707 adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
3708 memmove(previous + 2 + LINK_SIZE, previous, len);
3709 code += 2 + LINK_SIZE;
3710 *previous++ = OP_BRAZERO + repeat_type;
3711 *previous++ = OP_BRA;
3712
3713 /* We chain together the bracket offset fields that have to be
3714 filled in later when the ends of the brackets are reached. */
3715
3716 offset = (bralink == NULL)? 0 : previous - bralink;
3717 bralink = previous;
3718 PUTINC(previous, 0, offset);
3719 }
3720
3721 repeat_max--;
3722 }
3723
3724 /* If the minimum is greater than zero, replicate the group as many
3725 times as necessary, and adjust the maximum to the number of subsequent
3726 copies that we need. If we set a first char from the group, and didn't
3727 set a required char, copy the latter from the former. If there are any
3728 forward reference subroutine calls in the group, there will be entries on
3729 the workspace list; replicate these with an appropriate increment. */
3730
3731 else
3732 {
3733 if (repeat_min > 1)
3734 {
3735 /* In the pre-compile phase, we don't actually do the replication. We
3736 just adjust the length as if we had. Do some paranoid checks for
3737 potential integer overflow. */
3738
3739 if (lengthptr != NULL)
3740 {
3741 int delta = (repeat_min - 1)*length_prevgroup;
3742 if ((double)(repeat_min - 1)*(double)length_prevgroup >
3743 (double)INT_MAX ||
3744 OFLOW_MAX - *lengthptr < delta)
3745 {
3746 *errorcodeptr = ERR20;
3747 goto FAILED;
3748 }
3749 *lengthptr += delta;
3750 }
3751
3752 /* This is compiling for real */
3753
3754 else
3755 {
3756 if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
3757 for (i = 1; i < repeat_min; i++)
3758 {
3759 uschar *hc;
3760 uschar *this_hwm = cd->hwm;
3761 memcpy(code, previous, len);
3762 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3763 {
3764 PUT(cd->hwm, 0, GET(hc, 0) + len);
3765 cd->hwm += LINK_SIZE;
3766 }
3767 save_hwm = this_hwm;
3768 code += len;
3769 }
3770 }
3771 }
3772
3773 if (repeat_max > 0) repeat_max -= repeat_min;
3774 }
3775
3776 /* This code is common to both the zero and non-zero minimum cases. If
3777 the maximum is limited, it replicates the group in a nested fashion,
3778 remembering the bracket starts on a stack. In the case of a zero minimum,
3779 the first one was set up above. In all cases the repeat_max now specifies
3780 the number of additional copies needed. Again, we must remember to
3781 replicate entries on the forward reference list. */
3782
3783 if (repeat_max >= 0)
3784 {
3785 /* In the pre-compile phase, we don't actually do the replication. We
3786 just adjust the length as if we had. For each repetition we must add 1
3787 to the length for BRAZERO and for all but the last repetition we must
3788 add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
3789 paranoid checks to avoid integer overflow. */
3790
3791 if (lengthptr != NULL && repeat_max > 0)
3792 {
3793 int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
3794 2 - 2*LINK_SIZE; /* Last one doesn't nest */
3795 if ((double)repeat_max *
3796 (double)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
3797 > (double)INT_MAX ||
3798 OFLOW_MAX - *lengthptr < delta)
3799 {
3800 *errorcodeptr = ERR20;
3801 goto FAILED;
3802 }
3803 *lengthptr += delta;
3804 }
3805
3806 /* This is compiling for real */
3807
3808 else for (i = repeat_max - 1; i >= 0; i--)
3809 {
3810 uschar *hc;
3811 uschar *this_hwm = cd->hwm;
3812
3813 *code++ = OP_BRAZERO + repeat_type;
3814
3815 /* All but the final copy start a new nesting, maintaining the
3816 chain of brackets outstanding. */
3817
3818 if (i != 0)
3819 {
3820 int offset;
3821 *code++ = OP_BRA;
3822 offset = (bralink == NULL)? 0 : code - bralink;
3823 bralink = code;
3824 PUTINC(code, 0, offset);
3825 }
3826
3827 memcpy(code, previous, len);
3828 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3829 {
3830 PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
3831 cd->hwm += LINK_SIZE;
3832 }
3833 save_hwm = this_hwm;
3834 code += len;
3835 }
3836
3837 /* Now chain through the pending brackets, and fill in their length
3838 fields (which are holding the chain links pro tem). */
3839
3840 while (bralink != NULL)
3841 {
3842 int oldlinkoffset;
3843 int offset = code - bralink + 1;
3844 uschar *bra = code - offset;
3845 oldlinkoffset = GET(bra, 1);
3846 bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
3847 *code++ = OP_KET;
3848 PUTINC(code, 0, offset);
3849 PUT(bra, 1, offset);
3850 }
3851 }
3852
3853 /* If the maximum is unlimited, set a repeater in the final copy. We
3854 can't just offset backwards from the current code point, because we
3855 don't know if there's been an options resetting after the ket. The
3856 correct offset was computed above.
3857
3858 Then, when we are doing the actual compile phase, check to see whether
3859 this group is a non-atomic one that could match an empty string. If so,
3860 convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
3861 that runtime checking can be done. [This check is also applied to
3862 atomic groups at runtime, but in a different way.] */
3863
3864 else
3865 {
3866 uschar *ketcode = code - ketoffset;
3867 uschar *bracode = ketcode - GET(ketcode, 1);
3868 *ketcode = OP_KETRMAX + repeat_type;
3869 if (lengthptr == NULL && *bracode != OP_ONCE)
3870 {
3871 uschar *scode = bracode;
3872 do
3873 {
3874 if (could_be_empty_branch(scode, ketcode, utf8))
3875 {
3876 *bracode += OP_SBRA - OP_BRA;
3877 break;
3878 }
3879 scode += GET(scode, 1);
3880 }
3881 while (*scode == OP_ALT);
3882 }
3883 }
3884 }
3885
3886 /* Else there's some kind of shambles */
3887
3888 else
3889 {
3890 *errorcodeptr = ERR11;
3891 goto FAILED;
3892 }
3893
3894 /* If the character following a repeat is '+', or if certain optimization
3895 tests above succeeded, possessive_quantifier is TRUE. For some of the
3896 simpler opcodes, there is an special alternative opcode for this. For
3897 anything else, we wrap the entire repeated item inside OP_ONCE brackets.
3898 The '+' notation is just syntactic sugar, taken from Sun's Java package,
3899 but the special opcodes can optimize it a bit. The repeated item starts at
3900 tempcode, not at previous, which might be the first part of a string whose
3901 (former) last char we repeated.
3902
3903 Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
3904 an 'upto' may follow. We skip over an 'exact' item, and then test the
3905 length of what remains before proceeding. */
3906
3907 if (possessive_quantifier)
3908 {
3909 int len;
3910 if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||
3911 *tempcode == OP_NOTEXACT)
3912 tempcode += _pcre_OP_lengths[*tempcode];
3913 len = code - tempcode;
3914 if (len > 0) switch (*tempcode)
3915 {
3916 case OP_STAR: *tempcode = OP_POSSTAR; break;
3917 case OP_PLUS: *tempcode = OP_POSPLUS; break;
3918 case OP_QUERY: *tempcode = OP_POSQUERY; break;
3919 case OP_UPTO: *tempcode = OP_POSUPTO; break;
3920
3921 case OP_TYPESTAR: *tempcode = OP_TYPEPOSSTAR; break;
3922 case OP_TYPEPLUS: *tempcode = OP_TYPEPOSPLUS; break;
3923 case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
3924 case OP_TYPEUPTO: *tempcode = OP_TYPEPOSUPTO; break;
3925
3926 case OP_NOTSTAR: *tempcode = OP_NOTPOSSTAR; break;
3927 case OP_NOTPLUS: *tempcode = OP_NOTPOSPLUS; break;
3928 case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
3929 case OP_NOTUPTO: *tempcode = OP_NOTPOSUPTO; break;
3930
3931 default:
3932 memmove(tempcode + 1+LINK_SIZE, tempcode, len);
3933 code += 1 + LINK_SIZE;
3934 len += 1 + LINK_SIZE;
3935 tempcode[0] = OP_ONCE;
3936 *code++ = OP_KET;
3937 PUTINC(code, 0, len);
3938 PUT(tempcode, 1, len);
3939 break;
3940 }
3941 }
3942
3943 /* In all case we no longer have a previous item. We also set the
3944 "follows varying string" flag for subsequently encountered reqbytes if
3945 it isn't already set and we have just passed a varying length item. */
3946
3947 END_REPEAT:
3948 previous = NULL;
3949 cd->req_varyopt |= reqvary;
3950 break;
3951
3952
3953 /* ===================================================================*/
3954 /* Start of nested parenthesized sub-expression, or comment or lookahead or
3955 lookbehind or option setting or condition or all the other extended
3956 parenthesis forms. */
3957
3958 case '(':
3959 newoptions = options;
3960 skipbytes = 0;
3961 bravalue = OP_CBRA;
3962 save_hwm = cd->hwm;
3963 reset_bracount = FALSE;
3964
3965 /* First deal with various "verbs" that can be introduced by '*'. */
3966
3967 if (*(++ptr) == '*' && (cd->ctypes[ptr[1]] & ctype_letter) != 0)
3968 {
3969 int i, namelen;
3970 const uschar *name = ++ptr;
3971 previous = NULL;
3972 while ((cd->ctypes[*++ptr] & ctype_letter) != 0);
3973 if (*ptr == ':')
3974 {
3975 *errorcodeptr = ERR59; /* Not supported */
3976 goto FAILED;
3977 }
3978 if (*ptr != ')')
3979 {
3980 *errorcodeptr = ERR60;
3981 goto FAILED;
3982 }
3983 namelen = ptr - name;
3984 for (i = 0; i < verbcount; i++)
3985 {
3986 if (namelen == verbs[i].len &&
3987 strncmp((char *)name, verbs[i].name, namelen) == 0)
3988 {
3989 *code = verbs[i].op;
3990 if (*code++ == OP_ACCEPT) cd->had_accept = TRUE;
3991 break;
3992 }
3993 }
3994 if (i < verbcount) continue;
3995 *errorcodeptr = ERR60;
3996 goto FAILED;
3997 }
3998
3999 /* Deal with the extended parentheses; all are introduced by '?', and the
4000 appearance of any of them means that this is not a capturing group. */
4001
4002 else if (*ptr == '?')
4003 {
4004 int i, set, unset, namelen;
4005 int *optset;
4006 const uschar *name;
4007 uschar *slot;
4008
4009 switch (*(++ptr))
4010 {
4011 case '#': /* Comment; skip to ket */
4012 ptr++;
4013 while (*ptr != 0 && *ptr != ')') ptr++;
4014 if (*ptr == 0)
4015 {
4016 *errorcodeptr = ERR18;
4017 goto FAILED;
4018 }
4019 continue;
4020
4021
4022 /* ------------------------------------------------------------ */
4023 case '|': /* Reset capture count for each branch */
4024 reset_bracount = TRUE;
4025 /* Fall through */
4026
4027 /* ------------------------------------------------------------ */
4028 case ':': /* Non-capturing bracket */
4029 bravalue = OP_BRA;
4030 ptr++;
4031 break;
4032
4033
4034 /* ------------------------------------------------------------ */
4035 case '(':
4036 bravalue = OP_COND; /* Conditional group */
4037
4038 /* A condition can be an assertion, a number (referring to a numbered
4039 group), a name (referring to a named group), or 'R', referring to
4040 recursion. R<digits> and R&name are also permitted for recursion tests.
4041
4042 There are several syntaxes for testing a named group: (?(name)) is used
4043 by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
4044
4045 There are two unfortunate ambiguities, caused by history. (a) 'R' can
4046 be the recursive thing or the name 'R' (and similarly for 'R' followed
4047 by digits), and (b) a number could be a name that consists of digits.
4048 In both cases, we look for a name first; if not found, we try the other
4049 cases. */
4050
4051 /* For conditions that are assertions, check the syntax, and then exit
4052 the switch. This will take control down to where bracketed groups,
4053 including assertions, are processed. */
4054
4055 if (ptr[1] == '?' && (ptr[2] == '=' || ptr[2] == '!' || ptr[2] == '<'))
4056 break;
4057
4058 /* Most other conditions use OP_CREF (a couple change to OP_RREF
4059 below), and all need to skip 3 bytes at the start of the group. */
4060
4061 code[1+LINK_SIZE] = OP_CREF;
4062 skipbytes = 3;
4063 refsign = -1;
4064
4065 /* Check for a test for recursion in a named group. */
4066
4067 if (ptr[1] == 'R' && ptr[2] == '&')
4068 {
4069 terminator = -1;
4070 ptr += 2;
4071 code[1+LINK_SIZE] = OP_RREF; /* Change the type of test */
4072 }
4073
4074 /* Check for a test for a named group's having been set, using the Perl
4075 syntax (?(<name>) or (?('name') */
4076
4077 else if (ptr[1] == '<')
4078 {
4079 terminator = '>';
4080 ptr++;
4081 }
4082 else if (ptr[1] == '\'')
4083 {
4084 terminator = '\'';
4085 ptr++;
4086 }
4087 else
4088 {
4089 terminator = 0;
4090 if (ptr[1] == '-' || ptr[1] == '+') refsign = *(++ptr);
4091 }
4092
4093 /* We now expect to read a name; any thing else is an error */
4094
4095 if ((cd->ctypes[ptr[1]] & ctype_word) == 0)
4096 {
4097 ptr += 1; /* To get the right offset */
4098 *errorcodeptr = ERR28;
4099 goto FAILED;
4100 }
4101
4102 /* Read the name, but also get it as a number if it's all digits */
4103
4104 recno = 0;
4105 name = ++ptr;
4106 while ((cd->ctypes[*ptr] & ctype_word) != 0)
4107 {
4108 if (recno >= 0)
4109 recno = ((digitab[*ptr] & ctype_digit) != 0)?
4110 recno * 10 + *ptr - '0' : -1;
4111 ptr++;
4112 }
4113 namelen = ptr - name;
4114
4115 if ((terminator > 0 && *ptr++ != terminator) || *ptr++ != ')')
4116 {
4117 ptr--; /* Error offset */
4118 *errorcodeptr = ERR26;
4119 goto FAILED;
4120 }
4121
4122 /* Do no further checking in the pre-compile phase. */
4123
4124 if (lengthptr != NULL) break;
4125
4126 /* In the real compile we do the work of looking for the actual
4127 reference. If the string started with "+" or "-" we require the rest to
4128 be digits, in which case recno will be set. */
4129
4130 if (refsign > 0)
4131 {
4132 if (recno <= 0)
4133 {
4134 *errorcodeptr = ERR58;
4135 goto FAILED;
4136 }
4137 if (refsign == '-')
4138 {
4139 recno = cd->bracount - recno + 1;
4140 if (recno <= 0)
4141 {
4142 *errorcodeptr = ERR15;
4143 goto FAILED;
4144 }
4145 }
4146 else recno += cd->bracount;
4147 PUT2(code, 2+LINK_SIZE, recno);
4148 break;
4149 }
4150
4151 /* Otherwise (did not start with "+" or "-"), start by looking for the
4152 name. */
4153
4154 slot = cd->name_table;
4155 for (i = 0; i < cd->names_found; i++)
4156 {
4157 if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
4158 slot += cd->name_entry_size;
4159 }
4160
4161 /* Found a previous named subpattern */
4162
4163 if (i < cd->names_found)
4164 {
4165 recno = GET2(slot, 0);
4166 PUT2(code, 2+LINK_SIZE, recno);
4167 }
4168
4169 /* Search the pattern for a forward reference */
4170
4171 else if ((i = find_parens(ptr, cd->bracount, name, namelen,
4172 (options & PCRE_EXTENDED) != 0)) > 0)
4173 {
4174 PUT2(code, 2+LINK_SIZE, i);
4175 }
4176
4177 /* If terminator == 0 it means that the name followed directly after
4178 the opening parenthesis [e.g. (?(abc)...] and in this case there are
4179 some further alternatives to try. For the cases where terminator != 0
4180 [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
4181 now checked all the possibilities, so give an error. */
4182
4183 else if (terminator != 0)
4184 {
4185 *errorcodeptr = ERR15;
4186 goto FAILED;
4187 }
4188
4189 /* Check for (?(R) for recursion. Allow digits after R to specify a
4190 specific group number. */
4191
4192 else if (*name == 'R')
4193 {
4194 recno = 0;
4195 for (i = 1; i < namelen; i++)
4196 {
4197 if ((digitab[name[i]] & ctype_digit) == 0)
4198 {
4199 *errorcodeptr = ERR15;
4200 goto FAILED;
4201 }
4202 recno = recno * 10 + name[i] - '0';
4203 }
4204 if (recno == 0) recno = RREF_ANY;
4205 code[1+LINK_SIZE] = OP_RREF; /* Change test type */
4206 PUT2(code, 2+LINK_SIZE, recno);
4207 }
4208
4209 /* Similarly, check for the (?(DEFINE) "condition", which is always
4210 false. */
4211
4212 else if (namelen == 6 && strncmp((char *)name, "DEFINE", 6) == 0)
4213 {
4214 code[1+LINK_SIZE] = OP_DEF;
4215 skipbytes = 1;
4216 }
4217
4218 /* Check for the "name" actually being a subpattern number. */
4219
4220 else if (recno > 0)
4221 {
4222 PUT2(code, 2+LINK_SIZE, recno);
4223 }
4224
4225 /* Either an unidentified subpattern, or a reference to (?(0) */
4226
4227 else
4228 {
4229 *errorcodeptr = (recno == 0)? ERR35: ERR15;
4230 goto FAILED;
4231 }
4232 break;
4233
4234
4235 /* ------------------------------------------------------------ */
4236 case '=': /* Positive lookahead */
4237 bravalue = OP_ASSERT;
4238 ptr++;
4239 break;
4240
4241
4242 /* ------------------------------------------------------------ */
4243 case '!': /* Negative lookahead */
4244 ptr++;
4245 if (*ptr == ')') /* Optimize (?!) */
4246 {
4247 *code++ = OP_FAIL;
4248 previous = NULL;
4249 continue;
4250 }
4251 bravalue = OP_ASSERT_NOT;
4252 break;
4253
4254
4255 /* ------------------------------------------------------------ */
4256 case '<': /* Lookbehind or named define */
4257 switch (ptr[1])
4258 {
4259 case '=': /* Positive lookbehind */
4260 bravalue = OP_ASSERTBACK;
4261 ptr += 2;
4262 break;
4263
4264 case '!': /* Negative lookbehind */
4265 bravalue = OP_ASSERTBACK_NOT;
4266 ptr += 2;
4267 break;
4268
4269 default: /* Could be name define, else bad */
4270 if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME;
4271 ptr++; /* Correct offset for error */
4272 *errorcodeptr = ERR24;
4273 goto FAILED;
4274 }
4275 break;
4276
4277
4278 /* ------------------------------------------------------------ */
4279 case '>': /* One-time brackets */
4280 bravalue = OP_ONCE;
4281 ptr++;
4282 break;
4283
4284
4285 /* ------------------------------------------------------------ */
4286 case 'C': /* Callout - may be followed by digits; */
4287 previous_callout = code; /* Save for later completion */
4288 after_manual_callout = 1; /* Skip one item before completing */
4289 *code++ = OP_CALLOUT;
4290 {
4291 int n = 0;
4292 while ((digitab[*(++ptr)] & ctype_digit) != 0)
4293 n = n * 10 + *ptr - '0';
4294 if (*ptr != ')')
4295 {
4296 *errorcodeptr = ERR39;
4297 goto FAILED;
4298 }
4299 if (n > 255)
4300 {
4301 *errorcodeptr = ERR38;
4302 goto FAILED;
4303 }
4304 *code++ = n;
4305 PUT(code, 0, ptr - cd->start_pattern + 1); /* Pattern offset */
4306 PUT(code, LINK_SIZE, 0); /* Default length */
4307 code += 2 * LINK_SIZE;
4308 }
4309 previous = NULL;
4310 continue;
4311
4312
4313 /* ------------------------------------------------------------ */
4314 case 'P': /* Python-style named subpattern handling */
4315 if (*(++ptr) == '=' || *ptr == '>') /* Reference or recursion */
4316 {
4317 is_recurse = *ptr == '>';
4318 terminator = ')';
4319 goto NAMED_REF_OR_RECURSE;
4320 }
4321 else if (*ptr != '<') /* Test for Python-style definition */
4322 {
4323 *errorcodeptr = ERR41;
4324 goto FAILED;
4325 }
4326 /* Fall through to handle (?P< as (?< is handled */
4327
4328
4329 /* ------------------------------------------------------------ */
4330 DEFINE_NAME: /* Come here from (?< handling */
4331 case '\'':
4332 {
4333 terminator = (*ptr == '<')? '>' : '\'';
4334 name = ++ptr;
4335
4336 while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4337 namelen = ptr - name;
4338
4339 /* In the pre-compile phase, just do a syntax check. */
4340
4341 if (lengthptr != NULL)
4342 {
4343 if (*ptr != terminator)
4344 {
4345 *errorcodeptr = ERR42;
4346 goto FAILED;
4347 }
4348 if (cd->names_found >= MAX_NAME_COUNT)
4349 {
4350 *errorcodeptr = ERR49;
4351 goto FAILED;
4352 }
4353 if (namelen + 3 > cd->name_entry_size)
4354 {
4355 cd->name_entry_size = namelen + 3;
4356 if (namelen > MAX_NAME_SIZE)
4357 {
4358 *errorcodeptr = ERR48;
4359 goto FAILED;
4360 }
4361 }
4362 }
4363
4364 /* In the real compile, create the entry in the table */
4365
4366 else
4367 {
4368 slot = cd->name_table;
4369 for (i = 0; i < cd->names_found; i++)
4370 {
4371 int crc = memcmp(name, slot+2, namelen);
4372 if (crc == 0)
4373 {
4374 if (slot[2+namelen] == 0)
4375 {
4376 if ((options & PCRE_DUPNAMES) == 0)
4377 {
4378 *errorcodeptr = ERR43;
4379 goto FAILED;
4380 }
4381 }
4382 else crc = -1; /* Current name is substring */
4383 }
4384 if (crc < 0)
4385 {
4386 memmove(slot + cd->name_entry_size, slot,
4387 (cd->names_found - i) * cd->name_entry_size);
4388 break;
4389 }
4390 slot += cd->name_entry_size;
4391 }
4392
4393 PUT2(slot, 0, cd->bracount + 1);
4394 memcpy(slot + 2, name, namelen);
4395 slot[2+namelen] = 0;
4396 }
4397 }
4398
4399 /* In both cases, count the number of names we've encountered. */
4400
4401 ptr++; /* Move past > or ' */
4402 cd->names_found++;
4403 goto NUMBERED_GROUP;
4404
4405
4406 /* ------------------------------------------------------------ */
4407 case '&': /* Perl recursion/subroutine syntax */
4408 terminator = ')';
4409 is_recurse = TRUE;
4410 /* Fall through */
4411
4412 /* We come here from the Python syntax above that handles both
4413 references (?P=name) and recursion (?P>name), as well as falling
4414 through from the Perl recursion syntax (?&name). */
4415
4416 NAMED_REF_OR_RECURSE:
4417 name = ++ptr;
4418 while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4419 namelen = ptr - name;
4420
4421 /* In the pre-compile phase, do a syntax check and set a dummy
4422 reference number. */
4423
4424 if (lengthptr != NULL)
4425 {
4426 if (*ptr != terminator)
4427 {
4428 *errorcodeptr = ERR42;
4429 goto FAILED;
4430 }
4431 if (namelen > MAX_NAME_SIZE)
4432 {
4433 *errorcodeptr = ERR48;
4434 goto FAILED;
4435 }
4436 recno = 0;
4437 }
4438
4439 /* In the real compile, seek the name in the table */
4440
4441 else
4442 {
4443 slot = cd->name_table;
4444 for (i = 0; i < cd->names_found; i++)
4445 {
4446 if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
4447 slot += cd->name_entry_size;
4448 }
4449
4450 if (i < cd->names_found) /* Back reference */
4451 {
4452 recno = GET2(slot, 0);
4453 }
4454 else if ((recno = /* Forward back reference */
4455 find_parens(ptr, cd->bracount, name, namelen,
4456 (options & PCRE_EXTENDED) != 0)) <= 0)
4457 {
4458 *errorcodeptr = ERR15;
4459 goto FAILED;
4460 }
4461 }
4462
4463 /* In both phases, we can now go to the code than handles numerical
4464 recursion or backreferences. */
4465
4466 if (is_recurse) goto HANDLE_RECURSION;
4467 else goto HANDLE_REFERENCE;
4468
4469
4470 /* ------------------------------------------------------------ */
4471 case 'R': /* Recursion */
4472 ptr++; /* Same as (?0) */
4473 /* Fall through */
4474
4475
4476 /* ------------------------------------------------------------ */
4477 case '-': case '+':
4478 case '0': case '1': case '2': case '3': case '4': /* Recursion or */
4479 case '5': case '6': case '7': case '8': case '9': /* subroutine */
4480 {
4481 const uschar *called;
4482
4483 if ((refsign = *ptr) == '+') ptr++;
4484 else if (refsign == '-')
4485 {
4486 if ((digitab[ptr[1]] & ctype_digit) == 0)
4487 goto OTHER_CHAR_AFTER_QUERY;
4488 ptr++;
4489 }
4490
4491 recno = 0;
4492 while((digitab[*ptr] & ctype_digit) != 0)
4493 recno = recno * 10 + *ptr++ - '0';
4494
4495 if (*ptr != ')')
4496 {
4497 *errorcodeptr = ERR29;
4498 goto FAILED;
4499 }
4500
4501 if (refsign == '-')
4502 {
4503 if (recno == 0)
4504 {
4505 *errorcodeptr = ERR58;
4506 goto FAILED;
4507 }
4508 recno = cd->bracount - recno + 1;
4509 if (recno <= 0)
4510 {
4511 *errorcodeptr = ERR15;
4512 goto FAILED;
4513 }
4514 }
4515 else if (refsign == '+')
4516 {
4517 if (recno == 0)
4518 {
4519 *errorcodeptr = ERR58;
4520 goto FAILED;
4521 }
4522 recno += cd->bracount;
4523 }
4524
4525 /* Come here from code above that handles a named recursion */
4526
4527 HANDLE_RECURSION:
4528
4529 previous = code;
4530 called = cd->start_code;
4531
4532 /* When we are actually compiling, find the bracket that is being
4533 referenced. Temporarily end the regex in case it doesn't exist before
4534 this point. If we end up with a forward reference, first check that
4535 the bracket does occur later so we can give the error (and position)
4536 now. Then remember this forward reference in the workspace so it can
4537 be filled in at the end. */
4538
4539 if (lengthptr == NULL)
4540 {
4541 *code = OP_END;
4542 if (recno != 0) called = find_bracket(cd->start_code, utf8, recno);
4543
4544 /* Forward reference */
4545
4546 if (called == NULL)
4547 {
4548 if (find_parens(ptr, cd->bracount, NULL, recno,
4549 (options & PCRE_EXTENDED) != 0) < 0)
4550 {
4551 *errorcodeptr = ERR15;
4552 goto FAILED;
4553 }
4554 called = cd->start_code + recno;
4555 PUTINC(cd->hwm, 0, code + 2 + LINK_SIZE - cd->start_code);
4556 }
4557
4558 /* If not a forward reference, and the subpattern is still open,
4559 this is a recursive call. We check to see if this is a left
4560 recursion that could loop for ever, and diagnose that case. */
4561
4562 else if (GET(called, 1) == 0 &&
4563 could_be_empty(called, code, bcptr, utf8))
4564 {
4565 *errorcodeptr = ERR40;
4566 goto FAILED;
4567 }
4568 }
4569
4570 /* Insert the recursion/subroutine item, automatically wrapped inside
4571 "once" brackets. Set up a "previous group" length so that a
4572 subsequent quantifier will work. */
4573
4574 *code = OP_ONCE;
4575 PUT(code, 1, 2 + 2*LINK_SIZE);
4576 code += 1 + LINK_SIZE;
4577
4578 *code = OP_RECURSE;
4579 PUT(code, 1, called - cd->start_code);
4580 code += 1 + LINK_SIZE;
4581
4582 *code = OP_KET;
4583 PUT(code, 1, 2 + 2*LINK_SIZE);
4584 code += 1 + LINK_SIZE;
4585
4586 length_prevgroup = 3 + 3*LINK_SIZE;
4587 }
4588
4589 /* Can't determine a first byte now */
4590
4591 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
4592 continue;
4593
4594
4595 /* ------------------------------------------------------------ */
4596 default: /* Other characters: check option setting */
4597 OTHER_CHAR_AFTER_QUERY:
4598 set = unset = 0;
4599 optset = &set;
4600
4601 while (*ptr != ')' && *ptr != ':')
4602 {
4603 switch (*ptr++)
4604 {
4605 case '-': optset = &unset; break;
4606
4607 case 'J': /* Record that it changed in the external options */
4608 *optset |= PCRE_DUPNAMES;
4609 cd->external_options |= PCRE_JCHANGED;
4610 break;
4611
4612 case 'i': *optset |= PCRE_CASELESS; break;
4613 case 'm': *optset |= PCRE_MULTILINE; break;
4614 case 's': *optset |= PCRE_DOTALL; break;
4615 case 'x': *optset |= PCRE_EXTENDED; break;
4616 case 'U': *optset |= PCRE_UNGREEDY; break;
4617 case 'X': *optset |= PCRE_EXTRA; break;
4618
4619 default: *errorcodeptr = ERR12;
4620 ptr--; /* Correct the offset */
4621 goto FAILED;
4622 }
4623 }
4624
4625 /* Set up the changed option bits, but don't change anything yet. */
4626
4627 newoptions = (options | set) & (~unset);
4628
4629 /* If the options ended with ')' this is not the start of a nested
4630 group with option changes, so the options change at this level. If this
4631 item is right at the start of the pattern, the options can be
4632 abstracted and made external in the pre-compile phase, and ignored in
4633 the compile phase. This can be helpful when matching -- for instance in
4634 caseless checking of required bytes.
4635
4636 If the code pointer is not (cd->start_code + 1 + LINK_SIZE), we are
4637 definitely *not* at the start of the pattern because something has been
4638 compiled. In the pre-compile phase, however, the code pointer can have
4639 that value after the start, because it gets reset as code is discarded
4640 during the pre-compile. However, this can happen only at top level - if
4641 we are within parentheses, the starting BRA will still be present. At
4642 any parenthesis level, the length value can be used to test if anything
4643 has been compiled at that level. Thus, a test for both these conditions
4644 is necessary to ensure we correctly detect the start of the pattern in
4645 both phases.
4646
4647 If we are not at the pattern start, compile code to change the ims
4648 options if this setting actually changes any of them. We also pass the
4649 new setting back so that it can be put at the start of any following
4650 branches, and when this group ends (if we are in a group), a resetting
4651 item can be compiled. */
4652
4653 if (*ptr == ')')
4654 {
4655 if (code == cd->start_code + 1 + LINK_SIZE &&
4656 (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE))
4657 {
4658 cd->external_options = newoptions;
4659 options = newoptions;
4660 }
4661 else
4662 {
4663 if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
4664 {
4665 *code++ = OP_OPT;
4666 *code++ = newoptions & PCRE_IMS;
4667 }
4668
4669 /* Change options at this level, and pass them back for use
4670 in subsequent branches. Reset the greedy defaults and the case
4671 value for firstbyte and reqbyte. */
4672
4673 *optionsptr = options = newoptions;
4674 greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
4675 greedy_non_default = greedy_default ^ 1;
4676 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
4677 }
4678
4679 previous = NULL; /* This item can't be repeated */
4680 continue; /* It is complete */
4681 }
4682
4683 /* If the options ended with ':' we are heading into a nested group
4684 with possible change of options. Such groups are non-capturing and are
4685 not assertions of any kind. All we need to do is skip over the ':';
4686 the newoptions value is handled below. */
4687
4688 bravalue = OP_BRA;
4689 ptr++;
4690 } /* End of switch for character following (? */
4691 } /* End of (? handling */
4692
4693 /* Opening parenthesis not followed by '?'. If PCRE_NO_AUTO_CAPTURE is set,
4694 all unadorned brackets become non-capturing and behave like (?:...)
4695 brackets. */
4696
4697 else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
4698 {
4699 bravalue = OP_BRA;
4700 }
4701
4702 /* Else we have a capturing group. */
4703
4704 else
4705 {
4706 NUMBERED_GROUP:
4707 cd->bracount += 1;
4708 PUT2(code, 1+LINK_SIZE, cd->bracount);
4709 skipbytes = 2;
4710 }
4711
4712 /* Process nested bracketed regex. Assertions may not be repeated, but
4713 other kinds can be. All their opcodes are >= OP_ONCE. We copy code into a
4714 non-register variable in order to be able to pass its address because some
4715 compilers complain otherwise. Pass in a new setting for the ims options if
4716 they have changed. */
4717
4718 previous = (bravalue >= OP_ONCE)? code : NULL;
4719 *code = bravalue;
4720 tempcode = code;
4721 tempreqvary = cd->req_varyopt; /* Save value before bracket */
4722 length_prevgroup = 0; /* Initialize for pre-compile phase */
4723
4724 if (!compile_regex(
4725 newoptions, /* The complete new option state */
4726 options & PCRE_IMS, /* The previous ims option state */
4727 &tempcode, /* Where to put code (updated) */
4728 &ptr, /* Input pointer (updated) */
4729 errorcodeptr, /* Where to put an error message */
4730 (bravalue == OP_ASSERTBACK ||
4731 bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
4732 reset_bracount, /* True if (?| group */
4733 skipbytes, /* Skip over bracket number */
4734 &subfirstbyte, /* For possible first char */
4735 &subreqbyte, /* For possible last char */
4736 bcptr, /* Current branch chain */
4737 cd, /* Tables block */
4738 (lengthptr == NULL)? NULL : /* Actual compile phase */
4739 &length_prevgroup /* Pre-compile phase */
4740 ))
4741 goto FAILED;
4742
4743 /* At the end of compiling, code is still pointing to the start of the
4744 group, while tempcode has been updated to point past the end of the group
4745 and any option resetting that may follow it. The pattern pointer (ptr)
4746 is on the bracket. */
4747
4748 /* If this is a conditional bracket, check that there are no more than
4749 two branches in the group, or just one if it's a DEFINE group. We do this
4750 in the real compile phase, not in the pre-pass, where the whole group may
4751 not be available. */
4752
4753 if (bravalue == OP_COND && lengthptr == NULL)
4754 {
4755 uschar *tc = code;
4756 int condcount = 0;
4757
4758 do {
4759 condcount++;
4760 tc += GET(tc,1);
4761 }
4762 while (*tc != OP_KET);
4763
4764 /* A DEFINE group is never obeyed inline (the "condition" is always
4765 false). It must have only one branch. */
4766
4767 if (code[LINK_SIZE+1] == OP_DEF)
4768 {
4769 if (condcount > 1)
4770 {
4771 *errorcodeptr = ERR54;
4772 goto FAILED;
4773 }
4774 bravalue = OP_DEF; /* Just a flag to suppress char handling below */
4775 }
4776
4777 /* A "normal" conditional group. If there is just one branch, we must not
4778 make use of its firstbyte or reqbyte, because this is equivalent to an
4779 empty second branch. */
4780
4781 else
4782 {
4783 if (condcount > 2)
4784 {
4785 *errorcodeptr = ERR27;
4786 goto FAILED;
4787 }
4788 if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
4789 }
4790 }
4791
4792 /* Error if hit end of pattern */
4793
4794 if (*ptr != ')')
4795 {
4796 *errorcodeptr = ERR14;
4797 goto FAILED;
4798 }
4799
4800 /* In the pre-compile phase, update the length by the length of the group,
4801 less the brackets at either end. Then reduce the compiled code to just a
4802 set of non-capturing brackets so that it doesn't use much memory if it is
4803 duplicated by a quantifier.*/
4804
4805 if (lengthptr != NULL)
4806 {
4807 if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE)
4808 {
4809 *errorcodeptr = ERR20;
4810 goto FAILED;
4811 }
4812 *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
4813 *code++ = OP_BRA;
4814 PUTINC(code, 0, 1 + LINK_SIZE);
4815 *code++ = OP_KET;
4816 PUTINC(code, 0, 1 + LINK_SIZE);
4817 break; /* No need to waste time with special character handling */
4818 }
4819
4820 /* Otherwise update the main code pointer to the end of the group. */
4821
4822 code = tempcode;
4823
4824 /* For a DEFINE group, required and first character settings are not
4825 relevant. */
4826
4827 if (bravalue == OP_DEF) break;
4828
4829 /* Handle updating of the required and first characters for other types of
4830 group. Update for normal brackets of all kinds, and conditions with two
4831 branches (see code above). If the bracket is followed by a quantifier with
4832 zero repeat, we have to back off. Hence the definition of zeroreqbyte and
4833 zerofirstbyte outside the main loop so that they can be accessed for the
4834 back off. */
4835
4836 zeroreqbyte = reqbyte;
4837 zerofirstbyte = firstbyte;
4838 groupsetfirstbyte = FALSE;
4839
4840 if (bravalue >= OP_ONCE)
4841 {
4842 /* If we have not yet set a firstbyte in this branch, take it from the
4843 subpattern, remembering that it was set here so that a repeat of more
4844 than one can replicate it as reqbyte if necessary. If the subpattern has
4845 no firstbyte, set "none" for the whole branch. In both cases, a zero
4846 repeat forces firstbyte to "none". */
4847
4848 if (firstbyte == REQ_UNSET)
4849 {
4850 if (subfirstbyte >= 0)
4851 {
4852 firstbyte = subfirstbyte;
4853 groupsetfirstbyte = TRUE;
4854 }
4855 else firstbyte = REQ_NONE;
4856 zerofirstbyte = REQ_NONE;
4857 }
4858
4859 /* If firstbyte was previously set, convert the subpattern's firstbyte
4860 into reqbyte if there wasn't one, using the vary flag that was in
4861 existence beforehand. */
4862
4863 else if (subfirstbyte >= 0 && subreqbyte < 0)
4864 subreqbyte = subfirstbyte | tempreqvary;
4865
4866 /* If the subpattern set a required byte (or set a first byte that isn't
4867 really the first byte - see above), set it. */
4868
4869 if (subreqbyte >= 0) reqbyte = subreqbyte;
4870 }
4871
4872 /* For a forward assertion, we take the reqbyte, if set. This can be
4873 helpful if the pattern that follows the assertion doesn't set a different
4874 char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
4875 for an assertion, however because it leads to incorrect effect for patterns
4876 such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
4877 of a firstbyte. This is overcome by a scan at the end if there's no
4878 firstbyte, looking for an asserted first char. */
4879
4880 else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
4881 break; /* End of processing '(' */
4882
4883
4884 /* ===================================================================*/
4885 /* Handle metasequences introduced by \. For ones like \d, the ESC_ values
4886 are arranged to be the negation of the corresponding OP_values. For the
4887 back references, the values are ESC_REF plus the reference number. Only
4888 back references and those types that consume a character may be repeated.
4889 We can test for values between ESC_b and ESC_Z for the latter; this may
4890 have to change if any new ones are ever created. */
4891
4892 case '\\':
4893 tempptr = ptr;
4894 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, FALSE);
4895 if (*errorcodeptr != 0) goto FAILED;
4896
4897 if (c < 0)
4898 {
4899 if (-c == ESC_Q) /* Handle start of quoted string */
4900 {
4901 if (ptr[1] == '\\' && ptr[2] == 'E') ptr += 2; /* avoid empty string */
4902 else inescq = TRUE;
4903 continue;
4904 }
4905
4906 if (-c == ESC_E) continue; /* Perl ignores an orphan \E */
4907
4908 /* For metasequences that actually match a character, we disable the
4909 setting of a first character if it hasn't already been set. */
4910
4911 if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
4912 firstbyte = REQ_NONE;
4913
4914 /* Set values to reset to if this is followed by a zero repeat. */
4915
4916 zerofirstbyte = firstbyte;
4917 zeroreqbyte = reqbyte;
4918
4919 /* \k<name> or \k'name' is a back reference by name (Perl syntax).
4920 We also support \k{name} (.NET syntax) */
4921
4922 if (-c == ESC_k && (ptr[1] == '<' || ptr[1] == '\'' || ptr[1] == '{'))
4923 {
4924 is_recurse = FALSE;
4925 terminator = (*(++ptr) == '<')? '>' : (*ptr == '\'')? '\'' : '}';
4926 goto NAMED_REF_OR_RECURSE;
4927 }
4928
4929 /* Back references are handled specially; must disable firstbyte if
4930 not set to cope with cases like (?=(\w+))\1: which would otherwise set
4931 ':' later. */
4932
4933 if (-c >= ESC_REF)
4934 {
4935 recno = -c - ESC_REF;
4936
4937 HANDLE_REFERENCE: /* Come here from named backref handling */
4938 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
4939 previous = code;
4940 *code++ = OP_REF;
4941 PUT2INC(code, 0, recno);
4942 cd->backref_map |= (recno < 32)? (1 << recno) : 1;
4943 if (recno > cd->top_backref) cd->top_backref = recno;
4944 }
4945
4946 /* So are Unicode property matches, if supported. */
4947
4948 #ifdef SUPPORT_UCP
4949 else if (-c == ESC_P || -c == ESC_p)
4950 {
4951 BOOL negated;
4952 int pdata;
4953 int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
4954 if (ptype < 0) goto FAILED;
4955 previous = code;
4956 *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
4957 *code++ = ptype;
4958 *code++ = pdata;
4959 }
4960 #else
4961
4962 /* If Unicode properties are not supported, \X, \P, and \p are not
4963 allowed. */
4964
4965 else if (-c == ESC_X || -c == ESC_P || -c == ESC_p)
4966 {
4967 *errorcodeptr = ERR45;
4968 goto FAILED;
4969 }
4970 #endif
4971
4972 /* For the rest (including \X when Unicode properties are supported), we
4973 can obtain the OP value by negating the escape value. */
4974
4975 else
4976 {
4977 previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
4978 *code++ = -c;
4979 }
4980 continue;
4981 }
4982
4983 /* We have a data character whose value is in c. In UTF-8 mode it may have
4984 a value > 127. We set its representation in the length/buffer, and then
4985 handle it as a data character. */
4986
4987 #ifdef SUPPORT_UTF8
4988 if (utf8 && c > 127)
4989 mclength = _pcre_ord2utf8(c, mcbuffer);
4990 else
4991 #endif
4992
4993 {
4994 mcbuffer[0] = c;
4995 mclength = 1;
4996 }
4997 goto ONE_CHAR;
4998
4999
5000 /* ===================================================================*/
5001 /* Handle a literal character. It is guaranteed not to be whitespace or #
5002 when the extended flag is set. If we are in UTF-8 mode, it may be a
5003 multi-byte literal character. */
5004
5005 default:
5006 NORMAL_CHAR:
5007 mclength = 1;
5008 mcbuffer[0] = c;
5009
5010 #ifdef SUPPORT_UTF8
5011 if (utf8 && c >= 0xc0)
5012 {
5013 while ((ptr[1] & 0xc0) == 0x80)
5014 mcbuffer[mclength++] = *(++ptr);
5015 }
5016 #endif
5017
5018 /* At this point we have the character's bytes in mcbuffer, and the length
5019 in mclength. When not in UTF-8 mode, the length is always 1. */
5020
5021 ONE_CHAR:
5022 previous = code;
5023 *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARNC : OP_CHAR;
5024 for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
5025
5026 /* Set the first and required bytes appropriately. If no previous first
5027 byte, set it from this character, but revert to none on a zero repeat.
5028 Otherwise, leave the firstbyte value alone, and don't change it on a zero
5029 repeat. */
5030
5031 if (firstbyte == REQ_UNSET)
5032 {
5033 zerofirstbyte = REQ_NONE;
5034 zeroreqbyte = reqbyte;
5035
5036 /* If the character is more than one byte long, we can set firstbyte
5037 only if it is not to be matched caselessly. */
5038
5039 if (mclength == 1 || req_caseopt == 0)
5040 {
5041 firstbyte = mcbuffer[0] | req_caseopt;
5042 if (mclength != 1) reqbyte = code[-1] | cd->req_varyopt;
5043 }
5044 else firstbyte = reqbyte = REQ_NONE;
5045 }
5046
5047 /* firstbyte was previously set; we can set reqbyte only the length is
5048 1 or the matching is caseful. */
5049
5050 else
5051 {
5052 zerofirstbyte = firstbyte;
5053 zeroreqbyte = reqbyte;
5054 if (mclength == 1 || req_caseopt == 0)
5055 reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
5056 }
5057
5058 break; /* End of literal character handling */
5059 }
5060 } /* end of big loop */
5061
5062
5063 /* Control never reaches here by falling through, only by a goto for all the
5064 error states. Pass back the position in the pattern so that it can be displayed
5065 to the user for diagnosing the error. */
5066
5067 FAILED:
5068 *ptrptr = ptr;
5069 return FALSE;
5070 }
5071
5072
5073
5074
5075 /*************************************************
5076 * Compile sequence of alternatives *
5077 *************************************************/
5078
5079 /* On entry, ptr is pointing past the bracket character, but on return it
5080 points to the closing bracket, or vertical bar, or end of string. The code
5081 variable is pointing at the byte into which the BRA operator has been stored.
5082 If the ims options are changed at the start (for a (?ims: group) or during any
5083 branch, we need to insert an OP_OPT item at the start of every following branch
5084 to ensure they get set correctly at run time, and also pass the new options
5085 into every subsequent branch compile.
5086
5087 This function is used during the pre-compile phase when we are trying to find
5088 out the amount of memory needed, as well as during the real compile phase. The
5089 value of lengthptr distinguishes the two phases.
5090
5091 Arguments:
5092 options option bits, including any changes for this subpattern
5093 oldims previous settings of ims option bits
5094 codeptr -> the address of the current code pointer
5095 ptrptr -> the address of the current pattern pointer
5096 errorcodeptr -> pointer to error code variable
5097 lookbehind TRUE if this is a lookbehind assertion
5098 reset_bracount TRUE to reset the count for each branch
5099 skipbytes skip this many bytes at start (for brackets and OP_COND)
5100 firstbyteptr place to put the first required character, or a negative number
5101 reqbyteptr place to put the last required character, or a negative number
5102 bcptr pointer to the chain of currently open branches
5103 cd points to the data block with tables pointers etc.
5104 lengthptr NULL during the real compile phase
5105 points to length accumulator during pre-compile phase
5106
5107 Returns: TRUE on success
5108 */
5109
5110 static BOOL
5111 compile_regex(int options, int oldims, uschar **codeptr, const uschar **ptrptr,
5112 int *errorcodeptr, BOOL lookbehind, BOOL reset_bracount, int skipbytes,
5113 int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd,
5114 int *lengthptr)
5115 {
5116 const uschar *ptr = *ptrptr;
5117 uschar *code = *codeptr;
5118 uschar *last_branch = code;
5119 uschar *start_bracket = code;
5120 uschar *reverse_count = NULL;
5121 int firstbyte, reqbyte;
5122 int branchfirstbyte, branchreqbyte;
5123 int length;
5124 int orig_bracount;
5125 int max_bracount;
5126 branch_chain bc;
5127
5128 bc.outer = bcptr;
5129 bc.current = code;
5130
5131 firstbyte = reqbyte = REQ_UNSET;
5132
5133 /* Accumulate the length for use in the pre-compile phase. Start with the
5134 length of the BRA and KET and any extra bytes that are required at the
5135 beginning. We accumulate in a local variable to save frequent testing of
5136 lenthptr for NULL. We cannot do this by looking at the value of code at the
5137 start and end of each alternative, because compiled items are discarded during
5138 the pre-compile phase so that the work space is not exceeded. */
5139
5140 length = 2 + 2*LINK_SIZE + skipbytes;
5141
5142 /* WARNING: If the above line is changed for any reason, you must also change
5143 the code that abstracts option settings at the start of the pattern and makes
5144 them global. It tests the value of length for (2 + 2*LINK_SIZE) in the
5145 pre-compile phase to find out whether anything has yet been compiled or not. */
5146
5147 /* Offset is set zero to mark that this bracket is still open */
5148
5149 PUT(code, 1, 0);
5150 code += 1 + LINK_SIZE + skipbytes;
5151
5152 /* Loop for each alternative branch */
5153
5154 orig_bracount = max_bracount = cd->bracount;
5155 for (;;)
5156 {
5157 /* For a (?| group, reset the capturing bracket count so that each branch
5158 uses the same numbers. */
5159
5160 if (reset_bracount) cd->bracount = orig_bracount;
5161
5162 /* Handle a change of ims options at the start of the branch */
5163
5164 if ((options & PCRE_IMS) != oldims)
5165 {
5166 *code++ = OP_OPT;
5167 *code++ = options & PCRE_IMS;
5168 length += 2;
5169 }
5170
5171 /* Set up dummy OP_REVERSE if lookbehind assertion */
5172
5173 if (lookbehind)
5174 {
5175 *code++ = OP_REVERSE;
5176 reverse_count = code;
5177 PUTINC(code, 0, 0);
5178 length += 1 + LINK_SIZE;
5179 }
5180
5181 /* Now compile the branch; in the pre-compile phase its length gets added
5182 into the length. */
5183
5184 if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstbyte,
5185 &branchreqbyte, &bc, cd, (lengthptr == NULL)? NULL : &length))
5186 {
5187 *ptrptr = ptr;
5188 return FALSE;
5189 }
5190
5191 /* Keep the highest bracket count in case (?| was used and some branch
5192 has fewer than the rest. */
5193
5194 if (cd->bracount > max_bracount) max_bracount = cd->bracount;
5195
5196 /* In the real compile phase, there is some post-processing to be done. */
5197
5198 if (lengthptr == NULL)
5199 {
5200 /* If this is the first branch, the firstbyte and reqbyte values for the
5201 branch become the values for the regex. */
5202
5203 if (*last_branch != OP_ALT)
5204 {
5205 firstbyte = branchfirstbyte;
5206 reqbyte = branchreqbyte;
5207 }
5208
5209 /* If this is not the first branch, the first char and reqbyte have to
5210 match the values from all the previous branches, except that if the
5211 previous value for reqbyte didn't have REQ_VARY set, it can still match,
5212 and we set REQ_VARY for the regex. */
5213
5214 else
5215 {
5216 /* If we previously had a firstbyte, but it doesn't match the new branch,
5217 we have to abandon the firstbyte for the regex, but if there was
5218 previously no reqbyte, it takes on the value of the old firstbyte. */
5219
5220 if (firstbyte >= 0 && firstbyte != branchfirstbyte)
5221 {
5222 if (reqbyte < 0) reqbyte = firstbyte;
5223 firstbyte = REQ_NONE;
5224 }
5225
5226 /* If we (now or from before) have no firstbyte, a firstbyte from the
5227 branch becomes a reqbyte if there isn't a branch reqbyte. */
5228
5229 if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
5230 branchreqbyte = branchfirstbyte;
5231
5232 /* Now ensure that the reqbytes match */
5233
5234 if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
5235 reqbyte = REQ_NONE;
5236 else reqbyte |= branchreqbyte; /* To "or" REQ_VARY */
5237 }
5238
5239 /* If lookbehind, check that this branch matches a fixed-length string, and
5240 put the length into the OP_REVERSE item. Temporarily mark the end of the
5241 branch with OP_END. */
5242
5243 if (lookbehind)
5244 {
5245 int fixed_length;
5246 *code = OP_END;
5247 fixed_length = find_fixedlength(last_branch, options);
5248 DPRINTF(("fixed length = %d\n", fixed_length));
5249 if (fixed_length < 0)
5250 {
5251 *errorcodeptr = (fixed_length == -2)? ERR36 : ERR25;
5252 *ptrptr = ptr;
5253 return FALSE;
5254 }
5255 PUT(reverse_count, 0, fixed_length);
5256 }
5257 }
5258
5259 /* Reached end of expression, either ')' or end of pattern. In the real
5260 compile phase, go back through the alternative branches and reverse the chain
5261 of offsets, with the field in the BRA item now becoming an offset to the
5262 first alternative. If there are no alternatives, it points to the end of the
5263 group. The length in the terminating ket is always the length of the whole
5264 bracketed item. If any of the ims options were changed inside the group,
5265 compile a resetting op-code following, except at the very end of the pattern.
5266 Return leaving the pointer at the terminating char. */
5267
5268 if (*ptr != '|')
5269 {
5270 if (lengthptr == NULL)
5271 {
5272 int branch_length = code - last_branch;
5273 do
5274 {
5275 int prev_length = GET(last_branch, 1);
5276 PUT(last_branch, 1, branch_length);
5277 branch_length = prev_length;
5278 last_branch -= branch_length;
5279 }
5280 while (branch_length > 0);
5281 }
5282
5283 /* Fill in the ket */
5284
5285 *code = OP_KET;
5286 PUT(code, 1, code - start_bracket);
5287 code += 1 + LINK_SIZE;
5288
5289 /* Resetting option if needed */
5290
5291 if ((options & PCRE_IMS) != oldims && *ptr == ')')
5292 {
5293 *code++ = OP_OPT;
5294 *code++ = oldims;
5295 length += 2;
5296 }
5297
5298 /* Retain the highest bracket number, in case resetting was used. */
5299
5300 cd->bracount = max_bracount;
5301
5302 /* Set values to pass back */
5303
5304 *codeptr = code;
5305 *ptrptr = ptr;
5306 *firstbyteptr = firstbyte;
5307 *reqbyteptr = reqbyte;
5308 if (lengthptr != NULL)
5309 {
5310 if (OFLOW_MAX - *lengthptr < length)
5311 {
5312 *errorcodeptr = ERR20;
5313 return FALSE;
5314 }
5315 *lengthptr += length;
5316 }
5317 return TRUE;
5318 }
5319
5320 /* Another branch follows. In the pre-compile phase, we can move the code
5321 pointer back to where it was for the start of the first branch. (That is,
5322 pretend that each branch is the only one.)
5323
5324 In the real compile phase, insert an ALT node. Its length field points back
5325 to the previous branch while the bracket remains open. At the end the chain
5326 is reversed. It's done like this so that the start of the bracket has a
5327 zero offset until it is closed, making it possible to detect recursion. */
5328
5329 if (lengthptr != NULL)
5330 {
5331 code = *codeptr + 1 + LINK_SIZE + skipbytes;
5332 length += 1 + LINK_SIZE;
5333 }
5334 else
5335 {
5336 *code = OP_ALT;
5337 PUT(code, 1, code - last_branch);
5338 bc.current = last_branch = code;
5339 code += 1 + LINK_SIZE;
5340 }
5341
5342 ptr++;
5343 }
5344 /* Control never reaches here */
5345 }
5346
5347
5348
5349
5350 /*************************************************
5351 * Check for anchored expression *
5352 *************************************************/
5353
5354 /* Try to find out if this is an anchored regular expression. Consider each
5355 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
5356 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
5357 it's anchored. However, if this is a multiline pattern, then only OP_SOD
5358 counts, since OP_CIRC can match in the middle.
5359
5360 We can also consider a regex to be anchored if OP_SOM starts all its branches.
5361 This is the code for \G, which means "match at start of match position, taking
5362 into account the match offset".
5363
5364 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
5365 because that will try the rest of the pattern at all possible matching points,
5366 so there is no point trying again.... er ....
5367
5368 .... except when the .* appears inside capturing parentheses, and there is a
5369 subsequent back reference to those parentheses. We haven't enough information
5370 to catch that case precisely.
5371
5372 At first, the best we could do was to detect when .* was in capturing brackets
5373 and the highest back reference was greater than or equal to that level.
5374 However, by keeping a bitmap of the first 31 back references, we can catch some
5375 of the more common cases more precisely.
5376
5377 Arguments:
5378 code points to start of expression (the bracket)
5379 options points to the options setting
5380 bracket_map a bitmap of which brackets we are inside while testing; this
5381 handles up to substring 31; after that we just have to take
5382 the less precise approach
5383 backref_map the back reference bitmap
5384
5385 Returns: TRUE or FALSE
5386 */
5387
5388 static BOOL
5389 is_anchored(register const uschar *code, int *options, unsigned int bracket_map,
5390 unsigned int backref_map)
5391 {
5392 do {
5393 const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
5394 options, PCRE_MULTILINE, FALSE);
5395 register int op = *scode;
5396
5397 /* Non-capturing brackets */
5398
5399 if (op == OP_BRA)
5400 {
5401 if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
5402 }
5403
5404 /* Capturing brackets */
5405
5406 else if (op == OP_CBRA)
5407 {
5408 int n = GET2(scode, 1+LINK_SIZE);
5409 int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
5410 if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;
5411 }
5412
5413 /* Other brackets */
5414
5415 else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
5416 {
5417 if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
5418 }
5419
5420 /* .* is not anchored unless DOTALL is set and it isn't in brackets that
5421 are or may be referenced. */
5422
5423 else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
5424 op == OP_TYPEPOSSTAR) &&
5425 (*options & PCRE_DOTALL) != 0)
5426 {
5427 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
5428 }
5429
5430 /* Check for explicit anchoring */
5431
5432 else if (op != OP_SOD && op != OP_SOM &&
5433 ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
5434 return FALSE;
5435 code += GET(code, 1);
5436 }
5437 while (*code == OP_ALT); /* Loop for each alternative */
5438 return TRUE;
5439 }
5440
5441
5442
5443 /*************************************************
5444 * Check for starting with ^ or .* *
5445 *************************************************/
5446
5447 /* This is called to find out if every branch starts with ^ or .* so that
5448 "first char" processing can be done to speed things up in multiline
5449 matching and for non-DOTALL patterns that start with .* (which must start at
5450 the beginning or after \n). As in the case of is_anchored() (see above), we
5451 have to take account of back references to capturing brackets that contain .*
5452 because in that case we can't make the assumption.
5453
5454 Arguments:
5455 code points to start of expression (the bracket)
5456 bracket_map a bitmap of which brackets we are inside while testing; this
5457 handles up to substring 31; after that we just have to take
5458 the less precise approach
5459 backref_map the back reference bitmap
5460
5461 Returns: TRUE or FALSE
5462 */
5463
5464 static BOOL
5465 is_startline(const uschar *code, unsigned int bracket_map,
5466 unsigned int backref_map)
5467 {
5468 do {
5469 const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
5470 NULL, 0, FALSE);
5471 register int op = *scode;
5472
5473 /* Non-capturing brackets */
5474
5475 if (op == OP_BRA)
5476 {
5477 if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
5478 }
5479
5480 /* Capturing brackets */
5481
5482 else if (op == OP_CBRA)
5483 {
5484 int n = GET2(scode, 1+LINK_SIZE);
5485 int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
5486 if (!is_startline(scode, new_map, backref_map)) return FALSE;
5487 }
5488
5489 /* Other brackets */
5490
5491 else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
5492 { if (!is_startline(scode, bracket_map, backref_map)) return FALSE; }
5493
5494 /* .* means "start at start or after \n" if it isn't in brackets that
5495 may be referenced. */
5496
5497 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
5498 {
5499 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
5500 }
5501
5502 /* Check for explicit circumflex */
5503
5504 else if (op != OP_CIRC) return FALSE;
5505
5506 /* Move on to the next alternative */
5507
5508 code += GET(code, 1);
5509 }
5510 while (*code == OP_ALT); /* Loop for each alternative */
5511 return TRUE;
5512 }
5513
5514
5515
5516 /*************************************************
5517 * Check for asserted fixed first char *
5518 *************************************************/
5519
5520 /* During compilation, the "first char" settings from forward assertions are
5521 discarded, because they can cause conflicts with actual literals that follow.
5522 However, if we end up without a first char setting for an unanchored pattern,
5523 it is worth scanning the regex to see if there is an initial asserted first
5524 char. If all branches start with the same asserted char, or with a bracket all
5525 of whose alternatives start with the same asserted char (recurse ad lib), then
5526 we return that char, otherwise -1.
5527
5528 Arguments:
5529 code points to start of expression (the bracket)
5530 options pointer to the options (used to check casing changes)
5531 inassert TRUE if in an assertion
5532
5533 Returns: -1 or the fixed first char
5534 */
5535
5536 static int
5537 find_firstassertedchar(const uschar *code, int *options, BOOL inassert)
5538 {
5539 register int c = -1;
5540 do {
5541 int d;
5542 const uschar *scode =
5543 first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE);
5544 register int op = *scode;
5545
5546 switch(op)
5547 {
5548 default:
5549 return -1;
5550
5551 case OP_BRA:
5552 case OP_CBRA:
5553 case OP_ASSERT:
5554 case OP_ONCE:
5555 case OP_COND:
5556 if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0)
5557 return -1;
5558 if (c < 0) c = d; else if (c != d) return -1;
5559 break;
5560
5561 case OP_EXACT: /* Fall through */
5562 scode += 2;
5563
5564 case OP_CHAR:
5565 case OP_CHARNC:
5566 case OP_PLUS:
5567 case OP_MINPLUS:
5568 case OP_POSPLUS:
5569 if (!inassert) return -1;
5570 if (c < 0)
5571 {
5572 c = scode[1];
5573 if ((*options & PCRE_CASELESS) != 0) c |= REQ_CASELESS;
5574 }
5575 else if (c != scode[1]) return -1;
5576 break;
5577 }
5578
5579 code += GET(code, 1);
5580 }
5581 while (*code == OP_ALT);
5582 return c;
5583 }
5584
5585
5586
5587 /*************************************************
5588 * Compile a Regular Expression *
5589 *************************************************/
5590
5591 /* This function takes a string and returns a pointer to a block of store
5592 holding a compiled version of the expression. The original API for this
5593 function had no error code return variable; it is retained for backwards
5594 compatibility. The new function is given a new name.
5595
5596 Arguments:
5597 pattern the regular expression
5598 options various option bits
5599 errorcodeptr pointer to error code variable (pcre_compile2() only)
5600 can be NULL if you don't want a code value
5601 errorptr pointer to pointer to error text
5602 erroroffset ptr offset in pattern where error was detected
5603 tables pointer to character tables or NULL
5604
5605 Returns: pointer to compiled data block, or NULL on error,
5606 with errorptr and erroroffset set
5607 */
5608
5609 PCRE_EXP_DEFN pcre *
5610 pcre_compile(const char *pattern, int options, const char **errorptr,
5611 int *erroroffset, const unsigned char *tables)
5612 {
5613 return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
5614 }
5615
5616
5617 PCRE_EXP_DEFN pcre *
5618 pcre_compile2(const char *pattern, int options, int *errorcodeptr,
5619 const char **errorptr, int *erroroffset, const unsigned char *tables)
5620 {
5621 real_pcre *re;
5622 int length = 1; /* For final END opcode */
5623 int firstbyte, reqbyte, newline;
5624 int errorcode = 0;
5625 #ifdef SUPPORT_UTF8
5626 BOOL utf8;
5627 #endif
5628 size_t size;
5629 uschar *code;
5630 const uschar *codestart;
5631 const uschar *ptr;
5632 compile_data compile_block;
5633 compile_data *cd = &compile_block;
5634
5635 /* This space is used for "compiling" into during the first phase, when we are
5636 computing the amount of memory that is needed. Compiled items are thrown away
5637 as soon as possible, so that a fairly large buffer should be sufficient for
5638 this purpose. The same space is used in the second phase for remembering where
5639 to fill in forward references to subpatterns. */
5640
5641 uschar cworkspace[COMPILE_WORK_SIZE];
5642
5643
5644 /* Set this early so that early errors get offset 0. */
5645
5646 ptr = (const uschar *)pattern;
5647
5648 /* We can't pass back an error message if errorptr is NULL; I guess the best we
5649 can do is just return NULL, but we can set a code value if there is a code
5650 pointer. */
5651
5652 if (errorptr == NULL)
5653 {
5654 if (errorcodeptr != NULL) *errorcodeptr = 99;
5655 return NULL;
5656 }
5657
5658 *errorptr = NULL;
5659 if (errorcodeptr != NULL) *errorcodeptr = ERR0;
5660
5661 /* However, we can give a message for this error */
5662
5663 if (erroroffset == NULL)
5664 {
5665 errorcode = ERR16;
5666 goto PCRE_EARLY_ERROR_RETURN2;
5667 }
5668
5669 *erroroffset = 0;
5670
5671 /* Can't support UTF8 unless PCRE has been compiled to include the code. */
5672
5673 #ifdef SUPPORT_UTF8
5674 utf8 = (options & PCRE_UTF8) != 0;
5675 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
5676 (*erroroffset = _pcre_valid_utf8((uschar *)pattern, -1)) >= 0)
5677 {
5678 errorcode = ERR44;
5679 goto PCRE_EARLY_ERROR_RETURN2;
5680 }
5681 #else
5682 if ((options & PCRE_UTF8) != 0)
5683 {
5684 errorcode = ERR32;
5685 goto PCRE_EARLY_ERROR_RETURN;
5686 }
5687 #endif
5688
5689 if ((options & ~PUBLIC_OPTIONS) != 0)
5690 {
5691 errorcode = ERR17;
5692 goto PCRE_EARLY_ERROR_RETURN;
5693 }
5694
5695 /* Set up pointers to the individual character tables */
5696
5697 if (tables == NULL) tables = _pcre_default_tables;
5698 cd->lcc = tables + lcc_offset;
5699 cd->fcc = tables + fcc_offset;
5700 cd->cbits = tables + cbits_offset;
5701 cd->ctypes = tables + ctypes_offset;
5702
5703 /* Handle different types of newline. The three bits give seven cases. The
5704 current code allows for fixed one- or two-byte sequences, plus "any" and
5705 "anycrlf". */
5706
5707 switch (options & (PCRE_NEWLINE_CRLF | PCRE_NEWLINE_ANY))
5708 {
5709 case 0: newline = NEWLINE; break; /* Compile-time default */
5710 case PCRE_NEWLINE_CR: newline = '\r'; break;
5711 case PCRE_NEWLINE_LF: newline = '\n'; break;
5712 case PCRE_NEWLINE_CR+
5713 PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
5714 case PCRE_NEWLINE_ANY: newline = -1; break;
5715 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
5716 default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
5717 }
5718
5719 if (newline == -2)
5720 {
5721 cd->nltype = NLTYPE_ANYCRLF;
5722 }
5723 else if (newline < 0)
5724 {
5725 cd->nltype = NLTYPE_ANY;
5726 }
5727 else
5728 {
5729 cd->nltype = NLTYPE_FIXED;
5730 if (newline > 255)
5731 {
5732 cd->nllen = 2;
5733 cd->nl[0] = (newline >> 8) & 255;
5734 cd->nl[1] = newline & 255;
5735 }
5736 else
5737 {
5738 cd->nllen = 1;
5739 cd->nl[0] = newline;
5740 }
5741 }
5742
5743 /* Maximum back reference and backref bitmap. The bitmap records up to 31 back
5744 references to help in deciding whether (.*) can be treated as anchored or not.
5745 */
5746
5747 cd->top_backref = 0;
5748 cd->backref_map = 0;
5749
5750 /* Reflect pattern for debugging output */
5751
5752 DPRINTF(("------------------------------------------------------------------\n"));
5753 DPRINTF(("%s\n", pattern));
5754
5755 /* Pretend to compile the pattern while actually just accumulating the length
5756 of memory required. This behaviour is triggered by passing a non-NULL final
5757 argument to compile_regex(). We pass a block of workspace (cworkspace) for it
5758 to compile parts of the pattern into; the compiled code is discarded when it is
5759 no longer needed, so hopefully this workspace will never overflow, though there
5760 is a test for its doing so. */
5761
5762 cd->bracount = 0;
5763 cd->names_found = 0;
5764 cd->name_entry_size = 0;
5765 cd->name_table = NULL;
5766 cd->start_workspace = cworkspace;
5767 cd->start_code = cworkspace;
5768 cd->hwm = cworkspace;
5769 cd->start_pattern = (const uschar *)pattern;
5770 cd->end_pattern = (const uschar *)(pattern + strlen(pattern));
5771 cd->req_varyopt = 0;
5772 cd->nopartial = FALSE;
5773 cd->external_options = options;
5774
5775 /* Now do the pre-compile. On error, errorcode will be set non-zero, so we
5776 don't need to look at the result of the function here. The initial options have
5777 been put into the cd block so that they can be changed if an option setting is
5778 found within the regex right at the beginning. Bringing initial option settings
5779 outside can help speed up starting point checks. */
5780
5781 code = cworkspace;
5782 *code = OP_BRA;
5783 (void)compile_regex(cd->external_options, cd->external_options & PCRE_IMS,
5784 &code, &ptr, &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd,
5785 &length);
5786 if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN;
5787
5788 DPRINTF(("end pre-compile: length=%d workspace=%d\n", length,
5789 cd->hwm - cworkspace));
5790
5791 if (length > MAX_PATTERN_SIZE)
5792 {
5793 errorcode = ERR20;
5794 goto PCRE_EARLY_ERROR_RETURN;
5795 }
5796
5797 /* Compute the size of data block needed and get it, either from malloc or
5798 externally provided function. Integer overflow should no longer be possible
5799 because nowadays we limit the maximum value of cd->names_found and
5800 cd->name_entry_size. */
5801
5802 size = length + sizeof(real_pcre) + cd->names_found * (cd->name_entry_size + 3);
5803 re = (real_pcre *)(pcre_malloc)(size);
5804
5805 if (re == NULL)
5806 {
5807 errorcode = ERR21;
5808 goto PCRE_EARLY_ERROR_RETURN;
5809 }
5810
5811 /* Put in the magic number, and save the sizes, initial options, and character
5812 table pointer. NULL is used for the default character tables. The nullpad field
5813 is at the end; it's there to help in the case when a regex compiled on a system
5814 with 4-byte pointers is run on another with 8-byte pointers. */
5815
5816 re->magic_number = MAGIC_NUMBER;
5817 re->size = size;
5818 re->options = cd->external_options;
5819 re->dummy1 = 0;
5820 re->first_byte = 0;
5821 re->req_byte = 0;
5822 re->name_table_offset = sizeof(real_pcre);
5823 re->name_entry_size = cd->name_entry_size;
5824 re->name_count = cd->names_found;
5825 re->ref_count = 0;
5826 re->tables = (tables == _pcre_default_tables)? NULL : tables;
5827 re->nullpad = NULL;
5828
5829 /* The starting points of the name/number translation table and of the code are
5830 passed around in the compile data block. The start/end pattern and initial
5831 options are already set from the pre-compile phase, as is the name_entry_size
5832 field. Reset the bracket count and the names_found field. Also reset the hwm
5833 field; this time it's used for remembering forward references to subpatterns.
5834 */
5835
5836 cd->bracount = 0;
5837 cd->names_found = 0;
5838 cd->name_table = (uschar *)re + re->name_table_offset;
5839 codestart = cd->name_table + re->name_entry_size * re->name_count;
5840 cd->start_code = codestart;
5841 cd->hwm = cworkspace;
5842 cd->req_varyopt = 0;
5843 cd->nopartial = FALSE;
5844 cd->had_accept = FALSE;
5845
5846 /* Set up a starting, non-extracting bracket, then compile the expression. On
5847 error, errorcode will be set non-zero, so we don't need to look at the result
5848 of the function here. */
5849
5850 ptr = (const uschar *)pattern;
5851 code = (uschar *)codestart;
5852 *code = OP_BRA;
5853 (void)compile_regex(re->options, re->options & PCRE_IMS, &code, &ptr,
5854 &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd, NULL);
5855 re->top_bracket = cd->bracount;
5856 re->top_backref = cd->top_backref;
5857
5858 if (cd->nopartial) re->options |= PCRE_NOPARTIAL;
5859 if (cd->had_accept) reqbyte = -1; /* Must disable after (*ACCEPT) */
5860
5861 /* If not reached end of pattern on success, there's an excess bracket. */
5862
5863 if (errorcode == 0 && *ptr != 0) errorcode = ERR22;
5864
5865 /* Fill in the terminating state and check for disastrous overflow, but
5866 if debugging, leave the test till after things are printed out. */
5867
5868 *code++ = OP_END;
5869
5870 #ifndef DEBUG
5871 if (code - codestart > length) errorcode = ERR23;
5872 #endif
5873
5874 /* Fill in any forward references that are required. */
5875
5876 while (errorcode == 0 && cd->hwm > cworkspace)
5877 {
5878 int offset, recno;
5879 const uschar *groupptr;
5880 cd->hwm -= LINK_SIZE;
5881 offset = GET(cd->hwm, 0);
5882 recno = GET(codestart, offset);
5883 groupptr = find_bracket(codestart, (re->options & PCRE_UTF8) != 0, recno);
5884 if (groupptr == NULL) errorcode = ERR53;
5885 else PUT(((uschar *)codestart), offset, groupptr - codestart);
5886 }
5887
5888 /* Give an error if there's back reference to a non-existent capturing
5889 subpattern. */
5890
5891 if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15;
5892
5893 /* Failed to compile, or error while post-processing */
5894
5895 if (errorcode != 0)
5896 {
5897 (pcre_free)(re);
5898 PCRE_EARLY_ERROR_RETURN:
5899 *erroroffset = ptr - (const uschar *)pattern;
5900 PCRE_EARLY_ERROR_RETURN2:
5901 *errorptr = error_texts[errorcode];
5902 if (errorcodeptr != NULL) *errorcodeptr = errorcode;
5903 return NULL;
5904 }
5905
5906 /* If the anchored option was not passed, set the flag if we can determine that
5907 the pattern is anchored by virtue of ^ characters or \A or anything else (such
5908 as starting with .* when DOTALL is set).
5909
5910 Otherwise, if we know what the first byte has to be, save it, because that
5911 speeds up unanchored matches no end. If not, see if we can set the
5912 PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
5913 start with ^. and also when all branches start with .* for non-DOTALL matches.
5914 */
5915
5916 if ((re->options & PCRE_ANCHORED) == 0)
5917 {
5918 int temp_options = re->options; /* May get changed during these scans */
5919 if (is_anchored(codestart, &temp_options, 0, cd->backref_map))
5920 re->options |= PCRE_ANCHORED;
5921 else
5922 {
5923 if (firstbyte < 0)
5924 firstbyte = find_firstassertedchar(codestart, &temp_options, FALSE);
5925 if (firstbyte >= 0) /* Remove caseless flag for non-caseable chars */
5926 {
5927 int ch = firstbyte & 255;
5928 re->first_byte = ((firstbyte & REQ_CASELESS) != 0 &&
5929 cd->fcc[ch] == ch)? ch : firstbyte;
5930 re->options |= PCRE_FIRSTSET;
5931 }
5932 else if (is_startline(codestart, 0, cd->backref_map))
5933 re->options |= PCRE_STARTLINE;
5934 }
5935 }
5936
5937 /* For an anchored pattern, we use the "required byte" only if it follows a
5938 variable length item in the regex. Remove the caseless flag for non-caseable
5939 bytes. */
5940
5941 if (reqbyte >= 0 &&
5942 ((re->options & PCRE_ANCHORED) == 0 || (reqbyte & REQ_VARY) != 0))
5943 {
5944 int ch = reqbyte & 255;
5945 re->req_byte = ((reqbyte & REQ_CASELESS) != 0 &&
5946 cd->fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte;
5947 re->options |= PCRE_REQCHSET;
5948 }
5949
5950 /* Print out the compiled data if debugging is enabled. This is never the
5951 case when building a production library. */
5952
5953 #ifdef DEBUG
5954
5955 printf("Length = %d top_bracket = %d top_backref = %d\n",
5956 length, re->top_bracket, re->top_backref);
5957
5958 if (re->options != 0)
5959 {
5960 printf("%s%s%s%s%s%s%s%s%s\n",
5961 ((re->options & PCRE_NOPARTIAL) != 0)? "nopartial " : "",
5962 ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
5963 ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
5964 ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
5965 ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
5966 ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
5967 ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
5968 ((re->options & PCRE_EXTRA) != 0)? "extra " : "",
5969 ((re->options & PCRE_UNGREEDY) != 0)? "ungreedy " : "");
5970 }
5971
5972 if ((re->options & PCRE_FIRSTSET) != 0)
5973 {
5974 int ch = re->first_byte & 255;
5975 const char *caseless = ((re->first_byte & REQ_CASELESS) == 0)?
5976 "" : " (caseless)";
5977 if (isprint(ch)) printf("First char = %c%s\n", ch, caseless);
5978 else printf("First char = \\x%02x%s\n", ch, caseless);
5979 }
5980
5981 if ((re->options & PCRE_REQCHSET) != 0)
5982 {
5983 int ch = re->req_byte & 255;
5984 const char *caseless = ((re->req_byte & REQ_CASELESS) == 0)?
5985 "" : " (caseless)";
5986 if (isprint(ch)) printf("Req char = %c%s\n", ch, caseless);
5987 else printf("Req char = \\x%02x%s\n", ch, caseless);
5988 }
5989
5990 pcre_printint(re, stdout, TRUE);
5991
5992 /* This check is done here in the debugging case so that the code that
5993 was compiled can be seen. */
5994
5995 if (code - codestart > length)
5996 {
5997 (pcre_free)(re);
5998 *errorptr = error_texts[ERR23];
5999 *erroroffset = ptr - (uschar *)pattern;
6000 if (errorcodeptr != NULL) *errorcodeptr = ERR23;
6001 return NULL;
6002 }
6003 #endif /* DEBUG */
6004
6005 return (pcre *)re;
6006 }
6007
6008 /* End of pcre_compile.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12