/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Contents of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 605 - (show annotations) (download)
Fri Jun 3 18:18:30 2011 UTC (3 years, 6 months ago) by ph10
File MIME type: text/plain
File size: 238607 byte(s)
Make pcre_study() more robust against update omissions; fix ONCE oversight.

1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2011 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains the external function pcre_compile(), along with
42 supporting internal functions that are not used by other modules. */
43
44
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48
49 #define NLBLOCK cd /* Block containing newline information */
50 #define PSSTART start_pattern /* Field containing processed string start */
51 #define PSEND end_pattern /* Field containing processed string end */
52
53 #include "pcre_internal.h"
54
55
56 /* When PCRE_DEBUG is defined, we need the pcre_printint() function, which is
57 also used by pcretest. PCRE_DEBUG is not defined when building a production
58 library. */
59
60 #ifdef PCRE_DEBUG
61 #include "pcre_printint.src"
62 #endif
63
64
65 /* Macro for setting individual bits in class bitmaps. */
66
67 #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
68
69 /* Maximum length value to check against when making sure that the integer that
70 holds the compiled pattern length does not overflow. We make it a bit less than
71 INT_MAX to allow for adding in group terminating bytes, so that we don't have
72 to check them every time. */
73
74 #define OFLOW_MAX (INT_MAX - 20)
75
76
77 /*************************************************
78 * Code parameters and static tables *
79 *************************************************/
80
81 /* This value specifies the size of stack workspace that is used during the
82 first pre-compile phase that determines how much memory is required. The regex
83 is partly compiled into this space, but the compiled parts are discarded as
84 soon as they can be, so that hopefully there will never be an overrun. The code
85 does, however, check for an overrun. The largest amount I've seen used is 218,
86 so this number is very generous.
87
88 The same workspace is used during the second, actual compile phase for
89 remembering forward references to groups so that they can be filled in at the
90 end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
91 is 4 there is plenty of room. */
92
93 #define COMPILE_WORK_SIZE (4096)
94
95 /* The overrun tests check for a slightly smaller size so that they detect the
96 overrun before it actually does run off the end of the data block. */
97
98 #define WORK_SIZE_CHECK (COMPILE_WORK_SIZE - 100)
99
100
101 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
102 are simple data values; negative values are for special things like \d and so
103 on. Zero means further processing is needed (for things like \x), or the escape
104 is invalid. */
105
106 #ifndef EBCDIC
107
108 /* This is the "normal" table for ASCII systems or for EBCDIC systems running
109 in UTF-8 mode. */
110
111 static const short int escapes[] = {
112 0, 0,
113 0, 0,
114 0, 0,
115 0, 0,
116 0, 0,
117 CHAR_COLON, CHAR_SEMICOLON,
118 CHAR_LESS_THAN_SIGN, CHAR_EQUALS_SIGN,
119 CHAR_GREATER_THAN_SIGN, CHAR_QUESTION_MARK,
120 CHAR_COMMERCIAL_AT, -ESC_A,
121 -ESC_B, -ESC_C,
122 -ESC_D, -ESC_E,
123 0, -ESC_G,
124 -ESC_H, 0,
125 0, -ESC_K,
126 0, 0,
127 -ESC_N, 0,
128 -ESC_P, -ESC_Q,
129 -ESC_R, -ESC_S,
130 0, 0,
131 -ESC_V, -ESC_W,
132 -ESC_X, 0,
133 -ESC_Z, CHAR_LEFT_SQUARE_BRACKET,
134 CHAR_BACKSLASH, CHAR_RIGHT_SQUARE_BRACKET,
135 CHAR_CIRCUMFLEX_ACCENT, CHAR_UNDERSCORE,
136 CHAR_GRAVE_ACCENT, 7,
137 -ESC_b, 0,
138 -ESC_d, ESC_e,
139 ESC_f, 0,
140 -ESC_h, 0,
141 0, -ESC_k,
142 0, 0,
143 ESC_n, 0,
144 -ESC_p, 0,
145 ESC_r, -ESC_s,
146 ESC_tee, 0,
147 -ESC_v, -ESC_w,
148 0, 0,
149 -ESC_z
150 };
151
152 #else
153
154 /* This is the "abnormal" table for EBCDIC systems without UTF-8 support. */
155
156 static const short int escapes[] = {
157 /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
158 /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
159 /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
160 /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
161 /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
162 /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
163 /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
164 /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
165 /* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0,
166 /* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
167 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
168 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0,
169 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
170 /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
171 /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
172 /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
173 /* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0,
174 /* D0 */ '}', 0, -ESC_K, 0, 0,-ESC_N, 0, -ESC_P,
175 /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
176 /* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X,
177 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
178 /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
179 /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
180 };
181 #endif
182
183
184 /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
185 searched linearly. Put all the names into a single string, in order to reduce
186 the number of relocations when a shared library is dynamically linked. The
187 string is built from string macros so that it works in UTF-8 mode on EBCDIC
188 platforms. */
189
190 typedef struct verbitem {
191 int len; /* Length of verb name */
192 int op; /* Op when no arg, or -1 if arg mandatory */
193 int op_arg; /* Op when arg present, or -1 if not allowed */
194 } verbitem;
195
196 static const char verbnames[] =
197 "\0" /* Empty name is a shorthand for MARK */
198 STRING_MARK0
199 STRING_ACCEPT0
200 STRING_COMMIT0
201 STRING_F0
202 STRING_FAIL0
203 STRING_PRUNE0
204 STRING_SKIP0
205 STRING_THEN;
206
207 static const verbitem verbs[] = {
208 { 0, -1, OP_MARK },
209 { 4, -1, OP_MARK },
210 { 6, OP_ACCEPT, -1 },
211 { 6, OP_COMMIT, -1 },
212 { 1, OP_FAIL, -1 },
213 { 4, OP_FAIL, -1 },
214 { 5, OP_PRUNE, OP_PRUNE_ARG },
215 { 4, OP_SKIP, OP_SKIP_ARG },
216 { 4, OP_THEN, OP_THEN_ARG }
217 };
218
219 static const int verbcount = sizeof(verbs)/sizeof(verbitem);
220
221
222 /* Tables of names of POSIX character classes and their lengths. The names are
223 now all in a single string, to reduce the number of relocations when a shared
224 library is dynamically loaded. The list of lengths is terminated by a zero
225 length entry. The first three must be alpha, lower, upper, as this is assumed
226 for handling case independence. */
227
228 static const char posix_names[] =
229 STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
230 STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
231 STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
232 STRING_word0 STRING_xdigit;
233
234 static const uschar posix_name_lengths[] = {
235 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
236
237 /* Table of class bit maps for each POSIX class. Each class is formed from a
238 base map, with an optional addition or removal of another map. Then, for some
239 classes, there is some additional tweaking: for [:blank:] the vertical space
240 characters are removed, and for [:alpha:] and [:alnum:] the underscore
241 character is removed. The triples in the table consist of the base map offset,
242 second map offset or -1 if no second map, and a non-negative value for map
243 addition or a negative value for map subtraction (if there are two maps). The
244 absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
245 remove vertical space characters, 2 => remove underscore. */
246
247 static const int posix_class_maps[] = {
248 cbit_word, cbit_digit, -2, /* alpha */
249 cbit_lower, -1, 0, /* lower */
250 cbit_upper, -1, 0, /* upper */
251 cbit_word, -1, 2, /* alnum - word without underscore */
252 cbit_print, cbit_cntrl, 0, /* ascii */
253 cbit_space, -1, 1, /* blank - a GNU extension */
254 cbit_cntrl, -1, 0, /* cntrl */
255 cbit_digit, -1, 0, /* digit */
256 cbit_graph, -1, 0, /* graph */
257 cbit_print, -1, 0, /* print */
258 cbit_punct, -1, 0, /* punct */
259 cbit_space, -1, 0, /* space */
260 cbit_word, -1, 0, /* word - a Perl extension */
261 cbit_xdigit,-1, 0 /* xdigit */
262 };
263
264 /* Table of substitutes for \d etc when PCRE_UCP is set. The POSIX class
265 substitutes must be in the order of the names, defined above, and there are
266 both positive and negative cases. NULL means no substitute. */
267
268 #ifdef SUPPORT_UCP
269 static const uschar *substitutes[] = {
270 (uschar *)"\\P{Nd}", /* \D */
271 (uschar *)"\\p{Nd}", /* \d */
272 (uschar *)"\\P{Xsp}", /* \S */ /* NOTE: Xsp is Perl space */
273 (uschar *)"\\p{Xsp}", /* \s */
274 (uschar *)"\\P{Xwd}", /* \W */
275 (uschar *)"\\p{Xwd}" /* \w */
276 };
277
278 static const uschar *posix_substitutes[] = {
279 (uschar *)"\\p{L}", /* alpha */
280 (uschar *)"\\p{Ll}", /* lower */
281 (uschar *)"\\p{Lu}", /* upper */
282 (uschar *)"\\p{Xan}", /* alnum */
283 NULL, /* ascii */
284 (uschar *)"\\h", /* blank */
285 NULL, /* cntrl */
286 (uschar *)"\\p{Nd}", /* digit */
287 NULL, /* graph */
288 NULL, /* print */
289 NULL, /* punct */
290 (uschar *)"\\p{Xps}", /* space */ /* NOTE: Xps is POSIX space */
291 (uschar *)"\\p{Xwd}", /* word */
292 NULL, /* xdigit */
293 /* Negated cases */
294 (uschar *)"\\P{L}", /* ^alpha */
295 (uschar *)"\\P{Ll}", /* ^lower */
296 (uschar *)"\\P{Lu}", /* ^upper */
297 (uschar *)"\\P{Xan}", /* ^alnum */
298 NULL, /* ^ascii */
299 (uschar *)"\\H", /* ^blank */
300 NULL, /* ^cntrl */
301 (uschar *)"\\P{Nd}", /* ^digit */
302 NULL, /* ^graph */
303 NULL, /* ^print */
304 NULL, /* ^punct */
305 (uschar *)"\\P{Xps}", /* ^space */ /* NOTE: Xps is POSIX space */
306 (uschar *)"\\P{Xwd}", /* ^word */
307 NULL /* ^xdigit */
308 };
309 #define POSIX_SUBSIZE (sizeof(posix_substitutes)/sizeof(uschar *))
310 #endif
311
312 #define STRING(a) # a
313 #define XSTRING(s) STRING(s)
314
315 /* The texts of compile-time error messages. These are "char *" because they
316 are passed to the outside world. Do not ever re-use any error number, because
317 they are documented. Always add a new error instead. Messages marked DEAD below
318 are no longer used. This used to be a table of strings, but in order to reduce
319 the number of relocations needed when a shared library is loaded dynamically,
320 it is now one long string. We cannot use a table of offsets, because the
321 lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
322 simply count through to the one we want - this isn't a performance issue
323 because these strings are used only when there is a compilation error.
324
325 Each substring ends with \0 to insert a null character. This includes the final
326 substring, so that the whole string ends with \0\0, which can be detected when
327 counting through. */
328
329 static const char error_texts[] =
330 "no error\0"
331 "\\ at end of pattern\0"
332 "\\c at end of pattern\0"
333 "unrecognized character follows \\\0"
334 "numbers out of order in {} quantifier\0"
335 /* 5 */
336 "number too big in {} quantifier\0"
337 "missing terminating ] for character class\0"
338 "invalid escape sequence in character class\0"
339 "range out of order in character class\0"
340 "nothing to repeat\0"
341 /* 10 */
342 "operand of unlimited repeat could match the empty string\0" /** DEAD **/
343 "internal error: unexpected repeat\0"
344 "unrecognized character after (? or (?-\0"
345 "POSIX named classes are supported only within a class\0"
346 "missing )\0"
347 /* 15 */
348 "reference to non-existent subpattern\0"
349 "erroffset passed as NULL\0"
350 "unknown option bit(s) set\0"
351 "missing ) after comment\0"
352 "parentheses nested too deeply\0" /** DEAD **/
353 /* 20 */
354 "regular expression is too large\0"
355 "failed to get memory\0"
356 "unmatched parentheses\0"
357 "internal error: code overflow\0"
358 "unrecognized character after (?<\0"
359 /* 25 */
360 "lookbehind assertion is not fixed length\0"
361 "malformed number or name after (?(\0"
362 "conditional group contains more than two branches\0"
363 "assertion expected after (?(\0"
364 "(?R or (?[+-]digits must be followed by )\0"
365 /* 30 */
366 "unknown POSIX class name\0"
367 "POSIX collating elements are not supported\0"
368 "this version of PCRE is not compiled with PCRE_UTF8 support\0"
369 "spare error\0" /** DEAD **/
370 "character value in \\x{...} sequence is too large\0"
371 /* 35 */
372 "invalid condition (?(0)\0"
373 "\\C not allowed in lookbehind assertion\0"
374 "PCRE does not support \\L, \\l, \\N{name}, \\U, or \\u\0"
375 "number after (?C is > 255\0"
376 "closing ) for (?C expected\0"
377 /* 40 */
378 "recursive call could loop indefinitely\0"
379 "unrecognized character after (?P\0"
380 "syntax error in subpattern name (missing terminator)\0"
381 "two named subpatterns have the same name\0"
382 "invalid UTF-8 string\0"
383 /* 45 */
384 "support for \\P, \\p, and \\X has not been compiled\0"
385 "malformed \\P or \\p sequence\0"
386 "unknown property name after \\P or \\p\0"
387 "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
388 "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
389 /* 50 */
390 "repeated subpattern is too long\0" /** DEAD **/
391 "octal value is greater than \\377 (not in UTF-8 mode)\0"
392 "internal error: overran compiling workspace\0"
393 "internal error: previously-checked referenced subpattern not found\0"
394 "DEFINE group contains more than one branch\0"
395 /* 55 */
396 "repeating a DEFINE group is not allowed\0"
397 "inconsistent NEWLINE options\0"
398 "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
399 "a numbered reference must not be zero\0"
400 "an argument is not allowed for (*ACCEPT), (*FAIL), or (*COMMIT)\0"
401 /* 60 */
402 "(*VERB) not recognized\0"
403 "number is too big\0"
404 "subpattern name expected\0"
405 "digit expected after (?+\0"
406 "] is an invalid data character in JavaScript compatibility mode\0"
407 /* 65 */
408 "different names for subpatterns of the same number are not allowed\0"
409 "(*MARK) must have an argument\0"
410 "this version of PCRE is not compiled with PCRE_UCP support\0"
411 "\\c must be followed by an ASCII character\0"
412 ;
413
414 /* Table to identify digits and hex digits. This is used when compiling
415 patterns. Note that the tables in chartables are dependent on the locale, and
416 may mark arbitrary characters as digits - but the PCRE compiling code expects
417 to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
418 a private table here. It costs 256 bytes, but it is a lot faster than doing
419 character value tests (at least in some simple cases I timed), and in some
420 applications one wants PCRE to compile efficiently as well as match
421 efficiently.
422
423 For convenience, we use the same bit definitions as in chartables:
424
425 0x04 decimal digit
426 0x08 hexadecimal digit
427
428 Then we can use ctype_digit and ctype_xdigit in the code. */
429
430 #ifndef EBCDIC
431
432 /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
433 UTF-8 mode. */
434
435 static const unsigned char digitab[] =
436 {
437 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
438 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
439 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
440 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
441 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
442 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
443 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
444 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
445 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
446 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
447 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
448 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
449 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
450 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
451 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
452 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
453 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
454 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
455 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
456 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
457 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
458 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
459 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
460 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
461 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
462 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
463 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
464 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
465 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
466 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
467 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
468 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
469
470 #else
471
472 /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
473
474 static const unsigned char digitab[] =
475 {
476 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
477 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
478 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
479 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
480 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
481 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
482 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
483 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
484 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
485 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
486 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
487 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */
488 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
489 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
490 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
491 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
492 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
493 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
494 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
495 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
496 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
497 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
498 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
499 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
500 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
501 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
502 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
503 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
504 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
505 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
506 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
507 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
508
509 static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
510 0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
511 0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
512 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
513 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
514 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
515 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
516 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
517 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
518 0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
519 0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
520 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
521 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */
522 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
523 0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
524 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
525 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
526 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
527 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
528 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
529 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
530 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
531 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
532 0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
533 0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
534 0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
535 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
536 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
537 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
538 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
539 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
540 0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
541 0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
542 #endif
543
544
545 /* Definition to allow mutual recursion */
546
547 static BOOL
548 compile_regex(int, uschar **, const uschar **, int *, BOOL, BOOL, int, int *,
549 int *, branch_chain *, compile_data *, int *);
550
551
552
553 /*************************************************
554 * Find an error text *
555 *************************************************/
556
557 /* The error texts are now all in one long string, to save on relocations. As
558 some of the text is of unknown length, we can't use a table of offsets.
559 Instead, just count through the strings. This is not a performance issue
560 because it happens only when there has been a compilation error.
561
562 Argument: the error number
563 Returns: pointer to the error string
564 */
565
566 static const char *
567 find_error_text(int n)
568 {
569 const char *s = error_texts;
570 for (; n > 0; n--)
571 {
572 while (*s++ != 0) {};
573 if (*s == 0) return "Error text not found (please report)";
574 }
575 return s;
576 }
577
578
579 /*************************************************
580 * Handle escapes *
581 *************************************************/
582
583 /* This function is called when a \ has been encountered. It either returns a
584 positive value for a simple escape such as \n, or a negative value which
585 encodes one of the more complicated things such as \d. A backreference to group
586 n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
587 UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
588 ptr is pointing at the \. On exit, it is on the final character of the escape
589 sequence.
590
591 Arguments:
592 ptrptr points to the pattern position pointer
593 errorcodeptr points to the errorcode variable
594 bracount number of previous extracting brackets
595 options the options bits
596 isclass TRUE if inside a character class
597
598 Returns: zero or positive => a data character
599 negative => a special escape sequence
600 on error, errorcodeptr is set
601 */
602
603 static int
604 check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
605 int options, BOOL isclass)
606 {
607 BOOL utf8 = (options & PCRE_UTF8) != 0;
608 const uschar *ptr = *ptrptr + 1;
609 int c, i;
610
611 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
612 ptr--; /* Set pointer back to the last byte */
613
614 /* If backslash is at the end of the pattern, it's an error. */
615
616 if (c == 0) *errorcodeptr = ERR1;
617
618 /* Non-alphanumerics are literals. For digits or letters, do an initial lookup
619 in a table. A non-zero result is something that can be returned immediately.
620 Otherwise further processing may be required. */
621
622 #ifndef EBCDIC /* ASCII/UTF-8 coding */
623 else if (c < CHAR_0 || c > CHAR_z) {} /* Not alphanumeric */
624 else if ((i = escapes[c - CHAR_0]) != 0) c = i;
625
626 #else /* EBCDIC coding */
627 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphanumeric */
628 else if ((i = escapes[c - 0x48]) != 0) c = i;
629 #endif
630
631 /* Escapes that need further processing, or are illegal. */
632
633 else
634 {
635 const uschar *oldptr;
636 BOOL braced, negated;
637
638 switch (c)
639 {
640 /* A number of Perl escapes are not handled by PCRE. We give an explicit
641 error. */
642
643 case CHAR_l:
644 case CHAR_L:
645 case CHAR_u:
646 case CHAR_U:
647 *errorcodeptr = ERR37;
648 break;
649
650 /* \g must be followed by one of a number of specific things:
651
652 (1) A number, either plain or braced. If positive, it is an absolute
653 backreference. If negative, it is a relative backreference. This is a Perl
654 5.10 feature.
655
656 (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
657 is part of Perl's movement towards a unified syntax for back references. As
658 this is synonymous with \k{name}, we fudge it up by pretending it really
659 was \k.
660
661 (3) For Oniguruma compatibility we also support \g followed by a name or a
662 number either in angle brackets or in single quotes. However, these are
663 (possibly recursive) subroutine calls, _not_ backreferences. Just return
664 the -ESC_g code (cf \k). */
665
666 case CHAR_g:
667 if (ptr[1] == CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_APOSTROPHE)
668 {
669 c = -ESC_g;
670 break;
671 }
672
673 /* Handle the Perl-compatible cases */
674
675 if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
676 {
677 const uschar *p;
678 for (p = ptr+2; *p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET; p++)
679 if (*p != CHAR_MINUS && (digitab[*p] & ctype_digit) == 0) break;
680 if (*p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET)
681 {
682 c = -ESC_k;
683 break;
684 }
685 braced = TRUE;
686 ptr++;
687 }
688 else braced = FALSE;
689
690 if (ptr[1] == CHAR_MINUS)
691 {
692 negated = TRUE;
693 ptr++;
694 }
695 else negated = FALSE;
696
697 c = 0;
698 while ((digitab[ptr[1]] & ctype_digit) != 0)
699 c = c * 10 + *(++ptr) - CHAR_0;
700
701 if (c < 0) /* Integer overflow */
702 {
703 *errorcodeptr = ERR61;
704 break;
705 }
706
707 if (braced && *(++ptr) != CHAR_RIGHT_CURLY_BRACKET)
708 {
709 *errorcodeptr = ERR57;
710 break;
711 }
712
713 if (c == 0)
714 {
715 *errorcodeptr = ERR58;
716 break;
717 }
718
719 if (negated)
720 {
721 if (c > bracount)
722 {
723 *errorcodeptr = ERR15;
724 break;
725 }
726 c = bracount - (c - 1);
727 }
728
729 c = -(ESC_REF + c);
730 break;
731
732 /* The handling of escape sequences consisting of a string of digits
733 starting with one that is not zero is not straightforward. By experiment,
734 the way Perl works seems to be as follows:
735
736 Outside a character class, the digits are read as a decimal number. If the
737 number is less than 10, or if there are that many previous extracting
738 left brackets, then it is a back reference. Otherwise, up to three octal
739 digits are read to form an escaped byte. Thus \123 is likely to be octal
740 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
741 value is greater than 377, the least significant 8 bits are taken. Inside a
742 character class, \ followed by a digit is always an octal number. */
743
744 case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
745 case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
746
747 if (!isclass)
748 {
749 oldptr = ptr;
750 c -= CHAR_0;
751 while ((digitab[ptr[1]] & ctype_digit) != 0)
752 c = c * 10 + *(++ptr) - CHAR_0;
753 if (c < 0) /* Integer overflow */
754 {
755 *errorcodeptr = ERR61;
756 break;
757 }
758 if (c < 10 || c <= bracount)
759 {
760 c = -(ESC_REF + c);
761 break;
762 }
763 ptr = oldptr; /* Put the pointer back and fall through */
764 }
765
766 /* Handle an octal number following \. If the first digit is 8 or 9, Perl
767 generates a binary zero byte and treats the digit as a following literal.
768 Thus we have to pull back the pointer by one. */
769
770 if ((c = *ptr) >= CHAR_8)
771 {
772 ptr--;
773 c = 0;
774 break;
775 }
776
777 /* \0 always starts an octal number, but we may drop through to here with a
778 larger first octal digit. The original code used just to take the least
779 significant 8 bits of octal numbers (I think this is what early Perls used
780 to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
781 than 3 octal digits. */
782
783 case CHAR_0:
784 c -= CHAR_0;
785 while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
786 c = c * 8 + *(++ptr) - CHAR_0;
787 if (!utf8 && c > 255) *errorcodeptr = ERR51;
788 break;
789
790 /* \x is complicated. \x{ddd} is a character number which can be greater
791 than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
792 treated as a data character. */
793
794 case CHAR_x:
795 if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
796 {
797 const uschar *pt = ptr + 2;
798 int count = 0;
799
800 c = 0;
801 while ((digitab[*pt] & ctype_xdigit) != 0)
802 {
803 register int cc = *pt++;
804 if (c == 0 && cc == CHAR_0) continue; /* Leading zeroes */
805 count++;
806
807 #ifndef EBCDIC /* ASCII/UTF-8 coding */
808 if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
809 c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
810 #else /* EBCDIC coding */
811 if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */
812 c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
813 #endif
814 }
815
816 if (*pt == CHAR_RIGHT_CURLY_BRACKET)
817 {
818 if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
819 ptr = pt;
820 break;
821 }
822
823 /* If the sequence of hex digits does not end with '}', then we don't
824 recognize this construct; fall through to the normal \x handling. */
825 }
826
827 /* Read just a single-byte hex-defined char */
828
829 c = 0;
830 while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
831 {
832 int cc; /* Some compilers don't like */
833 cc = *(++ptr); /* ++ in initializers */
834 #ifndef EBCDIC /* ASCII/UTF-8 coding */
835 if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
836 c = c * 16 + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
837 #else /* EBCDIC coding */
838 if (cc <= CHAR_z) cc += 64; /* Convert to upper case */
839 c = c * 16 + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
840 #endif
841 }
842 break;
843
844 /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
845 An error is given if the byte following \c is not an ASCII character. This
846 coding is ASCII-specific, but then the whole concept of \cx is
847 ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
848
849 case CHAR_c:
850 c = *(++ptr);
851 if (c == 0)
852 {
853 *errorcodeptr = ERR2;
854 break;
855 }
856 #ifndef EBCDIC /* ASCII/UTF-8 coding */
857 if (c > 127) /* Excludes all non-ASCII in either mode */
858 {
859 *errorcodeptr = ERR68;
860 break;
861 }
862 if (c >= CHAR_a && c <= CHAR_z) c -= 32;
863 c ^= 0x40;
864 #else /* EBCDIC coding */
865 if (c >= CHAR_a && c <= CHAR_z) c += 64;
866 c ^= 0xC0;
867 #endif
868 break;
869
870 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
871 other alphanumeric following \ is an error if PCRE_EXTRA was set;
872 otherwise, for Perl compatibility, it is a literal. This code looks a bit
873 odd, but there used to be some cases other than the default, and there may
874 be again in future, so I haven't "optimized" it. */
875
876 default:
877 if ((options & PCRE_EXTRA) != 0) switch(c)
878 {
879 default:
880 *errorcodeptr = ERR3;
881 break;
882 }
883 break;
884 }
885 }
886
887 /* Perl supports \N{name} for character names, as well as plain \N for "not
888 newline". PCRE does not support \N{name}. */
889
890 if (c == -ESC_N && ptr[1] == CHAR_LEFT_CURLY_BRACKET)
891 *errorcodeptr = ERR37;
892
893 /* If PCRE_UCP is set, we change the values for \d etc. */
894
895 if ((options & PCRE_UCP) != 0 && c <= -ESC_D && c >= -ESC_w)
896 c -= (ESC_DU - ESC_D);
897
898 /* Set the pointer to the final character before returning. */
899
900 *ptrptr = ptr;
901 return c;
902 }
903
904
905
906 #ifdef SUPPORT_UCP
907 /*************************************************
908 * Handle \P and \p *
909 *************************************************/
910
911 /* This function is called after \P or \p has been encountered, provided that
912 PCRE is compiled with support for Unicode properties. On entry, ptrptr is
913 pointing at the P or p. On exit, it is pointing at the final character of the
914 escape sequence.
915
916 Argument:
917 ptrptr points to the pattern position pointer
918 negptr points to a boolean that is set TRUE for negation else FALSE
919 dptr points to an int that is set to the detailed property value
920 errorcodeptr points to the error code variable
921
922 Returns: type value from ucp_type_table, or -1 for an invalid type
923 */
924
925 static int
926 get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
927 {
928 int c, i, bot, top;
929 const uschar *ptr = *ptrptr;
930 char name[32];
931
932 c = *(++ptr);
933 if (c == 0) goto ERROR_RETURN;
934
935 *negptr = FALSE;
936
937 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
938 negation. */
939
940 if (c == CHAR_LEFT_CURLY_BRACKET)
941 {
942 if (ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
943 {
944 *negptr = TRUE;
945 ptr++;
946 }
947 for (i = 0; i < (int)sizeof(name) - 1; i++)
948 {
949 c = *(++ptr);
950 if (c == 0) goto ERROR_RETURN;
951 if (c == CHAR_RIGHT_CURLY_BRACKET) break;
952 name[i] = c;
953 }
954 if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
955 name[i] = 0;
956 }
957
958 /* Otherwise there is just one following character */
959
960 else
961 {
962 name[0] = c;
963 name[1] = 0;
964 }
965
966 *ptrptr = ptr;
967
968 /* Search for a recognized property name using binary chop */
969
970 bot = 0;
971 top = _pcre_utt_size;
972
973 while (bot < top)
974 {
975 i = (bot + top) >> 1;
976 c = strcmp(name, _pcre_utt_names + _pcre_utt[i].name_offset);
977 if (c == 0)
978 {
979 *dptr = _pcre_utt[i].value;
980 return _pcre_utt[i].type;
981 }
982 if (c > 0) bot = i + 1; else top = i;
983 }
984
985 *errorcodeptr = ERR47;
986 *ptrptr = ptr;
987 return -1;
988
989 ERROR_RETURN:
990 *errorcodeptr = ERR46;
991 *ptrptr = ptr;
992 return -1;
993 }
994 #endif
995
996
997
998
999 /*************************************************
1000 * Check for counted repeat *
1001 *************************************************/
1002
1003 /* This function is called when a '{' is encountered in a place where it might
1004 start a quantifier. It looks ahead to see if it really is a quantifier or not.
1005 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
1006 where the ddds are digits.
1007
1008 Arguments:
1009 p pointer to the first char after '{'
1010
1011 Returns: TRUE or FALSE
1012 */
1013
1014 static BOOL
1015 is_counted_repeat(const uschar *p)
1016 {
1017 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
1018 while ((digitab[*p] & ctype_digit) != 0) p++;
1019 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
1020
1021 if (*p++ != CHAR_COMMA) return FALSE;
1022 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
1023
1024 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
1025 while ((digitab[*p] & ctype_digit) != 0) p++;
1026
1027 return (*p == CHAR_RIGHT_CURLY_BRACKET);
1028 }
1029
1030
1031
1032 /*************************************************
1033 * Read repeat counts *
1034 *************************************************/
1035
1036 /* Read an item of the form {n,m} and return the values. This is called only
1037 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
1038 so the syntax is guaranteed to be correct, but we need to check the values.
1039
1040 Arguments:
1041 p pointer to first char after '{'
1042 minp pointer to int for min
1043 maxp pointer to int for max
1044 returned as -1 if no max
1045 errorcodeptr points to error code variable
1046
1047 Returns: pointer to '}' on success;
1048 current ptr on error, with errorcodeptr set non-zero
1049 */
1050
1051 static const uschar *
1052 read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
1053 {
1054 int min = 0;
1055 int max = -1;
1056
1057 /* Read the minimum value and do a paranoid check: a negative value indicates
1058 an integer overflow. */
1059
1060 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - CHAR_0;
1061 if (min < 0 || min > 65535)
1062 {
1063 *errorcodeptr = ERR5;
1064 return p;
1065 }
1066
1067 /* Read the maximum value if there is one, and again do a paranoid on its size.
1068 Also, max must not be less than min. */
1069
1070 if (*p == CHAR_RIGHT_CURLY_BRACKET) max = min; else
1071 {
1072 if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)
1073 {
1074 max = 0;
1075 while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - CHAR_0;
1076 if (max < 0 || max > 65535)
1077 {
1078 *errorcodeptr = ERR5;
1079 return p;
1080 }
1081 if (max < min)
1082 {
1083 *errorcodeptr = ERR4;
1084 return p;
1085 }
1086 }
1087 }
1088
1089 /* Fill in the required variables, and pass back the pointer to the terminating
1090 '}'. */
1091
1092 *minp = min;
1093 *maxp = max;
1094 return p;
1095 }
1096
1097
1098
1099 /*************************************************
1100 * Subroutine for finding forward reference *
1101 *************************************************/
1102
1103 /* This recursive function is called only from find_parens() below. The
1104 top-level call starts at the beginning of the pattern. All other calls must
1105 start at a parenthesis. It scans along a pattern's text looking for capturing
1106 subpatterns, and counting them. If it finds a named pattern that matches the
1107 name it is given, it returns its number. Alternatively, if the name is NULL, it
1108 returns when it reaches a given numbered subpattern. Recursion is used to keep
1109 track of subpatterns that reset the capturing group numbers - the (?| feature.
1110
1111 This function was originally called only from the second pass, in which we know
1112 that if (?< or (?' or (?P< is encountered, the name will be correctly
1113 terminated because that is checked in the first pass. There is now one call to
1114 this function in the first pass, to check for a recursive back reference by
1115 name (so that we can make the whole group atomic). In this case, we need check
1116 only up to the current position in the pattern, and that is still OK because
1117 and previous occurrences will have been checked. To make this work, the test
1118 for "end of pattern" is a check against cd->end_pattern in the main loop,
1119 instead of looking for a binary zero. This means that the special first-pass
1120 call can adjust cd->end_pattern temporarily. (Checks for binary zero while
1121 processing items within the loop are OK, because afterwards the main loop will
1122 terminate.)
1123
1124 Arguments:
1125 ptrptr address of the current character pointer (updated)
1126 cd compile background data
1127 name name to seek, or NULL if seeking a numbered subpattern
1128 lorn name length, or subpattern number if name is NULL
1129 xmode TRUE if we are in /x mode
1130 utf8 TRUE if we are in UTF-8 mode
1131 count pointer to the current capturing subpattern number (updated)
1132
1133 Returns: the number of the named subpattern, or -1 if not found
1134 */
1135
1136 static int
1137 find_parens_sub(uschar **ptrptr, compile_data *cd, const uschar *name, int lorn,
1138 BOOL xmode, BOOL utf8, int *count)
1139 {
1140 uschar *ptr = *ptrptr;
1141 int start_count = *count;
1142 int hwm_count = start_count;
1143 BOOL dup_parens = FALSE;
1144
1145 /* If the first character is a parenthesis, check on the type of group we are
1146 dealing with. The very first call may not start with a parenthesis. */
1147
1148 if (ptr[0] == CHAR_LEFT_PARENTHESIS)
1149 {
1150 /* Handle specials such as (*SKIP) or (*UTF8) etc. */
1151
1152 if (ptr[1] == CHAR_ASTERISK) ptr += 2;
1153
1154 /* Handle a normal, unnamed capturing parenthesis. */
1155
1156 else if (ptr[1] != CHAR_QUESTION_MARK)
1157 {
1158 *count += 1;
1159 if (name == NULL && *count == lorn) return *count;
1160 ptr++;
1161 }
1162
1163 /* All cases now have (? at the start. Remember when we are in a group
1164 where the parenthesis numbers are duplicated. */
1165
1166 else if (ptr[2] == CHAR_VERTICAL_LINE)
1167 {
1168 ptr += 3;
1169 dup_parens = TRUE;
1170 }
1171
1172 /* Handle comments; all characters are allowed until a ket is reached. */
1173
1174 else if (ptr[2] == CHAR_NUMBER_SIGN)
1175 {
1176 for (ptr += 3; *ptr != 0; ptr++) if (*ptr == CHAR_RIGHT_PARENTHESIS) break;
1177 goto FAIL_EXIT;
1178 }
1179
1180 /* Handle a condition. If it is an assertion, just carry on so that it
1181 is processed as normal. If not, skip to the closing parenthesis of the
1182 condition (there can't be any nested parens). */
1183
1184 else if (ptr[2] == CHAR_LEFT_PARENTHESIS)
1185 {
1186 ptr += 2;
1187 if (ptr[1] != CHAR_QUESTION_MARK)
1188 {
1189 while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
1190 if (*ptr != 0) ptr++;
1191 }
1192 }
1193
1194 /* Start with (? but not a condition. */
1195
1196 else
1197 {
1198 ptr += 2;
1199 if (*ptr == CHAR_P) ptr++; /* Allow optional P */
1200
1201 /* We have to disambiguate (?<! and (?<= from (?<name> for named groups */
1202
1203 if ((*ptr == CHAR_LESS_THAN_SIGN && ptr[1] != CHAR_EXCLAMATION_MARK &&
1204 ptr[1] != CHAR_EQUALS_SIGN) || *ptr == CHAR_APOSTROPHE)
1205 {
1206 int term;
1207 const uschar *thisname;
1208 *count += 1;
1209 if (name == NULL && *count == lorn) return *count;
1210 term = *ptr++;
1211 if (term == CHAR_LESS_THAN_SIGN) term = CHAR_GREATER_THAN_SIGN;
1212 thisname = ptr;
1213 while (*ptr != term) ptr++;
1214 if (name != NULL && lorn == ptr - thisname &&
1215 strncmp((const char *)name, (const char *)thisname, lorn) == 0)
1216 return *count;
1217 term++;
1218 }
1219 }
1220 }
1221
1222 /* Past any initial parenthesis handling, scan for parentheses or vertical
1223 bars. Stop if we get to cd->end_pattern. Note that this is important for the
1224 first-pass call when this value is temporarily adjusted to stop at the current
1225 position. So DO NOT change this to a test for binary zero. */
1226
1227 for (; ptr < cd->end_pattern; ptr++)
1228 {
1229 /* Skip over backslashed characters and also entire \Q...\E */
1230
1231 if (*ptr == CHAR_BACKSLASH)
1232 {
1233 if (*(++ptr) == 0) goto FAIL_EXIT;
1234 if (*ptr == CHAR_Q) for (;;)
1235 {
1236 while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
1237 if (*ptr == 0) goto FAIL_EXIT;
1238 if (*(++ptr) == CHAR_E) break;
1239 }
1240 continue;
1241 }
1242
1243 /* Skip over character classes; this logic must be similar to the way they
1244 are handled for real. If the first character is '^', skip it. Also, if the
1245 first few characters (either before or after ^) are \Q\E or \E we skip them
1246 too. This makes for compatibility with Perl. Note the use of STR macros to
1247 encode "Q\\E" so that it works in UTF-8 on EBCDIC platforms. */
1248
1249 if (*ptr == CHAR_LEFT_SQUARE_BRACKET)
1250 {
1251 BOOL negate_class = FALSE;
1252 for (;;)
1253 {
1254 if (ptr[1] == CHAR_BACKSLASH)
1255 {
1256 if (ptr[2] == CHAR_E)
1257 ptr+= 2;
1258 else if (strncmp((const char *)ptr+2,
1259 STR_Q STR_BACKSLASH STR_E, 3) == 0)
1260 ptr += 4;
1261 else
1262 break;
1263 }
1264 else if (!negate_class && ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
1265 {
1266 negate_class = TRUE;
1267 ptr++;
1268 }
1269 else break;
1270 }
1271
1272 /* If the next character is ']', it is a data character that must be
1273 skipped, except in JavaScript compatibility mode. */
1274
1275 if (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET &&
1276 (cd->external_options & PCRE_JAVASCRIPT_COMPAT) == 0)
1277 ptr++;
1278
1279 while (*(++ptr) != CHAR_RIGHT_SQUARE_BRACKET)
1280 {
1281 if (*ptr == 0) return -1;
1282 if (*ptr == CHAR_BACKSLASH)
1283 {
1284 if (*(++ptr) == 0) goto FAIL_EXIT;
1285 if (*ptr == CHAR_Q) for (;;)
1286 {
1287 while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
1288 if (*ptr == 0) goto FAIL_EXIT;
1289 if (*(++ptr) == CHAR_E) break;
1290 }
1291 continue;
1292 }
1293 }
1294 continue;
1295 }
1296
1297 /* Skip comments in /x mode */
1298
1299 if (xmode && *ptr == CHAR_NUMBER_SIGN)
1300 {
1301 ptr++;
1302 while (*ptr != 0)
1303 {
1304 if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
1305 ptr++;
1306 #ifdef SUPPORT_UTF8
1307 if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
1308 #endif
1309 }
1310 if (*ptr == 0) goto FAIL_EXIT;
1311 continue;
1312 }
1313
1314 /* Check for the special metacharacters */
1315
1316 if (*ptr == CHAR_LEFT_PARENTHESIS)
1317 {
1318 int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf8, count);
1319 if (rc > 0) return rc;
1320 if (*ptr == 0) goto FAIL_EXIT;
1321 }
1322
1323 else if (*ptr == CHAR_RIGHT_PARENTHESIS)
1324 {
1325 if (dup_parens && *count < hwm_count) *count = hwm_count;
1326 goto FAIL_EXIT;
1327 }
1328
1329 else if (*ptr == CHAR_VERTICAL_LINE && dup_parens)
1330 {
1331 if (*count > hwm_count) hwm_count = *count;
1332 *count = start_count;
1333 }
1334 }
1335
1336 FAIL_EXIT:
1337 *ptrptr = ptr;
1338 return -1;
1339 }
1340
1341
1342
1343
1344 /*************************************************
1345 * Find forward referenced subpattern *
1346 *************************************************/
1347
1348 /* This function scans along a pattern's text looking for capturing
1349 subpatterns, and counting them. If it finds a named pattern that matches the
1350 name it is given, it returns its number. Alternatively, if the name is NULL, it
1351 returns when it reaches a given numbered subpattern. This is used for forward
1352 references to subpatterns. We used to be able to start this scan from the
1353 current compiling point, using the current count value from cd->bracount, and
1354 do it all in a single loop, but the addition of the possibility of duplicate
1355 subpattern numbers means that we have to scan from the very start, in order to
1356 take account of such duplicates, and to use a recursive function to keep track
1357 of the different types of group.
1358
1359 Arguments:
1360 cd compile background data
1361 name name to seek, or NULL if seeking a numbered subpattern
1362 lorn name length, or subpattern number if name is NULL
1363 xmode TRUE if we are in /x mode
1364 utf8 TRUE if we are in UTF-8 mode
1365
1366 Returns: the number of the found subpattern, or -1 if not found
1367 */
1368
1369 static int
1370 find_parens(compile_data *cd, const uschar *name, int lorn, BOOL xmode,
1371 BOOL utf8)
1372 {
1373 uschar *ptr = (uschar *)cd->start_pattern;
1374 int count = 0;
1375 int rc;
1376
1377 /* If the pattern does not start with an opening parenthesis, the first call
1378 to find_parens_sub() will scan right to the end (if necessary). However, if it
1379 does start with a parenthesis, find_parens_sub() will return when it hits the
1380 matching closing parens. That is why we have to have a loop. */
1381
1382 for (;;)
1383 {
1384 rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf8, &count);
1385 if (rc > 0 || *ptr++ == 0) break;
1386 }
1387
1388 return rc;
1389 }
1390
1391
1392
1393
1394 /*************************************************
1395 * Find first significant op code *
1396 *************************************************/
1397
1398 /* This is called by several functions that scan a compiled expression looking
1399 for a fixed first character, or an anchoring op code etc. It skips over things
1400 that do not influence this. For some calls, it makes sense to skip negative
1401 forward and all backward assertions, and also the \b assertion; for others it
1402 does not.
1403
1404 Arguments:
1405 code pointer to the start of the group
1406 skipassert TRUE if certain assertions are to be skipped
1407
1408 Returns: pointer to the first significant opcode
1409 */
1410
1411 static const uschar*
1412 first_significant_code(const uschar *code, BOOL skipassert)
1413 {
1414 for (;;)
1415 {
1416 switch ((int)*code)
1417 {
1418 case OP_ASSERT_NOT:
1419 case OP_ASSERTBACK:
1420 case OP_ASSERTBACK_NOT:
1421 if (!skipassert) return code;
1422 do code += GET(code, 1); while (*code == OP_ALT);
1423 code += _pcre_OP_lengths[*code];
1424 break;
1425
1426 case OP_WORD_BOUNDARY:
1427 case OP_NOT_WORD_BOUNDARY:
1428 if (!skipassert) return code;
1429 /* Fall through */
1430
1431 case OP_CALLOUT:
1432 case OP_CREF:
1433 case OP_NCREF:
1434 case OP_RREF:
1435 case OP_NRREF:
1436 case OP_DEF:
1437 code += _pcre_OP_lengths[*code];
1438 break;
1439
1440 default:
1441 return code;
1442 }
1443 }
1444 /* Control never reaches here */
1445 }
1446
1447
1448
1449
1450 /*************************************************
1451 * Find the fixed length of a branch *
1452 *************************************************/
1453
1454 /* Scan a branch and compute the fixed length of subject that will match it,
1455 if the length is fixed. This is needed for dealing with backward assertions.
1456 In UTF8 mode, the result is in characters rather than bytes. The branch is
1457 temporarily terminated with OP_END when this function is called.
1458
1459 This function is called when a backward assertion is encountered, so that if it
1460 fails, the error message can point to the correct place in the pattern.
1461 However, we cannot do this when the assertion contains subroutine calls,
1462 because they can be forward references. We solve this by remembering this case
1463 and doing the check at the end; a flag specifies which mode we are running in.
1464
1465 Arguments:
1466 code points to the start of the pattern (the bracket)
1467 utf8 TRUE in UTF-8 mode
1468 atend TRUE if called when the pattern is complete
1469 cd the "compile data" structure
1470
1471 Returns: the fixed length,
1472 or -1 if there is no fixed length,
1473 or -2 if \C was encountered
1474 or -3 if an OP_RECURSE item was encountered and atend is FALSE
1475 */
1476
1477 static int
1478 find_fixedlength(uschar *code, BOOL utf8, BOOL atend, compile_data *cd)
1479 {
1480 int length = -1;
1481
1482 register int branchlength = 0;
1483 register uschar *cc = code + 1 + LINK_SIZE;
1484
1485 /* Scan along the opcodes for this branch. If we get to the end of the
1486 branch, check the length against that of the other branches. */
1487
1488 for (;;)
1489 {
1490 int d;
1491 uschar *ce, *cs;
1492 register int op = *cc;
1493 switch (op)
1494 {
1495 /* We only need to continue for OP_CBRA (normal capturing bracket) and
1496 OP_BRA (normal non-capturing bracket) because the other variants of these
1497 opcodes are all concerned with unlimited repeated groups, which of course
1498 are not of fixed length. They will cause a -1 response from the default
1499 case of this switch. */
1500
1501 case OP_CBRA:
1502 case OP_BRA:
1503 case OP_ONCE:
1504 case OP_COND:
1505 d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), utf8, atend, cd);
1506 if (d < 0) return d;
1507 branchlength += d;
1508 do cc += GET(cc, 1); while (*cc == OP_ALT);
1509 cc += 1 + LINK_SIZE;
1510 break;
1511
1512 /* Reached end of a branch; if it's a ket it is the end of a nested
1513 call. If it's ALT it is an alternation in a nested call. If it is
1514 END it's the end of the outer call. All can be handled by the same code.
1515 Note that we must not include the OP_KETRxxx opcodes here, because they
1516 all imply an unlimited repeat. */
1517
1518 case OP_ALT:
1519 case OP_KET:
1520 case OP_END:
1521 if (length < 0) length = branchlength;
1522 else if (length != branchlength) return -1;
1523 if (*cc != OP_ALT) return length;
1524 cc += 1 + LINK_SIZE;
1525 branchlength = 0;
1526 break;
1527
1528 /* A true recursion implies not fixed length, but a subroutine call may
1529 be OK. If the subroutine is a forward reference, we can't deal with
1530 it until the end of the pattern, so return -3. */
1531
1532 case OP_RECURSE:
1533 if (!atend) return -3;
1534 cs = ce = (uschar *)cd->start_code + GET(cc, 1); /* Start subpattern */
1535 do ce += GET(ce, 1); while (*ce == OP_ALT); /* End subpattern */
1536 if (cc > cs && cc < ce) return -1; /* Recursion */
1537 d = find_fixedlength(cs + 2, utf8, atend, cd);
1538 if (d < 0) return d;
1539 branchlength += d;
1540 cc += 1 + LINK_SIZE;
1541 break;
1542
1543 /* Skip over assertive subpatterns */
1544
1545 case OP_ASSERT:
1546 case OP_ASSERT_NOT:
1547 case OP_ASSERTBACK:
1548 case OP_ASSERTBACK_NOT:
1549 do cc += GET(cc, 1); while (*cc == OP_ALT);
1550 /* Fall through */
1551
1552 /* Skip over things that don't match chars */
1553
1554 case OP_REVERSE:
1555 case OP_CREF:
1556 case OP_NCREF:
1557 case OP_RREF:
1558 case OP_NRREF:
1559 case OP_DEF:
1560 case OP_CALLOUT:
1561 case OP_SOD:
1562 case OP_SOM:
1563 case OP_SET_SOM:
1564 case OP_EOD:
1565 case OP_EODN:
1566 case OP_CIRC:
1567 case OP_CIRCM:
1568 case OP_DOLL:
1569 case OP_DOLLM:
1570 case OP_NOT_WORD_BOUNDARY:
1571 case OP_WORD_BOUNDARY:
1572 cc += _pcre_OP_lengths[*cc];
1573 break;
1574
1575 /* Handle literal characters */
1576
1577 case OP_CHAR:
1578 case OP_CHARI:
1579 case OP_NOT:
1580 case OP_NOTI:
1581 branchlength++;
1582 cc += 2;
1583 #ifdef SUPPORT_UTF8
1584 if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
1585 #endif
1586 break;
1587
1588 /* Handle exact repetitions. The count is already in characters, but we
1589 need to skip over a multibyte character in UTF8 mode. */
1590
1591 case OP_EXACT:
1592 branchlength += GET2(cc,1);
1593 cc += 4;
1594 #ifdef SUPPORT_UTF8
1595 if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
1596 #endif
1597 break;
1598
1599 case OP_TYPEEXACT:
1600 branchlength += GET2(cc,1);
1601 if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
1602 cc += 4;
1603 break;
1604
1605 /* Handle single-char matchers */
1606
1607 case OP_PROP:
1608 case OP_NOTPROP:
1609 cc += 2;
1610 /* Fall through */
1611
1612 case OP_NOT_DIGIT:
1613 case OP_DIGIT:
1614 case OP_NOT_WHITESPACE:
1615 case OP_WHITESPACE:
1616 case OP_NOT_WORDCHAR:
1617 case OP_WORDCHAR:
1618 case OP_ANY:
1619 case OP_ALLANY:
1620 branchlength++;
1621 cc++;
1622 break;
1623
1624 /* The single-byte matcher isn't allowed */
1625
1626 case OP_ANYBYTE:
1627 return -2;
1628
1629 /* Check a class for variable quantification */
1630
1631 #ifdef SUPPORT_UTF8
1632 case OP_XCLASS:
1633 cc += GET(cc, 1) - 33;
1634 /* Fall through */
1635 #endif
1636
1637 case OP_CLASS:
1638 case OP_NCLASS:
1639 cc += 33;
1640
1641 switch (*cc)
1642 {
1643 case OP_CRSTAR:
1644 case OP_CRMINSTAR:
1645 case OP_CRQUERY:
1646 case OP_CRMINQUERY:
1647 return -1;
1648
1649 case OP_CRRANGE:
1650 case OP_CRMINRANGE:
1651 if (GET2(cc,1) != GET2(cc,3)) return -1;
1652 branchlength += GET2(cc,1);
1653 cc += 5;
1654 break;
1655
1656 default:
1657 branchlength++;
1658 }
1659 break;
1660
1661 /* Anything else is variable length */
1662
1663 default:
1664 return -1;
1665 }
1666 }
1667 /* Control never gets here */
1668 }
1669
1670
1671
1672
1673 /*************************************************
1674 * Scan compiled regex for specific bracket *
1675 *************************************************/
1676
1677 /* This little function scans through a compiled pattern until it finds a
1678 capturing bracket with the given number, or, if the number is negative, an
1679 instance of OP_REVERSE for a lookbehind. The function is global in the C sense
1680 so that it can be called from pcre_study() when finding the minimum matching
1681 length.
1682
1683 Arguments:
1684 code points to start of expression
1685 utf8 TRUE in UTF-8 mode
1686 number the required bracket number or negative to find a lookbehind
1687
1688 Returns: pointer to the opcode for the bracket, or NULL if not found
1689 */
1690
1691 const uschar *
1692 _pcre_find_bracket(const uschar *code, BOOL utf8, int number)
1693 {
1694 for (;;)
1695 {
1696 register int c = *code;
1697 if (c == OP_END) return NULL;
1698
1699 /* XCLASS is used for classes that cannot be represented just by a bit
1700 map. This includes negated single high-valued characters. The length in
1701 the table is zero; the actual length is stored in the compiled code. */
1702
1703 if (c == OP_XCLASS) code += GET(code, 1);
1704
1705 /* Handle recursion */
1706
1707 else if (c == OP_REVERSE)
1708 {
1709 if (number < 0) return (uschar *)code;
1710 code += _pcre_OP_lengths[c];
1711 }
1712
1713 /* Handle capturing bracket */
1714
1715 else if (c == OP_CBRA || c == OP_SCBRA ||
1716 c == OP_CBRAPOS || c == OP_SCBRAPOS)
1717 {
1718 int n = GET2(code, 1+LINK_SIZE);
1719 if (n == number) return (uschar *)code;
1720 code += _pcre_OP_lengths[c];
1721 }
1722
1723 /* Otherwise, we can get the item's length from the table, except that for
1724 repeated character types, we have to test for \p and \P, which have an extra
1725 two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
1726 must add in its length. */
1727
1728 else
1729 {
1730 switch(c)
1731 {
1732 case OP_TYPESTAR:
1733 case OP_TYPEMINSTAR:
1734 case OP_TYPEPLUS:
1735 case OP_TYPEMINPLUS:
1736 case OP_TYPEQUERY:
1737 case OP_TYPEMINQUERY:
1738 case OP_TYPEPOSSTAR:
1739 case OP_TYPEPOSPLUS:
1740 case OP_TYPEPOSQUERY:
1741 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1742 break;
1743
1744 case OP_TYPEUPTO:
1745 case OP_TYPEMINUPTO:
1746 case OP_TYPEEXACT:
1747 case OP_TYPEPOSUPTO:
1748 if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1749 break;
1750
1751 case OP_MARK:
1752 case OP_PRUNE_ARG:
1753 case OP_SKIP_ARG:
1754 code += code[1];
1755 break;
1756
1757 case OP_THEN_ARG:
1758 code += code[1+LINK_SIZE];
1759 break;
1760 }
1761
1762 /* Add in the fixed length from the table */
1763
1764 code += _pcre_OP_lengths[c];
1765
1766 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1767 a multi-byte character. The length in the table is a minimum, so we have to
1768 arrange to skip the extra bytes. */
1769
1770 #ifdef SUPPORT_UTF8
1771 if (utf8) switch(c)
1772 {
1773 case OP_CHAR:
1774 case OP_CHARI:
1775 case OP_EXACT:
1776 case OP_EXACTI:
1777 case OP_UPTO:
1778 case OP_UPTOI:
1779 case OP_MINUPTO:
1780 case OP_MINUPTOI:
1781 case OP_POSUPTO:
1782 case OP_POSUPTOI:
1783 case OP_STAR:
1784 case OP_STARI:
1785 case OP_MINSTAR:
1786 case OP_MINSTARI:
1787 case OP_POSSTAR:
1788 case OP_POSSTARI:
1789 case OP_PLUS:
1790 case OP_PLUSI:
1791 case OP_MINPLUS:
1792 case OP_MINPLUSI:
1793 case OP_POSPLUS:
1794 case OP_POSPLUSI:
1795 case OP_QUERY:
1796 case OP_QUERYI:
1797 case OP_MINQUERY:
1798 case OP_MINQUERYI:
1799 case OP_POSQUERY:
1800 case OP_POSQUERYI:
1801 if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1802 break;
1803 }
1804 #else
1805 (void)(utf8); /* Keep compiler happy by referencing function argument */
1806 #endif
1807 }
1808 }
1809 }
1810
1811
1812
1813 /*************************************************
1814 * Scan compiled regex for recursion reference *
1815 *************************************************/
1816
1817 /* This little function scans through a compiled pattern until it finds an
1818 instance of OP_RECURSE.
1819
1820 Arguments:
1821 code points to start of expression
1822 utf8 TRUE in UTF-8 mode
1823
1824 Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1825 */
1826
1827 static const uschar *
1828 find_recurse(const uschar *code, BOOL utf8)
1829 {
1830 for (;;)
1831 {
1832 register int c = *code;
1833 if (c == OP_END) return NULL;
1834 if (c == OP_RECURSE) return code;
1835
1836 /* XCLASS is used for classes that cannot be represented just by a bit
1837 map. This includes negated single high-valued characters. The length in
1838 the table is zero; the actual length is stored in the compiled code. */
1839
1840 if (c == OP_XCLASS) code += GET(code, 1);
1841
1842 /* Otherwise, we can get the item's length from the table, except that for
1843 repeated character types, we have to test for \p and \P, which have an extra
1844 two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
1845 must add in its length. */
1846
1847 else
1848 {
1849 switch(c)
1850 {
1851 case OP_TYPESTAR:
1852 case OP_TYPEMINSTAR:
1853 case OP_TYPEPLUS:
1854 case OP_TYPEMINPLUS:
1855 case OP_TYPEQUERY:
1856 case OP_TYPEMINQUERY:
1857 case OP_TYPEPOSSTAR:
1858 case OP_TYPEPOSPLUS:
1859 case OP_TYPEPOSQUERY:
1860 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1861 break;
1862
1863 case OP_TYPEPOSUPTO:
1864 case OP_TYPEUPTO:
1865 case OP_TYPEMINUPTO:
1866 case OP_TYPEEXACT:
1867 if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1868 break;
1869
1870 case OP_MARK:
1871 case OP_PRUNE_ARG:
1872 case OP_SKIP_ARG:
1873 code += code[1];
1874 break;
1875
1876 case OP_THEN_ARG:
1877 code += code[1+LINK_SIZE];
1878 break;
1879 }
1880
1881 /* Add in the fixed length from the table */
1882
1883 code += _pcre_OP_lengths[c];
1884
1885 /* In UTF-8 mode, opcodes that are followed by a character may be followed
1886 by a multi-byte character. The length in the table is a minimum, so we have
1887 to arrange to skip the extra bytes. */
1888
1889 #ifdef SUPPORT_UTF8
1890 if (utf8) switch(c)
1891 {
1892 case OP_CHAR:
1893 case OP_CHARI:
1894 case OP_EXACT:
1895 case OP_EXACTI:
1896 case OP_UPTO:
1897 case OP_UPTOI:
1898 case OP_MINUPTO:
1899 case OP_MINUPTOI:
1900 case OP_POSUPTO:
1901 case OP_POSUPTOI:
1902 case OP_STAR:
1903 case OP_STARI:
1904 case OP_MINSTAR:
1905 case OP_MINSTARI:
1906 case OP_POSSTAR:
1907 case OP_POSSTARI:
1908 case OP_PLUS:
1909 case OP_PLUSI:
1910 case OP_MINPLUS:
1911 case OP_MINPLUSI:
1912 case OP_POSPLUS:
1913 case OP_POSPLUSI:
1914 case OP_QUERY:
1915 case OP_QUERYI:
1916 case OP_MINQUERY:
1917 case OP_MINQUERYI:
1918 case OP_POSQUERY:
1919 case OP_POSQUERYI:
1920 if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1921 break;
1922 }
1923 #else
1924 (void)(utf8); /* Keep compiler happy by referencing function argument */
1925 #endif
1926 }
1927 }
1928 }
1929
1930
1931
1932 /*************************************************
1933 * Scan compiled branch for non-emptiness *
1934 *************************************************/
1935
1936 /* This function scans through a branch of a compiled pattern to see whether it
1937 can match the empty string or not. It is called from could_be_empty()
1938 below and from compile_branch() when checking for an unlimited repeat of a
1939 group that can match nothing. Note that first_significant_code() skips over
1940 backward and negative forward assertions when its final argument is TRUE. If we
1941 hit an unclosed bracket, we return "empty" - this means we've struck an inner
1942 bracket whose current branch will already have been scanned.
1943
1944 Arguments:
1945 code points to start of search
1946 endcode points to where to stop
1947 utf8 TRUE if in UTF8 mode
1948 cd contains pointers to tables etc.
1949
1950 Returns: TRUE if what is matched could be empty
1951 */
1952
1953 static BOOL
1954 could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8,
1955 compile_data *cd)
1956 {
1957 register int c;
1958 for (code = first_significant_code(code + _pcre_OP_lengths[*code], TRUE);
1959 code < endcode;
1960 code = first_significant_code(code + _pcre_OP_lengths[c], TRUE))
1961 {
1962 const uschar *ccode;
1963
1964 c = *code;
1965
1966 /* Skip over forward assertions; the other assertions are skipped by
1967 first_significant_code() with a TRUE final argument. */
1968
1969 if (c == OP_ASSERT)
1970 {
1971 do code += GET(code, 1); while (*code == OP_ALT);
1972 c = *code;
1973 continue;
1974 }
1975
1976 /* For a recursion/subroutine call, if its end has been reached, which
1977 implies a subroutine call, we can scan it. */
1978
1979 if (c == OP_RECURSE)
1980 {
1981 BOOL empty_branch = FALSE;
1982 const uschar *scode = cd->start_code + GET(code, 1);
1983 if (GET(scode, 1) == 0) return TRUE; /* Unclosed */
1984 do
1985 {
1986 if (could_be_empty_branch(scode, endcode, utf8, cd))
1987 {
1988 empty_branch = TRUE;
1989 break;
1990 }
1991 scode += GET(scode, 1);
1992 }
1993 while (*scode == OP_ALT);
1994 if (!empty_branch) return FALSE; /* All branches are non-empty */
1995 continue;
1996 }
1997
1998 /* Groups with zero repeats can of course be empty; skip them. */
1999
2000 if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO ||
2001 c == OP_BRAPOSZERO)
2002 {
2003 code += _pcre_OP_lengths[c];
2004 do code += GET(code, 1); while (*code == OP_ALT);
2005 c = *code;
2006 continue;
2007 }
2008
2009 /* A nested group that is already marked as "could be empty" can just be
2010 skipped. */
2011
2012 if (c == OP_SBRA || c == OP_SBRAPOS ||
2013 c == OP_SCBRA || c == OP_SCBRAPOS)
2014 {
2015 do code += GET(code, 1); while (*code == OP_ALT);
2016 c = *code;
2017 continue;
2018 }
2019
2020 /* For other groups, scan the branches. */
2021
2022 if (c == OP_BRA || c == OP_BRAPOS ||
2023 c == OP_CBRA || c == OP_CBRAPOS ||
2024 c == OP_ONCE || c == OP_COND)
2025 {
2026 BOOL empty_branch;
2027 if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
2028
2029 /* If a conditional group has only one branch, there is a second, implied,
2030 empty branch, so just skip over the conditional, because it could be empty.
2031 Otherwise, scan the individual branches of the group. */
2032
2033 if (c == OP_COND && code[GET(code, 1)] != OP_ALT)
2034 code += GET(code, 1);
2035 else
2036 {
2037 empty_branch = FALSE;
2038 do
2039 {
2040 if (!empty_branch && could_be_empty_branch(code, endcode, utf8, cd))
2041 empty_branch = TRUE;
2042 code += GET(code, 1);
2043 }
2044 while (*code == OP_ALT);
2045 if (!empty_branch) return FALSE; /* All branches are non-empty */
2046 }
2047
2048 c = *code;
2049 continue;
2050 }
2051
2052 /* Handle the other opcodes */
2053
2054 switch (c)
2055 {
2056 /* Check for quantifiers after a class. XCLASS is used for classes that
2057 cannot be represented just by a bit map. This includes negated single
2058 high-valued characters. The length in _pcre_OP_lengths[] is zero; the
2059 actual length is stored in the compiled code, so we must update "code"
2060 here. */
2061
2062 #ifdef SUPPORT_UTF8
2063 case OP_XCLASS:
2064 ccode = code += GET(code, 1);
2065 goto CHECK_CLASS_REPEAT;
2066 #endif
2067
2068 case OP_CLASS:
2069 case OP_NCLASS:
2070 ccode = code + 33;
2071
2072 #ifdef SUPPORT_UTF8
2073 CHECK_CLASS_REPEAT:
2074 #endif
2075
2076 switch (*ccode)
2077 {
2078 case OP_CRSTAR: /* These could be empty; continue */
2079 case OP_CRMINSTAR:
2080 case OP_CRQUERY:
2081 case OP_CRMINQUERY:
2082 break;
2083
2084 default: /* Non-repeat => class must match */
2085 case OP_CRPLUS: /* These repeats aren't empty */
2086 case OP_CRMINPLUS:
2087 return FALSE;
2088
2089 case OP_CRRANGE:
2090 case OP_CRMINRANGE:
2091 if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
2092 break;
2093 }
2094 break;
2095
2096 /* Opcodes that must match a character */
2097
2098 case OP_PROP:
2099 case OP_NOTPROP:
2100 case OP_EXTUNI:
2101 case OP_NOT_DIGIT:
2102 case OP_DIGIT:
2103 case OP_NOT_WHITESPACE:
2104 case OP_WHITESPACE:
2105 case OP_NOT_WORDCHAR:
2106 case OP_WORDCHAR:
2107 case OP_ANY:
2108 case OP_ALLANY:
2109 case OP_ANYBYTE:
2110 case OP_CHAR:
2111 case OP_CHARI:
2112 case OP_NOT:
2113 case OP_NOTI:
2114 case OP_PLUS:
2115 case OP_MINPLUS:
2116 case OP_POSPLUS:
2117 case OP_EXACT:
2118 case OP_NOTPLUS:
2119 case OP_NOTMINPLUS:
2120 case OP_NOTPOSPLUS:
2121 case OP_NOTEXACT:
2122 case OP_TYPEPLUS:
2123 case OP_TYPEMINPLUS:
2124 case OP_TYPEPOSPLUS:
2125 case OP_TYPEEXACT:
2126 return FALSE;
2127
2128 /* These are going to continue, as they may be empty, but we have to
2129 fudge the length for the \p and \P cases. */
2130
2131 case OP_TYPESTAR:
2132 case OP_TYPEMINSTAR:
2133 case OP_TYPEPOSSTAR:
2134 case OP_TYPEQUERY:
2135 case OP_TYPEMINQUERY:
2136 case OP_TYPEPOSQUERY:
2137 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
2138 break;
2139
2140 /* Same for these */
2141
2142 case OP_TYPEUPTO:
2143 case OP_TYPEMINUPTO:
2144 case OP_TYPEPOSUPTO:
2145 if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
2146 break;
2147
2148 /* End of branch */
2149
2150 case OP_KET:
2151 case OP_KETRMAX:
2152 case OP_KETRMIN:
2153 case OP_KETRPOS:
2154 case OP_ALT:
2155 return TRUE;
2156
2157 /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
2158 MINUPTO, and POSUPTO may be followed by a multibyte character */
2159
2160 #ifdef SUPPORT_UTF8
2161 case OP_STAR:
2162 case OP_STARI:
2163 case OP_MINSTAR:
2164 case OP_MINSTARI:
2165 case OP_POSSTAR:
2166 case OP_POSSTARI:
2167 case OP_QUERY:
2168 case OP_QUERYI:
2169 case OP_MINQUERY:
2170 case OP_MINQUERYI:
2171 case OP_POSQUERY:
2172 case OP_POSQUERYI:
2173 if (utf8 && code[1] >= 0xc0) code += _pcre_utf8_table4[code[1] & 0x3f];
2174 break;
2175
2176 case OP_UPTO:
2177 case OP_UPTOI:
2178 case OP_MINUPTO:
2179 case OP_MINUPTOI:
2180 case OP_POSUPTO:
2181 case OP_POSUPTOI:
2182 if (utf8 && code[3] >= 0xc0) code += _pcre_utf8_table4[code[3] & 0x3f];
2183 break;
2184 #endif
2185
2186 /* MARK, and PRUNE/SKIP/THEN with an argument must skip over the argument
2187 string. */
2188
2189 case OP_MARK:
2190 case OP_PRUNE_ARG:
2191 case OP_SKIP_ARG:
2192 code += code[1];
2193 break;
2194
2195 case OP_THEN_ARG:
2196 code += code[1+LINK_SIZE];
2197 break;
2198
2199 /* None of the remaining opcodes are required to match a character. */
2200
2201 default:
2202 break;
2203 }
2204 }
2205
2206 return TRUE;
2207 }
2208
2209
2210
2211 /*************************************************
2212 * Scan compiled regex for non-emptiness *
2213 *************************************************/
2214
2215 /* This function is called to check for left recursive calls. We want to check
2216 the current branch of the current pattern to see if it could match the empty
2217 string. If it could, we must look outwards for branches at other levels,
2218 stopping when we pass beyond the bracket which is the subject of the recursion.
2219
2220 Arguments:
2221 code points to start of the recursion
2222 endcode points to where to stop (current RECURSE item)
2223 bcptr points to the chain of current (unclosed) branch starts
2224 utf8 TRUE if in UTF-8 mode
2225 cd pointers to tables etc
2226
2227 Returns: TRUE if what is matched could be empty
2228 */
2229
2230 static BOOL
2231 could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
2232 BOOL utf8, compile_data *cd)
2233 {
2234 while (bcptr != NULL && bcptr->current_branch >= code)
2235 {
2236 if (!could_be_empty_branch(bcptr->current_branch, endcode, utf8, cd))
2237 return FALSE;
2238 bcptr = bcptr->outer;
2239 }
2240 return TRUE;
2241 }
2242
2243
2244
2245 /*************************************************
2246 * Check for POSIX class syntax *
2247 *************************************************/
2248
2249 /* This function is called when the sequence "[:" or "[." or "[=" is
2250 encountered in a character class. It checks whether this is followed by a
2251 sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
2252 reach an unescaped ']' without the special preceding character, return FALSE.
2253
2254 Originally, this function only recognized a sequence of letters between the
2255 terminators, but it seems that Perl recognizes any sequence of characters,
2256 though of course unknown POSIX names are subsequently rejected. Perl gives an
2257 "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
2258 didn't consider this to be a POSIX class. Likewise for [:1234:].
2259
2260 The problem in trying to be exactly like Perl is in the handling of escapes. We
2261 have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
2262 class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
2263 below handles the special case of \], but does not try to do any other escape
2264 processing. This makes it different from Perl for cases such as [:l\ower:]
2265 where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize
2266 "l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,
2267 I think.
2268
2269 Arguments:
2270 ptr pointer to the initial [
2271 endptr where to return the end pointer
2272
2273 Returns: TRUE or FALSE
2274 */
2275
2276 static BOOL
2277 check_posix_syntax(const uschar *ptr, const uschar **endptr)
2278 {
2279 int terminator; /* Don't combine these lines; the Solaris cc */
2280 terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
2281 for (++ptr; *ptr != 0; ptr++)
2282 {
2283 if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET) ptr++; else
2284 {
2285 if (*ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
2286 if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
2287 {
2288 *endptr = ptr;
2289 return TRUE;
2290 }
2291 }
2292 }
2293 return FALSE;
2294 }
2295
2296
2297
2298
2299 /*************************************************
2300 * Check POSIX class name *
2301 *************************************************/
2302
2303 /* This function is called to check the name given in a POSIX-style class entry
2304 such as [:alnum:].
2305
2306 Arguments:
2307 ptr points to the first letter
2308 len the length of the name
2309
2310 Returns: a value representing the name, or -1 if unknown
2311 */
2312
2313 static int
2314 check_posix_name(const uschar *ptr, int len)
2315 {
2316 const char *pn = posix_names;
2317 register int yield = 0;
2318 while (posix_name_lengths[yield] != 0)
2319 {
2320 if (len == posix_name_lengths[yield] &&
2321 strncmp((const char *)ptr, pn, len) == 0) return yield;
2322 pn += posix_name_lengths[yield] + 1;
2323 yield++;
2324 }
2325 return -1;
2326 }
2327
2328
2329 /*************************************************
2330 * Adjust OP_RECURSE items in repeated group *
2331 *************************************************/
2332
2333 /* OP_RECURSE items contain an offset from the start of the regex to the group
2334 that is referenced. This means that groups can be replicated for fixed
2335 repetition simply by copying (because the recursion is allowed to refer to
2336 earlier groups that are outside the current group). However, when a group is
2337 optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is
2338 inserted before it, after it has been compiled. This means that any OP_RECURSE
2339 items within it that refer to the group itself or any contained groups have to
2340 have their offsets adjusted. That one of the jobs of this function. Before it
2341 is called, the partially compiled regex must be temporarily terminated with
2342 OP_END.
2343
2344 This function has been extended with the possibility of forward references for
2345 recursions and subroutine calls. It must also check the list of such references
2346 for the group we are dealing with. If it finds that one of the recursions in
2347 the current group is on this list, it adjusts the offset in the list, not the
2348 value in the reference (which is a group number).
2349
2350 Arguments:
2351 group points to the start of the group
2352 adjust the amount by which the group is to be moved
2353 utf8 TRUE in UTF-8 mode
2354 cd contains pointers to tables etc.
2355 save_hwm the hwm forward reference pointer at the start of the group
2356
2357 Returns: nothing
2358 */
2359
2360 static void
2361 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
2362 uschar *save_hwm)
2363 {
2364 uschar *ptr = group;
2365
2366 while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
2367 {
2368 int offset;
2369 uschar *hc;
2370
2371 /* See if this recursion is on the forward reference list. If so, adjust the
2372 reference. */
2373
2374 for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
2375 {
2376 offset = GET(hc, 0);
2377 if (cd->start_code + offset == ptr + 1)
2378 {
2379 PUT(hc, 0, offset + adjust);
2380 break;
2381 }
2382 }
2383
2384 /* Otherwise, adjust the recursion offset if it's after the start of this
2385 group. */
2386
2387 if (hc >= cd->hwm)
2388 {
2389 offset = GET(ptr, 1);
2390 if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
2391 }
2392
2393 ptr += 1 + LINK_SIZE;
2394 }
2395 }
2396
2397
2398
2399 /*************************************************
2400 * Insert an automatic callout point *
2401 *************************************************/
2402
2403 /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
2404 callout points before each pattern item.
2405
2406 Arguments:
2407 code current code pointer
2408 ptr current pattern pointer
2409 cd pointers to tables etc
2410
2411 Returns: new code pointer
2412 */
2413
2414 static uschar *
2415 auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
2416 {
2417 *code++ = OP_CALLOUT;
2418 *code++ = 255;
2419 PUT(code, 0, (int)(ptr - cd->start_pattern)); /* Pattern offset */
2420 PUT(code, LINK_SIZE, 0); /* Default length */
2421 return code + 2*LINK_SIZE;
2422 }
2423
2424
2425
2426 /*************************************************
2427 * Complete a callout item *
2428 *************************************************/
2429
2430 /* A callout item contains the length of the next item in the pattern, which
2431 we can't fill in till after we have reached the relevant point. This is used
2432 for both automatic and manual callouts.
2433
2434 Arguments:
2435 previous_callout points to previous callout item
2436 ptr current pattern pointer
2437 cd pointers to tables etc
2438
2439 Returns: nothing
2440 */
2441
2442 static void
2443 complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
2444 {
2445 int length = (int)(ptr - cd->start_pattern - GET(previous_callout, 2));
2446 PUT(previous_callout, 2 + LINK_SIZE, length);
2447 }
2448
2449
2450
2451 #ifdef SUPPORT_UCP
2452 /*************************************************
2453 * Get othercase range *
2454 *************************************************/
2455
2456 /* This function is passed the start and end of a class range, in UTF-8 mode
2457 with UCP support. It searches up the characters, looking for internal ranges of
2458 characters in the "other" case. Each call returns the next one, updating the
2459 start address.
2460
2461 Arguments:
2462 cptr points to starting character value; updated
2463 d end value
2464 ocptr where to put start of othercase range
2465 odptr where to put end of othercase range
2466
2467 Yield: TRUE when range returned; FALSE when no more
2468 */
2469
2470 static BOOL
2471 get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
2472 unsigned int *odptr)
2473 {
2474 unsigned int c, othercase, next;
2475
2476 for (c = *cptr; c <= d; c++)
2477 { if ((othercase = UCD_OTHERCASE(c)) != c) break; }
2478
2479 if (c > d) return FALSE;
2480
2481 *ocptr = othercase;
2482 next = othercase + 1;
2483
2484 for (++c; c <= d; c++)
2485 {
2486 if (UCD_OTHERCASE(c) != next) break;
2487 next++;
2488 }
2489
2490 *odptr = next - 1;
2491 *cptr = c;
2492
2493 return TRUE;
2494 }
2495
2496
2497
2498 /*************************************************
2499 * Check a character and a property *
2500 *************************************************/
2501
2502 /* This function is called by check_auto_possessive() when a property item
2503 is adjacent to a fixed character.
2504
2505 Arguments:
2506 c the character
2507 ptype the property type
2508 pdata the data for the type
2509 negated TRUE if it's a negated property (\P or \p{^)
2510
2511 Returns: TRUE if auto-possessifying is OK
2512 */
2513
2514 static BOOL
2515 check_char_prop(int c, int ptype, int pdata, BOOL negated)
2516 {
2517 const ucd_record *prop = GET_UCD(c);
2518 switch(ptype)
2519 {
2520 case PT_LAMP:
2521 return (prop->chartype == ucp_Lu ||
2522 prop->chartype == ucp_Ll ||
2523 prop->chartype == ucp_Lt) == negated;
2524
2525 case PT_GC:
2526 return (pdata == _pcre_ucp_gentype[prop->chartype]) == negated;
2527
2528 case PT_PC:
2529 return (pdata == prop->chartype) == negated;
2530
2531 case PT_SC:
2532 return (pdata == prop->script) == negated;
2533
2534 /* These are specials */
2535
2536 case PT_ALNUM:
2537 return (_pcre_ucp_gentype[prop->chartype] == ucp_L ||
2538 _pcre_ucp_gentype[prop->chartype] == ucp_N) == negated;
2539
2540 case PT_SPACE: /* Perl space */
2541 return (_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
2542 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
2543 == negated;
2544
2545 case PT_PXSPACE: /* POSIX space */
2546 return (_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
2547 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2548 c == CHAR_FF || c == CHAR_CR)
2549 == negated;
2550
2551 case PT_WORD:
2552 return (_pcre_ucp_gentype[prop->chartype] == ucp_L ||
2553 _pcre_ucp_gentype[prop->chartype] == ucp_N ||
2554 c == CHAR_UNDERSCORE) == negated;
2555 }
2556 return FALSE;
2557 }
2558 #endif /* SUPPORT_UCP */
2559
2560
2561
2562 /*************************************************
2563 * Check if auto-possessifying is possible *
2564 *************************************************/
2565
2566 /* This function is called for unlimited repeats of certain items, to see
2567 whether the next thing could possibly match the repeated item. If not, it makes
2568 sense to automatically possessify the repeated item.
2569
2570 Arguments:
2571 previous pointer to the repeated opcode
2572 utf8 TRUE in UTF-8 mode
2573 ptr next character in pattern
2574 options options bits
2575 cd contains pointers to tables etc.
2576
2577 Returns: TRUE if possessifying is wanted
2578 */
2579
2580 static BOOL
2581 check_auto_possessive(const uschar *previous, BOOL utf8, const uschar *ptr,
2582 int options, compile_data *cd)
2583 {
2584 int c, next;
2585 int op_code = *previous++;
2586
2587 /* Skip whitespace and comments in extended mode */
2588
2589 if ((options & PCRE_EXTENDED) != 0)
2590 {
2591 for (;;)
2592 {
2593 while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2594 if (*ptr == CHAR_NUMBER_SIGN)
2595 {
2596 ptr++;
2597 while (*ptr != 0)
2598 {
2599 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2600 ptr++;
2601 #ifdef SUPPORT_UTF8
2602 if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
2603 #endif
2604 }
2605 }
2606 else break;
2607 }
2608 }
2609
2610 /* If the next item is one that we can handle, get its value. A non-negative
2611 value is a character, a negative value is an escape value. */
2612
2613 if (*ptr == CHAR_BACKSLASH)
2614 {
2615 int temperrorcode = 0;
2616 next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
2617 if (temperrorcode != 0) return FALSE;
2618 ptr++; /* Point after the escape sequence */
2619 }
2620
2621 else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
2622 {
2623 #ifdef SUPPORT_UTF8
2624 if (utf8) { GETCHARINC(next, ptr); } else
2625 #endif
2626 next = *ptr++;
2627 }
2628
2629 else return FALSE;
2630
2631 /* Skip whitespace and comments in extended mode */
2632
2633 if ((options & PCRE_EXTENDED) != 0)
2634 {
2635 for (;;)
2636 {
2637 while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2638 if (*ptr == CHAR_NUMBER_SIGN)
2639 {
2640 ptr++;
2641 while (*ptr != 0)
2642 {
2643 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2644 ptr++;
2645 #ifdef SUPPORT_UTF8
2646 if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
2647 #endif
2648 }
2649 }
2650 else break;
2651 }
2652 }
2653
2654 /* If the next thing is itself optional, we have to give up. */
2655
2656 if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
2657 strncmp((char *)ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
2658 return FALSE;
2659
2660 /* Now compare the next item with the previous opcode. First, handle cases when
2661 the next item is a character. */
2662
2663 if (next >= 0) switch(op_code)
2664 {
2665 case OP_CHAR:
2666 #ifdef SUPPORT_UTF8
2667 GETCHARTEST(c, previous);
2668 #else
2669 c = *previous;
2670 #endif
2671 return c != next;
2672
2673 /* For CHARI (caseless character) we must check the other case. If we have
2674 Unicode property support, we can use it to test the other case of
2675 high-valued characters. */
2676
2677 case OP_CHARI:
2678 #ifdef SUPPORT_UTF8
2679 GETCHARTEST(c, previous);
2680 #else
2681 c = *previous;
2682 #endif
2683 if (c == next) return FALSE;
2684 #ifdef SUPPORT_UTF8
2685 if (utf8)
2686 {
2687 unsigned int othercase;
2688 if (next < 128) othercase = cd->fcc[next]; else
2689 #ifdef SUPPORT_UCP
2690 othercase = UCD_OTHERCASE((unsigned int)next);
2691 #else
2692 othercase = NOTACHAR;
2693 #endif
2694 return (unsigned int)c != othercase;
2695 }
2696 else
2697 #endif /* SUPPORT_UTF8 */
2698 return (c != cd->fcc[next]); /* Non-UTF-8 mode */
2699
2700 /* For OP_NOT and OP_NOTI, the data is always a single-byte character. These
2701 opcodes are not used for multi-byte characters, because they are coded using
2702 an XCLASS instead. */
2703
2704 case OP_NOT:
2705 return (c = *previous) == next;
2706
2707 case OP_NOTI:
2708 if ((c = *previous) == next) return TRUE;
2709 #ifdef SUPPORT_UTF8
2710 if (utf8)
2711 {
2712 unsigned int othercase;
2713 if (next < 128) othercase = cd->fcc[next]; else
2714 #ifdef SUPPORT_UCP
2715 othercase = UCD_OTHERCASE(next);
2716 #else
2717 othercase = NOTACHAR;
2718 #endif
2719 return (unsigned int)c == othercase;
2720 }
2721 else
2722 #endif /* SUPPORT_UTF8 */
2723 return (c == cd->fcc[next]); /* Non-UTF-8 mode */
2724
2725 /* Note that OP_DIGIT etc. are generated only when PCRE_UCP is *not* set.
2726 When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
2727
2728 case OP_DIGIT:
2729 return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
2730
2731 case OP_NOT_DIGIT:
2732 return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
2733
2734 case OP_WHITESPACE:
2735 return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
2736
2737 case OP_NOT_WHITESPACE:
2738 return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
2739
2740 case OP_WORDCHAR:
2741 return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
2742
2743 case OP_NOT_WORDCHAR:
2744 return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
2745
2746 case OP_HSPACE:
2747 case OP_NOT_HSPACE:
2748 switch(next)
2749 {
2750 case 0x09:
2751 case 0x20:
2752 case 0xa0:
2753 case 0x1680:
2754 case 0x180e:
2755 case 0x2000:
2756 case 0x2001:
2757 case 0x2002:
2758 case 0x2003:
2759 case 0x2004:
2760 case 0x2005:
2761 case 0x2006:
2762 case 0x2007:
2763 case 0x2008:
2764 case 0x2009:
2765 case 0x200A:
2766 case 0x202f:
2767 case 0x205f:
2768 case 0x3000:
2769 return op_code == OP_NOT_HSPACE;
2770 default:
2771 return op_code != OP_NOT_HSPACE;
2772 }
2773
2774 case OP_ANYNL:
2775 case OP_VSPACE:
2776 case OP_NOT_VSPACE:
2777 switch(next)
2778 {
2779 case 0x0a:
2780 case 0x0b:
2781 case 0x0c:
2782 case 0x0d:
2783 case 0x85:
2784 case 0x2028:
2785 case 0x2029:
2786 return op_code == OP_NOT_VSPACE;
2787 default:
2788 return op_code != OP_NOT_VSPACE;
2789 }
2790
2791 #ifdef SUPPORT_UCP
2792 case OP_PROP:
2793 return check_char_prop(next, previous[0], previous[1], FALSE);
2794
2795 case OP_NOTPROP:
2796 return check_char_prop(next, previous[0], previous[1], TRUE);
2797 #endif
2798
2799 default:
2800 return FALSE;
2801 }
2802
2803
2804 /* Handle the case when the next item is \d, \s, etc. Note that when PCRE_UCP
2805 is set, \d turns into ESC_du rather than ESC_d, etc., so ESC_d etc. are
2806 generated only when PCRE_UCP is *not* set, that is, when only ASCII
2807 characteristics are recognized. Similarly, the opcodes OP_DIGIT etc. are
2808 replaced by OP_PROP codes when PCRE_UCP is set. */
2809
2810 switch(op_code)
2811 {
2812 case OP_CHAR:
2813 case OP_CHARI:
2814 #ifdef SUPPORT_UTF8
2815 GETCHARTEST(c, previous);
2816 #else
2817 c = *previous;
2818 #endif
2819 switch(-next)
2820 {
2821 case ESC_d:
2822 return c > 127 || (cd->ctypes[c] & ctype_digit) == 0;
2823
2824 case ESC_D:
2825 return c <= 127 && (cd->ctypes[c] & ctype_digit) != 0;
2826
2827 case ESC_s:
2828 return c > 127 || (cd->ctypes[c] & ctype_space) == 0;
2829
2830 case ESC_S:
2831 return c <= 127 && (cd->ctypes[c] & ctype_space) != 0;
2832
2833 case ESC_w:
2834 return c > 127 || (cd->ctypes[c] & ctype_word) == 0;
2835
2836 case ESC_W:
2837 return c <= 127 && (cd->ctypes[c] & ctype_word) != 0;
2838
2839 case ESC_h:
2840 case ESC_H:
2841 switch(c)
2842 {
2843 case 0x09:
2844 case 0x20:
2845 case 0xa0:
2846 case 0x1680:
2847 case 0x180e:
2848 case 0x2000:
2849 case 0x2001:
2850 case 0x2002:
2851 case 0x2003:
2852 case 0x2004:
2853 case 0x2005:
2854 case 0x2006:
2855 case 0x2007:
2856 case 0x2008:
2857 case 0x2009:
2858 case 0x200A:
2859 case 0x202f:
2860 case 0x205f:
2861 case 0x3000:
2862 return -next != ESC_h;
2863 default:
2864 return -next == ESC_h;
2865 }
2866
2867 case ESC_v:
2868 case ESC_V:
2869 switch(c)
2870 {
2871 case 0x0a:
2872 case 0x0b:
2873 case 0x0c:
2874 case 0x0d:
2875 case 0x85:
2876 case 0x2028:
2877 case 0x2029:
2878 return -next != ESC_v;
2879 default:
2880 return -next == ESC_v;
2881 }
2882
2883 /* When PCRE_UCP is set, these values get generated for \d etc. Find
2884 their substitutions and process them. The result will always be either
2885 -ESC_p or -ESC_P. Then fall through to process those values. */
2886
2887 #ifdef SUPPORT_UCP
2888 case ESC_du:
2889 case ESC_DU:
2890 case ESC_wu:
2891 case ESC_WU:
2892 case ESC_su:
2893 case ESC_SU:
2894 {
2895 int temperrorcode = 0;
2896 ptr = substitutes[-next - ESC_DU];
2897 next = check_escape(&ptr, &temperrorcode, 0, options, FALSE);
2898 if (temperrorcode != 0) return FALSE;
2899 ptr++; /* For compatibility */
2900 }
2901 /* Fall through */
2902
2903 case ESC_p:
2904 case ESC_P:
2905 {
2906 int ptype, pdata, errorcodeptr;
2907 BOOL negated;
2908
2909 ptr--; /* Make ptr point at the p or P */
2910 ptype = get_ucp(&ptr, &negated, &pdata, &errorcodeptr);
2911 if (ptype < 0) return FALSE;
2912 ptr++; /* Point past the final curly ket */
2913
2914 /* If the property item is optional, we have to give up. (When generated
2915 from \d etc by PCRE_UCP, this test will have been applied much earlier,
2916 to the original \d etc. At this point, ptr will point to a zero byte. */
2917
2918 if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
2919 strncmp((char *)ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
2920 return FALSE;
2921
2922 /* Do the property check. */
2923
2924 return check_char_prop(c, ptype, pdata, (next == -ESC_P) != negated);
2925 }
2926 #endif
2927
2928 default:
2929 return FALSE;
2930 }
2931
2932 /* In principle, support for Unicode properties should be integrated here as
2933 well. It means re-organizing the above code so as to get hold of the property
2934 values before switching on the op-code. However, I wonder how many patterns
2935 combine ASCII \d etc with Unicode properties? (Note that if PCRE_UCP is set,
2936 these op-codes are never generated.) */
2937
2938 case OP_DIGIT:
2939 return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
2940 next == -ESC_h || next == -ESC_v || next == -ESC_R;
2941
2942 case OP_NOT_DIGIT:
2943 return next == -ESC_d;
2944
2945 case OP_WHITESPACE:
2946 return next == -ESC_S || next == -ESC_d || next == -ESC_w || next == -ESC_R;
2947
2948 case OP_NOT_WHITESPACE:
2949 return next == -ESC_s || next == -ESC_h || next == -ESC_v;
2950
2951 case OP_HSPACE:
2952 return next == -ESC_S || next == -ESC_H || next == -ESC_d ||
2953 next == -ESC_w || next == -ESC_v || next == -ESC_R;
2954
2955 case OP_NOT_HSPACE:
2956 return next == -ESC_h;
2957
2958 /* Can't have \S in here because VT matches \S (Perl anomaly) */
2959 case OP_ANYNL:
2960 case OP_VSPACE:
2961 return next == -ESC_V || next == -ESC_d || next == -ESC_w;
2962
2963 case OP_NOT_VSPACE:
2964 return next == -ESC_v || next == -ESC_R;
2965
2966 case OP_WORDCHAR:
2967 return next == -ESC_W || next == -ESC_s || next == -ESC_h ||
2968 next == -ESC_v || next == -ESC_R;
2969
2970 case OP_NOT_WORDCHAR:
2971 return next == -ESC_w || next == -ESC_d;
2972
2973 default:
2974 return FALSE;
2975 }
2976
2977 /* Control does not reach here */
2978 }
2979
2980
2981
2982 /*************************************************
2983 * Compile one branch *
2984 *************************************************/
2985
2986 /* Scan the pattern, compiling it into the a vector. If the options are
2987 changed during the branch, the pointer is used to change the external options
2988 bits. This function is used during the pre-compile phase when we are trying
2989 to find out the amount of memory needed, as well as during the real compile
2990 phase. The value of lengthptr distinguishes the two phases.
2991
2992 Arguments:
2993 optionsptr pointer to the option bits
2994 codeptr points to the pointer to the current code point
2995 ptrptr points to the current pattern pointer
2996 errorcodeptr points to error code variable
2997 firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2998 reqbyteptr set to the last literal character required, else < 0
2999 bcptr points to current branch chain
3000 cd contains pointers to tables etc.
3001 lengthptr NULL during the real compile phase
3002 points to length accumulator during pre-compile phase
3003
3004 Returns: TRUE on success
3005 FALSE, with *errorcodeptr set non-zero on error
3006 */
3007
3008 static BOOL
3009 compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
3010 int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
3011 compile_data *cd, int *lengthptr)
3012 {
3013 int repeat_type, op_type;
3014 int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
3015 int bravalue = 0;
3016 int greedy_default, greedy_non_default;
3017 int firstbyte, reqbyte;
3018 int zeroreqbyte, zerofirstbyte;
3019 int req_caseopt, reqvary, tempreqvary;
3020 int options = *optionsptr;
3021 int after_manual_callout = 0;
3022 int length_prevgroup = 0;
3023 register int c;
3024 register uschar *code = *codeptr;
3025 uschar *last_code = code;
3026 uschar *orig_code = code;
3027 uschar *tempcode;
3028 BOOL inescq = FALSE;
3029 BOOL groupsetfirstbyte = FALSE;
3030 const uschar *ptr = *ptrptr;
3031 const uschar *tempptr;
3032 const uschar *nestptr = NULL;
3033 uschar *previous = NULL;
3034 uschar *previous_callout = NULL;
3035 uschar *save_hwm = NULL;
3036 uschar classbits[32];
3037
3038 #ifdef SUPPORT_UTF8
3039 BOOL class_utf8;
3040 BOOL utf8 = (options & PCRE_UTF8) != 0;
3041 uschar *class_utf8data;
3042 uschar *class_utf8data_base;
3043 uschar utf8_char[6];
3044 #else
3045 BOOL utf8 = FALSE;
3046 uschar *utf8_char = NULL;
3047 #endif
3048
3049 #ifdef PCRE_DEBUG
3050 if (lengthptr != NULL) DPRINTF((">> start branch\n"));
3051 #endif
3052
3053 /* Set up the default and non-default settings for greediness */
3054
3055 greedy_default = ((options & PCRE_UNGREEDY) != 0);
3056 greedy_non_default = greedy_default ^ 1;
3057
3058 /* Initialize no first byte, no required byte. REQ_UNSET means "no char
3059 matching encountered yet". It gets changed to REQ_NONE if we hit something that
3060 matches a non-fixed char first char; reqbyte just remains unset if we never
3061 find one.
3062
3063 When we hit a repeat whose minimum is zero, we may have to adjust these values
3064 to take the zero repeat into account. This is implemented by setting them to
3065 zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
3066 item types that can be repeated set these backoff variables appropriately. */
3067
3068 firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
3069
3070 /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
3071 according to the current setting of the caseless flag. REQ_CASELESS is a bit
3072 value > 255. It is added into the firstbyte or reqbyte variables to record the
3073 case status of the value. This is used only for ASCII characters. */
3074
3075 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
3076
3077 /* Switch on next character until the end of the branch */
3078
3079 for (;; ptr++)
3080 {
3081 BOOL negate_class;
3082 BOOL should_flip_negation;
3083 BOOL possessive_quantifier;
3084 BOOL is_quantifier;
3085 BOOL is_recurse;
3086 BOOL reset_bracount;
3087 int class_charcount;
3088 int class_lastchar;
3089 int newoptions;
3090 int recno;
3091 int refsign;
3092 int skipbytes;
3093 int subreqbyte;
3094 int subfirstbyte;
3095 int terminator;
3096 int mclength;
3097 uschar mcbuffer[8];
3098
3099 /* Get next byte in the pattern */
3100
3101 c = *ptr;
3102
3103 /* If we are at the end of a nested substitution, revert to the outer level
3104 string. Nesting only happens one level deep. */
3105
3106 if (c == 0 && nestptr != NULL)
3107 {
3108 ptr = nestptr;
3109 nestptr = NULL;
3110 c = *ptr;
3111 }
3112
3113 /* If we are in the pre-compile phase, accumulate the length used for the
3114 previous cycle of this loop. */
3115
3116 if (lengthptr != NULL)
3117 {
3118 #ifdef PCRE_DEBUG
3119 if (code > cd->hwm) cd->hwm = code; /* High water info */
3120 #endif
3121 if (code > cd->start_workspace + WORK_SIZE_CHECK) /* Check for overrun */
3122 {
3123 *errorcodeptr = ERR52;
3124 goto FAILED;
3125 }
3126
3127 /* There is at least one situation where code goes backwards: this is the
3128 case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
3129 the class is simply eliminated. However, it is created first, so we have to
3130 allow memory for it. Therefore, don't ever reduce the length at this point.
3131 */
3132
3133 if (code < last_code) code = last_code;
3134
3135 /* Paranoid check for integer overflow */
3136
3137 if (OFLOW_MAX - *lengthptr < code - last_code)
3138 {
3139 *errorcodeptr = ERR20;
3140 goto FAILED;
3141 }
3142
3143 *lengthptr += (int)(code - last_code);
3144 DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
3145
3146 /* If "previous" is set and it is not at the start of the work space, move
3147 it back to there, in order to avoid filling up the work space. Otherwise,
3148 if "previous" is NULL, reset the current code pointer to the start. */
3149
3150 if (previous != NULL)
3151 {
3152 if (previous > orig_code)
3153 {
3154 memmove(orig_code, previous, code - previous);
3155 code -= previous - orig_code;
3156 previous = orig_code;
3157 }
3158 }
3159 else code = orig_code;
3160
3161 /* Remember where this code item starts so we can pick up the length
3162 next time round. */
3163
3164 last_code = code;
3165 }
3166
3167 /* In the real compile phase, just check the workspace used by the forward
3168 reference list. */
3169
3170 else if (cd->hwm > cd->start_workspace + WORK_SIZE_CHECK)
3171 {
3172 *errorcodeptr = ERR52;
3173 goto FAILED;
3174 }
3175
3176 /* If in \Q...\E, check for the end; if not, we have a literal */
3177
3178 if (inescq && c != 0)
3179 {
3180 if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)
3181 {
3182 inescq = FALSE;
3183 ptr++;
3184 continue;
3185 }
3186 else
3187 {
3188 if (previous_callout != NULL)
3189 {
3190 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
3191 complete_callout(previous_callout, ptr, cd);
3192 previous_callout = NULL;
3193 }
3194 if ((options & PCRE_AUTO_CALLOUT) != 0)
3195 {
3196 previous_callout = code;
3197 code = auto_callout(code, ptr, cd);
3198 }
3199 goto NORMAL_CHAR;
3200 }
3201 }
3202
3203 /* Fill in length of a previous callout, except when the next thing is
3204 a quantifier. */
3205
3206 is_quantifier =
3207 c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK ||
3208 (c == CHAR_LEFT_CURLY_BRACKET && is_counted_repeat(ptr+1));
3209
3210 if (!is_quantifier && previous_callout != NULL &&
3211 after_manual_callout-- <= 0)
3212 {
3213 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
3214 complete_callout(previous_callout, ptr, cd);
3215 previous_callout = NULL;
3216 }
3217
3218 /* In extended mode, skip white space and comments */
3219
3220 if ((options & PCRE_EXTENDED) != 0)
3221 {
3222 if ((cd->ctypes[c] & ctype_space) != 0) continue;
3223 if (c == CHAR_NUMBER_SIGN)
3224 {
3225 ptr++;
3226 while (*ptr != 0)
3227 {
3228 if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
3229 ptr++;
3230 #ifdef SUPPORT_UTF8
3231 if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
3232 #endif
3233 }
3234 if (*ptr != 0) continue;
3235
3236 /* Else fall through to handle end of string */
3237 c = 0;
3238 }
3239 }
3240
3241 /* No auto callout for quantifiers. */
3242
3243 if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
3244 {
3245 previous_callout = code;
3246 code = auto_callout(code, ptr, cd);
3247 }
3248
3249 switch(c)
3250 {
3251 /* ===================================================================*/
3252 case 0: /* The branch terminates at string end */
3253 case CHAR_VERTICAL_LINE: /* or | or ) */
3254 case CHAR_RIGHT_PARENTHESIS:
3255 *firstbyteptr = firstbyte;
3256 *reqbyteptr = reqbyte;
3257 *codeptr = code;
3258 *ptrptr = ptr;
3259 if (lengthptr != NULL)
3260 {
3261 if (OFLOW_MAX - *lengthptr < code - last_code)
3262 {
3263 *errorcodeptr = ERR20;
3264 goto FAILED;
3265 }
3266 *lengthptr += (int)(code - last_code); /* To include callout length */
3267 DPRINTF((">> end branch\n"));
3268 }
3269 return TRUE;
3270
3271
3272 /* ===================================================================*/
3273 /* Handle single-character metacharacters. In multiline mode, ^ disables
3274 the setting of any following char as a first character. */
3275
3276 case CHAR_CIRCUMFLEX_ACCENT:
3277 previous = NULL;
3278 if ((options & PCRE_MULTILINE) != 0)
3279 {
3280 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3281 *code++ = OP_CIRCM;
3282 }
3283 else *code++ = OP_CIRC;
3284 break;
3285
3286 case CHAR_DOLLAR_SIGN:
3287 previous = NULL;
3288 *code++ = ((options & PCRE_MULTILINE) != 0)? OP_DOLLM : OP_DOLL;
3289 break;
3290
3291 /* There can never be a first char if '.' is first, whatever happens about
3292 repeats. The value of reqbyte doesn't change either. */
3293
3294 case CHAR_DOT:
3295 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3296 zerofirstbyte = firstbyte;
3297 zeroreqbyte = reqbyte;
3298 previous = code;
3299 *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;
3300 break;
3301
3302
3303 /* ===================================================================*/
3304 /* Character classes. If the included characters are all < 256, we build a
3305 32-byte bitmap of the permitted characters, except in the special case
3306 where there is only one such character. For negated classes, we build the
3307 map as usual, then invert it at the end. However, we use a different opcode
3308 so that data characters > 255 can be handled correctly.
3309
3310 If the class contains characters outside the 0-255 range, a different
3311 opcode is compiled. It may optionally have a bit map for characters < 256,
3312 but those above are are explicitly listed afterwards. A flag byte tells
3313 whether the bitmap is present, and whether this is a negated class or not.
3314
3315 In JavaScript compatibility mode, an isolated ']' causes an error. In
3316 default (Perl) mode, it is treated as a data character. */
3317
3318 case CHAR_RIGHT_SQUARE_BRACKET:
3319 if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
3320 {
3321 *errorcodeptr = ERR64;
3322 goto FAILED;
3323 }
3324 goto NORMAL_CHAR;
3325
3326 case CHAR_LEFT_SQUARE_BRACKET:
3327 previous = code;
3328
3329 /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
3330 they are encountered at the top level, so we'll do that too. */
3331
3332 if ((ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
3333 ptr[1] == CHAR_EQUALS_SIGN) &&
3334 check_posix_syntax(ptr, &tempptr))
3335 {
3336 *errorcodeptr = (ptr[1] == CHAR_COLON)? ERR13 : ERR31;
3337 goto FAILED;
3338 }
3339
3340 /* If the first character is '^', set the negation flag and skip it. Also,
3341 if the first few characters (either before or after ^) are \Q\E or \E we
3342 skip them too. This makes for compatibility with Perl. */
3343
3344 negate_class = FALSE;
3345 for (;;)
3346 {
3347 c = *(++ptr);
3348 if (c == CHAR_BACKSLASH)
3349 {
3350 if (ptr[1] == CHAR_E)
3351 ptr++;
3352 else if (strncmp((const char *)ptr+1,
3353 STR_Q STR_BACKSLASH STR_E, 3) == 0)
3354 ptr += 3;
3355 else
3356 break;
3357 }
3358 else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
3359 negate_class = TRUE;
3360 else break;
3361 }
3362
3363 /* Empty classes are allowed in JavaScript compatibility mode. Otherwise,
3364 an initial ']' is taken as a data character -- the code below handles
3365 that. In JS mode, [] must always fail, so generate OP_FAIL, whereas
3366 [^] must match any character, so generate OP_ALLANY. */
3367
3368 if (c == CHAR_RIGHT_SQUARE_BRACKET &&
3369 (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
3370 {
3371 *code++ = negate_class? OP_ALLANY : OP_FAIL;
3372 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3373 zerofirstbyte = firstbyte;
3374 break;
3375 }
3376
3377 /* If a class contains a negative special such as \S, we need to flip the
3378 negation flag at the end, so that support for characters > 255 works
3379 correctly (they are all included in the class). */
3380
3381 should_flip_negation = FALSE;
3382
3383 /* Keep a count of chars with values < 256 so that we can optimize the case
3384 of just a single character (as long as it's < 256). However, For higher
3385 valued UTF-8 characters, we don't yet do any optimization. */
3386
3387 class_charcount = 0;
3388 class_lastchar = -1;
3389
3390 /* Initialize the 32-char bit map to all zeros. We build the map in a
3391 temporary bit of memory, in case the class contains only 1 character (less
3392 than 256), because in that case the compiled code doesn't use the bit map.
3393 */
3394
3395 memset(classbits, 0, 32 * sizeof(uschar));
3396
3397 #ifdef SUPPORT_UTF8
3398 class_utf8 = FALSE; /* No chars >= 256 */
3399 class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */
3400 class_utf8data_base = class_utf8data; /* For resetting in pass 1 */
3401 #endif
3402
3403 /* Process characters until ] is reached. By writing this as a "do" it
3404 means that an initial ] is taken as a data character. At the start of the
3405 loop, c contains the first byte of the character. */
3406
3407 if (c != 0) do
3408 {
3409 const uschar *oldptr;
3410
3411 #ifdef SUPPORT_UTF8
3412 if (utf8 && c > 127)
3413 { /* Braces are required because the */
3414 GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
3415 }
3416
3417 /* In the pre-compile phase, accumulate the length of any UTF-8 extra
3418 data and reset the pointer. This is so that very large classes that
3419 contain a zillion UTF-8 characters no longer overwrite the work space
3420 (which is on the stack). */
3421
3422 if (lengthptr != NULL)
3423 {
3424 *lengthptr += class_utf8data - class_utf8data_base;
3425 class_utf8data = class_utf8data_base;
3426 }
3427
3428 #endif
3429
3430 /* Inside \Q...\E everything is literal except \E */
3431
3432 if (inescq)
3433 {
3434 if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E) /* If we are at \E */
3435 {
3436 inescq = FALSE; /* Reset literal state */
3437 ptr++; /* Skip the 'E' */
3438 continue; /* Carry on with next */
3439 }
3440 goto CHECK_RANGE; /* Could be range if \E follows */
3441 }
3442
3443 /* Handle POSIX class names. Perl allows a negation extension of the
3444 form [:^name:]. A square bracket that doesn't match the syntax is
3445 treated as a literal. We also recognize the POSIX constructions
3446 [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
3447 5.6 and 5.8 do. */
3448
3449 if (c == CHAR_LEFT_SQUARE_BRACKET &&
3450 (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
3451 ptr[1] == CHAR_EQUALS_SIGN) && check_posix_syntax(ptr, &tempptr))
3452 {
3453 BOOL local_negate = FALSE;
3454 int posix_class, taboffset, tabopt;
3455 register const uschar *cbits = cd->cbits;
3456 uschar pbits[32];
3457
3458 if (ptr[1] != CHAR_COLON)
3459 {
3460 *errorcodeptr = ERR31;
3461 goto FAILED;
3462 }
3463
3464 ptr += 2;
3465 if (*ptr == CHAR_CIRCUMFLEX_ACCENT)
3466 {
3467 local_negate = TRUE;
3468 should_flip_negation = TRUE; /* Note negative special */
3469 ptr++;
3470 }
3471
3472 posix_class = check_posix_name(ptr, (int)(tempptr - ptr));
3473 if (posix_class < 0)
3474 {
3475 *errorcodeptr = ERR30;
3476 goto FAILED;
3477 }
3478
3479 /* If matching is caseless, upper and lower are converted to
3480 alpha. This relies on the fact that the class table starts with
3481 alpha, lower, upper as the first 3 entries. */
3482
3483 if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
3484 posix_class = 0;
3485
3486 /* When PCRE_UCP is set, some of the POSIX classes are converted to
3487 different escape sequences that use Unicode properties. */
3488
3489 #ifdef SUPPORT_UCP
3490 if ((options & PCRE_UCP) != 0)
3491 {
3492 int pc = posix_class + ((local_negate)? POSIX_SUBSIZE/2 : 0);
3493 if (posix_substitutes[pc] != NULL)
3494 {
3495 nestptr = tempptr + 1;
3496 ptr = posix_substitutes[pc] - 1;
3497 continue;
3498 }
3499 }
3500 #endif
3501 /* In the non-UCP case, we build the bit map for the POSIX class in a
3502 chunk of local store because we may be adding and subtracting from it,
3503 and we don't want to subtract bits that may be in the main map already.
3504 At the end we or the result into the bit map that is being built. */
3505
3506 posix_class *= 3;
3507
3508 /* Copy in the first table (always present) */
3509
3510 memcpy(pbits, cbits + posix_class_maps[posix_class],
3511 32 * sizeof(uschar));
3512
3513 /* If there is a second table, add or remove it as required. */
3514
3515 taboffset = posix_class_maps[posix_class + 1];
3516 tabopt = posix_class_maps[posix_class + 2];
3517
3518 if (taboffset >= 0)
3519 {
3520 if (tabopt >= 0)
3521 for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
3522 else
3523 for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
3524 }
3525
3526 /* Not see if we need to remove any special characters. An option
3527 value of 1 removes vertical space and 2 removes underscore. */
3528
3529 if (tabopt < 0) tabopt = -tabopt;
3530 if (tabopt == 1) pbits[1] &= ~0x3c;
3531 else if (tabopt == 2) pbits[11] &= 0x7f;
3532
3533 /* Add the POSIX table or its complement into the main table that is
3534 being built and we are done. */
3535
3536 if (local_negate)
3537 for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
3538 else
3539 for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
3540
3541 ptr = tempptr + 1;
3542 class_charcount = 10; /* Set > 1; assumes more than 1 per class */
3543 continue; /* End of POSIX syntax handling */
3544 }
3545
3546 /* Backslash may introduce a single character, or it may introduce one
3547 of the specials, which just set a flag. The sequence \b is a special
3548 case. Inside a class (and only there) it is treated as backspace. We
3549 assume that other escapes have more than one character in them, so set
3550 class_charcount bigger than one. Unrecognized escapes fall through and
3551 are either treated as literal characters (by default), or are faulted if
3552 PCRE_EXTRA is set. */
3553
3554 if (c == CHAR_BACKSLASH)
3555 {
3556 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3557 if (*errorcodeptr != 0) goto FAILED;
3558
3559 if (-c == ESC_b) c = CHAR_BS; /* \b is backspace in a class */
3560 else if (-c == ESC_Q) /* Handle start of quoted string */
3561 {
3562 if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
3563 {
3564 ptr += 2; /* avoid empty string */
3565 }
3566 else inescq = TRUE;
3567 continue;
3568 }
3569 else if (-c == ESC_E) continue; /* Ignore orphan \E */
3570
3571 if (c < 0)
3572 {
3573 register const uschar *cbits = cd->cbits;
3574 class_charcount += 2; /* Greater than 1 is what matters */
3575
3576 switch (-c)
3577 {
3578 #ifdef SUPPORT_UCP
3579 case ESC_du: /* These are the values given for \d etc */
3580 case ESC_DU: /* when PCRE_UCP is set. We replace the */
3581 case ESC_wu: /* escape sequence with an appropriate \p */
3582 case ESC_WU: /* or \P to test Unicode properties instead */
3583 case ESC_su: /* of the default ASCII testing. */
3584 case ESC_SU:
3585 nestptr = ptr;
3586 ptr = substitutes[-c - ESC_DU] - 1; /* Just before substitute */
3587 class_charcount -= 2; /* Undo! */
3588 continue;
3589 #endif
3590 case ESC_d:
3591 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
3592 continue;
3593
3594 case ESC_D:
3595 should_flip_negation = TRUE;
3596 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
3597 continue;
3598
3599 case ESC_w:
3600 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
3601 continue;
3602
3603 case ESC_W:
3604 should_flip_negation = TRUE;
3605 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
3606 continue;
3607
3608 /* Perl 5.004 onwards omits VT from \s, but we must preserve it
3609 if it was previously set by something earlier in the character
3610 class. */
3611
3612 case ESC_s:
3613 classbits[0] |= cbits[cbit_space];
3614 classbits[1] |= cbits[cbit_space+1] & ~0x08;
3615 for (c = 2; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
3616 continue;
3617
3618 case ESC_S:
3619 should_flip_negation = TRUE;
3620 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
3621 classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
3622 continue;
3623
3624 case ESC_h:
3625 SETBIT(classbits, 0x09); /* VT */
3626 SETBIT(classbits, 0x20); /* SPACE */
3627 SETBIT(classbits, 0xa0); /* NSBP */
3628 #ifdef SUPPORT_UTF8
3629 if (utf8)
3630 {
3631 class_utf8 = TRUE;
3632 *class_utf8data++ = XCL_SINGLE;
3633 class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
3634 *class_utf8data++ = XCL_SINGLE;
3635 class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
3636 *class_utf8data++ = XCL_RANGE;
3637 class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
3638 class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
3639 *class_utf8data++ = XCL_SINGLE;
3640 class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
3641 *class_utf8data++ = XCL_SINGLE;
3642 class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
3643 *class_utf8data++ = XCL_SINGLE;
3644 class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
3645 }
3646 #endif
3647 continue;
3648
3649 case ESC_H:
3650 for (c = 0; c < 32; c++)
3651 {
3652 int x = 0xff;
3653 switch (c)
3654 {
3655 case 0x09/8: x ^= 1 << (0x09%8); break;
3656 case 0x20/8: x ^= 1 << (0x20%8); break;
3657 case 0xa0/8: x ^= 1 << (0xa0%8); break;
3658 default: break;
3659 }
3660 classbits[c] |= x;
3661 }
3662
3663 #ifdef SUPPORT_UTF8
3664 if (utf8)
3665 {
3666 class_utf8 = TRUE;
3667 *class_utf8data++ = XCL_RANGE;
3668 class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
3669 class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
3670 *class_utf8data++ = XCL_RANGE;
3671 class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
3672 class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
3673 *class_utf8data++ = XCL_RANGE;
3674 class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
3675 class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
3676 *class_utf8data++ = XCL_RANGE;
3677 class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
3678 class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
3679 *class_utf8data++ = XCL_RANGE;
3680 class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
3681 class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
3682 *class_utf8data++ = XCL_RANGE;
3683 class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
3684 class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
3685 *class_utf8data++ = XCL_RANGE;
3686 class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
3687 class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
3688 }
3689 #endif
3690 continue;
3691
3692 case ESC_v:
3693 SETBIT(classbits, 0x0a); /* LF */
3694 SETBIT(classbits, 0x0b); /* VT */
3695 SETBIT(classbits, 0x0c); /* FF */
3696 SETBIT(classbits, 0x0d); /* CR */
3697 SETBIT(classbits, 0x85); /* NEL */
3698 #ifdef SUPPORT_UTF8
3699 if (utf8)
3700 {
3701 class_utf8 = TRUE;
3702 *class_utf8data++ = XCL_RANGE;
3703 class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
3704 class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
3705 }
3706 #endif
3707 continue;
3708
3709 case ESC_V:
3710 for (c = 0; c < 32; c++)
3711 {
3712 int x = 0xff;
3713 switch (c)
3714 {
3715 case 0x0a/8: x ^= 1 << (0x0a%8);
3716 x ^= 1 << (0x0b%8);
3717 x ^= 1 << (0x0c%8);
3718 x ^= 1 << (0x0d%8);
3719 break;
3720 case 0x85/8: x ^= 1 << (0x85%8); break;
3721 default: break;
3722 }
3723 classbits[c] |= x;
3724 }
3725
3726 #ifdef SUPPORT_UTF8
3727 if (utf8)
3728 {
3729 class_utf8 = TRUE;
3730 *class_utf8data++ = XCL_RANGE;
3731 class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
3732 class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
3733 *class_utf8data++ = XCL_RANGE;
3734 class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
3735 class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
3736 }
3737 #endif
3738 continue;
3739
3740 #ifdef SUPPORT_UCP
3741 case ESC_p:
3742 case ESC_P:
3743 {
3744 BOOL negated;
3745 int pdata;
3746 int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
3747 if (ptype < 0) goto FAILED;
3748 class_utf8 = TRUE;
3749 *class_utf8data++ = ((-c == ESC_p) != negated)?
3750 XCL_PROP : XCL_NOTPROP;
3751 *class_utf8data++ = ptype;
3752 *class_utf8data++ = pdata;
3753 class_charcount -= 2; /* Not a < 256 character */
3754 continue;
3755 }
3756 #endif
3757 /* Unrecognized escapes are faulted if PCRE is running in its
3758 strict mode. By default, for compatibility with Perl, they are
3759 treated as literals. */
3760
3761 default:
3762 if ((options & PCRE_EXTRA) != 0)
3763 {
3764 *errorcodeptr = ERR7;
3765 goto FAILED;
3766 }
3767 class_charcount -= 2; /* Undo the default count from above */
3768 c = *ptr; /* Get the final character and fall through */
3769 break;
3770 }
3771 }
3772
3773 /* Fall through if we have a single character (c >= 0). This may be
3774 greater than 256 in UTF-8 mode. */
3775
3776 } /* End of backslash handling */
3777
3778 /* A single character may be followed by '-' to form a range. However,
3779 Perl does not permit ']' to be the end of the range. A '-' character
3780 at the end is treated as a literal. Perl ignores orphaned \E sequences
3781 entirely. The code for handling \Q and \E is messy. */
3782
3783 CHECK_RANGE:
3784 while (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
3785 {
3786 inescq = FALSE;
3787 ptr += 2;
3788 }
3789
3790 oldptr = ptr;
3791
3792 /* Remember \r or \n */
3793
3794 if (c == CHAR_CR || c == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
3795
3796 /* Check for range */
3797
3798 if (!inescq && ptr[1] == CHAR_MINUS)
3799 {
3800 int d;
3801 ptr += 2;
3802 while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E) ptr += 2;
3803
3804 /* If we hit \Q (not followed by \E) at this point, go into escaped
3805 mode. */
3806
3807 while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_Q)
3808 {
3809 ptr += 2;
3810 if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E)
3811 { ptr += 2; continue; }
3812 inescq = TRUE;
3813 break;
3814 }
3815
3816 if (*ptr == 0 || (!inescq && *ptr == CHAR_RIGHT_SQUARE_BRACKET))
3817 {
3818 ptr = oldptr;
3819 goto LONE_SINGLE_CHARACTER;
3820 }
3821
3822 #ifdef SUPPORT_UTF8
3823 if (utf8)
3824 { /* Braces are required because the */
3825 GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
3826 }
3827 else
3828 #endif
3829 d = *ptr; /* Not UTF-8 mode */
3830
3831 /* The second part of a range can be a single-character escape, but
3832 not any of the other escapes. Perl 5.6 treats a hyphen as a literal
3833 in such circumstances. */
3834
3835 if (!inescq && d == CHAR_BACKSLASH)
3836 {
3837 d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3838 if (*errorcodeptr != 0) goto FAILED;
3839
3840 /* \b is backspace; any other special means the '-' was literal */
3841
3842 if (d < 0)
3843 {
3844 if (d == -ESC_b) d = CHAR_BS; else
3845 {
3846 ptr = oldptr;
3847 goto LONE_SINGLE_CHARACTER; /* A few lines below */
3848 }
3849 }
3850 }
3851
3852 /* Check that the two values are in the correct order. Optimize
3853 one-character ranges */
3854
3855 if (d < c)
3856 {
3857 *errorcodeptr = ERR8;
3858 goto FAILED;
3859 }
3860
3861 if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
3862
3863 /* Remember \r or \n */
3864
3865 if (d == CHAR_CR || d == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
3866
3867 /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
3868 matching, we have to use an XCLASS with extra data items. Caseless
3869 matching for characters > 127 is available only if UCP support is
3870 available. */
3871
3872 #ifdef SUPPORT_UTF8
3873 if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
3874 {
3875 class_utf8 = TRUE;
3876
3877 /* With UCP support, we can find the other case equivalents of
3878 the relevant characters. There may be several ranges. Optimize how
3879 they fit with the basic range. */
3880
3881 #ifdef SUPPORT_UCP
3882 if ((options & PCRE_CASELESS) != 0)
3883 {
3884 unsigned int occ, ocd;
3885 unsigned int cc = c;
3886 unsigned int origd = d;
3887 while (get_othercase_range(&cc, origd, &occ, &ocd))
3888 {
3889 if (occ >= (unsigned int)c &&
3890 ocd <= (unsigned int)d)
3891 continue; /* Skip embedded ranges */
3892
3893 if (occ < (unsigned int)c &&
3894 ocd >= (unsigned int)c - 1) /* Extend the basic range */
3895 { /* if there is overlap, */
3896 c = occ; /* noting that if occ < c */
3897 continue; /* we can't have ocd > d */
3898 } /* because a subrange is */
3899 if (ocd > (unsigned int)d &&
3900 occ <= (unsigned int)d + 1) /* always shorter than */
3901 { /* the basic range. */
3902 d = ocd;
3903 continue;
3904 }
3905
3906 if (occ == ocd)
3907 {
3908 *class_utf8data++ = XCL_SINGLE;
3909 }
3910 else
3911 {
3912 *class_utf8data++ = XCL_RANGE;
3913 class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
3914 }
3915 class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
3916 }
3917 }
3918 #endif /* SUPPORT_UCP */
3919
3920 /* Now record the original range, possibly modified for UCP caseless
3921 overlapping ranges. */
3922
3923 *class_utf8data++ = XCL_RANGE;
3924 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3925 class_utf8data += _pcre_ord2utf8(d, class_utf8data);
3926
3927 /* With UCP support, we are done. Without UCP support, there is no
3928 caseless matching for UTF-8 characters > 127; we can use the bit map
3929 for the smaller ones. */
3930
3931 #ifdef SUPPORT_UCP
3932 continue; /* With next character in the class */
3933 #else
3934 if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
3935
3936 /* Adjust upper limit and fall through to set up the map */
3937
3938 d = 127;
3939
3940 #endif /* SUPPORT_UCP */
3941 }
3942 #endif /* SUPPORT_UTF8 */
3943
3944 /* We use the bit map for all cases when not in UTF-8 mode; else
3945 ranges that lie entirely within 0-127 when there is UCP support; else
3946 for partial ranges without UCP support. */
3947
3948 class_charcount += d - c + 1;
3949 class_lastchar = d;
3950
3951 /* We can save a bit of time by skipping this in the pre-compile. */
3952
3953 if (lengthptr == NULL) for (; c <= d; c++)
3954 {
3955 classbits[c/8] |= (1 << (c&7));
3956 if ((options & PCRE_CASELESS) != 0)
3957 {
3958 int uc = cd->fcc[c]; /* flip case */
3959 classbits[uc/8] |= (1 << (uc&7));
3960 }
3961 }
3962
3963 continue; /* Go get the next char in the class */
3964 }
3965
3966 /* Handle a lone single character - we can get here for a normal
3967 non-escape char, or after \ that introduces a single character or for an
3968 apparent range that isn't. */
3969
3970 LONE_SINGLE_CHARACTER:
3971
3972 /* Handle a character that cannot go in the bit map */
3973
3974 #ifdef SUPPORT_UTF8
3975 if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
3976 {
3977 class_utf8 = TRUE;
3978 *class_utf8data++ = XCL_SINGLE;
3979 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3980
3981 #ifdef SUPPORT_UCP
3982 if ((options & PCRE_CASELESS) != 0)
3983 {
3984 unsigned int othercase;
3985 if ((othercase = UCD_OTHERCASE(c)) != c)
3986 {
3987 *class_utf8data++ = XCL_SINGLE;
3988 class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
3989 }
3990 }
3991 #endif /* SUPPORT_UCP */
3992
3993 }
3994 else
3995 #endif /* SUPPORT_UTF8 */
3996
3997 /* Handle a single-byte character */
3998 {
3999 classbits[c/8] |= (1 << (c&7));
4000 if ((options & PCRE_CASELESS) != 0)
4001 {
4002 c = cd->fcc[c]; /* flip case */
4003 classbits[c/8] |= (1 << (c&7));
4004 }
4005 class_charcount++;
4006 class_lastchar = c;
4007 }
4008 }
4009
4010 /* Loop until ']' reached. This "while" is the end of the "do" far above.
4011 If we are at the end of an internal nested string, revert to the outer
4012 string. */
4013
4014 while (((c = *(++ptr)) != 0 ||
4015 (nestptr != NULL &&
4016 (ptr = nestptr, nestptr = NULL, c = *(++ptr)) != 0)) &&
4017 (c != CHAR_RIGHT_SQUARE_BRACKET || inescq));
4018
4019 /* Check for missing terminating ']' */
4020
4021 if (c == 0)
4022 {
4023 *errorcodeptr = ERR6;
4024 goto FAILED;
4025 }
4026
4027 /* If class_charcount is 1, we saw precisely one character whose value is
4028 less than 256. As long as there were no characters >= 128 and there was no
4029 use of \p or \P, in other words, no use of any XCLASS features, we can
4030 optimize.
4031
4032 In UTF-8 mode, we can optimize the negative case only if there were no
4033 characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR
4034 operate on single-bytes characters only. This is an historical hangover.
4035 Maybe one day we can tidy these opcodes to handle multi-byte characters.
4036
4037 The optimization throws away the bit map. We turn the item into a
4038 1-character OP_CHAR[I] if it's positive, or OP_NOT[I] if it's negative.
4039 Note that OP_NOT[I] does not support multibyte characters. In the positive
4040 case, it can cause firstbyte to be set. Otherwise, there can be no first
4041 char if this item is first, whatever repeat count may follow. In the case
4042 of reqbyte, save the previous value for reinstating. */
4043
4044 #ifdef SUPPORT_UTF8
4045 if (class_charcount == 1 && !class_utf8 &&
4046 (!utf8 || !negate_class || class_lastchar < 128))
4047 #else
4048 if (class_charcount == 1)
4049 #endif
4050 {
4051 zeroreqbyte = reqbyte;
4052
4053 /* The OP_NOT[I] opcodes work on one-byte characters only. */
4054
4055 if (negate_class)
4056 {
4057 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
4058 zerofirstbyte = firstbyte;
4059 *code++ = ((options & PCRE_CASELESS) != 0)? OP_NOTI: OP_NOT;
4060 *code++ = class_lastchar;
4061 break;
4062 }
4063
4064 /* For a single, positive character, get the value into mcbuffer, and
4065 then we can handle this with the normal one-character code. */
4066
4067 #ifdef SUPPORT_UTF8
4068 if (utf8 && class_lastchar > 127)
4069 mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
4070 else
4071 #endif
4072 {
4073 mcbuffer[0] = class_lastchar;
4074 mclength = 1;
4075 }
4076 goto ONE_CHAR;
4077 } /* End of 1-char optimization */
4078
4079 /* The general case - not the one-char optimization. If this is the first
4080 thing in the branch, there can be no first char setting, whatever the
4081 repeat count. Any reqbyte setting must remain unchanged after any kind of
4082 repeat. */
4083
4084 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
4085 zerofirstbyte = firstbyte;
4086 zeroreqbyte = reqbyte;
4087
4088 /* If there are characters with values > 255, we have to compile an
4089 extended class, with its own opcode, unless there was a negated special
4090 such as \S in the class, and PCRE_UCP is not set, because in that case all
4091 characters > 255 are in the class, so any that were explicitly given as
4092 well can be ignored. If (when there are explicit characters > 255 that must
4093 be listed) there are no characters < 256, we can omit the bitmap in the
4094 actual compiled code. */
4095
4096 #ifdef SUPPORT_UTF8
4097 if (class_utf8 && (!should_flip_negation || (options & PCRE_UCP) != 0))
4098 {
4099 *class_utf8data++ = XCL_END; /* Marks the end of extra data */
4100 *code++ = OP_XCLASS;
4101 code += LINK_SIZE;
4102 *code = negate_class? XCL_NOT : 0;
4103
4104 /* If the map is required, move up the extra data to make room for it;
4105 otherwise just move the code pointer to the end of the extra data. */
4106
4107 if (class_charcount > 0)
4108 {
4109 *code++ |= XCL_MAP;
4110 memmove(code + 32, code, class_utf8data - code);
4111 memcpy(code, classbits, 32);
4112 code = class_utf8data + 32;
4113 }
4114 else code = class_utf8data;
4115
4116 /* Now fill in the complete length of the item */
4117
4118 PUT(previous, 1, code - previous);
4119 break; /* End of class handling */
4120 }
4121 #endif
4122
4123 /* If there are no characters > 255, or they are all to be included or
4124 excluded, set the opcode to OP_CLASS or OP_NCLASS, depending on whether the
4125 whole class was negated and whether there were negative specials such as \S
4126 (non-UCP) in the class. Then copy the 32-byte map into the code vector,
4127 negating it if necessary. */
4128
4129 *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
4130 if (negate_class)
4131 {
4132 if (lengthptr == NULL) /* Save time in the pre-compile phase */
4133 for (c = 0; c < 32; c++) code[c] = ~classbits[c];
4134 }
4135 else
4136 {
4137 memcpy(code, classbits, 32);
4138 }
4139 code += 32;
4140 break;
4141
4142
4143 /* ===================================================================*/
4144 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
4145 has been tested above. */
4146
4147 case CHAR_LEFT_CURLY_BRACKET:
4148 if (!is_quantifier) goto NORMAL_CHAR;
4149 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
4150 if (*errorcodeptr != 0) goto FAILED;
4151 goto REPEAT;
4152
4153 case CHAR_ASTERISK:
4154 repeat_min = 0;
4155 repeat_max = -1;
4156 goto REPEAT;
4157
4158 case CHAR_PLUS:
4159 repeat_min = 1;
4160 repeat_max = -1;
4161 goto REPEAT;
4162
4163 case CHAR_QUESTION_MARK:
4164 repeat_min = 0;
4165 repeat_max = 1;
4166
4167 REPEAT:
4168 if (previous == NULL)
4169 {
4170 *errorcodeptr = ERR9;
4171 goto FAILED;
4172 }
4173
4174 if (repeat_min == 0)
4175 {
4176 firstbyte = zerofirstbyte; /* Adjust for zero repeat */
4177 reqbyte = zeroreqbyte; /* Ditto */
4178 }
4179
4180 /* Remember whether this is a variable length repeat */
4181
4182 reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
4183
4184 op_type = 0; /* Default single-char op codes */
4185 possessive_quantifier = FALSE; /* Default not possessive quantifier */
4186
4187 /* Save start of previous item, in case we have to move it up to make space
4188 for an inserted OP_ONCE for the additional '+' extension. */
4189
4190 tempcode = previous;
4191
4192 /* If the next character is '+', we have a possessive quantifier. This
4193 implies greediness, whatever the setting of the PCRE_UNGREEDY option.
4194 If the next character is '?' this is a minimizing repeat, by default,
4195 but if PCRE_UNGREEDY is set, it works the other way round. We change the
4196 repeat type to the non-default. */
4197
4198 if (ptr[1] == CHAR_PLUS)
4199 {
4200 repeat_type = 0; /* Force greedy */
4201 possessive_quantifier = TRUE;
4202 ptr++;
4203 }
4204 else if (ptr[1] == CHAR_QUESTION_MARK)
4205 {
4206 repeat_type = greedy_non_default;
4207 ptr++;
4208 }
4209 else repeat_type = greedy_default;
4210
4211 /* If previous was a character match, abolish the item and generate a
4212 repeat item instead. If a char item has a minumum of more than one, ensure
4213 that it is set in reqbyte - it might not be if a sequence such as x{3} is
4214 the first thing in a branch because the x will have gone into firstbyte
4215 instead. */
4216
4217 if (*previous == OP_CHAR || *previous == OP_CHARI)
4218 {
4219 op_type = (*previous == OP_CHAR)? 0 : OP_STARI - OP_STAR;
4220
4221 /* Deal with UTF-8 characters that take up more than one byte. It's
4222 easier to write this out separately than try to macrify it. Use c to
4223 hold the length of the character in bytes, plus 0x80 to flag that it's a
4224 length rather than a small character. */
4225
4226 #ifdef SUPPORT_UTF8
4227 if (utf8 && (code[-1] & 0x80) != 0)
4228 {
4229 uschar *lastchar = code - 1;
4230 while((*lastchar & 0xc0) == 0x80) lastchar--;
4231 c = code - lastchar; /* Length of UTF-8 character */
4232 memcpy(utf8_char, lastchar, c); /* Save the char */
4233 c |= 0x80; /* Flag c as a length */
4234 }
4235 else
4236 #endif
4237
4238 /* Handle the case of a single byte - either with no UTF8 support, or
4239 with UTF-8 disabled, or for a UTF-8 character < 128. */
4240
4241 {
4242 c = code[-1];
4243 if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
4244 }
4245
4246 /* If the repetition is unlimited, it pays to see if the next thing on
4247 the line is something that cannot possibly match this character. If so,
4248 automatically possessifying this item gains some performance in the case
4249 where the match fails. */
4250
4251 if (!possessive_quantifier &&
4252 repeat_max < 0 &&
4253 check_auto_possessive(previous, utf8, ptr + 1, options, cd))
4254 {
4255 repeat_type = 0; /* Force greedy */
4256 possessive_quantifier = TRUE;
4257 }
4258
4259 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
4260 }
4261
4262 /* If previous was a single negated character ([^a] or similar), we use
4263 one of the special opcodes, replacing it. The code is shared with single-
4264 character repeats by setting opt_type to add a suitable offset into
4265 repeat_type. We can also test for auto-possessification. OP_NOT and OP_NOTI
4266 are currently used only for single-byte chars. */
4267
4268 else if (*previous == OP_NOT || *previous == OP_NOTI)
4269 {
4270 op_type = ((*previous == OP_NOT)? OP_NOTSTAR : OP_NOTSTARI) - OP_STAR;
4271 c = previous[1];
4272 if (!possessive_quantifier &&
4273 repeat_max < 0 &&
4274 check_auto_possessive(previous, utf8, ptr + 1, options, cd))
4275 {
4276 repeat_type = 0; /* Force greedy */
4277 possessive_quantifier = TRUE;
4278 }
4279 goto OUTPUT_SINGLE_REPEAT;
4280 }
4281
4282 /* If previous was a character type match (\d or similar), abolish it and
4283 create a suitable repeat item. The code is shared with single-character
4284 repeats by setting op_type to add a suitable offset into repeat_type. Note
4285 the the Unicode property types will be present only when SUPPORT_UCP is
4286 defined, but we don't wrap the little bits of code here because it just
4287 makes it horribly messy. */
4288
4289 else if (*previous < OP_EODN)
4290 {
4291 uschar *oldcode;
4292 int prop_type, prop_value;
4293 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
4294 c = *previous;
4295
4296 if (!possessive_quantifier &&
4297 repeat_max < 0 &&
4298 check_auto_possessive(previous, utf8, ptr + 1, options, cd))
4299 {
4300 repeat_type = 0; /* Force greedy */
4301 possessive_quantifier = TRUE;
4302 }
4303
4304 OUTPUT_SINGLE_REPEAT:
4305 if (*previous == OP_PROP || *previous == OP_NOTPROP)
4306 {
4307 prop_type = previous[1];
4308 prop_value = previous[2];
4309 }
4310 else prop_type = prop_value = -1;
4311
4312 oldcode = code;
4313 code = previous; /* Usually overwrite previous item */
4314
4315 /* If the maximum is zero then the minimum must also be zero; Perl allows
4316 this case, so we do too - by simply omitting the item altogether. */
4317
4318 if (repeat_max == 0) goto END_REPEAT;
4319
4320 /*--------------------------------------------------------------------*/
4321 /* This code is obsolete from release 8.00; the restriction was finally
4322 removed: */
4323
4324 /* All real repeats make it impossible to handle partial matching (maybe
4325 one day we will be able to remove this restriction). */
4326
4327 /* if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL; */
4328 /*--------------------------------------------------------------------*/
4329
4330 /* Combine the op_type with the repeat_type */
4331
4332 repeat_type += op_type;
4333
4334 /* A minimum of zero is handled either as the special case * or ?, or as
4335 an UPTO, with the maximum given. */
4336
4337 if (repeat_min == 0)
4338 {
4339 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
4340 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
4341 else
4342 {
4343 *code++ = OP_UPTO + repeat_type;
4344 PUT2INC(code, 0, repeat_max);
4345 }
4346 }
4347
4348 /* A repeat minimum of 1 is optimized into some special cases. If the
4349 maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
4350 left in place and, if the maximum is greater than 1, we use OP_UPTO with
4351 one less than the maximum. */
4352
4353 else if (repeat_min == 1)
4354 {
4355 if (repeat_max == -1)
4356 *code++ = OP_PLUS + repeat_type;
4357 else
4358 {
4359 code = oldcode; /* leave previous item in place */
4360 if (repeat_max == 1) goto END_REPEAT;
4361 *code++ = OP_UPTO + repeat_type;
4362 PUT2INC(code, 0, repeat_max - 1);
4363 }
4364 }
4365
4366 /* The case {n,n} is just an EXACT, while the general case {n,m} is
4367 handled as an EXACT followed by an UPTO. */
4368
4369 else
4370 {
4371 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
4372 PUT2INC(code, 0, repeat_min);
4373
4374 /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
4375 we have to insert the character for the previous code. For a repeated
4376 Unicode property match, there are two extra bytes that define the
4377 required property. In UTF-8 mode, long characters have their length in
4378 c, with the 0x80 bit as a flag. */
4379
4380 if (repeat_max < 0)
4381 {
4382 #ifdef SUPPORT_UTF8
4383 if (utf8 && c >= 128)
4384 {
4385 memcpy(code, utf8_char, c & 7);
4386 code += c & 7;
4387 }
4388 else
4389 #endif
4390 {
4391 *code++ = c;
4392 if (prop_type >= 0)
4393 {
4394 *code++ = prop_type;
4395 *code++ = prop_value;
4396 }
4397 }
4398 *code++ = OP_STAR + repeat_type;
4399 }
4400
4401 /* Else insert an UPTO if the max is greater than the min, again
4402 preceded by the character, for the previously inserted code. If the
4403 UPTO is just for 1 instance, we can use QUERY instead. */
4404
4405 else if (repeat_max != repeat_min)
4406 {
4407 #ifdef SUPPORT_UTF8
4408 if (utf8 && c >= 128)
4409 {
4410 memcpy(code, utf8_char, c & 7);
4411 code += c & 7;
4412 }
4413 else
4414 #endif
4415 *code++ = c;
4416 if (prop_type >= 0)
4417 {
4418 *code++ = prop_type;
4419 *code++ = prop_value;
4420 }
4421 repeat_max -= repeat_min;
4422
4423 if (repeat_max == 1)
4424 {
4425 *code++ = OP_QUERY + repeat_type;
4426 }
4427 else
4428 {
4429 *code++ = OP_UPTO + repeat_type;
4430 PUT2INC(code, 0, repeat_max);
4431 }
4432 }
4433 }
4434
4435 /* The character or character type itself comes last in all cases. */
4436
4437 #ifdef SUPPORT_UTF8
4438 if (utf8 && c >= 128)
4439 {
4440 memcpy(code, utf8_char, c & 7);
4441 code += c & 7;
4442 }
4443 else
4444 #endif
4445 *code++ = c;
4446
4447 /* For a repeated Unicode property match, there are two extra bytes that
4448 define the required property. */
4449
4450 #ifdef SUPPORT_UCP
4451 if (prop_type >= 0)
4452 {
4453 *code++ = prop_type;
4454 *code++ = prop_value;
4455 }
4456 #endif
4457 }
4458
4459 /* If previous was a character class or a back reference, we put the repeat
4460 stuff after it, but just skip the item if the repeat was {0,0}. */
4461
4462 else if (*previous == OP_CLASS ||
4463 *previous == OP_NCLASS ||
4464 #ifdef SUPPORT_UTF8
4465 *previous == OP_XCLASS ||
4466 #endif
4467 *previous == OP_REF ||
4468 *previous == OP_REFI)
4469 {
4470 if (repeat_max == 0)
4471 {
4472 code = previous;
4473 goto END_REPEAT;
4474 }
4475
4476 /*--------------------------------------------------------------------*/
4477 /* This code is obsolete from release 8.00; the restriction was finally
4478 removed: */
4479
4480 /* All real repeats make it impossible to handle partial matching (maybe
4481 one day we will be able to remove this restriction). */
4482
4483 /* if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL; */
4484 /*--------------------------------------------------------------------*/
4485
4486 if (repeat_min == 0 && repeat_max == -1)
4487 *code++ = OP_CRSTAR + repeat_type;
4488 else if (repeat_min == 1 && repeat_max == -1)
4489 *code++ = OP_CRPLUS + repeat_type;
4490 else if (repeat_min == 0 && repeat_max == 1)
4491 *code++ = OP_CRQUERY + repeat_type;
4492 else
4493 {
4494 *code++ = OP_CRRANGE + repeat_type;
4495 PUT2INC(code, 0, repeat_min);
4496 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
4497 PUT2INC(code, 0, repeat_max);
4498 }
4499 }
4500
4501 /* If previous was a bracket group, we may have to replicate it in certain
4502 cases. Note that at this point we can encounter only the "basic" BRA and
4503 KET opcodes, as this is the place where they get converted into the more
4504 special varieties. */
4505
4506 else if (*previous == OP_BRA || *previous == OP_CBRA ||
4507 *previous == OP_ONCE || *previous == OP_COND)
4508 {
4509 register int i;
4510 int len = (int)(code - previous);
4511 uschar *bralink = NULL;
4512 uschar *brazeroptr = NULL;
4513
4514 /* Repeating a DEFINE group is pointless */
4515
4516 if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
4517 {
4518 *errorcodeptr = ERR55;
4519 goto FAILED;
4520 }
4521
4522 /* The case of a zero minimum is special because of the need to stick
4523 OP_BRAZERO in front of it, and because the group appears once in the
4524 data, whereas in other cases it appears the minimum number of times. For
4525 this reason, it is simplest to treat this case separately, as otherwise
4526 the code gets far too messy. There are several special subcases when the
4527 minimum is zero. */
4528
4529 if (repeat_min == 0)
4530 {
4531 /* If the maximum is also zero, we used to just omit the group from the
4532 output altogether, like this:
4533
4534 ** if (repeat_max == 0)
4535 ** {
4536 ** code = previous;
4537 ** goto END_REPEAT;
4538 ** }
4539
4540 However, that fails when a group is referenced as a subroutine from
4541 elsewhere in the pattern, so now we stick in OP_SKIPZERO in front of it
4542 so that it is skipped on execution. As we don't have a list of which
4543 groups are referenced, we cannot do this selectively.
4544
4545 If the maximum is 1 or unlimited, we just have to stick in the BRAZERO
4546 and do no more at this point. However, we do need to adjust any
4547 OP_RECURSE calls inside the group that refer to the group itself or any
4548 internal or forward referenced group, because the offset is from the
4549 start of the whole regex. Temporarily terminate the pattern while doing
4550 this. */
4551
4552 if (repeat_max <= 1) /* Covers 0, 1, and unlimited */
4553 {
4554 *code = OP_END;
4555 adjust_recurse(previous, 1, utf8, cd, save_hwm);
4556 memmove(previous+1, previous, len);
4557 code++;
4558 if (repeat_max == 0)
4559 {
4560 *previous++ = OP_SKIPZERO;
4561 goto END_REPEAT;
4562 }
4563 brazeroptr = previous; /* Save for possessive optimizing */
4564 *previous++ = OP_BRAZERO + repeat_type;
4565 }
4566
4567 /* If the maximum is greater than 1 and limited, we have to replicate
4568 in a nested fashion, sticking OP_BRAZERO before each set of brackets.
4569 The first one has to be handled carefully because it's the original
4570 copy, which has to be moved up. The remainder can be handled by code
4571 that is common with the non-zero minimum case below. We have to
4572 adjust the value or repeat_max, since one less copy is required. Once
4573 again, we may have to adjust any OP_RECURSE calls inside the group. */
4574
4575 else
4576 {
4577 int offset;
4578 *code = OP_END;
4579 adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
4580 memmove(previous + 2 + LINK_SIZE, previous, len);
4581 code += 2 + LINK_SIZE;
4582 *previous++ = OP_BRAZERO + repeat_type;
4583 *previous++ = OP_BRA;
4584
4585 /* We chain together the bracket offset fields that have to be
4586 filled in later when the ends of the brackets are reached. */
4587
4588 offset = (bralink == NULL)? 0 : (int)(previous - bralink);
4589 bralink = previous;
4590 PUTINC(previous, 0, offset);
4591 }
4592
4593 repeat_max--;
4594 }
4595
4596 /* If the minimum is greater than zero, replicate the group as many
4597 times as necessary, and adjust the maximum to the number of subsequent
4598 copies that we need. If we set a first char from the group, and didn't
4599 set a required char, copy the latter from the former. If there are any
4600 forward reference subroutine calls in the group, there will be entries on
4601 the workspace list; replicate these with an appropriate increment. */
4602
4603 else
4604 {
4605 if (repeat_min > 1)
4606 {
4607 /* In the pre-compile phase, we don't actually do the replication. We
4608 just adjust the length as if we had. Do some paranoid checks for
4609 potential integer overflow. The INT64_OR_DOUBLE type is a 64-bit
4610 integer type when available, otherwise double. */
4611
4612 if (lengthptr != NULL)
4613 {
4614 int delta = (repeat_min - 1)*length_prevgroup;
4615 if ((INT64_OR_DOUBLE)(repeat_min - 1)*
4616 (INT64_OR_DOUBLE)length_prevgroup >
4617 (INT64_OR_DOUBLE)INT_MAX ||
4618 OFLOW_MAX - *lengthptr < delta)
4619 {
4620 *errorcodeptr = ERR20;
4621 goto FAILED;
4622 }
4623 *lengthptr += delta;
4624 }
4625
4626 /* This is compiling for real */
4627
4628 else
4629 {
4630 if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
4631 for (i = 1; i < repeat_min; i++)
4632 {
4633 uschar *hc;
4634 uschar *this_hwm = cd->hwm;
4635 memcpy(code, previous, len);
4636 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
4637 {
4638 PUT(cd->hwm, 0, GET(hc, 0) + len);
4639 cd->hwm += LINK_SIZE;
4640 }
4641 save_hwm = this_hwm;
4642 code += len;
4643 }
4644 }
4645 }
4646
4647 if (repeat_max > 0) repeat_max -= repeat_min;
4648 }
4649
4650 /* This code is common to both the zero and non-zero minimum cases. If
4651 the maximum is limited, it replicates the group in a nested fashion,
4652 remembering the bracket starts on a stack. In the case of a zero minimum,
4653 the first one was set up above. In all cases the repeat_max now specifies
4654 the number of additional copies needed. Again, we must remember to
4655 replicate entries on the forward reference list. */
4656
4657 if (repeat_max >= 0)
4658 {
4659 /* In the pre-compile phase, we don't actually do the replication. We
4660 just adjust the length as if we had. For each repetition we must add 1
4661 to the length for BRAZERO and for all but the last repetition we must
4662 add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
4663 paranoid checks to avoid integer overflow. The INT64_OR_DOUBLE type is
4664 a 64-bit integer type when available, otherwise double. */
4665
4666 if (lengthptr != NULL && repeat_max > 0)
4667 {
4668 int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
4669 2 - 2*LINK_SIZE; /* Last one doesn't nest */
4670 if ((INT64_OR_DOUBLE)repeat_max *
4671 (INT64_OR_DOUBLE)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
4672 > (INT64_OR_DOUBLE)INT_MAX ||
4673 OFLOW_MAX - *lengthptr < delta)
4674 {
4675 *errorcodeptr = ERR20;
4676 goto FAILED;
4677 }
4678 *lengthptr += delta;
4679 }
4680
4681 /* This is compiling for real */
4682
4683 else for (i = repeat_max - 1; i >= 0; i--)
4684 {
4685 uschar *hc;
4686 uschar *this_hwm = cd->hwm;
4687
4688 *code++ = OP_BRAZERO + repeat_type;
4689
4690 /* All but the final copy start a new nesting, maintaining the
4691 chain of brackets outstanding. */
4692
4693 if (i != 0)
4694 {
4695 int offset;
4696 *code++ = OP_BRA;
4697 offset = (bralink == NULL)? 0 : (int)(code - bralink);
4698 bralink = code;
4699 PUTINC(code, 0, offset);
4700 }
4701
4702 memcpy(code, previous, len);
4703 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
4704 {
4705 PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
4706 cd->hwm += LINK_SIZE;
4707 }
4708 save_hwm = this_hwm;
4709 code += len;
4710 }
4711
4712 /* Now chain through the pending brackets, and fill in their length
4713 fields (which are holding the chain links pro tem). */
4714
4715 while (bralink != NULL)
4716 {
4717 int oldlinkoffset;
4718 int offset = (int)(code - bralink + 1);
4719 uschar *bra = code - offset;
4720 oldlinkoffset = GET(bra, 1);
4721 bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
4722 *code++ = OP_KET;
4723 PUTINC(code, 0, offset);
4724 PUT(bra, 1, offset);
4725 }
4726 }
4727
4728 /* If the maximum is unlimited, set a repeater in the final copy. For
4729 ONCE brackets, that's all we need to do.
4730
4731 Otherwise, if the quantifier was possessive, we convert the BRA code to
4732 the POS form, and the KET code to KETRPOS. (It turns out to be convenient
4733 at runtime to detect this kind of subpattern at both the start and at the
4734 end.) The use of special opcodes makes it possible to reduce greatly the
4735 stack usage in pcre_exec(). If the group is preceded by OP_BRAZERO,
4736 convert this to OP_BRAPOSZERO. Then cancel the possessive flag so that
4737 the default action below, of wrapping everything inside atomic brackets,
4738 does not happen.
4739
4740 Then, when we are doing the actual compile phase, check to see whether
4741 this group is one that could match an empty string. If so, convert the
4742 initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so that runtime
4743 checking can be done. [This check is also applied to ONCE groups at
4744 runtime, but in a different way.] */
4745
4746 else
4747 {
4748 uschar *ketcode = code - 1 - LINK_SIZE;
4749 uschar *bracode = ketcode - GET(ketcode, 1);
4750
4751 if (*bracode == OP_ONCE)
4752 *ketcode = OP_KETRMAX + repeat_type;
4753 else
4754 {
4755 if (possessive_quantifier)
4756 {
4757 *bracode += 1; /* Switch to xxxPOS opcodes */
4758 *ketcode = OP_KETRPOS;
4759 if (brazeroptr != NULL) *brazeroptr = OP_BRAPOSZERO;
4760 possessive_quantifier = FALSE;
4761 }
4762 else *ketcode = OP_KETRMAX + repeat_type;
4763
4764 if (lengthptr == NULL)
4765 {
4766 uschar *scode = bracode;
4767 do
4768 {
4769 if (could_be_empty_branch(scode, ketcode, utf8, cd))
4770 {
4771 *bracode += OP_SBRA - OP_BRA;
4772 break;
4773 }
4774 scode += GET(scode, 1);
4775 }
4776 while (*scode == OP_ALT);
4777 }
4778 }
4779 }
4780 }
4781
4782 /* If previous is OP_FAIL, it was generated by an empty class [] in
4783 JavaScript mode. The other ways in which OP_FAIL can be generated, that is
4784 by (*FAIL) or (?!) set previous to NULL, which gives a "nothing to repeat"
4785 error above. We can just ignore the repeat in JS case. */
4786
4787 else if (*previous == OP_FAIL) goto END_REPEAT;
4788
4789 /* Else there's some kind of shambles */
4790
4791 else
4792 {
4793 *errorcodeptr = ERR11;
4794 goto FAILED;
4795 }
4796
4797 /* If the character following a repeat is '+', or if certain optimization
4798 tests above succeeded, possessive_quantifier is TRUE. For some opcodes,
4799 there are special alternative opcodes for this case. For anything else, we
4800 wrap the entire repeated item inside OP_ONCE brackets. Logically, the '+'
4801 notation is just syntactic sugar, taken from Sun's Java package, but the
4802 special opcodes can optimize it.
4803
4804 Possessively repeated subpatterns have already been handled in the code
4805 just above, so possessive_quantifier is always FALSE for them at this
4806 stage.
4807
4808 Note that the repeated item starts at tempcode, not at previous, which
4809 might be the first part of a string whose (former) last char we repeated.
4810
4811 Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
4812 an 'upto' may follow. We skip over an 'exact' item, and then test the
4813 length of what remains before proceeding. */
4814
4815 if (possessive_quantifier)
4816 {
4817 int len;
4818
4819 if (*tempcode == OP_TYPEEXACT)
4820 tempcode += _pcre_OP_lengths[*tempcode] +
4821 ((tempcode[3] == OP_PROP || tempcode[3] == OP_NOTPROP)? 2 : 0);
4822
4823 else if (*tempcode == OP_EXACT || *tempcode == OP_NOTEXACT)
4824 {
4825 tempcode += _pcre_OP_lengths[*tempcode];
4826 #ifdef SUPPORT_UTF8
4827 if (utf8 && tempcode[-1] >= 0xc0)
4828 tempcode += _pcre_utf8_table4[tempcode[-1] & 0x3f];
4829 #endif
4830 }
4831
4832 len = (int)(code - tempcode);
4833 if (len > 0) switch (*tempcode)
4834 {
4835 case OP_STAR: *tempcode = OP_POSSTAR; break;
4836 case OP_PLUS: *tempcode = OP_POSPLUS; break;
4837 case OP_QUERY: *tempcode = OP_POSQUERY; break;
4838 case OP_UPTO: *tempcode = OP_POSUPTO; break;
4839
4840 case OP_STARI: *tempcode = OP_POSSTARI; break;
4841 case OP_PLUSI: *tempcode = OP_POSPLUSI; break;
4842 case OP_QUERYI: *tempcode = OP_POSQUERYI; break;
4843 case OP_UPTOI: *tempcode = OP_POSUPTOI; break;
4844
4845 case OP_NOTSTAR: *tempcode = OP_NOTPOSSTAR; break;
4846 case OP_NOTPLUS: *tempcode = OP_NOTPOSPLUS; break;
4847 case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
4848 case OP_NOTUPTO: *tempcode = OP_NOTPOSUPTO; break;
4849
4850 case OP_NOTSTARI: *tempcode = OP_NOTPOSSTARI; break;
4851 case OP_NOTPLUSI: *tempcode = OP_NOTPOSPLUSI; break;
4852 case OP_NOTQUERYI: *tempcode = OP_NOTPOSQUERYI; break;
4853 case OP_NOTUPTOI: *tempcode = OP_NOTPOSUPTOI; break;
4854
4855 case OP_TYPESTAR: *tempcode = OP_TYPEPOSSTAR; break;
4856 case OP_TYPEPLUS: *tempcode = OP_TYPEPOSPLUS; break;
4857 case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
4858 case OP_TYPEUPTO: *tempcode = OP_TYPEPOSUPTO; break;
4859
4860 /* Because we are moving code along, we must ensure that any
4861 pending recursive references are updated. */
4862
4863 default:
4864 *code = OP_END;
4865 adjust_recurse(tempcode, 1 + LINK_SIZE, utf8, cd, save_hwm);
4866 memmove(tempcode + 1+LINK_SIZE, tempcode, len);
4867 code += 1 + LINK_SIZE;
4868 len += 1 + LINK_SIZE;
4869 tempcode[0] = OP_ONCE;
4870 *code++ = OP_KET;
4871 PUTINC(code, 0, len);
4872 PUT(tempcode, 1, len);
4873 break;
4874 }
4875 }
4876
4877 /* In all case we no longer have a previous item. We also set the
4878 "follows varying string" flag for subsequently encountered reqbytes if
4879 it isn't already set and we have just passed a varying length item. */
4880
4881 END_REPEAT:
4882 previous = NULL;
4883 cd->req_varyopt |= reqvary;
4884 break;
4885
4886
4887 /* ===================================================================*/
4888 /* Start of nested parenthesized sub-expression, or comment or lookahead or
4889 lookbehind or option setting or condition or all the other extended
4890 parenthesis forms. */
4891
4892 case CHAR_LEFT_PARENTHESIS:
4893 newoptions = options;
4894 skipbytes = 0;
4895 bravalue = OP_CBRA;
4896 save_hwm = cd->hwm;
4897 reset_bracount = FALSE;
4898
4899 /* First deal with various "verbs" that can be introduced by '*'. */
4900
4901 if (*(++ptr) == CHAR_ASTERISK &&
4902 ((cd->ctypes[ptr[1]] & ctype_letter) != 0 || ptr[1] == ':'))
4903 {
4904 int i, namelen;
4905 int arglen = 0;
4906 const char *vn = verbnames;
4907 const uschar *name = ptr + 1;
4908 const uschar *arg = NULL;
4909 previous = NULL;
4910 while ((cd->ctypes[*++ptr] & ctype_letter) != 0) {};
4911 namelen = (int)(ptr - name);
4912
4913 if (*ptr == CHAR_COLON)
4914 {
4915 arg = ++ptr;
4916 while ((cd->ctypes[*ptr] & (ctype_letter|ctype_digit)) != 0
4917 || *ptr == '_') ptr++;
4918 arglen = (int)(ptr - arg);
4919 }
4920
4921 if (*ptr != CHAR_RIGHT_PARENTHESIS)
4922 {
4923 *errorcodeptr = ERR60;
4924 goto FAILED;
4925 }
4926
4927 /* Scan the table of verb names */
4928
4929 for (i = 0; i < verbcount; i++)
4930 {
4931 if (namelen == verbs[i].len &&
4932 strncmp((char *)name, vn, namelen) == 0)
4933 {
4934 /* Check for open captures before ACCEPT */
4935
4936 if (verbs[i].op == OP_ACCEPT)
4937 {
4938 open_capitem *oc;
4939 cd->had_accept = TRUE;
4940 for (oc = cd->open_caps; oc != NULL; oc = oc->next)
4941 {
4942 *code++ = OP_CLOSE;
4943 PUT2INC(code, 0, oc->number);
4944 }
4945 }
4946
4947 /* Handle the cases with/without an argument */
4948
4949 if (arglen == 0)
4950 {
4951 if (verbs[i].op < 0) /* Argument is mandatory */
4952 {
4953 *errorcodeptr = ERR66;
4954 goto FAILED;
4955 }
4956 *code = verbs[i].op;
4957 if (*code++ == OP_THEN)
4958 {
4959 PUT(code, 0, code - bcptr->current_branch - 1);
4960 code += LINK_SIZE;
4961 }
4962 }
4963
4964 else
4965 {
4966 if (verbs[i].op_arg < 0) /* Argument is forbidden */
4967 {
4968 *errorcodeptr = ERR59;
4969 goto FAILED;
4970 }
4971 *code = verbs[i].op_arg;
4972 if (*code++ == OP_THEN_ARG)
4973 {
4974 PUT(code, 0, code - bcptr->current_branch - 1);
4975 code += LINK_SIZE;
4976 }
4977 *code++ = arglen;
4978 memcpy(code, arg, arglen);
4979 code += arglen;
4980 *code++ = 0;
4981 }
4982
4983 break; /* Found verb, exit loop */
4984 }
4985
4986 vn += verbs[i].len + 1;
4987 }
4988
4989 if (i < verbcount) continue; /* Successfully handled a verb */
4990 *errorcodeptr = ERR60; /* Verb not recognized */
4991 goto FAILED;
4992 }
4993
4994 /* Deal with the extended parentheses; all are introduced by '?', and the
4995 appearance of any of them means that this is not a capturing group. */
4996
4997 else if (*ptr == CHAR_QUESTION_MARK)
4998 {
4999 int i, set, unset, namelen;
5000 int *optset;
5001 const uschar *name;
5002 uschar *slot;
5003
5004 switch (*(++ptr))
5005 {
5006 case CHAR_NUMBER_SIGN: /* Comment; skip to ket */
5007 ptr++;
5008 while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
5009 if (*ptr == 0)
5010 {
5011 *errorcodeptr = ERR18;
5012 goto FAILED;
5013 }
5014 continue;
5015
5016
5017 /* ------------------------------------------------------------ */
5018 case CHAR_VERTICAL_LINE: /* Reset capture count for each branch */
5019 reset_bracount = TRUE;
5020 /* Fall through */
5021
5022 /* ------------------------------------------------------------ */
5023 case CHAR_COLON: /* Non-capturing bracket */
5024 bravalue = OP_BRA;
5025 ptr++;
5026 break;
5027
5028
5029 /* ------------------------------------------------------------ */
5030 case CHAR_LEFT_PARENTHESIS:
5031 bravalue = OP_COND; /* Conditional group */
5032
5033 /* A condition can be an assertion, a number (referring to a numbered
5034 group), a name (referring to a named group), or 'R', referring to
5035 recursion. R<digits> and R&name are also permitted for recursion tests.
5036
5037 There are several syntaxes for testing a named group: (?(name)) is used
5038 by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
5039
5040 There are two unfortunate ambiguities, caused by history. (a) 'R' can
5041 be the recursive thing or the name 'R' (and similarly for 'R' followed
5042 by digits), and (b) a number could be a name that consists of digits.
5043 In both cases, we look for a name first; if not found, we try the other
5044 cases. */
5045
5046 /* For conditions that are assertions, check the syntax, and then exit
5047 the switch. This will take control down to where bracketed groups,
5048 including assertions, are processed. */
5049
5050 if (ptr[1] == CHAR_QUESTION_MARK && (ptr[2] == CHAR_EQUALS_SIGN ||
5051 ptr[2] == CHAR_EXCLAMATION_MARK || ptr[2] == CHAR_LESS_THAN_SIGN))
5052 break;
5053
5054 /* Most other conditions use OP_CREF (a couple change to OP_RREF
5055 below), and all need to skip 3 bytes at the start of the group. */
5056
5057 code[1+LINK_SIZE] = OP_CREF;
5058 skipbytes = 3;
5059 refsign = -1;
5060
5061 /* Check for a test for recursion in a named group. */
5062
5063 if (ptr[1] == CHAR_R && ptr[2] == CHAR_AMPERSAND)
5064 {
5065 terminator = -1;
5066 ptr += 2;
5067 code[1+LINK_SIZE] = OP_RREF; /* Change the type of test */
5068 }
5069
5070 /* Check for a test for a named group's having been set, using the Perl
5071 syntax (?(<name>) or (?('name') */
5072
5073 else if (ptr[1] == CHAR_LESS_THAN_SIGN)
5074 {
5075 terminator = CHAR_GREATER_THAN_SIGN;
5076 ptr++;
5077 }
5078 else if (ptr[1] == CHAR_APOSTROPHE)
5079 {
5080 terminator = CHAR_APOSTROPHE;
5081 ptr++;
5082 }
5083 else
5084 {
5085 terminator = 0;
5086 if (ptr[1] == CHAR_MINUS || ptr[1] == CHAR_PLUS) refsign = *(++ptr);
5087 }
5088
5089 /* We now expect to read a name; any thing else is an error */
5090
5091 if ((cd->ctypes[ptr[1]] & ctype_word) == 0)
5092 {
5093 ptr += 1; /* To get the right offset */
5094 *errorcodeptr = ERR28;
5095 goto FAILED;
5096 }
5097
5098 /* Read the name, but also get it as a number if it's all digits */
5099
5100 recno = 0;
5101 name = ++ptr;
5102 while ((cd->ctypes[*ptr] & ctype_word) != 0)
5103 {
5104 if (recno >= 0)
5105 recno = ((digitab[*ptr] & ctype_digit) != 0)?
5106 recno * 10 + *ptr - CHAR_0 : -1;
5107 ptr++;
5108 }
5109 namelen = (int)(ptr - name);
5110
5111 if ((terminator > 0 && *ptr++ != terminator) ||
5112 *ptr++ != CHAR_RIGHT_PARENTHESIS)
5113 {
5114 ptr--; /* Error offset */
5115 *errorcodeptr = ERR26;
5116 goto FAILED;
5117 }
5118
5119 /* Do no further checking in the pre-compile phase. */
5120
5121 if (lengthptr != NULL) break;
5122
5123 /* In the real compile we do the work of looking for the actual
5124 reference. If the string started with "+" or "-" we require the rest to
5125 be digits, in which case recno will be set. */
5126
5127 if (refsign > 0)
5128 {
5129 if (recno <= 0)
5130 {
5131 *errorcodeptr = ERR58;
5132 goto FAILED;
5133 }
5134 recno = (refsign == CHAR_MINUS)?
5135 cd->bracount - recno + 1 : recno +cd->bracount;
5136 if (recno <= 0 || recno > cd->final_bracount)
5137 {
5138 *errorcodeptr = ERR15;
5139 goto FAILED;
5140 }
5141 PUT2(code, 2+LINK_SIZE, recno);
5142 break;
5143 }
5144
5145 /* Otherwise (did not start with "+" or "-"), start by looking for the
5146 name. If we find a name, add one to the opcode to change OP_CREF or
5147 OP_RREF into OP_NCREF or OP_NRREF. These behave exactly the same,
5148 except they record that the reference was originally to a name. The
5149 information is used to check duplicate names. */
5150
5151 slot = cd->name_table;
5152 for (i = 0; i < cd->names_found; i++)
5153 {
5154 if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
5155 slot += cd->name_entry_size;
5156 }
5157
5158 /* Found a previous named subpattern */
5159
5160 if (i < cd->names_found)
5161 {
5162 recno = GET2(slot, 0);
5163 PUT2(code, 2+LINK_SIZE, recno);
5164 code[1+LINK_SIZE]++;
5165 }
5166
5167 /* Search the pattern for a forward reference */
5168
5169 else if ((i = find_parens(cd, name, namelen,
5170 (options & PCRE_EXTENDED) != 0, utf8)) > 0)
5171 {
5172 PUT2(code, 2+LINK_SIZE, i);
5173 code[1+LINK_SIZE]++;
5174 }
5175
5176 /* If terminator == 0 it means that the name followed directly after
5177 the opening parenthesis [e.g. (?(abc)...] and in this case there are
5178 some further alternatives to try. For the cases where terminator != 0
5179 [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
5180 now checked all the possibilities, so give an error. */
5181
5182 else if (terminator != 0)
5183 {
5184 *errorcodeptr = ERR15;
5185 goto FAILED;
5186 }
5187
5188 /* Check for (?(R) for recursion. Allow digits after R to specify a
5189 specific group number. */
5190
5191 else if (*name == CHAR_R)
5192 {
5193 recno = 0;
5194 for (i = 1; i < namelen; i++)
5195 {
5196 if ((digitab[name[i]] & ctype_digit) == 0)
5197 {
5198 *errorcodeptr = ERR15;
5199 goto FAILED;
5200 }
5201 recno = recno * 10 + name[i] - CHAR_0;
5202 }
5203 if (recno == 0) recno = RREF_ANY;
5204 code[1+LINK_SIZE] = OP_RREF; /* Change test type */
5205 PUT2(code, 2+LINK_SIZE, recno);
5206 }
5207
5208 /* Similarly, check for the (?(DEFINE) "condition", which is always
5209 false. */
5210
5211 else if (namelen == 6 && strncmp((char *)name, STRING_DEFINE, 6) == 0)
5212 {
5213 code[1+LINK_SIZE] = OP_DEF;
5214 skipbytes = 1;
5215 }
5216
5217 /* Check for the "name" actually being a subpattern number. We are
5218 in the second pass here, so final_bracount is set. */
5219
5220 else if (recno > 0 && recno <= cd->final_bracount)
5221 {
5222 PUT2(code, 2+LINK_SIZE, recno);
5223 }
5224
5225 /* Either an unidentified subpattern, or a reference to (?(0) */
5226
5227 else
5228 {
5229 *errorcodeptr = (recno == 0)? ERR35: ERR15;
5230 goto FAILED;
5231 }
5232 break;
5233
5234
5235 /* ------------------------------------------------------------ */
5236 case CHAR_EQUALS_SIGN: /* Positive lookahead */
5237 bravalue = OP_ASSERT;
5238 ptr++;
5239 break;
5240
5241
5242 /* ------------------------------------------------------------ */
5243 case CHAR_EXCLAMATION_MARK: /* Negative lookahead */
5244 ptr++;
5245 if (*ptr == CHAR_RIGHT_PARENTHESIS) /* Optimize (?!) */
5246 {
5247 *code++ = OP_FAIL;
5248 previous = NULL;
5249 continue;
5250 }
5251 bravalue = OP_ASSERT_NOT;
5252 break;
5253
5254
5255 /* ------------------------------------------------------------ */
5256 case CHAR_LESS_THAN_SIGN: /* Lookbehind or named define */
5257 switch (ptr[1])
5258 {
5259 case CHAR_EQUALS_SIGN: /* Positive lookbehind */
5260 bravalue = OP_ASSERTBACK;
5261 ptr += 2;
5262 break;
5263
5264 case CHAR_EXCLAMATION_MARK: /* Negative lookbehind */
5265 bravalue = OP_ASSERTBACK_NOT;
5266 ptr += 2;
5267 break;
5268
5269 default: /* Could be name define, else bad */
5270 if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME;
5271 ptr++; /* Correct offset for error */
5272 *errorcodeptr = ERR24;
5273 goto FAILED;
5274 }
5275 break;
5276
5277
5278 /* ------------------------------------------------------------ */
5279 case CHAR_GREATER_THAN_SIGN: /* One-time brackets */
5280 bravalue = OP_ONCE;
5281 ptr++;
5282 break;
5283
5284
5285 /* ------------------------------------------------------------ */
5286 case CHAR_C: /* Callout - may be followed by digits; */
5287 previous_callout = code; /* Save for later completion */
5288 after_manual_callout = 1; /* Skip one item before completing */
5289 *code++ = OP_CALLOUT;
5290 {
5291 int n = 0;
5292 while ((digitab[*(++ptr)] & ctype_digit) != 0)
5293 n = n * 10 + *ptr - CHAR_0;
5294 if (*ptr != CHAR_RIGHT_PARENTHESIS)
5295 {
5296 *errorcodeptr = ERR39;
5297 goto FAILED;
5298 }
5299 if (n > 255)
5300 {
5301 *errorcodeptr = ERR38;
5302 goto FAILED;
5303 }
5304 *code++ = n;
5305 PUT(code, 0, (int)(ptr - cd->start_pattern + 1)); /* Pattern offset */
5306 PUT(code, LINK_SIZE, 0); /* Default length */
5307 code += 2 * LINK_SIZE;
5308 }
5309 previous = NULL;
5310 continue;
5311
5312
5313 /* ------------------------------------------------------------ */
5314 case CHAR_P: /* Python-style named subpattern handling */
5315 if (*(++ptr) == CHAR_EQUALS_SIGN ||
5316 *ptr == CHAR_GREATER_THAN_SIGN) /* Reference or recursion */
5317 {
5318 is_recurse = *ptr == CHAR_GREATER_THAN_SIGN;
5319 terminator = CHAR_RIGHT_PARENTHESIS;
5320 goto NAMED_REF_OR_RECURSE;
5321 }
5322 else if (*ptr != CHAR_LESS_THAN_SIGN) /* Test for Python-style defn */
5323 {
5324 *errorcodeptr = ERR41;
5325 goto FAILED;
5326 }
5327 /* Fall through to handle (?P< as (?< is handled */
5328
5329
5330 /* ------------------------------------------------------------ */
5331 DEFINE_NAME: /* Come here from (?< handling */
5332 case CHAR_APOSTROPHE:
5333 {
5334 terminator = (*ptr == CHAR_LESS_THAN_SIGN)?
5335 CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
5336 name = ++ptr;
5337
5338 while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
5339 namelen = (int)(ptr - name);
5340
5341 /* In the pre-compile phase, just do a syntax check. */
5342
5343 if (lengthptr != NULL)
5344 {
5345 if (*ptr != terminator)
5346 {
5347 *errorcodeptr = ERR42;
5348 goto FAILED;
5349 }
5350 if (cd->names_found >= MAX_NAME_COUNT)
5351 {
5352 *errorcodeptr = ERR49;
5353 goto FAILED;
5354 }
5355 if (namelen + 3 > cd->name_entry_size)
5356 {
5357 cd->name_entry_size = namelen + 3;
5358 if (namelen > MAX_NAME_SIZE)
5359 {
5360 *errorcodeptr = ERR48;
5361 goto FAILED;
5362 }
5363 }
5364 }
5365
5366 /* In the real compile, create the entry in the table, maintaining
5367 alphabetical order. Duplicate names for different numbers are
5368 permitted only if PCRE_DUPNAMES is set. Duplicate names for the same
5369 number are always OK. (An existing number can be re-used if (?|
5370 appears in the pattern.) In either event, a duplicate name results in
5371 a duplicate entry in the table, even if the number is the same. This
5372 is because the number of names, and hence the table size, is computed
5373 in the pre-compile, and it affects various numbers and pointers which
5374 would all have to be modified, and the compiled code moved down, if
5375 duplicates with the same number were omitted from the table. This
5376 doesn't seem worth the hassle. However, *different* names for the
5377 same number are not permitted. */
5378
5379 else
5380 {
5381 BOOL dupname = FALSE;
5382 slot = cd->name_table;
5383
5384 for (i = 0; i < cd->names_found; i++)
5385 {
5386 int crc = memcmp(name, slot+2, namelen);
5387 if (crc == 0)
5388 {
5389 if (slot[2+namelen] == 0)
5390 {
5391 if (GET2(slot, 0) != cd->bracount + 1 &&
5392 (options & PCRE_DUPNAMES) == 0)
5393 {
5394 *errorcodeptr = ERR43;
5395 goto FAILED;
5396 }
5397 else dupname = TRUE;
5398 }
5399 else crc = -1; /* Current name is a substring */
5400 }
5401
5402 /* Make space in the table and break the loop for an earlier
5403 name. For a duplicate or later name, carry on. We do this for
5404 duplicates so that in the simple case (when ?(| is not used) they
5405 are in order of their numbers. */
5406
5407 if (crc < 0)
5408 {
5409 memmove(slot + cd->name_entry_size, slot,
5410 (cd->names_found - i) * cd->name_entry_size);
5411 break;
5412 }
5413
5414 /* Continue the loop for a later or duplicate name */
5415
5416 slot += cd->name_entry_size;
5417 }
5418
5419 /* For non-duplicate names, check for a duplicate number before
5420 adding the new name. */
5421
5422 if (!dupname)
5423 {
5424 uschar *cslot = cd->name_table;
5425 for (i = 0; i < cd->names_found; i++)
5426 {
5427 if (cslot != slot)
5428 {
5429 if (GET2(cslot, 0) == cd->bracount + 1)
5430 {
5431 *errorcodeptr = ERR65;
5432 goto FAILED;
5433 }
5434 }
5435 else i--;
5436 cslot += cd->name_entry_size;
5437 }
5438 }
5439
5440 PUT2(slot, 0, cd->bracount + 1);
5441 memcpy(slot + 2, name, namelen);
5442 slot[2+namelen] = 0;
5443 }
5444 }
5445
5446 /* In both pre-compile and compile, count the number of names we've
5447 encountered. */
5448
5449 cd->names_found++;
5450 ptr++; /* Move past > or ' */
5451 goto NUMBERED_GROUP;
5452
5453
5454 /* ------------------------------------------------------------ */
5455 case CHAR_AMPERSAND: /* Perl recursion/subroutine syntax */
5456 terminator = CHAR_RIGHT_PARENTHESIS;
5457 is_recurse = TRUE;
5458 /* Fall through */
5459
5460 /* We come here from the Python syntax above that handles both
5461 references (?P=name) and recursion (?P>name), as well as falling
5462 through from the Perl recursion syntax (?&name). We also come here from
5463 the Perl \k<name> or \k'name' back reference syntax and the \k{name}
5464 .NET syntax, and the Oniguruma \g<...> and \g'...' subroutine syntax. */
5465
5466 NAMED_REF_OR_RECURSE:
5467 name = ++ptr;
5468 while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
5469 namelen = (int)(ptr - name);
5470
5471 /* In the pre-compile phase, do a syntax check. We used to just set
5472 a dummy reference number, because it was not used in the first pass.
5473 However, with the change of recursive back references to be atomic,
5474 we have to look for the number so that this state can be identified, as
5475 otherwise the incorrect length is computed. If it's not a backwards
5476 reference, the dummy number will do. */
5477
5478 if (lengthptr != NULL)
5479 {
5480 const uschar *temp;
5481
5482 if (namelen == 0)
5483 {
5484 *errorcodeptr = ERR62;
5485 goto FAILED;
5486 }
5487 if (*ptr != terminator)
5488 {
5489 *errorcodeptr = ERR42;
5490 goto FAILED;
5491 }
5492 if (namelen > MAX_NAME_SIZE)
5493 {
5494 *errorcodeptr = ERR48;
5495 goto FAILED;
5496 }
5497
5498 /* The name table does not exist in the first pass, so we cannot
5499 do a simple search as in the code below. Instead, we have to scan the
5500 pattern to find the number. It is important that we scan it only as
5501 far as we have got because the syntax of named subpatterns has not
5502 been checked for the rest of the pattern, and find_parens() assumes
5503 correct syntax. In any case, it's a waste of resources to scan
5504 further. We stop the scan at the current point by temporarily
5505 adjusting the value of cd->endpattern. */
5506
5507 temp = cd->end_pattern;
5508 cd->end_pattern = ptr;
5509 recno = find_parens(cd, name, namelen,
5510 (options & PCRE_EXTENDED) != 0, utf8);
5511 cd->end_pattern = temp;
5512 if (recno < 0) recno = 0; /* Forward ref; set dummy number */
5513 }
5514
5515 /* In the real compile, seek the name in the table. We check the name
5516 first, and then check that we have reached the end of the name in the
5517 table. That way, if the name that is longer than any in the table,
5518 the comparison will fail without reading beyond the table entry. */
5519
5520 else
5521 {
5522 slot = cd->name_table;
5523 for (i = 0; i < cd->names_found; i++)
5524 {
5525 if (strncmp((char *)name, (char *)slot+2, namelen) == 0 &&
5526 slot[2+namelen] == 0)
5527 break;
5528 slot += cd->name_entry_size;
5529 }
5530
5531 if (i < cd->names_found) /* Back reference */
5532 {
5533 recno = GET2(slot, 0);
5534 }
5535 else if ((recno = /* Forward back reference */
5536 find_parens(cd, name, namelen,
5537 (options & PCRE_EXTENDED) != 0, utf8)) <= 0)
5538 {
5539 *errorcodeptr = ERR15;
5540 goto FAILED;
5541 }
5542 }
5543
5544 /* In both phases, we can now go to the code than handles numerical
5545 recursion or backreferences. */
5546
5547 if (is_recurse) goto HANDLE_RECURSION;
5548 else goto HANDLE_REFERENCE;
5549
5550
5551 /* ------------------------------------------------------------ */
5552 case CHAR_R: /* Recursion */
5553 ptr++; /* Same as (?0) */
5554 /* Fall through */
5555
5556
5557 /* ------------------------------------------------------------ */
5558 case CHAR_MINUS: case CHAR_PLUS: /* Recursion or subroutine */
5559 case CHAR_0: case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4:
5560 case CHAR_5: case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
5561 {
5562 const uschar *called;
5563 terminator = CHAR_RIGHT_PARENTHESIS;
5564
5565 /* Come here from the \g<...> and \g'...' code (Oniguruma
5566 compatibility). However, the syntax has been checked to ensure that
5567 the ... are a (signed) number, so that neither ERR63 nor ERR29 will
5568 be called on this path, nor with the jump to OTHER_CHAR_AFTER_QUERY
5569 ever be taken. */
5570
5571 HANDLE_NUMERICAL_RECURSION:
5572
5573 if ((refsign = *ptr) == CHAR_PLUS)
5574 {
5575 ptr++;
5576 if ((digitab[*ptr] & ctype_digit) == 0)
5577 {
5578 *errorcodeptr = ERR63;
5579 goto FAILED;
5580 }
5581 }
5582 else if (refsign == CHAR_MINUS)
5583 {
5584 if ((digitab[ptr[1]] & ctype_digit) == 0)
5585 goto OTHER_CHAR_AFTER_QUERY;
5586 ptr++;
5587 }
5588
5589 recno = 0;
5590 while((digitab[*ptr] & ctype_digit) != 0)
5591 recno = recno * 10 + *ptr++ - CHAR_0;
5592
5593 if (*ptr != terminator)
5594 {
5595 *errorcodeptr = ERR29;
5596 goto FAILED;
5597 }
5598
5599 if (refsign == CHAR_MINUS)
5600 {
5601 if (recno == 0)
5602 {
5603 *errorcodeptr = ERR58;
5604 goto FAILED;
5605 }
5606 recno = cd->bracount - recno + 1;
5607 if (recno <= 0)
5608 {
5609 *errorcodeptr = ERR15;
5610 goto FAILED;
5611 }
5612 }
5613 else if (refsign == CHAR_PLUS)
5614 {
5615 if (recno == 0)
5616 {
5617 *errorcodeptr = ERR58;
5618 goto FAILED;
5619 }
5620 recno += cd->bracount;
5621 }
5622
5623 /* Come here from code above that handles a named recursion */
5624
5625 HANDLE_RECURSION:
5626
5627 previous = code;
5628 called = cd->start_code;
5629
5630 /* When we are actually compiling, find the bracket that is being
5631 referenced. Temporarily end the regex in case it doesn't exist before
5632 this point. If we end up with a forward reference, first check that
5633 the bracket does occur later so we can give the error (and position)
5634 now. Then remember this forward reference in the workspace so it can
5635 be filled in at the end. */
5636
5637 if (lengthptr == NULL)
5638 {
5639 *code = OP_END;
5640 if (recno != 0)
5641 called = _pcre_find_bracket(cd->start_code, utf8, recno);
5642
5643 /* Forward reference */
5644
5645 if (called == NULL)
5646 {
5647 if (find_parens(cd, NULL, recno,
5648 (options & PCRE_EXTENDED) != 0, utf8) < 0)
5649 {
5650 *errorcodeptr = ERR15;
5651 goto FAILED;
5652 }
5653
5654 /* Fudge the value of "called" so that when it is inserted as an
5655 offset below, what it actually inserted is the reference number
5656 of the group. */
5657
5658 called = cd->start_code + recno;
5659 PUTINC(cd->hwm, 0, (int)(code + 2 + LINK_SIZE - cd->start_code));
5660 }
5661
5662 /* If not a forward reference, and the subpattern is still open,
5663 this is a recursive call. We check to see if this is a left
5664 recursion that could loop for ever, and diagnose that case. */
5665
5666 else if (GET(called, 1) == 0 &&
5667 could_be_empty(called, code, bcptr, utf8, cd))
5668 {
5669 *errorcodeptr = ERR40;
5670 goto FAILED;
5671 }
5672 }
5673
5674 /* Insert the recursion/subroutine item, automatically wrapped inside
5675 "once" brackets. Set up a "previous group" length so that a
5676 subsequent quantifier will work. */
5677
5678 *code = OP_ONCE;
5679 PUT(code, 1, 2 + 2*LINK_SIZE);
5680 code += 1 + LINK_SIZE;
5681
5682 *code = OP_RECURSE;
5683 PUT(code, 1, (int)(called - cd->start_code));
5684 code += 1 + LINK_SIZE;
5685
5686 *code = OP_KET;
5687 PUT(code, 1, 2 + 2*LINK_SIZE);
5688 code += 1 + LINK_SIZE;
5689
5690 length_prevgroup = 3 + 3*LINK_SIZE;
5691 }
5692
5693 /* Can't determine a first byte now */
5694
5695 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
5696 continue;
5697
5698
5699 /* ------------------------------------------------------------ */
5700 default: /* Other characters: check option setting */
5701 OTHER_CHAR_AFTER_QUERY:
5702 set = unset = 0;
5703 optset = &set;
5704
5705 while (*ptr != CHAR_RIGHT_PARENTHESIS && *ptr != CHAR_COLON)
5706 {
5707 switch (*ptr++)
5708 {
5709 case CHAR_MINUS: optset = &unset; break;
5710
5711 case CHAR_J: /* Record that it changed in the external options */
5712 *optset |= PCRE_DUPNAMES;
5713 cd->external_flags |= PCRE_JCHANGED;
5714 break;
5715
5716 case CHAR_i: *optset |= PCRE_CASELESS; break;
5717 case CHAR_m: *optset |= PCRE_MULTILINE; break;
5718 case CHAR_s: *optset |= PCRE_DOTALL; break;
5719 case CHAR_x: *optset |= PCRE_EXTENDED; break;
5720 case CHAR_U: *optset |= PCRE_UNGREEDY; break;
5721 case CHAR_X: *optset |= PCRE_EXTRA; break;
5722
5723 default: *errorcodeptr = ERR12;
5724 ptr--; /* Correct the offset */
5725 goto FAILED;
5726 }
5727 }
5728
5729 /* Set up the changed option bits, but don't change anything yet. */
5730
5731 newoptions = (options | set) & (~unset);
5732
5733 /* If the options ended with ')' this is not the start of a nested
5734 group with option changes, so the options change at this level. If this
5735 item is right at the start of the pattern, the options can be
5736 abstracted and made external in the pre-compile phase, and ignored in
5737 the compile phase. This can be helpful when matching -- for instance in
5738 caseless checking of required bytes.
5739
5740 If the code pointer is not (cd->start_code + 1 + LINK_SIZE), we are
5741 definitely *not* at the start of the pattern because something has been
5742 compiled. In the pre-compile phase, however, the code pointer can have
5743 that value after the start, because it gets reset as code is discarded
5744 during the pre-compile. However, this can happen only at top level - if
5745 we are within parentheses, the starting BRA will still be present. At
5746 any parenthesis level, the length value can be used to test if anything
5747 has been compiled at that level. Thus, a test for both these conditions
5748 is necessary to ensure we correctly detect the start of the pattern in
5749 both phases.
5750
5751 If we are not at the pattern start, reset the greedy defaults and the
5752 case value for firstbyte and reqbyte. */
5753
5754 if (*ptr == CHAR_RIGHT_PARENTHESIS)
5755 {
5756 if (code == cd->start_code + 1 + LINK_SIZE &&
5757 (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE))
5758 {
5759 cd->external_options = newoptions;
5760 }
5761 else
5762 {
5763 greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
5764 greedy_non_default = greedy_default ^ 1;
5765 req_caseopt = ((newoptions & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
5766 }
5767
5768 /* Change options at this level, and pass them back for use
5769 in subsequent branches. */
5770
5771 *optionsptr = options = newoptions;
5772 previous = NULL; /* This item can't be repeated */
5773 continue; /* It is complete */
5774 }
5775
5776 /* If the options ended with ':' we are heading into a nested group
5777 with possible change of options. Such groups are non-capturing and are
5778 not assertions of any kind. All we need to do is skip over the ':';
5779 the newoptions value is handled below. */
5780
5781 bravalue = OP_BRA;
5782 ptr++;
5783 } /* End of switch for character following (? */
5784 } /* End of (? handling */
5785
5786 /* Opening parenthesis not followed by '*' or '?'. If PCRE_NO_AUTO_CAPTURE
5787 is set, all unadorned brackets become non-capturing and behave like (?:...)
5788 brackets. */
5789
5790 else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
5791 {
5792 bravalue = OP_BRA;
5793 }
5794
5795 /* Else we have a capturing group. */
5796
5797 else
5798 {
5799 NUMBERED_GROUP:
5800 cd->bracount += 1;
5801 PUT2(code, 1+LINK_SIZE, cd->bracount);
5802 skipbytes = 2;
5803 }
5804
5805 /* Process nested bracketed regex. Assertions may not be repeated, but
5806 other kinds can be. All their opcodes are >= OP_ONCE. We copy code into a
5807 non-register variable (tempcode) in order to be able to pass its address
5808 because some compilers complain otherwise. */
5809
5810 previous = (bravalue >= OP_ONCE)? code : NULL;
5811 *code = bravalue;
5812 tempcode = code;
5813 tempreqvary = cd->req_varyopt; /* Save value before bracket */
5814 length_prevgroup = 0; /* Initialize for pre-compile phase */
5815
5816 if (!compile_regex(
5817 newoptions, /* The complete new option state */
5818 &tempcode, /* Where to put code (updated) */
5819 &ptr, /* Input pointer (updated) */
5820 errorcodeptr, /* Where to put an error message */
5821 (bravalue == OP_ASSERTBACK ||
5822 bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
5823 reset_bracount, /* True if (?| group */
5824 skipbytes, /* Skip over bracket number */
5825 &subfirstbyte, /* For possible first char */
5826 &subreqbyte, /* For possible last char */
5827 bcptr, /* Current branch chain */
5828 cd, /* Tables block */
5829 (lengthptr == NULL)? NULL : /* Actual compile phase */
5830 &length_prevgroup /* Pre-compile phase */
5831 ))
5832 goto FAILED;
5833
5834 /* At the end of compiling, code is still pointing to the start of the
5835 group, while tempcode has been updated to point past the end of the group
5836 and any option resetting that may follow it. The pattern pointer (ptr)
5837 is on the bracket. */
5838
5839 /* If this is a conditional bracket, check that there are no more than
5840 two branches in the group, or just one if it's a DEFINE group. We do this
5841 in the real compile phase, not in the pre-pass, where the whole group may
5842 not be available. */
5843
5844 if (bravalue == OP_COND && lengthptr == NULL)
5845 {
5846 uschar *tc = code;
5847 int condcount = 0;
5848
5849 do {
5850 condcount++;
5851 tc += GET(tc,1);
5852 }
5853 while (*tc != OP_KET);
5854
5855 /* A DEFINE group is never obeyed inline (the "condition" is always
5856 false). It must have only one branch. */
5857
5858 if (code[LINK_SIZE+1] == OP_DEF)
5859 {
5860 if (condcount > 1)
5861 {
5862 *errorcodeptr = ERR54;
5863 goto FAILED;
5864 }
5865 bravalue = OP_DEF; /* Just a flag to suppress char handling below */
5866 }
5867
5868 /* A "normal" conditional group. If there is just one branch, we must not
5869 make use of its firstbyte or reqbyte, because this is equivalent to an
5870 empty second branch. */
5871
5872 else
5873 {
5874 if (condcount > 2)
5875 {
5876 *errorcodeptr = ERR27;
5877 goto FAILED;
5878 }
5879 if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
5880 }
5881 }
5882
5883 /* Error if hit end of pattern */
5884
5885 if (*ptr != CHAR_RIGHT_PARENTHESIS)
5886 {
5887 *errorcodeptr = ERR14;
5888 goto FAILED;
5889 }
5890
5891 /* In the pre-compile phase, update the length by the length of the group,
5892 less the brackets at either end. Then reduce the compiled code to just a
5893 set of non-capturing brackets so that it doesn't use much memory if it is
5894 duplicated by a quantifier.*/
5895
5896 if (lengthptr != NULL)
5897 {
5898 if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE)
5899 {
5900 *errorcodeptr = ERR20;
5901 goto FAILED;
5902 }
5903 *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
5904 code++; /* This already contains bravalue */
5905 PUTINC(code, 0, 1 + LINK_SIZE);
5906 *code++ = OP_KET;
5907 PUTINC(code, 0, 1 + LINK_SIZE);
5908 break; /* No need to waste time with special character handling */
5909 }
5910
5911 /* Otherwise update the main code pointer to the end of the group. */
5912
5913 code = tempcode;
5914
5915 /* For a DEFINE group, required and first character settings are not
5916 relevant. */
5917
5918 if (bravalue == OP_DEF) break;
5919
5920 /* Handle updating of the required and first characters for other types of
5921 group. Update for normal brackets of all kinds, and conditions with two
5922 branches (see code above). If the bracket is followed by a quantifier with
5923 zero repeat, we have to back off. Hence the definition of zeroreqbyte and
5924 zerofirstbyte outside the main loop so that they can be accessed for the
5925 back off. */
5926
5927 zeroreqbyte = reqbyte;
5928 zerofirstbyte = firstbyte;
5929 groupsetfirstbyte = FALSE;
5930
5931 if (bravalue >= OP_ONCE)
5932 {
5933 /* If we have not yet set a firstbyte in this branch, take it from the
5934 subpattern, remembering that it was set here so that a repeat of more
5935 than one can replicate it as reqbyte if necessary. If the subpattern has
5936 no firstbyte, set "none" for the whole branch. In both cases, a zero
5937 repeat forces firstbyte to "none". */
5938
5939 if (firstbyte == REQ_UNSET)
5940 {
5941 if (subfirstbyte >= 0)
5942 {
5943 firstbyte = subfirstbyte;
5944 groupsetfirstbyte = TRUE;
5945 }
5946 else firstbyte = REQ_NONE;
5947 zerofirstbyte = REQ_NONE;
5948 }
5949
5950 /* If firstbyte was previously set, convert the subpattern's firstbyte
5951 into reqbyte if there wasn't one, using the vary flag that was in
5952 existence beforehand. */
5953
5954 else if (subfirstbyte >= 0 && subreqbyte < 0)
5955 subreqbyte = subfirstbyte | tempreqvary;
5956
5957 /* If the subpattern set a required byte (or set a first byte that isn't
5958 really the first byte - see above), set it. */
5959
5960 if (subreqbyte >= 0) reqbyte = subreqbyte;
5961 }
5962
5963 /* For a forward assertion, we take the reqbyte, if set. This can be
5964 helpful if the pattern that follows the assertion doesn't set a different
5965 char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
5966 for an assertion, however because it leads to incorrect effect for patterns
5967 such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
5968 of a firstbyte. This is overcome by a scan at the end if there's no
5969 firstbyte, looking for an asserted first char. */
5970
5971 else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
5972 break; /* End of processing '(' */
5973
5974
5975 /* ===================================================================*/
5976 /* Handle metasequences introduced by \. For ones like \d, the ESC_ values
5977 are arranged to be the negation of the corresponding OP_values in the
5978 default case when PCRE_UCP is not set. For the back references, the values
5979 are ESC_REF plus the reference number. Only back references and those types
5980 that consume a character may be repeated. We can test for values between
5981 ESC_b and ESC_Z for the latter; this may have to change if any new ones are
5982 ever created. */
5983
5984 case CHAR_BACKSLASH:
5985 tempptr = ptr;
5986 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, FALSE);
5987 if (*errorcodeptr != 0) goto FAILED;
5988
5989 if (c < 0)
5990 {
5991 if (-c == ESC_Q) /* Handle start of quoted string */
5992 {
5993 if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
5994 ptr += 2; /* avoid empty string */
5995 else inescq = TRUE;
5996 continue;
5997 }
5998
5999 if (-c == ESC_E) continue; /* Perl ignores an orphan \E */
6000
6001 /* For metasequences that actually match a character, we disable the
6002 setting of a first character if it hasn't already been set. */
6003
6004 if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
6005 firstbyte = REQ_NONE;
6006
6007 /* Set values to reset to if this is followed by a zero repeat. */
6008
6009 zerofirstbyte = firstbyte;
6010 zeroreqbyte = reqbyte;
6011
6012 /* \g<name> or \g'name' is a subroutine call by name and \g<n> or \g'n'
6013 is a subroutine call by number (Oniguruma syntax). In fact, the value
6014 -ESC_g is returned only for these cases. So we don't need to check for <
6015 or ' if the value is -ESC_g. For the Perl syntax \g{n} the value is
6016 -ESC_REF+n, and for the Perl syntax \g{name} the result is -ESC_k (as
6017 that is a synonym for a named back reference). */
6018
6019 if (-c == ESC_g)
6020 {
6021 const uschar *p;
6022 save_hwm = cd->hwm; /* Normally this is set when '(' is read */
6023 terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
6024 CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
6025
6026 /* These two statements stop the compiler for warning about possibly
6027 unset variables caused by the jump to HANDLE_NUMERICAL_RECURSION. In
6028 fact, because we actually check for a number below, the paths that
6029 would actually be in error are never taken. */
6030
6031 skipbytes = 0;
6032 reset_bracount = FALSE;
6033
6034 /* Test for a name */
6035
6036 if (ptr[1] != CHAR_PLUS && ptr[1] != CHAR_MINUS)
6037 {
6038 BOOL isnumber = TRUE;
6039 for (p = ptr + 1; *p != 0 && *p != terminator; p++)
6040 {
6041 if ((cd->ctypes[*p] & ctype_digit) == 0) isnumber = FALSE;
6042 if ((cd->ctypes[*p] & ctype_word) == 0) break;
6043 }
6044 if (*p != terminator)
6045 {
6046 *errorcodeptr = ERR57;
6047 break;
6048 }
6049 if (isnumber)
6050 {
6051 ptr++;
6052 goto HANDLE_NUMERICAL_RECURSION;
6053 }
6054 is_recurse = TRUE;
6055 goto NAMED_REF_OR_RECURSE;
6056 }
6057
6058 /* Test a signed number in angle brackets or quotes. */
6059
6060 p = ptr + 2;
6061 while ((digitab[*p] & ctype_digit) != 0) p++;
6062 if (*p != terminator)
6063 {
6064 *errorcodeptr = ERR57;
6065 break;
6066 }
6067 ptr++;
6068 goto HANDLE_NUMERICAL_RECURSION;
6069 }
6070
6071 /* \k<name> or \k'name' is a back reference by name (Perl syntax).
6072 We also support \k{name} (.NET syntax) */
6073
6074 if (-c == ESC_k && (ptr[1] == CHAR_LESS_THAN_SIGN ||
6075 ptr[1] == CHAR_APOSTROPHE || ptr[1] == CHAR_LEFT_CURLY_BRACKET))
6076 {
6077 is_recurse = FALSE;
6078 terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
6079 CHAR_GREATER_THAN_SIGN : (*ptr == CHAR_APOSTROPHE)?
6080 CHAR_APOSTROPHE : CHAR_RIGHT_CURLY_BRACKET;
6081 goto NAMED_REF_OR_RECURSE;
6082 }
6083
6084 /* Back references are handled specially; must disable firstbyte if
6085 not set to cope with cases like (?=(\w+))\1: which would otherwise set
6086 ':' later. */
6087
6088 if (-c >= ESC_REF)
6089 {
6090 open_capitem *oc;
6091 recno = -c - ESC_REF;
6092
6093 HANDLE_REFERENCE: /* Come here from named backref handling */
6094 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
6095 previous = code;
6096 *code++ = ((options & PCRE_CASELESS) != 0)? OP_REFI : OP_REF;
6097 PUT2INC(code, 0, recno);
6098 cd->backref_map |= (recno < 32)? (1 << recno) : 1;
6099 if (recno > cd->top_backref) cd->top_backref = recno;
6100
6101 /* Check to see if this back reference is recursive, that it, it
6102 is inside the group that it references. A flag is set so that the
6103 group can be made atomic. */
6104
6105 for (oc = cd->open_caps; oc != NULL; oc = oc->next)
6106 {
6107 if (oc->number == recno)
6108 {
6109 oc->flag = TRUE;
6110 break;
6111 }
6112 }
6113 }
6114
6115 /* So are Unicode property matches, if supported. */
6116
6117 #ifdef SUPPORT_UCP
6118 else if (-c == ESC_P || -c == ESC_p)
6119 {
6120 BOOL negated;
6121 int pdata;
6122 int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
6123 if (ptype < 0) goto FAILED;
6124 previous = code;
6125 *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
6126 *code++ = ptype;
6127 *code++ = pdata;
6128 }
6129 #else
6130
6131 /* If Unicode properties are not supported, \X, \P, and \p are not
6132 allowed. */
6133
6134 else if (-c == ESC_X || -c == ESC_P || -c == ESC_p)
6135 {
6136 *errorcodeptr = ERR45;
6137 goto FAILED;
6138 }
6139 #endif
6140
6141 /* For the rest (including \X when Unicode properties are supported), we
6142 can obtain the OP value by negating the escape value in the default
6143 situation when PCRE_UCP is not set. When it *is* set, we substitute
6144 Unicode property tests. */
6145
6146 else
6147 {
6148 #ifdef SUPPORT_UCP
6149 if (-c >= ESC_DU && -c <= ESC_wu)
6150 {
6151 nestptr = ptr + 1; /* Where to resume */
6152 ptr = substitutes[-c - ESC_DU] - 1; /* Just before substitute */
6153 }
6154 else
6155 #endif
6156 {
6157 previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
6158 *code++ = -c;
6159 }
6160 }
6161 continue;
6162 }
6163
6164 /* We have a data character whose value is in c. In UTF-8 mode it may have
6165 a value > 127. We set its representation in the length/buffer, and then
6166 handle it as a data character. */
6167
6168 #ifdef SUPPORT_UTF8
6169 if (utf8 && c > 127)
6170 mclength = _pcre_ord2utf8(c, mcbuffer);
6171 else
6172 #endif
6173
6174 {
6175 mcbuffer[0] = c;
6176 mclength = 1;
6177 }
6178 goto ONE_CHAR;
6179
6180
6181 /* ===================================================================*/
6182 /* Handle a literal character. It is guaranteed not to be whitespace or #
6183 when the extended flag is set. If we are in UTF-8 mode, it may be a
6184 multi-byte literal character. */
6185
6186 default:
6187 NORMAL_CHAR:
6188 mclength = 1;
6189 mcbuffer[0] = c;
6190
6191 #ifdef SUPPORT_UTF8
6192 if (utf8 && c >= 0xc0)
6193 {
6194 while ((ptr[1] & 0xc0) == 0x80)
6195 mcbuffer[mclength++] = *(++ptr);
6196 }
6197 #endif
6198
6199 /* At this point we have the character's bytes in mcbuffer, and the length
6200 in mclength. When not in UTF-8 mode, the length is always 1. */
6201
6202 ONE_CHAR:
6203 previous = code;
6204 *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARI : OP_CHAR;
6205 for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
6206
6207 /* Remember if \r or \n were seen */
6208
6209 if (mcbuffer[0] == CHAR_CR || mcbuffer[0] == CHAR_NL)
6210 cd->external_flags |= PCRE_HASCRORLF;
6211
6212 /* Set the first and required bytes appropriately. If no previous first
6213 byte, set it from this character, but revert to none on a zero repeat.
6214 Otherwise, leave the firstbyte value alone, and don't change it on a zero
6215 repeat. */
6216
6217 if (firstbyte == REQ_UNSET)
6218 {
6219 zerofirstbyte = REQ_NONE;
6220 zeroreqbyte = reqbyte;
6221
6222 /* If the character is more than one byte long, we can set firstbyte
6223 only if it is not to be matched caselessly. */
6224
6225 if (mclength == 1 || req_caseopt == 0)
6226 {
6227 firstbyte = mcbuffer[0] | req_caseopt;
6228 if (mclength != 1) reqbyte = code[-1] | cd->req_varyopt;
6229 }
6230 else firstbyte = reqbyte = REQ_NONE;
6231 }
6232
6233 /* firstbyte was previously set; we can set reqbyte only the length is
6234 1 or the matching is caseful. */
6235
6236 else
6237 {
6238 zerofirstbyte = firstbyte;
6239 zeroreqbyte = reqbyte;
6240 if (mclength == 1 || req_caseopt == 0)
6241 reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
6242 }
6243
6244 break; /* End of literal character handling */
6245 }
6246 } /* end of big loop */
6247
6248
6249 /* Control never reaches here by falling through, only by a goto for all the
6250 error states. Pass back the position in the pattern so that it can be displayed
6251 to the user for diagnosing the error. */
6252
6253 FAILED:
6254 *ptrptr = ptr;
6255 return FALSE;
6256 }
6257
6258
6259
6260
6261 /*************************************************
6262 * Compile sequence of alternatives *
6263 *************************************************/
6264
6265 /* On entry, ptr is pointing past the bracket character, but on return it
6266 points to the closing bracket, or vertical bar, or end of string. The code
6267 variable is pointing at the byte into which the BRA operator has been stored.
6268 This function is used during the pre-compile phase when we are trying to find
6269 out the amount of memory needed, as well as during the real compile phase. The
6270 value of lengthptr distinguishes the two phases.
6271
6272 Arguments:
6273 options option bits, including any changes for this subpattern
6274 codeptr -> the address of the current code pointer
6275 ptrptr -> the address of the current pattern pointer
6276 errorcodeptr -> pointer to error code variable
6277 lookbehind TRUE if this is a lookbehind assertion
6278 reset_bracount TRUE to reset the count for each branch
6279 skipbytes skip this many bytes at start (for brackets and OP_COND)
6280 firstbyteptr place to put the first required character, or a negative number
6281 reqbyteptr place to put the last required character, or a negative number
6282 bcptr pointer to the chain of currently open branches
6283 cd points to the data block with tables pointers etc.
6284 lengthptr NULL during the real compile phase
6285 points to length accumulator during pre-compile phase
6286
6287 Returns: TRUE on success
6288 */
6289
6290 static BOOL
6291 compile_regex(int options, uschar **codeptr, const uschar **ptrptr,
6292 int *errorcodeptr, BOOL lookbehind, BOOL reset_bracount, int skipbytes,
6293 int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd,
6294 int *lengthptr)
6295 {
6296 const uschar *ptr = *ptrptr;
6297 uschar *code = *codeptr;
6298 uschar *last_branch = code;
6299 uschar *start_bracket = code;
6300 uschar *reverse_count = NULL;
6301 open_capitem capitem;
6302 int capnumber = 0;
6303 int firstbyte, reqbyte;
6304 int branchfirstbyte, branchreqbyte;
6305 int length;
6306 int orig_bracount;
6307 int max_bracount;
6308 branch_chain bc;
6309
6310 bc.outer = bcptr;
6311 bc.current_branch = code;
6312
6313 firstbyte = reqbyte = REQ_UNSET;
6314
6315 /* Accumulate the length for use in the pre-compile phase. Start with the
6316 length of the BRA and KET and any extra bytes that are required at the
6317 beginning. We accumulate in a local variable to save frequent testing of
6318 lenthptr for NULL. We cannot do this by looking at the value of code at the
6319 start and end of each alternative, because compiled items are discarded during
6320 the pre-compile phase so that the work space is not exceeded. */
6321
6322 length = 2 + 2*LINK_SIZE + skipbytes;
6323
6324 /* WARNING: If the above line is changed for any reason, you must also change
6325 the code that abstracts option settings at the start of the pattern and makes
6326 them global. It tests the value of length for (2 + 2*LINK_SIZE) in the
6327 pre-compile phase to find out whether anything has yet been compiled or not. */
6328
6329 /* If this is a capturing subpattern, add to the chain of open capturing items
6330 so that we can detect them if (*ACCEPT) is encountered. This is also used to
6331 detect groups that contain recursive back references to themselves. Note that
6332 only OP_CBRA need be tested here; changing this opcode to one of its variants,
6333 e.g. OP_SCBRAPOS, happens later, after the group has been compiled. */
6334
6335 if (*code == OP_CBRA)
6336 {
6337 capnumber = GET2(code, 1 + LINK_SIZE);
6338 capitem.number = capnumber;
6339 capitem.next = cd->open_caps;
6340 capitem.flag = FALSE;
6341 cd->open_caps = &capitem;
6342 }
6343
6344 /* Offset is set zero to mark that this bracket is still open */
6345
6346 PUT(code, 1, 0);
6347 code += 1 + LINK_SIZE + skipbytes;
6348
6349 /* Loop for each alternative branch */
6350
6351 orig_bracount = max_bracount = cd->bracount;
6352 for (;;)
6353 {
6354 /* For a (?| group, reset the capturing bracket count so that each branch
6355 uses the same numbers. */
6356
6357 if (reset_bracount) cd->bracount = orig_bracount;
6358
6359 /* Set up dummy OP_REVERSE if lookbehind assertion */
6360
6361 if (lookbehind)
6362 {
6363 *code++ = OP_REVERSE;
6364 reverse_count = code;
6365 PUTINC(code, 0, 0);
6366 length += 1 + LINK_SIZE;
6367 }
6368
6369 /* Now compile the branch; in the pre-compile phase its length gets added
6370 into the length. */
6371
6372 if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstbyte,
6373 &branchreqbyte, &bc, cd, (lengthptr == NULL)? NULL : &length))
6374 {
6375 *ptrptr = ptr;
6376 return FALSE;
6377 }
6378
6379 /* Keep the highest bracket count in case (?| was used and some branch
6380 has fewer than the rest. */
6381
6382 if (cd->bracount > max_bracount) max_bracount = cd->bracount;
6383
6384 /* In the real compile phase, there is some post-processing to be done. */
6385
6386 if (lengthptr == NULL)
6387 {
6388 /* If this is the first branch, the firstbyte and reqbyte values for the
6389 branch become the values for the regex. */
6390
6391 if (*last_branch != OP_ALT)
6392 {
6393 firstbyte = branchfirstbyte;
6394 reqbyte = branchreqbyte;
6395 }
6396
6397 /* If this is not the first branch, the first char and reqbyte have to
6398 match the values from all the previous branches, except that if the
6399 previous value for reqbyte didn't have REQ_VARY set, it can still match,
6400 and we set REQ_VARY for the regex. */
6401
6402 else
6403 {
6404 /* If we previously had a firstbyte, but it doesn't match the new branch,
6405 we have to abandon the firstbyte for the regex, but if there was
6406 previously no reqbyte, it takes on the value of the old firstbyte. */
6407
6408 if (firstbyte >= 0 && firstbyte != branchfirstbyte)
6409 {
6410 if (reqbyte < 0) reqbyte = firstbyte;
6411 firstbyte = REQ_NONE;
6412 }
6413
6414 /* If we (now or from before) have no firstbyte, a firstbyte from the
6415 branch becomes a reqbyte if there isn't a branch reqbyte. */
6416
6417 if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
6418 branchreqbyte = branchfirstbyte;
6419
6420 /* Now ensure that the reqbytes match */
6421
6422 if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
6423 reqbyte = REQ_NONE;
6424 else reqbyte |= branchreqbyte; /* To "or" REQ_VARY */
6425 }
6426
6427 /* If lookbehind, check that this branch matches a fixed-length string, and
6428 put the length into the OP_REVERSE item. Temporarily mark the end of the
6429 branch with OP_END. If the branch contains OP_RECURSE, the result is -3
6430 because there may be forward references that we can't check here. Set a
6431 flag to cause another lookbehind check at the end. Why not do it all at the
6432 end? Because common, erroneous checks are picked up here and the offset of
6433 the problem can be shown. */
6434
6435 if (lookbehind)
6436 {
6437 int fixed_length;
6438 *code = OP_END;
6439 fixed_length = find_fixedlength(last_branch, (options & PCRE_UTF8) != 0,
6440 FALSE, cd);
6441 DPRINTF(("fixed length = %d\n", fixed_length));
6442 if (fixed_length == -3)
6443 {
6444 cd->check_lookbehind = TRUE;
6445 }
6446 else if (fixed_length < 0)
6447 {
6448 *errorcodeptr = (fixed_length == -2)? ERR36 : ERR25;
6449 *ptrptr = ptr;
6450 return FALSE;
6451 }
6452 else { PUT(reverse_count, 0, fixed_length); }
6453 }
6454 }
6455
6456 /* Reached end of expression, either ')' or end of pattern. In the real
6457 compile phase, go back through the alternative branches and reverse the chain
6458 of offsets, with the field in the BRA item now becoming an offset to the
6459 first alternative. If there are no alternatives, it points to the end of the
6460 group. The length in the terminating ket is always the length of the whole
6461 bracketed item. Return leaving the pointer at the terminating char. */
6462
6463 if (*ptr != CHAR_VERTICAL_LINE)
6464 {
6465 if (lengthptr == NULL)
6466 {
6467 int branch_length = (int)(code - last_branch);
6468 do
6469 {
6470 int prev_length = GET(last_branch, 1);
6471 PUT(last_branch, 1, branch_length);
6472 branch_length = prev_length;
6473 last_branch -= branch_length;
6474 }
6475 while (branch_length > 0);
6476 }
6477
6478 /* Fill in the ket */
6479
6480 *code = OP_KET;
6481 PUT(code, 1, (int)(code - start_bracket));
6482 code += 1 + LINK_SIZE;
6483
6484 /* If it was a capturing subpattern, check to see if it contained any
6485 recursive back references. If so, we must wrap it in atomic brackets.
6486 In any event, remove the block from the chain. */
6487
6488 if (capnumber > 0)
6489 {
6490 if (cd->open_caps->flag)
6491 {
6492 memmove(start_bracket + 1 + LINK_SIZE, start_bracket,
6493 code - start_bracket);
6494 *start_bracket = OP_ONCE;
6495 code += 1 + LINK_SIZE;
6496 PUT(start_bracket, 1, (int)(code - start_bracket));
6497 *code = OP_KET;
6498 PUT(code, 1, (int)(code - start_bracket));
6499 code += 1 + LINK_SIZE;
6500 length += 2 + 2*LINK_SIZE;
6501 }
6502 cd->open_caps = cd->open_caps->next;
6503 }
6504
6505 /* Retain the highest bracket number, in case resetting was used. */
6506
6507 cd->bracount = max_bracount;
6508
6509 /* Set values to pass back */
6510
6511 *codeptr = code;
6512 *ptrptr = ptr;
6513 *firstbyteptr = firstbyte;
6514 *reqbyteptr = reqbyte;
6515 if (lengthptr != NULL)
6516 {
6517 if (OFLOW_MAX - *lengthptr < length)
6518 {
6519 *errorcodeptr = ERR20;
6520 return FALSE;
6521 }
6522 *lengthptr += length;
6523 }
6524 return TRUE;
6525 }
6526
6527 /* Another branch follows. In the pre-compile phase, we can move the code
6528 pointer back to where it was for the start of the first branch. (That is,
6529 pretend that each branch is the only one.)
6530
6531 In the real compile phase, insert an ALT node. Its length field points back
6532 to the previous branch while the bracket remains open. At the end the chain
6533 is reversed. It's done like this so that the start of the bracket has a
6534 zero offset until it is closed, making it possible to detect recursion. */
6535
6536 if (lengthptr != NULL)
6537 {
6538 code = *codeptr + 1 + LINK_SIZE + skipbytes;
6539 length += 1 + LINK_SIZE;
6540 }
6541 else
6542 {
6543 *code = OP_ALT;
6544 PUT(code, 1, (int)(code - last_branch));
6545 bc.current_branch = last_branch = code;
6546 code += 1 + LINK_SIZE;
6547 }
6548
6549 ptr++;
6550 }
6551 /* Control never reaches here */
6552 }
6553
6554
6555
6556
6557 /*************************************************
6558 * Check for anchored expression *
6559 *************************************************/
6560
6561 /* Try to find out if this is an anchored regular expression. Consider each
6562 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
6563 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
6564 it's anchored. However, if this is a multiline pattern, then only OP_SOD will
6565 be found, because ^ generates OP_CIRCM in that mode.
6566
6567 We can also consider a regex to be anchored if OP_SOM starts all its branches.
6568 This is the code for \G, which means "match at start of match position, taking
6569 into account the match offset".
6570
6571 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
6572 because that will try the rest of the pattern at all possible matching points,
6573 so there is no point trying again.... er ....
6574
6575 .... except when the .* appears inside capturing parentheses, and there is a
6576 subsequent back reference to those parentheses. We haven't enough information
6577 to catch that case precisely.
6578
6579 At first, the best we could do was to detect when .* was in capturing brackets
6580 and the highest back reference was greater than or equal to that level.
6581 However, by keeping a bitmap of the first 31 back references, we can catch some
6582 of the more common cases more precisely.
6583
6584 Arguments:
6585 code points to start of expression (the bracket)
6586 bracket_map a bitmap of which brackets we are inside while testing; this
6587 handles up to substring 31; after that we just have to take
6588 the less precise approach
6589 backref_map the back reference bitmap
6590
6591 Returns: TRUE or FALSE
6592 */
6593
6594 static BOOL
6595 is_anchored(register const uschar *code, unsigned int bracket_map,
6596 unsigned int backref_map)
6597 {
6598 do {
6599 const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
6600 FALSE);
6601 register int op = *scode;
6602
6603 /* Non-capturing brackets */
6604
6605 if (op == OP_BRA || op == OP_BRAPOS ||
6606 op == OP_SBRA || op == OP_SBRAPOS)
6607 {
6608 if (!is_anchored(scode, bracket_map, backref_map)) return FALSE;
6609 }
6610
6611 /* Capturing brackets */
6612
6613 else if (op == OP_CBRA || op == OP_CBRAPOS ||
6614 op == OP_SCBRA || op == OP_SCBRAPOS)
6615 {
6616 int n = GET2(scode, 1+LINK_SIZE);
6617 int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
6618 if (!is_anchored(scode, new_map, backref_map)) return FALSE;
6619 }
6620
6621 /* Other brackets */
6622
6623 else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
6624 {
6625 if (!is_anchored(scode, bracket_map, backref_map)) return FALSE;
6626 }
6627
6628 /* .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and
6629 it isn't in brackets that are or may be referenced. */
6630
6631 else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
6632 op == OP_TYPEPOSSTAR))
6633 {
6634 if (scode[1] != OP_ALLANY || (bracket_map & backref_map) != 0)
6635 return FALSE;
6636 }
6637
6638 /* Check for explicit anchoring */
6639
6640 else if (op != OP_SOD && op != OP_SOM && op != OP_CIRC) return FALSE;
6641 code += GET(code, 1);
6642 }
6643 while (*code == OP_ALT); /* Loop for each alternative */
6644 return TRUE;
6645 }
6646
6647
6648
6649