/[pcre]/code/branches/pcre16/pcre_compile.c
ViewVC logotype

Contents of /code/branches/pcre16/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 556 - (show annotations) (download)
Tue Oct 26 11:06:44 2010 UTC (3 years, 11 months ago) by ph10
Original Path: code/trunk/pcre_compile.c
File MIME type: text/plain
File size: 234378 byte(s)
Fix #-comment bugs in UTF-8 mode with PCRE_NEWLINE_ANY.

1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2010 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains the external function pcre_compile(), along with
42 supporting internal functions that are not used by other modules. */
43
44
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48
49 #define NLBLOCK cd /* Block containing newline information */
50 #define PSSTART start_pattern /* Field containing processed string start */
51 #define PSEND end_pattern /* Field containing processed string end */
52
53 #include "pcre_internal.h"
54
55
56 /* When PCRE_DEBUG is defined, we need the pcre_printint() function, which is
57 also used by pcretest. PCRE_DEBUG is not defined when building a production
58 library. */
59
60 #ifdef PCRE_DEBUG
61 #include "pcre_printint.src"
62 #endif
63
64
65 /* Macro for setting individual bits in class bitmaps. */
66
67 #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
68
69 /* Maximum length value to check against when making sure that the integer that
70 holds the compiled pattern length does not overflow. We make it a bit less than
71 INT_MAX to allow for adding in group terminating bytes, so that we don't have
72 to check them every time. */
73
74 #define OFLOW_MAX (INT_MAX - 20)
75
76
77 /*************************************************
78 * Code parameters and static tables *
79 *************************************************/
80
81 /* This value specifies the size of stack workspace that is used during the
82 first pre-compile phase that determines how much memory is required. The regex
83 is partly compiled into this space, but the compiled parts are discarded as
84 soon as they can be, so that hopefully there will never be an overrun. The code
85 does, however, check for an overrun. The largest amount I've seen used is 218,
86 so this number is very generous.
87
88 The same workspace is used during the second, actual compile phase for
89 remembering forward references to groups so that they can be filled in at the
90 end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
91 is 4 there is plenty of room. */
92
93 #define COMPILE_WORK_SIZE (4096)
94
95 /* The overrun tests check for a slightly smaller size so that they detect the
96 overrun before it actually does run off the end of the data block. */
97
98 #define WORK_SIZE_CHECK (COMPILE_WORK_SIZE - 100)
99
100
101 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
102 are simple data values; negative values are for special things like \d and so
103 on. Zero means further processing is needed (for things like \x), or the escape
104 is invalid. */
105
106 #ifndef EBCDIC
107
108 /* This is the "normal" table for ASCII systems or for EBCDIC systems running
109 in UTF-8 mode. */
110
111 static const short int escapes[] = {
112 0, 0,
113 0, 0,
114 0, 0,
115 0, 0,
116 0, 0,
117 CHAR_COLON, CHAR_SEMICOLON,
118 CHAR_LESS_THAN_SIGN, CHAR_EQUALS_SIGN,
119 CHAR_GREATER_THAN_SIGN, CHAR_QUESTION_MARK,
120 CHAR_COMMERCIAL_AT, -ESC_A,
121 -ESC_B, -ESC_C,
122 -ESC_D, -ESC_E,
123 0, -ESC_G,
124 -ESC_H, 0,
125 0, -ESC_K,
126 0, 0,
127 -ESC_N, 0,
128 -ESC_P, -ESC_Q,
129 -ESC_R, -ESC_S,
130 0, 0,
131 -ESC_V, -ESC_W,
132 -ESC_X, 0,
133 -ESC_Z, CHAR_LEFT_SQUARE_BRACKET,
134 CHAR_BACKSLASH, CHAR_RIGHT_SQUARE_BRACKET,
135 CHAR_CIRCUMFLEX_ACCENT, CHAR_UNDERSCORE,
136 CHAR_GRAVE_ACCENT, 7,
137 -ESC_b, 0,
138 -ESC_d, ESC_e,
139 ESC_f, 0,
140 -ESC_h, 0,
141 0, -ESC_k,
142 0, 0,
143 ESC_n, 0,
144 -ESC_p, 0,
145 ESC_r, -ESC_s,
146 ESC_tee, 0,
147 -ESC_v, -ESC_w,
148 0, 0,
149 -ESC_z
150 };
151
152 #else
153
154 /* This is the "abnormal" table for EBCDIC systems without UTF-8 support. */
155
156 static const short int escapes[] = {
157 /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
158 /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
159 /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
160 /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
161 /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
162 /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
163 /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
164 /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
165 /* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0,
166 /* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
167 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
168 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0,
169 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
170 /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
171 /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
172 /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
173 /* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0,
174 /* D0 */ '}', 0, -ESC_K, 0, 0,-ESC_N, 0, -ESC_P,
175 /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
176 /* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X,
177 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
178 /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
179 /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
180 };
181 #endif
182
183
184 /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
185 searched linearly. Put all the names into a single string, in order to reduce
186 the number of relocations when a shared library is dynamically linked. The
187 string is built from string macros so that it works in UTF-8 mode on EBCDIC
188 platforms. */
189
190 typedef struct verbitem {
191 int len; /* Length of verb name */
192 int op; /* Op when no arg, or -1 if arg mandatory */
193 int op_arg; /* Op when arg present, or -1 if not allowed */
194 } verbitem;
195
196 static const char verbnames[] =
197 "\0" /* Empty name is a shorthand for MARK */
198 STRING_MARK0
199 STRING_ACCEPT0
200 STRING_COMMIT0
201 STRING_F0
202 STRING_FAIL0
203 STRING_PRUNE0
204 STRING_SKIP0
205 STRING_THEN;
206
207 static const verbitem verbs[] = {
208 { 0, -1, OP_MARK },
209 { 4, -1, OP_MARK },
210 { 6, OP_ACCEPT, -1 },
211 { 6, OP_COMMIT, -1 },
212 { 1, OP_FAIL, -1 },
213 { 4, OP_FAIL, -1 },
214 { 5, OP_PRUNE, OP_PRUNE_ARG },
215 { 4, OP_SKIP, OP_SKIP_ARG },
216 { 4, OP_THEN, OP_THEN_ARG }
217 };
218
219 static const int verbcount = sizeof(verbs)/sizeof(verbitem);
220
221
222 /* Tables of names of POSIX character classes and their lengths. The names are
223 now all in a single string, to reduce the number of relocations when a shared
224 library is dynamically loaded. The list of lengths is terminated by a zero
225 length entry. The first three must be alpha, lower, upper, as this is assumed
226 for handling case independence. */
227
228 static const char posix_names[] =
229 STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
230 STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
231 STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
232 STRING_word0 STRING_xdigit;
233
234 static const uschar posix_name_lengths[] = {
235 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
236
237 /* Table of class bit maps for each POSIX class. Each class is formed from a
238 base map, with an optional addition or removal of another map. Then, for some
239 classes, there is some additional tweaking: for [:blank:] the vertical space
240 characters are removed, and for [:alpha:] and [:alnum:] the underscore
241 character is removed. The triples in the table consist of the base map offset,
242 second map offset or -1 if no second map, and a non-negative value for map
243 addition or a negative value for map subtraction (if there are two maps). The
244 absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
245 remove vertical space characters, 2 => remove underscore. */
246
247 static const int posix_class_maps[] = {
248 cbit_word, cbit_digit, -2, /* alpha */
249 cbit_lower, -1, 0, /* lower */
250 cbit_upper, -1, 0, /* upper */
251 cbit_word, -1, 2, /* alnum - word without underscore */
252 cbit_print, cbit_cntrl, 0, /* ascii */
253 cbit_space, -1, 1, /* blank - a GNU extension */
254 cbit_cntrl, -1, 0, /* cntrl */
255 cbit_digit, -1, 0, /* digit */
256 cbit_graph, -1, 0, /* graph */
257 cbit_print, -1, 0, /* print */
258 cbit_punct, -1, 0, /* punct */
259 cbit_space, -1, 0, /* space */
260 cbit_word, -1, 0, /* word - a Perl extension */
261 cbit_xdigit,-1, 0 /* xdigit */
262 };
263
264 /* Table of substitutes for \d etc when PCRE_UCP is set. The POSIX class
265 substitutes must be in the order of the names, defined above, and there are
266 both positive and negative cases. NULL means no substitute. */
267
268 #ifdef SUPPORT_UCP
269 static const uschar *substitutes[] = {
270 (uschar *)"\\P{Nd}", /* \D */
271 (uschar *)"\\p{Nd}", /* \d */
272 (uschar *)"\\P{Xsp}", /* \S */ /* NOTE: Xsp is Perl space */
273 (uschar *)"\\p{Xsp}", /* \s */
274 (uschar *)"\\P{Xwd}", /* \W */
275 (uschar *)"\\p{Xwd}" /* \w */
276 };
277
278 static const uschar *posix_substitutes[] = {
279 (uschar *)"\\p{L}", /* alpha */
280 (uschar *)"\\p{Ll}", /* lower */
281 (uschar *)"\\p{Lu}", /* upper */
282 (uschar *)"\\p{Xan}", /* alnum */
283 NULL, /* ascii */
284 (uschar *)"\\h", /* blank */
285 NULL, /* cntrl */
286 (uschar *)"\\p{Nd}", /* digit */
287 NULL, /* graph */
288 NULL, /* print */
289 NULL, /* punct */
290 (uschar *)"\\p{Xps}", /* space */ /* NOTE: Xps is POSIX space */
291 (uschar *)"\\p{Xwd}", /* word */
292 NULL, /* xdigit */
293 /* Negated cases */
294 (uschar *)"\\P{L}", /* ^alpha */
295 (uschar *)"\\P{Ll}", /* ^lower */
296 (uschar *)"\\P{Lu}", /* ^upper */
297 (uschar *)"\\P{Xan}", /* ^alnum */
298 NULL, /* ^ascii */
299 (uschar *)"\\H", /* ^blank */
300 NULL, /* ^cntrl */
301 (uschar *)"\\P{Nd}", /* ^digit */
302 NULL, /* ^graph */
303 NULL, /* ^print */
304 NULL, /* ^punct */
305 (uschar *)"\\P{Xps}", /* ^space */ /* NOTE: Xps is POSIX space */
306 (uschar *)"\\P{Xwd}", /* ^word */
307 NULL /* ^xdigit */
308 };
309 #define POSIX_SUBSIZE (sizeof(posix_substitutes)/sizeof(uschar *))
310 #endif
311
312 #define STRING(a) # a
313 #define XSTRING(s) STRING(s)
314
315 /* The texts of compile-time error messages. These are "char *" because they
316 are passed to the outside world. Do not ever re-use any error number, because
317 they are documented. Always add a new error instead. Messages marked DEAD below
318 are no longer used. This used to be a table of strings, but in order to reduce
319 the number of relocations needed when a shared library is loaded dynamically,
320 it is now one long string. We cannot use a table of offsets, because the
321 lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
322 simply count through to the one we want - this isn't a performance issue
323 because these strings are used only when there is a compilation error.
324
325 Each substring ends with \0 to insert a null character. This includes the final
326 substring, so that the whole string ends with \0\0, which can be detected when
327 counting through. */
328
329 static const char error_texts[] =
330 "no error\0"
331 "\\ at end of pattern\0"
332 "\\c at end of pattern\0"
333 "unrecognized character follows \\\0"
334 "numbers out of order in {} quantifier\0"
335 /* 5 */
336 "number too big in {} quantifier\0"
337 "missing terminating ] for character class\0"
338 "invalid escape sequence in character class\0"
339 "range out of order in character class\0"
340 "nothing to repeat\0"
341 /* 10 */
342 "operand of unlimited repeat could match the empty string\0" /** DEAD **/
343 "internal error: unexpected repeat\0"
344 "unrecognized character after (? or (?-\0"
345 "POSIX named classes are supported only within a class\0"
346 "missing )\0"
347 /* 15 */
348 "reference to non-existent subpattern\0"
349 "erroffset passed as NULL\0"
350 "unknown option bit(s) set\0"
351 "missing ) after comment\0"
352 "parentheses nested too deeply\0" /** DEAD **/
353 /* 20 */
354 "regular expression is too large\0"
355 "failed to get memory\0"
356 "unmatched parentheses\0"
357 "internal error: code overflow\0"
358 "unrecognized character after (?<\0"
359 /* 25 */
360 "lookbehind assertion is not fixed length\0"
361 "malformed number or name after (?(\0"
362 "conditional group contains more than two branches\0"
363 "assertion expected after (?(\0"
364 "(?R or (?[+-]digits must be followed by )\0"
365 /* 30 */
366 "unknown POSIX class name\0"
367 "POSIX collating elements are not supported\0"
368 "this version of PCRE is not compiled with PCRE_UTF8 support\0"
369 "spare error\0" /** DEAD **/
370 "character value in \\x{...} sequence is too large\0"
371 /* 35 */
372 "invalid condition (?(0)\0"
373 "\\C not allowed in lookbehind assertion\0"
374 "PCRE does not support \\L, \\l, \\N{name}, \\U, or \\u\0"
375 "number after (?C is > 255\0"
376 "closing ) for (?C expected\0"
377 /* 40 */
378 "recursive call could loop indefinitely\0"
379 "unrecognized character after (?P\0"
380 "syntax error in subpattern name (missing terminator)\0"
381 "two named subpatterns have the same name\0"
382 "invalid UTF-8 string\0"
383 /* 45 */
384 "support for \\P, \\p, and \\X has not been compiled\0"
385 "malformed \\P or \\p sequence\0"
386 "unknown property name after \\P or \\p\0"
387 "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
388 "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
389 /* 50 */
390 "repeated subpattern is too long\0" /** DEAD **/
391 "octal value is greater than \\377 (not in UTF-8 mode)\0"
392 "internal error: overran compiling workspace\0"
393 "internal error: previously-checked referenced subpattern not found\0"
394 "DEFINE group contains more than one branch\0"
395 /* 55 */
396 "repeating a DEFINE group is not allowed\0"
397 "inconsistent NEWLINE options\0"
398 "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
399 "a numbered reference must not be zero\0"
400 "an argument is not allowed for (*ACCEPT), (*FAIL), or (*COMMIT)\0"
401 /* 60 */
402 "(*VERB) not recognized\0"
403 "number is too big\0"
404 "subpattern name expected\0"
405 "digit expected after (?+\0"
406 "] is an invalid data character in JavaScript compatibility mode\0"
407 /* 65 */
408 "different names for subpatterns of the same number are not allowed\0"
409 "(*MARK) must have an argument\0"
410 "this version of PCRE is not compiled with PCRE_UCP support\0"
411 ;
412
413 /* Table to identify digits and hex digits. This is used when compiling
414 patterns. Note that the tables in chartables are dependent on the locale, and
415 may mark arbitrary characters as digits - but the PCRE compiling code expects
416 to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
417 a private table here. It costs 256 bytes, but it is a lot faster than doing
418 character value tests (at least in some simple cases I timed), and in some
419 applications one wants PCRE to compile efficiently as well as match
420 efficiently.
421
422 For convenience, we use the same bit definitions as in chartables:
423
424 0x04 decimal digit
425 0x08 hexadecimal digit
426
427 Then we can use ctype_digit and ctype_xdigit in the code. */
428
429 #ifndef EBCDIC
430
431 /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
432 UTF-8 mode. */
433
434 static const unsigned char digitab[] =
435 {
436 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
437 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
438 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
439 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
440 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
441 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
442 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
443 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
444 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
445 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
446 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
447 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
448 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
449 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
450 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
451 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
452 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
453 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
454 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
455 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
456 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
457 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
458 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
459 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
460 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
461 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
462 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
463 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
464 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
465 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
466 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
467 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
468
469 #else
470
471 /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
472
473 static const unsigned char digitab[] =
474 {
475 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
476 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
477 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
478 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
479 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
480 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
481 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
482 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
483 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
484 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
485 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
486 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */
487 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
488 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
489 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
490 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
491 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
492 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
493 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
494 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
495 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
496 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
497 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
498 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
499 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
500 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
501 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
502 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
503 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
504 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
505 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
506 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
507
508 static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
509 0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
510 0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
511 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
512 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
513 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
514 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
515 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
516 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
517 0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
518 0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
519 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
520 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */
521 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
522 0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
523 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
524 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
525 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
526 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
527 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
528 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
529 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
530 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
531 0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
532 0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
533 0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
534 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
535 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
536 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
537 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
538 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
539 0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
540 0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
541 #endif
542
543
544 /* Definition to allow mutual recursion */
545
546 static BOOL
547 compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
548 int *, int *, branch_chain *, compile_data *, int *);
549
550
551
552 /*************************************************
553 * Find an error text *
554 *************************************************/
555
556 /* The error texts are now all in one long string, to save on relocations. As
557 some of the text is of unknown length, we can't use a table of offsets.
558 Instead, just count through the strings. This is not a performance issue
559 because it happens only when there has been a compilation error.
560
561 Argument: the error number
562 Returns: pointer to the error string
563 */
564
565 static const char *
566 find_error_text(int n)
567 {
568 const char *s = error_texts;
569 for (; n > 0; n--)
570 {
571 while (*s++ != 0) {};
572 if (*s == 0) return "Error text not found (please report)";
573 }
574 return s;
575 }
576
577
578 /*************************************************
579 * Handle escapes *
580 *************************************************/
581
582 /* This function is called when a \ has been encountered. It either returns a
583 positive value for a simple escape such as \n, or a negative value which
584 encodes one of the more complicated things such as \d. A backreference to group
585 n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
586 UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
587 ptr is pointing at the \. On exit, it is on the final character of the escape
588 sequence.
589
590 Arguments:
591 ptrptr points to the pattern position pointer
592 errorcodeptr points to the errorcode variable
593 bracount number of previous extracting brackets
594 options the options bits
595 isclass TRUE if inside a character class
596
597 Returns: zero or positive => a data character
598 negative => a special escape sequence
599 on error, errorcodeptr is set
600 */
601
602 static int
603 check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
604 int options, BOOL isclass)
605 {
606 BOOL utf8 = (options & PCRE_UTF8) != 0;
607 const uschar *ptr = *ptrptr + 1;
608 int c, i;
609
610 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
611 ptr--; /* Set pointer back to the last byte */
612
613 /* If backslash is at the end of the pattern, it's an error. */
614
615 if (c == 0) *errorcodeptr = ERR1;
616
617 /* Non-alphanumerics are literals. For digits or letters, do an initial lookup
618 in a table. A non-zero result is something that can be returned immediately.
619 Otherwise further processing may be required. */
620
621 #ifndef EBCDIC /* ASCII/UTF-8 coding */
622 else if (c < CHAR_0 || c > CHAR_z) {} /* Not alphanumeric */
623 else if ((i = escapes[c - CHAR_0]) != 0) c = i;
624
625 #else /* EBCDIC coding */
626 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphanumeric */
627 else if ((i = escapes[c - 0x48]) != 0) c = i;
628 #endif
629
630 /* Escapes that need further processing, or are illegal. */
631
632 else
633 {
634 const uschar *oldptr;
635 BOOL braced, negated;
636
637 switch (c)
638 {
639 /* A number of Perl escapes are not handled by PCRE. We give an explicit
640 error. */
641
642 case CHAR_l:
643 case CHAR_L:
644 case CHAR_u:
645 case CHAR_U:
646 *errorcodeptr = ERR37;
647 break;
648
649 /* \g must be followed by one of a number of specific things:
650
651 (1) A number, either plain or braced. If positive, it is an absolute
652 backreference. If negative, it is a relative backreference. This is a Perl
653 5.10 feature.
654
655 (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
656 is part of Perl's movement towards a unified syntax for back references. As
657 this is synonymous with \k{name}, we fudge it up by pretending it really
658 was \k.
659
660 (3) For Oniguruma compatibility we also support \g followed by a name or a
661 number either in angle brackets or in single quotes. However, these are
662 (possibly recursive) subroutine calls, _not_ backreferences. Just return
663 the -ESC_g code (cf \k). */
664
665 case CHAR_g:
666 if (ptr[1] == CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_APOSTROPHE)
667 {
668 c = -ESC_g;
669 break;
670 }
671
672 /* Handle the Perl-compatible cases */
673
674 if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
675 {
676 const uschar *p;
677 for (p = ptr+2; *p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET; p++)
678 if (*p != CHAR_MINUS && (digitab[*p] & ctype_digit) == 0) break;
679 if (*p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET)
680 {
681 c = -ESC_k;
682 break;
683 }
684 braced = TRUE;
685 ptr++;
686 }
687 else braced = FALSE;
688
689 if (ptr[1] == CHAR_MINUS)
690 {
691 negated = TRUE;
692 ptr++;
693 }
694 else negated = FALSE;
695
696 c = 0;
697 while ((digitab[ptr[1]] & ctype_digit) != 0)
698 c = c * 10 + *(++ptr) - CHAR_0;
699
700 if (c < 0) /* Integer overflow */
701 {
702 *errorcodeptr = ERR61;
703 break;
704 }
705
706 if (braced && *(++ptr) != CHAR_RIGHT_CURLY_BRACKET)
707 {
708 *errorcodeptr = ERR57;
709 break;
710 }
711
712 if (c == 0)
713 {
714 *errorcodeptr = ERR58;
715 break;
716 }
717
718 if (negated)
719 {
720 if (c > bracount)
721 {
722 *errorcodeptr = ERR15;
723 break;
724 }
725 c = bracount - (c - 1);
726 }
727
728 c = -(ESC_REF + c);
729 break;
730
731 /* The handling of escape sequences consisting of a string of digits
732 starting with one that is not zero is not straightforward. By experiment,
733 the way Perl works seems to be as follows:
734
735 Outside a character class, the digits are read as a decimal number. If the
736 number is less than 10, or if there are that many previous extracting
737 left brackets, then it is a back reference. Otherwise, up to three octal
738 digits are read to form an escaped byte. Thus \123 is likely to be octal
739 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
740 value is greater than 377, the least significant 8 bits are taken. Inside a
741 character class, \ followed by a digit is always an octal number. */
742
743 case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
744 case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
745
746 if (!isclass)
747 {
748 oldptr = ptr;
749 c -= CHAR_0;
750 while ((digitab[ptr[1]] & ctype_digit) != 0)
751 c = c * 10 + *(++ptr) - CHAR_0;
752 if (c < 0) /* Integer overflow */
753 {
754 *errorcodeptr = ERR61;
755 break;
756 }
757 if (c < 10 || c <= bracount)
758 {
759 c = -(ESC_REF + c);
760 break;
761 }
762 ptr = oldptr; /* Put the pointer back and fall through */
763 }
764
765 /* Handle an octal number following \. If the first digit is 8 or 9, Perl
766 generates a binary zero byte and treats the digit as a following literal.
767 Thus we have to pull back the pointer by one. */
768
769 if ((c = *ptr) >= CHAR_8)
770 {
771 ptr--;
772 c = 0;
773 break;
774 }
775
776 /* \0 always starts an octal number, but we may drop through to here with a
777 larger first octal digit. The original code used just to take the least
778 significant 8 bits of octal numbers (I think this is what early Perls used
779 to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
780 than 3 octal digits. */
781
782 case CHAR_0:
783 c -= CHAR_0;
784 while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
785 c = c * 8 + *(++ptr) - CHAR_0;
786 if (!utf8 && c > 255) *errorcodeptr = ERR51;
787 break;
788
789 /* \x is complicated. \x{ddd} is a character number which can be greater
790 than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
791 treated as a data character. */
792
793 case CHAR_x:
794 if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
795 {
796 const uschar *pt = ptr + 2;
797 int count = 0;
798
799 c = 0;
800 while ((digitab[*pt] & ctype_xdigit) != 0)
801 {
802 register int cc = *pt++;
803 if (c == 0 && cc == CHAR_0) continue; /* Leading zeroes */
804 count++;
805
806 #ifndef EBCDIC /* ASCII/UTF-8 coding */
807 if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
808 c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
809 #else /* EBCDIC coding */
810 if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */
811 c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
812 #endif
813 }
814
815 if (*pt == CHAR_RIGHT_CURLY_BRACKET)
816 {
817 if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
818 ptr = pt;
819 break;
820 }
821
822 /* If the sequence of hex digits does not end with '}', then we don't
823 recognize this construct; fall through to the normal \x handling. */
824 }
825
826 /* Read just a single-byte hex-defined char */
827
828 c = 0;
829 while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
830 {
831 int cc; /* Some compilers don't like */
832 cc = *(++ptr); /* ++ in initializers */
833 #ifndef EBCDIC /* ASCII/UTF-8 coding */
834 if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
835 c = c * 16 + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
836 #else /* EBCDIC coding */
837 if (cc <= CHAR_z) cc += 64; /* Convert to upper case */
838 c = c * 16 + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
839 #endif
840 }
841 break;
842
843 /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
844 This coding is ASCII-specific, but then the whole concept of \cx is
845 ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
846
847 case CHAR_c:
848 c = *(++ptr);
849 if (c == 0)
850 {
851 *errorcodeptr = ERR2;
852 break;
853 }
854
855 #ifndef EBCDIC /* ASCII/UTF-8 coding */
856 if (c >= CHAR_a && c <= CHAR_z) c -= 32;
857 c ^= 0x40;
858 #else /* EBCDIC coding */
859 if (c >= CHAR_a && c <= CHAR_z) c += 64;
860 c ^= 0xC0;
861 #endif
862 break;
863
864 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
865 other alphanumeric following \ is an error if PCRE_EXTRA was set;
866 otherwise, for Perl compatibility, it is a literal. This code looks a bit
867 odd, but there used to be some cases other than the default, and there may
868 be again in future, so I haven't "optimized" it. */
869
870 default:
871 if ((options & PCRE_EXTRA) != 0) switch(c)
872 {
873 default:
874 *errorcodeptr = ERR3;
875 break;
876 }
877 break;
878 }
879 }
880
881 /* Perl supports \N{name} for character names, as well as plain \N for "not
882 newline". PCRE does not support \N{name}. */
883
884 if (c == -ESC_N && ptr[1] == CHAR_LEFT_CURLY_BRACKET)
885 *errorcodeptr = ERR37;
886
887 /* If PCRE_UCP is set, we change the values for \d etc. */
888
889 if ((options & PCRE_UCP) != 0 && c <= -ESC_D && c >= -ESC_w)
890 c -= (ESC_DU - ESC_D);
891
892 /* Set the pointer to the final character before returning. */
893
894 *ptrptr = ptr;
895 return c;
896 }
897
898
899
900 #ifdef SUPPORT_UCP
901 /*************************************************
902 * Handle \P and \p *
903 *************************************************/
904
905 /* This function is called after \P or \p has been encountered, provided that
906 PCRE is compiled with support for Unicode properties. On entry, ptrptr is
907 pointing at the P or p. On exit, it is pointing at the final character of the
908 escape sequence.
909
910 Argument:
911 ptrptr points to the pattern position pointer
912 negptr points to a boolean that is set TRUE for negation else FALSE
913 dptr points to an int that is set to the detailed property value
914 errorcodeptr points to the error code variable
915
916 Returns: type value from ucp_type_table, or -1 for an invalid type
917 */
918
919 static int
920 get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
921 {
922 int c, i, bot, top;
923 const uschar *ptr = *ptrptr;
924 char name[32];
925
926 c = *(++ptr);
927 if (c == 0) goto ERROR_RETURN;
928
929 *negptr = FALSE;
930
931 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
932 negation. */
933
934 if (c == CHAR_LEFT_CURLY_BRACKET)
935 {
936 if (ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
937 {
938 *negptr = TRUE;
939 ptr++;
940 }
941 for (i = 0; i < (int)sizeof(name) - 1; i++)
942 {
943 c = *(++ptr);
944 if (c == 0) goto ERROR_RETURN;
945 if (c == CHAR_RIGHT_CURLY_BRACKET) break;
946 name[i] = c;
947 }
948 if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
949 name[i] = 0;
950 }
951
952 /* Otherwise there is just one following character */
953
954 else
955 {
956 name[0] = c;
957 name[1] = 0;
958 }
959
960 *ptrptr = ptr;
961
962 /* Search for a recognized property name using binary chop */
963
964 bot = 0;
965 top = _pcre_utt_size;
966
967 while (bot < top)
968 {
969 i = (bot + top) >> 1;
970 c = strcmp(name, _pcre_utt_names + _pcre_utt[i].name_offset);
971 if (c == 0)
972 {
973 *dptr = _pcre_utt[i].value;
974 return _pcre_utt[i].type;
975 }
976 if (c > 0) bot = i + 1; else top = i;
977 }
978
979 *errorcodeptr = ERR47;
980 *ptrptr = ptr;
981 return -1;
982
983 ERROR_RETURN:
984 *errorcodeptr = ERR46;
985 *ptrptr = ptr;
986 return -1;
987 }
988 #endif
989
990
991
992
993 /*************************************************
994 * Check for counted repeat *
995 *************************************************/
996
997 /* This function is called when a '{' is encountered in a place where it might
998 start a quantifier. It looks ahead to see if it really is a quantifier or not.
999 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
1000 where the ddds are digits.
1001
1002 Arguments:
1003 p pointer to the first char after '{'
1004
1005 Returns: TRUE or FALSE
1006 */
1007
1008 static BOOL
1009 is_counted_repeat(const uschar *p)
1010 {
1011 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
1012 while ((digitab[*p] & ctype_digit) != 0) p++;
1013 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
1014
1015 if (*p++ != CHAR_COMMA) return FALSE;
1016 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
1017
1018 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
1019 while ((digitab[*p] & ctype_digit) != 0) p++;
1020
1021 return (*p == CHAR_RIGHT_CURLY_BRACKET);
1022 }
1023
1024
1025
1026 /*************************************************
1027 * Read repeat counts *
1028 *************************************************/
1029
1030 /* Read an item of the form {n,m} and return the values. This is called only
1031 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
1032 so the syntax is guaranteed to be correct, but we need to check the values.
1033
1034 Arguments:
1035 p pointer to first char after '{'
1036 minp pointer to int for min
1037 maxp pointer to int for max
1038 returned as -1 if no max
1039 errorcodeptr points to error code variable
1040
1041 Returns: pointer to '}' on success;
1042 current ptr on error, with errorcodeptr set non-zero
1043 */
1044
1045 static const uschar *
1046 read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
1047 {
1048 int min = 0;
1049 int max = -1;
1050
1051 /* Read the minimum value and do a paranoid check: a negative value indicates
1052 an integer overflow. */
1053
1054 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - CHAR_0;
1055 if (min < 0 || min > 65535)
1056 {
1057 *errorcodeptr = ERR5;
1058 return p;
1059 }
1060
1061 /* Read the maximum value if there is one, and again do a paranoid on its size.
1062 Also, max must not be less than min. */
1063
1064 if (*p == CHAR_RIGHT_CURLY_BRACKET) max = min; else
1065 {
1066 if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)
1067 {
1068 max = 0;
1069 while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - CHAR_0;
1070 if (max < 0 || max > 65535)
1071 {
1072 *errorcodeptr = ERR5;
1073 return p;
1074 }
1075 if (max < min)
1076 {
1077 *errorcodeptr = ERR4;
1078 return p;
1079 }
1080 }
1081 }
1082
1083 /* Fill in the required variables, and pass back the pointer to the terminating
1084 '}'. */
1085
1086 *minp = min;
1087 *maxp = max;
1088 return p;
1089 }
1090
1091
1092
1093 /*************************************************
1094 * Subroutine for finding forward reference *
1095 *************************************************/
1096
1097 /* This recursive function is called only from find_parens() below. The
1098 top-level call starts at the beginning of the pattern. All other calls must
1099 start at a parenthesis. It scans along a pattern's text looking for capturing
1100 subpatterns, and counting them. If it finds a named pattern that matches the
1101 name it is given, it returns its number. Alternatively, if the name is NULL, it
1102 returns when it reaches a given numbered subpattern. We know that if (?P< is
1103 encountered, the name will be terminated by '>' because that is checked in the
1104 first pass. Recursion is used to keep track of subpatterns that reset the
1105 capturing group numbers - the (?| feature.
1106
1107 Arguments:
1108 ptrptr address of the current character pointer (updated)
1109 cd compile background data
1110 name name to seek, or NULL if seeking a numbered subpattern
1111 lorn name length, or subpattern number if name is NULL
1112 xmode TRUE if we are in /x mode
1113 utf8 TRUE if we are in UTF-8 mode
1114 count pointer to the current capturing subpattern number (updated)
1115
1116 Returns: the number of the named subpattern, or -1 if not found
1117 */
1118
1119 static int
1120 find_parens_sub(uschar **ptrptr, compile_data *cd, const uschar *name, int lorn,
1121 BOOL xmode, BOOL utf8, int *count)
1122 {
1123 uschar *ptr = *ptrptr;
1124 int start_count = *count;
1125 int hwm_count = start_count;
1126 BOOL dup_parens = FALSE;
1127
1128 /* If the first character is a parenthesis, check on the type of group we are
1129 dealing with. The very first call may not start with a parenthesis. */
1130
1131 if (ptr[0] == CHAR_LEFT_PARENTHESIS)
1132 {
1133 /* Handle specials such as (*SKIP) or (*UTF8) etc. */
1134
1135 if (ptr[1] == CHAR_ASTERISK) ptr += 2;
1136
1137 /* Handle a normal, unnamed capturing parenthesis. */
1138
1139 else if (ptr[1] != CHAR_QUESTION_MARK)
1140 {
1141 *count += 1;
1142 if (name == NULL && *count == lorn) return *count;
1143 ptr++;
1144 }
1145
1146 /* All cases now have (? at the start. Remember when we are in a group
1147 where the parenthesis numbers are duplicated. */
1148
1149 else if (ptr[2] == CHAR_VERTICAL_LINE)
1150 {
1151 ptr += 3;
1152 dup_parens = TRUE;
1153 }
1154
1155 /* Handle comments; all characters are allowed until a ket is reached. */
1156
1157 else if (ptr[2] == CHAR_NUMBER_SIGN)
1158 {
1159 for (ptr += 3; *ptr != 0; ptr++) if (*ptr == CHAR_RIGHT_PARENTHESIS) break;
1160 goto FAIL_EXIT;
1161 }
1162
1163 /* Handle a condition. If it is an assertion, just carry on so that it
1164 is processed as normal. If not, skip to the closing parenthesis of the
1165 condition (there can't be any nested parens). */
1166
1167 else if (ptr[2] == CHAR_LEFT_PARENTHESIS)
1168 {
1169 ptr += 2;
1170 if (ptr[1] != CHAR_QUESTION_MARK)
1171 {
1172 while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
1173 if (*ptr != 0) ptr++;
1174 }
1175 }
1176
1177 /* Start with (? but not a condition. */
1178
1179 else
1180 {
1181 ptr += 2;
1182 if (*ptr == CHAR_P) ptr++; /* Allow optional P */
1183
1184 /* We have to disambiguate (?<! and (?<= from (?<name> for named groups */
1185
1186 if ((*ptr == CHAR_LESS_THAN_SIGN && ptr[1] != CHAR_EXCLAMATION_MARK &&
1187 ptr[1] != CHAR_EQUALS_SIGN) || *ptr == CHAR_APOSTROPHE)
1188 {
1189 int term;
1190 const uschar *thisname;
1191 *count += 1;
1192 if (name == NULL && *count == lorn) return *count;
1193 term = *ptr++;
1194 if (term == CHAR_LESS_THAN_SIGN) term = CHAR_GREATER_THAN_SIGN;
1195 thisname = ptr;
1196 while (*ptr != term) ptr++;
1197 if (name != NULL && lorn == ptr - thisname &&
1198 strncmp((const char *)name, (const char *)thisname, lorn) == 0)
1199 return *count;
1200 term++;
1201 }
1202 }
1203 }
1204
1205 /* Past any initial parenthesis handling, scan for parentheses or vertical
1206 bars. */
1207
1208 for (; *ptr != 0; ptr++)
1209 {
1210 /* Skip over backslashed characters and also entire \Q...\E */
1211
1212 if (*ptr == CHAR_BACKSLASH)
1213 {
1214 if (*(++ptr) == 0) goto FAIL_EXIT;
1215 if (*ptr == CHAR_Q) for (;;)
1216 {
1217 while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
1218 if (*ptr == 0) goto FAIL_EXIT;
1219 if (*(++ptr) == CHAR_E) break;
1220 }
1221 continue;
1222 }
1223
1224 /* Skip over character classes; this logic must be similar to the way they
1225 are handled for real. If the first character is '^', skip it. Also, if the
1226 first few characters (either before or after ^) are \Q\E or \E we skip them
1227 too. This makes for compatibility with Perl. Note the use of STR macros to
1228 encode "Q\\E" so that it works in UTF-8 on EBCDIC platforms. */
1229
1230 if (*ptr == CHAR_LEFT_SQUARE_BRACKET)
1231 {
1232 BOOL negate_class = FALSE;
1233 for (;;)
1234 {
1235 if (ptr[1] == CHAR_BACKSLASH)
1236 {
1237 if (ptr[2] == CHAR_E)
1238 ptr+= 2;
1239 else if (strncmp((const char *)ptr+2,
1240 STR_Q STR_BACKSLASH STR_E, 3) == 0)
1241 ptr += 4;
1242 else
1243 break;
1244 }
1245 else if (!negate_class && ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
1246 {
1247 negate_class = TRUE;
1248 ptr++;
1249 }
1250 else break;
1251 }
1252
1253 /* If the next character is ']', it is a data character that must be
1254 skipped, except in JavaScript compatibility mode. */
1255
1256 if (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET &&
1257 (cd->external_options & PCRE_JAVASCRIPT_COMPAT) == 0)
1258 ptr++;
1259
1260 while (*(++ptr) != CHAR_RIGHT_SQUARE_BRACKET)
1261 {
1262 if (*ptr == 0) return -1;
1263 if (*ptr == CHAR_BACKSLASH)
1264 {
1265 if (*(++ptr) == 0) goto FAIL_EXIT;
1266 if (*ptr == CHAR_Q) for (;;)
1267 {
1268 while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
1269 if (*ptr == 0) goto FAIL_EXIT;
1270 if (*(++ptr) == CHAR_E) break;
1271 }
1272 continue;
1273 }
1274 }
1275 continue;
1276 }
1277
1278 /* Skip comments in /x mode */
1279
1280 if (xmode && *ptr == CHAR_NUMBER_SIGN)
1281 {
1282 ptr++;
1283 while (*ptr != 0)
1284 {
1285 if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
1286 ptr++;
1287 #ifdef SUPPORT_UTF8
1288 if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
1289 #endif
1290 }
1291 if (*ptr == 0) goto FAIL_EXIT;
1292 continue;
1293 }
1294
1295 /* Check for the special metacharacters */
1296
1297 if (*ptr == CHAR_LEFT_PARENTHESIS)
1298 {
1299 int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf8, count);
1300 if (rc > 0) return rc;
1301 if (*ptr == 0) goto FAIL_EXIT;
1302 }
1303
1304 else if (*ptr == CHAR_RIGHT_PARENTHESIS)
1305 {
1306 if (dup_parens && *count < hwm_count) *count = hwm_count;
1307 goto FAIL_EXIT;
1308 }
1309
1310 else if (*ptr == CHAR_VERTICAL_LINE && dup_parens)
1311 {
1312 if (*count > hwm_count) hwm_count = *count;
1313 *count = start_count;
1314 }
1315 }
1316
1317 FAIL_EXIT:
1318 *ptrptr = ptr;
1319 return -1;
1320 }
1321
1322
1323
1324
1325 /*************************************************
1326 * Find forward referenced subpattern *
1327 *************************************************/
1328
1329 /* This function scans along a pattern's text looking for capturing
1330 subpatterns, and counting them. If it finds a named pattern that matches the
1331 name it is given, it returns its number. Alternatively, if the name is NULL, it
1332 returns when it reaches a given numbered subpattern. This is used for forward
1333 references to subpatterns. We used to be able to start this scan from the
1334 current compiling point, using the current count value from cd->bracount, and
1335 do it all in a single loop, but the addition of the possibility of duplicate
1336 subpattern numbers means that we have to scan from the very start, in order to
1337 take account of such duplicates, and to use a recursive function to keep track
1338 of the different types of group.
1339
1340 Arguments:
1341 cd compile background data
1342 name name to seek, or NULL if seeking a numbered subpattern
1343 lorn name length, or subpattern number if name is NULL
1344 xmode TRUE if we are in /x mode
1345 utf8 TRUE if we are in UTF-8 mode
1346
1347 Returns: the number of the found subpattern, or -1 if not found
1348 */
1349
1350 static int
1351 find_parens(compile_data *cd, const uschar *name, int lorn, BOOL xmode,
1352 BOOL utf8)
1353 {
1354 uschar *ptr = (uschar *)cd->start_pattern;
1355 int count = 0;
1356 int rc;
1357
1358 /* If the pattern does not start with an opening parenthesis, the first call
1359 to find_parens_sub() will scan right to the end (if necessary). However, if it
1360 does start with a parenthesis, find_parens_sub() will return when it hits the
1361 matching closing parens. That is why we have to have a loop. */
1362
1363 for (;;)
1364 {
1365 rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf8, &count);
1366 if (rc > 0 || *ptr++ == 0) break;
1367 }
1368
1369 return rc;
1370 }
1371
1372
1373
1374
1375 /*************************************************
1376 * Find first significant op code *
1377 *************************************************/
1378
1379 /* This is called by several functions that scan a compiled expression looking
1380 for a fixed first character, or an anchoring op code etc. It skips over things
1381 that do not influence this. For some calls, a change of option is important.
1382 For some calls, it makes sense to skip negative forward and all backward
1383 assertions, and also the \b assertion; for others it does not.
1384
1385 Arguments:
1386 code pointer to the start of the group
1387 options pointer to external options
1388 optbit the option bit whose changing is significant, or
1389 zero if none are
1390 skipassert TRUE if certain assertions are to be skipped
1391
1392 Returns: pointer to the first significant opcode
1393 */
1394
1395 static const uschar*
1396 first_significant_code(const uschar *code, int *options, int optbit,
1397 BOOL skipassert)
1398 {
1399 for (;;)
1400 {
1401 switch ((int)*code)
1402 {
1403 case OP_OPT:
1404 if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
1405 *options = (int)code[1];
1406 code += 2;
1407 break;
1408
1409 case OP_ASSERT_NOT:
1410 case OP_ASSERTBACK:
1411 case OP_ASSERTBACK_NOT:
1412 if (!skipassert) return code;
1413 do code += GET(code, 1); while (*code == OP_ALT);
1414 code += _pcre_OP_lengths[*code];
1415 break;
1416
1417 case OP_WORD_BOUNDARY:
1418 case OP_NOT_WORD_BOUNDARY:
1419 if (!skipassert) return code;
1420 /* Fall through */
1421
1422 case OP_CALLOUT:
1423 case OP_CREF:
1424 case OP_NCREF:
1425 case OP_RREF:
1426 case OP_NRREF:
1427 case OP_DEF:
1428 code += _pcre_OP_lengths[*code];
1429 break;
1430
1431 default:
1432 return code;
1433 }
1434 }
1435 /* Control never reaches here */
1436 }
1437
1438
1439
1440
1441 /*************************************************
1442 * Find the fixed length of a branch *
1443 *************************************************/
1444
1445 /* Scan a branch and compute the fixed length of subject that will match it,
1446 if the length is fixed. This is needed for dealing with backward assertions.
1447 In UTF8 mode, the result is in characters rather than bytes. The branch is
1448 temporarily terminated with OP_END when this function is called.
1449
1450 This function is called when a backward assertion is encountered, so that if it
1451 fails, the error message can point to the correct place in the pattern.
1452 However, we cannot do this when the assertion contains subroutine calls,
1453 because they can be forward references. We solve this by remembering this case
1454 and doing the check at the end; a flag specifies which mode we are running in.
1455
1456 Arguments:
1457 code points to the start of the pattern (the bracket)
1458 options the compiling options
1459 atend TRUE if called when the pattern is complete
1460 cd the "compile data" structure
1461
1462 Returns: the fixed length,
1463 or -1 if there is no fixed length,
1464 or -2 if \C was encountered
1465 or -3 if an OP_RECURSE item was encountered and atend is FALSE
1466 */
1467
1468 static int
1469 find_fixedlength(uschar *code, int options, BOOL atend, compile_data *cd)
1470 {
1471 int length = -1;
1472
1473 register int branchlength = 0;
1474 register uschar *cc = code + 1 + LINK_SIZE;
1475
1476 /* Scan along the opcodes for this branch. If we get to the end of the
1477 branch, check the length against that of the other branches. */
1478
1479 for (;;)
1480 {
1481 int d;
1482 uschar *ce, *cs;
1483 register int op = *cc;
1484 switch (op)
1485 {
1486 case OP_CBRA:
1487 case OP_BRA:
1488 case OP_ONCE:
1489 case OP_COND:
1490 d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options, atend, cd);
1491 if (d < 0) return d;
1492 branchlength += d;
1493 do cc += GET(cc, 1); while (*cc == OP_ALT);
1494 cc += 1 + LINK_SIZE;
1495 break;
1496
1497 /* Reached end of a branch; if it's a ket it is the end of a nested
1498 call. If it's ALT it is an alternation in a nested call. If it is
1499 END it's the end of the outer call. All can be handled by the same code. */
1500
1501 case OP_ALT:
1502 case OP_KET:
1503 case OP_KETRMAX:
1504 case OP_KETRMIN:
1505 case OP_END:
1506 if (length < 0) length = branchlength;
1507 else if (length != branchlength) return -1;
1508 if (*cc != OP_ALT) return length;
1509 cc += 1 + LINK_SIZE;
1510 branchlength = 0;
1511 break;
1512
1513 /* A true recursion implies not fixed length, but a subroutine call may
1514 be OK. If the subroutine is a forward reference, we can't deal with
1515 it until the end of the pattern, so return -3. */
1516
1517 case OP_RECURSE:
1518 if (!atend) return -3;
1519 cs = ce = (uschar *)cd->start_code + GET(cc, 1); /* Start subpattern */
1520 do ce += GET(ce, 1); while (*ce == OP_ALT); /* End subpattern */
1521 if (cc > cs && cc < ce) return -1; /* Recursion */
1522 d = find_fixedlength(cs + 2, options, atend, cd);
1523 if (d < 0) return d;
1524 branchlength += d;
1525 cc += 1 + LINK_SIZE;
1526 break;
1527
1528 /* Skip over assertive subpatterns */
1529
1530 case OP_ASSERT:
1531 case OP_ASSERT_NOT:
1532 case OP_ASSERTBACK:
1533 case OP_ASSERTBACK_NOT:
1534 do cc += GET(cc, 1); while (*cc == OP_ALT);
1535 /* Fall through */
1536
1537 /* Skip over things that don't match chars */
1538
1539 case OP_REVERSE:
1540 case OP_CREF:
1541 case OP_NCREF:
1542 case OP_RREF:
1543 case OP_NRREF:
1544 case OP_DEF:
1545 case OP_OPT:
1546 case OP_CALLOUT:
1547 case OP_SOD:
1548 case OP_SOM:
1549 case OP_SET_SOM:
1550 case OP_EOD:
1551 case OP_EODN:
1552 case OP_CIRC:
1553 case OP_DOLL:
1554 case OP_NOT_WORD_BOUNDARY:
1555 case OP_WORD_BOUNDARY:
1556 cc += _pcre_OP_lengths[*cc];
1557 break;
1558
1559 /* Handle literal characters */
1560
1561 case OP_CHAR:
1562 case OP_CHARNC:
1563 case OP_NOT:
1564 branchlength++;
1565 cc += 2;
1566 #ifdef SUPPORT_UTF8
1567 if ((options & PCRE_UTF8) != 0 && cc[-1] >= 0xc0)
1568 cc += _pcre_utf8_table4[cc[-1] & 0x3f];
1569 #endif
1570 break;
1571
1572 /* Handle exact repetitions. The count is already in characters, but we
1573 need to skip over a multibyte character in UTF8 mode. */
1574
1575 case OP_EXACT:
1576 branchlength += GET2(cc,1);
1577 cc += 4;
1578 #ifdef SUPPORT_UTF8
1579 if ((options & PCRE_UTF8) != 0 && cc[-1] >= 0xc0)
1580 cc += _pcre_utf8_table4[cc[-1] & 0x3f];
1581 #endif
1582 break;
1583
1584 case OP_TYPEEXACT:
1585 branchlength += GET2(cc,1);
1586 if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
1587 cc += 4;
1588 break;
1589
1590 /* Handle single-char matchers */
1591
1592 case OP_PROP:
1593 case OP_NOTPROP:
1594 cc += 2;
1595 /* Fall through */
1596
1597 case OP_NOT_DIGIT:
1598 case OP_DIGIT:
1599 case OP_NOT_WHITESPACE:
1600 case OP_WHITESPACE:
1601 case OP_NOT_WORDCHAR:
1602 case OP_WORDCHAR:
1603 case OP_ANY:
1604 case OP_ALLANY:
1605 branchlength++;
1606 cc++;
1607 break;
1608
1609 /* The single-byte matcher isn't allowed */
1610
1611 case OP_ANYBYTE:
1612 return -2;
1613
1614 /* Check a class for variable quantification */
1615
1616 #ifdef SUPPORT_UTF8
1617 case OP_XCLASS:
1618 cc += GET(cc, 1) - 33;
1619 /* Fall through */
1620 #endif
1621
1622 case OP_CLASS:
1623 case OP_NCLASS:
1624 cc += 33;
1625
1626 switch (*cc)
1627 {
1628 case OP_CRSTAR:
1629 case OP_CRMINSTAR:
1630 case OP_CRQUERY:
1631 case OP_CRMINQUERY:
1632 return -1;
1633
1634 case OP_CRRANGE:
1635 case OP_CRMINRANGE:
1636 if (GET2(cc,1) != GET2(cc,3)) return -1;
1637 branchlength += GET2(cc,1);
1638 cc += 5;
1639 break;
1640
1641 default:
1642 branchlength++;
1643 }
1644 break;
1645
1646 /* Anything else is variable length */
1647
1648 default:
1649 return -1;
1650 }
1651 }
1652 /* Control never gets here */
1653 }
1654
1655
1656
1657
1658 /*************************************************
1659 * Scan compiled regex for specific bracket *
1660 *************************************************/
1661
1662 /* This little function scans through a compiled pattern until it finds a
1663 capturing bracket with the given number, or, if the number is negative, an
1664 instance of OP_REVERSE for a lookbehind. The function is global in the C sense
1665 so that it can be called from pcre_study() when finding the minimum matching
1666 length.
1667
1668 Arguments:
1669 code points to start of expression
1670 utf8 TRUE in UTF-8 mode
1671 number the required bracket number or negative to find a lookbehind
1672
1673 Returns: pointer to the opcode for the bracket, or NULL if not found
1674 */
1675
1676 const uschar *
1677 _pcre_find_bracket(const uschar *code, BOOL utf8, int number)
1678 {
1679 for (;;)
1680 {
1681 register int c = *code;
1682 if (c == OP_END) return NULL;
1683
1684 /* XCLASS is used for classes that cannot be represented just by a bit
1685 map. This includes negated single high-valued characters. The length in
1686 the table is zero; the actual length is stored in the compiled code. */
1687
1688 if (c == OP_XCLASS) code += GET(code, 1);
1689
1690 /* Handle recursion */
1691
1692 else if (c == OP_REVERSE)
1693 {
1694 if (number < 0) return (uschar *)code;
1695 code += _pcre_OP_lengths[c];
1696 }
1697
1698 /* Handle capturing bracket */
1699
1700 else if (c == OP_CBRA)
1701 {
1702 int n = GET2(code, 1+LINK_SIZE);
1703 if (n == number) return (uschar *)code;
1704 code += _pcre_OP_lengths[c];
1705 }
1706
1707 /* Otherwise, we can get the item's length from the table, except that for
1708 repeated character types, we have to test for \p and \P, which have an extra
1709 two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
1710 must add in its length. */
1711
1712 else
1713 {
1714 switch(c)
1715 {
1716 case OP_TYPESTAR:
1717 case OP_TYPEMINSTAR:
1718 case OP_TYPEPLUS:
1719 case OP_TYPEMINPLUS:
1720 case OP_TYPEQUERY:
1721 case OP_TYPEMINQUERY:
1722 case OP_TYPEPOSSTAR:
1723 case OP_TYPEPOSPLUS:
1724 case OP_TYPEPOSQUERY:
1725 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1726 break;
1727
1728 case OP_TYPEUPTO:
1729 case OP_TYPEMINUPTO:
1730 case OP_TYPEEXACT:
1731 case OP_TYPEPOSUPTO:
1732 if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1733 break;
1734
1735 case OP_MARK:
1736 case OP_PRUNE_ARG:
1737 case OP_SKIP_ARG:
1738 code += code[1];
1739 break;
1740
1741 case OP_THEN_ARG:
1742 code += code[1+LINK_SIZE];
1743 break;
1744 }
1745
1746 /* Add in the fixed length from the table */
1747
1748 code += _pcre_OP_lengths[c];
1749
1750 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1751 a multi-byte character. The length in the table is a minimum, so we have to
1752 arrange to skip the extra bytes. */
1753
1754 #ifdef SUPPORT_UTF8
1755 if (utf8) switch(c)
1756 {
1757 case OP_CHAR:
1758 case OP_CHARNC:
1759 case OP_EXACT:
1760 case OP_UPTO:
1761 case OP_MINUPTO:
1762 case OP_POSUPTO:
1763 case OP_STAR:
1764 case OP_MINSTAR:
1765 case OP_POSSTAR:
1766 case OP_PLUS:
1767 case OP_MINPLUS:
1768 case OP_POSPLUS:
1769 case OP_QUERY:
1770 case OP_MINQUERY:
1771 case OP_POSQUERY:
1772 if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1773 break;
1774 }
1775 #else
1776 (void)(utf8); /* Keep compiler happy by referencing function argument */
1777 #endif
1778 }
1779 }
1780 }
1781
1782
1783
1784 /*************************************************
1785 * Scan compiled regex for recursion reference *
1786 *************************************************/
1787
1788 /* This little function scans through a compiled pattern until it finds an
1789 instance of OP_RECURSE.
1790
1791 Arguments:
1792 code points to start of expression
1793 utf8 TRUE in UTF-8 mode
1794
1795 Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1796 */
1797
1798 static const uschar *
1799 find_recurse(const uschar *code, BOOL utf8)
1800 {
1801 for (;;)
1802 {
1803 register int c = *code;
1804 if (c == OP_END) return NULL;
1805 if (c == OP_RECURSE) return code;
1806
1807 /* XCLASS is used for classes that cannot be represented just by a bit
1808 map. This includes negated single high-valued characters. The length in
1809 the table is zero; the actual length is stored in the compiled code. */
1810
1811 if (c == OP_XCLASS) code += GET(code, 1);
1812
1813 /* Otherwise, we can get the item's length from the table, except that for
1814 repeated character types, we have to test for \p and \P, which have an extra
1815 two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
1816 must add in its length. */
1817
1818 else
1819 {
1820 switch(c)
1821 {
1822 case OP_TYPESTAR:
1823 case OP_TYPEMINSTAR:
1824 case OP_TYPEPLUS:
1825 case OP_TYPEMINPLUS:
1826 case OP_TYPEQUERY:
1827 case OP_TYPEMINQUERY:
1828 case OP_TYPEPOSSTAR:
1829 case OP_TYPEPOSPLUS:
1830 case OP_TYPEPOSQUERY:
1831 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1832 break;
1833
1834 case OP_TYPEPOSUPTO:
1835 case OP_TYPEUPTO:
1836 case OP_TYPEMINUPTO:
1837 case OP_TYPEEXACT:
1838 if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1839 break;
1840
1841 case OP_MARK:
1842 case OP_PRUNE_ARG:
1843 case OP_SKIP_ARG:
1844 code += code[1];
1845 break;
1846
1847 case OP_THEN_ARG:
1848 code += code[1+LINK_SIZE];
1849 break;
1850 }
1851
1852 /* Add in the fixed length from the table */
1853
1854 code += _pcre_OP_lengths[c];
1855
1856 /* In UTF-8 mode, opcodes that are followed by a character may be followed
1857 by a multi-byte character. The length in the table is a minimum, so we have
1858 to arrange to skip the extra bytes. */
1859
1860 #ifdef SUPPORT_UTF8
1861 if (utf8) switch(c)
1862 {
1863 case OP_CHAR:
1864 case OP_CHARNC:
1865 case OP_EXACT:
1866 case OP_UPTO:
1867 case OP_MINUPTO:
1868 case OP_POSUPTO:
1869 case OP_STAR:
1870 case OP_MINSTAR:
1871 case OP_POSSTAR:
1872 case OP_PLUS:
1873 case OP_MINPLUS:
1874 case OP_POSPLUS:
1875 case OP_QUERY:
1876 case OP_MINQUERY:
1877 case OP_POSQUERY:
1878 if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1879 break;
1880 }
1881 #else
1882 (void)(utf8); /* Keep compiler happy by referencing function argument */
1883 #endif
1884 }
1885 }
1886 }
1887
1888
1889
1890 /*************************************************
1891 * Scan compiled branch for non-emptiness *
1892 *************************************************/
1893
1894 /* This function scans through a branch of a compiled pattern to see whether it
1895 can match the empty string or not. It is called from could_be_empty()
1896 below and from compile_branch() when checking for an unlimited repeat of a
1897 group that can match nothing. Note that first_significant_code() skips over
1898 backward and negative forward assertions when its final argument is TRUE. If we
1899 hit an unclosed bracket, we return "empty" - this means we've struck an inner
1900 bracket whose current branch will already have been scanned.
1901
1902 Arguments:
1903 code points to start of search
1904 endcode points to where to stop
1905 utf8 TRUE if in UTF8 mode
1906 cd contains pointers to tables etc.
1907
1908 Returns: TRUE if what is matched could be empty
1909 */
1910
1911 static BOOL
1912 could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8,
1913 compile_data *cd)
1914 {
1915 register int c;
1916 for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1917 code < endcode;
1918 code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1919 {
1920 const uschar *ccode;
1921
1922 c = *code;
1923
1924 /* Skip over forward assertions; the other assertions are skipped by
1925 first_significant_code() with a TRUE final argument. */
1926
1927 if (c == OP_ASSERT)
1928 {
1929 do code += GET(code, 1); while (*code == OP_ALT);
1930 c = *code;
1931 continue;
1932 }
1933
1934 /* Groups with zero repeats can of course be empty; skip them. */
1935
1936 if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO)
1937 {
1938 code += _pcre_OP_lengths[c];
1939 do code += GET(code, 1); while (*code == OP_ALT);
1940 c = *code;
1941 continue;
1942 }
1943
1944 /* For a recursion/subroutine call, if its end has been reached, which
1945 implies a subroutine call, we can scan it. */
1946
1947 if (c == OP_RECURSE)
1948 {
1949 BOOL empty_branch = FALSE;
1950 const uschar *scode = cd->start_code + GET(code, 1);
1951 if (GET(scode, 1) == 0) return TRUE; /* Unclosed */
1952 do
1953 {
1954 if (could_be_empty_branch(scode, endcode, utf8, cd))
1955 {
1956 empty_branch = TRUE;
1957 break;
1958 }
1959 scode += GET(scode, 1);
1960 }
1961 while (*scode == OP_ALT);
1962 if (!empty_branch) return FALSE; /* All branches are non-empty */
1963 continue;
1964 }
1965
1966 /* For other groups, scan the branches. */
1967
1968 if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND)
1969 {
1970 BOOL empty_branch;
1971 if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1972
1973 /* If a conditional group has only one branch, there is a second, implied,
1974 empty branch, so just skip over the conditional, because it could be empty.
1975 Otherwise, scan the individual branches of the group. */
1976
1977 if (c == OP_COND && code[GET(code, 1)] != OP_ALT)
1978 code += GET(code, 1);
1979 else
1980 {
1981 empty_branch = FALSE;
1982 do
1983 {
1984 if (!empty_branch && could_be_empty_branch(code, endcode, utf8, cd))
1985 empty_branch = TRUE;
1986 code += GET(code, 1);
1987 }
1988 while (*code == OP_ALT);
1989 if (!empty_branch) return FALSE; /* All branches are non-empty */
1990 }
1991
1992 c = *code;
1993 continue;
1994 }
1995
1996 /* Handle the other opcodes */
1997
1998 switch (c)
1999 {
2000 /* Check for quantifiers after a class. XCLASS is used for classes that
2001 cannot be represented just by a bit map. This includes negated single
2002 high-valued characters. The length in _pcre_OP_lengths[] is zero; the
2003 actual length is stored in the compiled code, so we must update "code"
2004 here. */
2005
2006 #ifdef SUPPORT_UTF8
2007 case OP_XCLASS:
2008 ccode = code += GET(code, 1);
2009 goto CHECK_CLASS_REPEAT;
2010 #endif
2011
2012 case OP_CLASS:
2013 case OP_NCLASS:
2014 ccode = code + 33;
2015
2016 #ifdef SUPPORT_UTF8
2017 CHECK_CLASS_REPEAT:
2018 #endif
2019
2020 switch (*ccode)
2021 {
2022 case OP_CRSTAR: /* These could be empty; continue */
2023 case OP_CRMINSTAR:
2024 case OP_CRQUERY:
2025 case OP_CRMINQUERY:
2026 break;
2027
2028 default: /* Non-repeat => class must match */
2029 case OP_CRPLUS: /* These repeats aren't empty */
2030 case OP_CRMINPLUS:
2031 return FALSE;
2032
2033 case OP_CRRANGE:
2034 case OP_CRMINRANGE:
2035 if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
2036 break;
2037 }
2038 break;
2039
2040 /* Opcodes that must match a character */
2041
2042 case OP_PROP:
2043 case OP_NOTPROP:
2044 case OP_EXTUNI:
2045 case OP_NOT_DIGIT:
2046 case OP_DIGIT:
2047 case OP_NOT_WHITESPACE:
2048 case OP_WHITESPACE:
2049 case OP_NOT_WORDCHAR:
2050 case OP_WORDCHAR:
2051 case OP_ANY:
2052 case OP_ALLANY:
2053 case OP_ANYBYTE:
2054 case OP_CHAR:
2055 case OP_CHARNC:
2056 case OP_NOT:
2057 case OP_PLUS:
2058 case OP_MINPLUS:
2059 case OP_POSPLUS:
2060 case OP_EXACT:
2061 case OP_NOTPLUS:
2062 case OP_NOTMINPLUS:
2063 case OP_NOTPOSPLUS:
2064 case OP_NOTEXACT:
2065 case OP_TYPEPLUS:
2066 case OP_TYPEMINPLUS:
2067 case OP_TYPEPOSPLUS:
2068 case OP_TYPEEXACT:
2069 return FALSE;
2070
2071 /* These are going to continue, as they may be empty, but we have to
2072 fudge the length for the \p and \P cases. */
2073
2074 case OP_TYPESTAR:
2075 case OP_TYPEMINSTAR:
2076 case OP_TYPEPOSSTAR:
2077 case OP_TYPEQUERY:
2078 case OP_TYPEMINQUERY:
2079 case OP_TYPEPOSQUERY:
2080 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
2081 break;
2082
2083 /* Same for these */
2084
2085 case OP_TYPEUPTO:
2086 case OP_TYPEMINUPTO:
2087 case OP_TYPEPOSUPTO:
2088 if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
2089 break;
2090
2091 /* End of branch */
2092
2093 case OP_KET:
2094 case OP_KETRMAX:
2095 case OP_KETRMIN:
2096 case OP_ALT:
2097 return TRUE;
2098
2099 /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
2100 MINUPTO, and POSUPTO may be followed by a multibyte character */
2101
2102 #ifdef SUPPORT_UTF8
2103 case OP_STAR:
2104 case OP_MINSTAR:
2105 case OP_POSSTAR:
2106 case OP_QUERY:
2107 case OP_MINQUERY:
2108 case OP_POSQUERY:
2109 if (utf8 && code[1] >= 0xc0) code += _pcre_utf8_table4[code[1] & 0x3f];
2110 break;
2111
2112 case OP_UPTO:
2113 case OP_MINUPTO:
2114 case OP_POSUPTO:
2115 if (utf8 && code[3] >= 0xc0) code += _pcre_utf8_table4[code[3] & 0x3f];
2116 break;
2117 #endif
2118
2119 /* MARK, and PRUNE/SKIP/THEN with an argument must skip over the argument
2120 string. */
2121
2122 case OP_MARK:
2123 case OP_PRUNE_ARG:
2124 case OP_SKIP_ARG:
2125 code += code[1];
2126 break;
2127
2128 case OP_THEN_ARG:
2129 code += code[1+LINK_SIZE];
2130 break;
2131
2132 /* None of the remaining opcodes are required to match a character. */
2133
2134 default:
2135 break;
2136 }
2137 }
2138
2139 return TRUE;
2140 }
2141
2142
2143
2144 /*************************************************
2145 * Scan compiled regex for non-emptiness *
2146 *************************************************/
2147
2148 /* This function is called to check for left recursive calls. We want to check
2149 the current branch of the current pattern to see if it could match the empty
2150 string. If it could, we must look outwards for branches at other levels,
2151 stopping when we pass beyond the bracket which is the subject of the recursion.
2152
2153 Arguments:
2154 code points to start of the recursion
2155 endcode points to where to stop (current RECURSE item)
2156 bcptr points to the chain of current (unclosed) branch starts
2157 utf8 TRUE if in UTF-8 mode
2158 cd pointers to tables etc
2159
2160 Returns: TRUE if what is matched could be empty
2161 */
2162
2163 static BOOL
2164 could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
2165 BOOL utf8, compile_data *cd)
2166 {
2167 while (bcptr != NULL && bcptr->current_branch >= code)
2168 {
2169 if (!could_be_empty_branch(bcptr->current_branch, endcode, utf8, cd))
2170 return FALSE;
2171 bcptr = bcptr->outer;
2172 }
2173 return TRUE;
2174 }
2175
2176
2177
2178 /*************************************************
2179 * Check for POSIX class syntax *
2180 *************************************************/
2181
2182 /* This function is called when the sequence "[:" or "[." or "[=" is
2183 encountered in a character class. It checks whether this is followed by a
2184 sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
2185 reach an unescaped ']' without the special preceding character, return FALSE.
2186
2187 Originally, this function only recognized a sequence of letters between the
2188 terminators, but it seems that Perl recognizes any sequence of characters,
2189 though of course unknown POSIX names are subsequently rejected. Perl gives an
2190 "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
2191 didn't consider this to be a POSIX class. Likewise for [:1234:].
2192
2193 The problem in trying to be exactly like Perl is in the handling of escapes. We
2194 have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
2195 class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
2196 below handles the special case of \], but does not try to do any other escape
2197 processing. This makes it different from Perl for cases such as [:l\ower:]
2198 where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize
2199 "l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,
2200 I think.
2201
2202 Arguments:
2203 ptr pointer to the initial [
2204 endptr where to return the end pointer
2205
2206 Returns: TRUE or FALSE
2207 */
2208
2209 static BOOL
2210 check_posix_syntax(const uschar *ptr, const uschar **endptr)
2211 {
2212 int terminator; /* Don't combine these lines; the Solaris cc */
2213 terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
2214 for (++ptr; *ptr != 0; ptr++)
2215 {
2216 if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET) ptr++; else
2217 {
2218 if (*ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
2219 if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
2220 {
2221 *endptr = ptr;
2222 return TRUE;
2223 }
2224 }
2225 }
2226 return FALSE;
2227 }
2228
2229
2230
2231
2232 /*************************************************
2233 * Check POSIX class name *
2234 *************************************************/
2235
2236 /* This function is called to check the name given in a POSIX-style class entry
2237 such as [:alnum:].
2238
2239 Arguments:
2240 ptr points to the first letter
2241 len the length of the name
2242
2243 Returns: a value representing the name, or -1 if unknown
2244 */
2245
2246 static int
2247 check_posix_name(const uschar *ptr, int len)
2248 {
2249 const char *pn = posix_names;
2250 register int yield = 0;
2251 while (posix_name_lengths[yield] != 0)
2252 {
2253 if (len == posix_name_lengths[yield] &&
2254 strncmp((const char *)ptr, pn, len) == 0) return yield;
2255 pn += posix_name_lengths[yield] + 1;
2256 yield++;
2257 }
2258 return -1;
2259 }
2260
2261
2262 /*************************************************
2263 * Adjust OP_RECURSE items in repeated group *
2264 *************************************************/
2265
2266 /* OP_RECURSE items contain an offset from the start of the regex to the group
2267 that is referenced. This means that groups can be replicated for fixed
2268 repetition simply by copying (because the recursion is allowed to refer to
2269 earlier groups that are outside the current group). However, when a group is
2270 optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is
2271 inserted before it, after it has been compiled. This means that any OP_RECURSE
2272 items within it that refer to the group itself or any contained groups have to
2273 have their offsets adjusted. That one of the jobs of this function. Before it
2274 is called, the partially compiled regex must be temporarily terminated with
2275 OP_END.
2276
2277 This function has been extended with the possibility of forward references for
2278 recursions and subroutine calls. It must also check the list of such references
2279 for the group we are dealing with. If it finds that one of the recursions in
2280 the current group is on this list, it adjusts the offset in the list, not the
2281 value in the reference (which is a group number).
2282
2283 Arguments:
2284 group points to the start of the group
2285 adjust the amount by which the group is to be moved
2286 utf8 TRUE in UTF-8 mode
2287 cd contains pointers to tables etc.
2288 save_hwm the hwm forward reference pointer at the start of the group
2289
2290 Returns: nothing
2291 */
2292
2293 static void
2294 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
2295 uschar *save_hwm)
2296 {
2297 uschar *ptr = group;
2298
2299 while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
2300 {
2301 int offset;
2302 uschar *hc;
2303
2304 /* See if this recursion is on the forward reference list. If so, adjust the
2305 reference. */
2306
2307 for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
2308 {
2309 offset = GET(hc, 0);
2310 if (cd->start_code + offset == ptr + 1)
2311 {
2312 PUT(hc, 0, offset + adjust);
2313 break;
2314 }
2315 }
2316
2317 /* Otherwise, adjust the recursion offset if it's after the start of this
2318 group. */
2319
2320 if (hc >= cd->hwm)
2321 {
2322 offset = GET(ptr, 1);
2323 if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
2324 }
2325
2326 ptr += 1 + LINK_SIZE;
2327 }
2328 }
2329
2330
2331
2332 /*************************************************
2333 * Insert an automatic callout point *
2334 *************************************************/
2335
2336 /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
2337 callout points before each pattern item.
2338
2339 Arguments:
2340 code current code pointer
2341 ptr current pattern pointer
2342 cd pointers to tables etc
2343
2344 Returns: new code pointer
2345 */
2346
2347 static uschar *
2348 auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
2349 {
2350 *code++ = OP_CALLOUT;
2351 *code++ = 255;
2352 PUT(code, 0, (int)(ptr - cd->start_pattern)); /* Pattern offset */
2353 PUT(code, LINK_SIZE, 0); /* Default length */
2354 return code + 2*LINK_SIZE;
2355 }
2356
2357
2358
2359 /*************************************************
2360 * Complete a callout item *
2361 *************************************************/
2362
2363 /* A callout item contains the length of the next item in the pattern, which
2364 we can't fill in till after we have reached the relevant point. This is used
2365 for both automatic and manual callouts.
2366
2367 Arguments:
2368 previous_callout points to previous callout item
2369 ptr current pattern pointer
2370 cd pointers to tables etc
2371
2372 Returns: nothing
2373 */
2374
2375 static void
2376 complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
2377 {
2378 int length = (int)(ptr - cd->start_pattern - GET(previous_callout, 2));
2379 PUT(previous_callout, 2 + LINK_SIZE, length);
2380 }
2381
2382
2383
2384 #ifdef SUPPORT_UCP
2385 /*************************************************
2386 * Get othercase range *
2387 *************************************************/
2388
2389 /* This function is passed the start and end of a class range, in UTF-8 mode
2390 with UCP support. It searches up the characters, looking for internal ranges of
2391 characters in the "other" case. Each call returns the next one, updating the
2392 start address.
2393
2394 Arguments:
2395 cptr points to starting character value; updated
2396 d end value
2397 ocptr where to put start of othercase range
2398 odptr where to put end of othercase range
2399
2400 Yield: TRUE when range returned; FALSE when no more
2401 */
2402
2403 static BOOL
2404 get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
2405 unsigned int *odptr)
2406 {
2407 unsigned int c, othercase, next;
2408
2409 for (c = *cptr; c <= d; c++)
2410 { if ((othercase = UCD_OTHERCASE(c)) != c) break; }
2411
2412 if (c > d) return FALSE;
2413
2414 *ocptr = othercase;
2415 next = othercase + 1;
2416
2417 for (++c; c <= d; c++)
2418 {
2419 if (UCD_OTHERCASE(c) != next) break;
2420 next++;
2421 }
2422
2423 *odptr = next - 1;
2424 *cptr = c;
2425
2426 return TRUE;
2427 }
2428
2429
2430
2431 /*************************************************
2432 * Check a character and a property *
2433 *************************************************/
2434
2435 /* This function is called by check_auto_possessive() when a property item
2436 is adjacent to a fixed character.
2437
2438 Arguments:
2439 c the character
2440 ptype the property type
2441 pdata the data for the type
2442 negated TRUE if it's a negated property (\P or \p{^)
2443
2444 Returns: TRUE if auto-possessifying is OK
2445 */
2446
2447 static BOOL
2448 check_char_prop(int c, int ptype, int pdata, BOOL negated)
2449 {
2450 const ucd_record *prop = GET_UCD(c);
2451 switch(ptype)
2452 {
2453 case PT_LAMP:
2454 return (prop->chartype == ucp_Lu ||
2455 prop->chartype == ucp_Ll ||
2456 prop->chartype == ucp_Lt) == negated;
2457
2458 case PT_GC:
2459 return (pdata == _pcre_ucp_gentype[prop->chartype]) == negated;
2460
2461 case PT_PC:
2462 return (pdata == prop->chartype) == negated;
2463
2464 case PT_SC:
2465 return (pdata == prop->script) == negated;
2466
2467 /* These are specials */
2468
2469 case PT_ALNUM:
2470 return (_pcre_ucp_gentype[prop->chartype] == ucp_L ||
2471 _pcre_ucp_gentype[prop->chartype] == ucp_N) == negated;
2472
2473 case PT_SPACE: /* Perl space */
2474 return (_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
2475 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
2476 == negated;
2477
2478 case PT_PXSPACE: /* POSIX space */
2479 return (_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
2480 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2481 c == CHAR_FF || c == CHAR_CR)
2482 == negated;
2483
2484 case PT_WORD:
2485 return (_pcre_ucp_gentype[prop->chartype] == ucp_L ||
2486 _pcre_ucp_gentype[prop->chartype] == ucp_N ||
2487 c == CHAR_UNDERSCORE) == negated;
2488 }
2489 return FALSE;
2490 }
2491 #endif /* SUPPORT_UCP */
2492
2493
2494
2495 /*************************************************
2496 * Check if auto-possessifying is possible *
2497 *************************************************/
2498
2499 /* This function is called for unlimited repeats of certain items, to see
2500 whether the next thing could possibly match the repeated item. If not, it makes
2501 sense to automatically possessify the repeated item.
2502
2503 Arguments:
2504 previous pointer to the repeated opcode
2505 utf8 TRUE in UTF-8 mode
2506 ptr next character in pattern
2507 options options bits
2508 cd contains pointers to tables etc.
2509
2510 Returns: TRUE if possessifying is wanted
2511 */
2512
2513 static BOOL
2514 check_auto_possessive(const uschar *previous, BOOL utf8, const uschar *ptr,
2515 int options, compile_data *cd)
2516 {
2517 int c, next;
2518 int op_code = *previous++;
2519
2520 /* Skip whitespace and comments in extended mode */
2521
2522 if ((options & PCRE_EXTENDED) != 0)
2523 {
2524 for (;;)
2525 {
2526 while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2527 if (*ptr == CHAR_NUMBER_SIGN)
2528 {
2529 ptr++;
2530 while (*ptr != 0)
2531 {
2532 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2533 ptr++;
2534 #ifdef SUPPORT_UTF8
2535 if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
2536 #endif
2537 }
2538 }
2539 else break;
2540 }
2541 }
2542
2543 /* If the next item is one that we can handle, get its value. A non-negative
2544 value is a character, a negative value is an escape value. */
2545
2546 if (*ptr == CHAR_BACKSLASH)
2547 {
2548 int temperrorcode = 0;
2549 next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
2550 if (temperrorcode != 0) return FALSE;
2551 ptr++; /* Point after the escape sequence */
2552 }
2553
2554 else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
2555 {
2556 #ifdef SUPPORT_UTF8
2557 if (utf8) { GETCHARINC(next, ptr); } else
2558 #endif
2559 next = *ptr++;
2560 }
2561
2562 else return FALSE;
2563
2564 /* Skip whitespace and comments in extended mode */
2565
2566 if ((options & PCRE_EXTENDED) != 0)
2567 {
2568 for (;;)
2569 {
2570 while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2571 if (*ptr == CHAR_NUMBER_SIGN)
2572 {
2573 ptr++;
2574 while (*ptr != 0)
2575 {
2576 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2577 ptr++;
2578 #ifdef SUPPORT_UTF8
2579 if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
2580 #endif
2581 }
2582 }
2583 else break;
2584 }
2585 }
2586
2587 /* If the next thing is itself optional, we have to give up. */
2588
2589 if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
2590 strncmp((char *)ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
2591 return FALSE;
2592
2593 /* Now compare the next item with the previous opcode. First, handle cases when
2594 the next item is a character. */
2595
2596 if (next >= 0) switch(op_code)
2597 {
2598 case OP_CHAR:
2599 #ifdef SUPPORT_UTF8
2600 GETCHARTEST(c, previous);
2601 #else
2602 c = *previous;
2603 #endif
2604 return c != next;
2605
2606 /* For CHARNC (caseless character) we must check the other case. If we have
2607 Unicode property support, we can use it to test the other case of
2608 high-valued characters. */
2609
2610 case OP_CHARNC:
2611 #ifdef SUPPORT_UTF8
2612 GETCHARTEST(c, previous);
2613 #else
2614 c = *previous;
2615 #endif
2616 if (c == next) return FALSE;
2617 #ifdef SUPPORT_UTF8
2618 if (utf8)
2619 {
2620 unsigned int othercase;
2621 if (next < 128) othercase = cd->fcc[next]; else
2622 #ifdef SUPPORT_UCP
2623 othercase = UCD_OTHERCASE((unsigned int)next);
2624 #else
2625 othercase = NOTACHAR;
2626 #endif
2627 return (unsigned int)c != othercase;
2628 }
2629 else
2630 #endif /* SUPPORT_UTF8 */
2631 return (c != cd->fcc[next]); /* Non-UTF-8 mode */
2632
2633 /* For OP_NOT, its data is always a single-byte character. */
2634
2635 case OP_NOT:
2636 if ((c = *previous) == next) return TRUE;
2637 if ((options & PCRE_CASELESS) == 0) return FALSE;
2638 #ifdef SUPPORT_UTF8
2639 if (utf8)
2640 {
2641 unsigned int othercase;
2642 if (next < 128) othercase = cd->fcc[next]; else
2643 #ifdef SUPPORT_UCP
2644 othercase = UCD_OTHERCASE(next);
2645 #else
2646 othercase = NOTACHAR;
2647 #endif
2648 return (unsigned int)c == othercase;
2649 }
2650 else
2651 #endif /* SUPPORT_UTF8 */
2652 return (c == cd->fcc[next]); /* Non-UTF-8 mode */
2653
2654 /* Note that OP_DIGIT etc. are generated only when PCRE_UCP is *not* set.
2655 When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
2656
2657 case OP_DIGIT:
2658 return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
2659
2660 case OP_NOT_DIGIT:
2661 return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
2662
2663 case OP_WHITESPACE:
2664 return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
2665
2666 case OP_NOT_WHITESPACE:
2667 return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
2668
2669 case OP_WORDCHAR:
2670 return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
2671
2672 case OP_NOT_WORDCHAR:
2673 return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
2674
2675 case OP_HSPACE:
2676 case OP_NOT_HSPACE:
2677 switch(next)
2678 {
2679 case 0x09:
2680 case 0x20:
2681 case 0xa0:
2682 case 0x1680:
2683 case 0x180e:
2684 case 0x2000:
2685 case 0x2001:
2686 case 0x2002:
2687 case 0x2003:
2688 case 0x2004:
2689 case 0x2005:
2690 case 0x2006:
2691 case 0x2007:
2692 case 0x2008:
2693 case 0x2009:
2694 case 0x200A:
2695 case 0x202f:
2696 case 0x205f:
2697 case 0x3000:
2698 return op_code == OP_NOT_HSPACE;
2699 default:
2700 return op_code != OP_NOT_HSPACE;
2701 }
2702
2703 case OP_ANYNL:
2704 case OP_VSPACE:
2705 case OP_NOT_VSPACE:
2706 switch(next)
2707 {
2708 case 0x0a:
2709 case 0x0b:
2710 case 0x0c:
2711 case 0x0d:
2712 case 0x85:
2713 case 0x2028:
2714 case 0x2029:
2715 return op_code == OP_NOT_VSPACE;
2716 default:
2717 return op_code != OP_NOT_VSPACE;
2718 }
2719
2720 #ifdef SUPPORT_UCP
2721 case OP_PROP:
2722 return check_char_prop(next, previous[0], previous[1], FALSE);
2723
2724 case OP_NOTPROP:
2725 return check_char_prop(next, previous[0], previous[1], TRUE);
2726 #endif
2727
2728 default:
2729 return FALSE;
2730 }
2731
2732
2733 /* Handle the case when the next item is \d, \s, etc. Note that when PCRE_UCP
2734 is set, \d turns into ESC_du rather than ESC_d, etc., so ESC_d etc. are
2735 generated only when PCRE_UCP is *not* set, that is, when only ASCII
2736 characteristics are recognized. Similarly, the opcodes OP_DIGIT etc. are
2737 replaced by OP_PROP codes when PCRE_UCP is set. */
2738
2739 switch(op_code)
2740 {
2741 case OP_CHAR:
2742 case OP_CHARNC:
2743 #ifdef SUPPORT_UTF8
2744 GETCHARTEST(c, previous);
2745 #else
2746 c = *previous;
2747 #endif
2748 switch(-next)
2749 {
2750 case ESC_d:
2751 return c > 127 || (cd->ctypes[c] & ctype_digit) == 0;
2752
2753 case ESC_D:
2754 return c <= 127 && (cd->ctypes[c] & ctype_digit) != 0;
2755
2756 case ESC_s:
2757 return c > 127 || (cd->ctypes[c] & ctype_space) == 0;
2758
2759 case ESC_S:
2760 return c <= 127 && (cd->ctypes[c] & ctype_space) != 0;
2761
2762 case ESC_w:
2763 return c > 127 || (cd->ctypes[c] & ctype_word) == 0;
2764
2765 case ESC_W:
2766 return c <= 127 && (cd->ctypes[c] & ctype_word) != 0;
2767
2768 case ESC_h:
2769 case ESC_H:
2770 switch(c)
2771 {
2772 case 0x09:
2773 case 0x20:
2774 case 0xa0:
2775 case 0x1680:
2776 case 0x180e:
2777 case 0x2000:
2778 case 0x2001:
2779 case 0x2002:
2780 case 0x2003:
2781 case 0x2004:
2782 case 0x2005:
2783 case 0x2006:
2784 case 0x2007:
2785 case 0x2008:
2786 case 0x2009:
2787 case 0x200A:
2788 case 0x202f:
2789 case 0x205f:
2790 case 0x3000:
2791 return -next != ESC_h;
2792 default:
2793 return -next == ESC_h;
2794 }
2795
2796 case ESC_v:
2797 case ESC_V:
2798 switch(c)
2799 {
2800 case 0x0a:
2801 case 0x0b:
2802 case 0x0c:
2803 case 0x0d:
2804 case 0x85:
2805 case 0x2028:
2806 case 0x2029:
2807 return -next != ESC_v;
2808 default:
2809 return -next == ESC_v;
2810 }
2811
2812 /* When PCRE_UCP is set, these values get generated for \d etc. Find
2813 their substitutions and process them. The result will always be either
2814 -ESC_p or -ESC_P. Then fall through to process those values. */
2815
2816 #ifdef SUPPORT_UCP
2817 case ESC_du:
2818 case ESC_DU:
2819 case ESC_wu:
2820 case ESC_WU:
2821 case ESC_su:
2822 case ESC_SU:
2823 {
2824 int temperrorcode = 0;
2825 ptr = substitutes[-next - ESC_DU];
2826 next = check_escape(&ptr, &temperrorcode, 0, options, FALSE);
2827 if (temperrorcode != 0) return FALSE;
2828 ptr++; /* For compatibility */
2829 }
2830 /* Fall through */
2831
2832 case ESC_p:
2833 case ESC_P:
2834 {
2835 int ptype, pdata, errorcodeptr;
2836 BOOL negated;
2837
2838 ptr--; /* Make ptr point at the p or P */
2839 ptype = get_ucp(&ptr, &negated, &pdata, &errorcodeptr);
2840 if (ptype < 0) return FALSE;
2841 ptr++; /* Point past the final curly ket */
2842
2843 /* If the property item is optional, we have to give up. (When generated
2844 from \d etc by PCRE_UCP, this test will have been applied much earlier,
2845 to the original \d etc. At this point, ptr will point to a zero byte. */
2846
2847 if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
2848 strncmp((char *)ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
2849 return FALSE;
2850
2851 /* Do the property check. */
2852
2853 return check_char_prop(c, ptype, pdata, (next == -ESC_P) != negated);
2854 }
2855 #endif
2856
2857 default:
2858 return FALSE;
2859 }
2860
2861 /* In principle, support for Unicode properties should be integrated here as
2862 well. It means re-organizing the above code so as to get hold of the property
2863 values before switching on the op-code. However, I wonder how many patterns
2864 combine ASCII \d etc with Unicode properties? (Note that if PCRE_UCP is set,
2865 these op-codes are never generated.) */
2866
2867 case OP_DIGIT:
2868 return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
2869 next == -ESC_h || next == -ESC_v || next == -ESC_R;
2870
2871 case OP_NOT_DIGIT:
2872 return next == -ESC_d;
2873
2874 case OP_WHITESPACE:
2875 return next == -ESC_S || next == -ESC_d || next == -ESC_w || next == -ESC_R;
2876
2877 case OP_NOT_WHITESPACE:
2878 return next == -ESC_s || next == -ESC_h || next == -ESC_v;
2879
2880 case OP_HSPACE:
2881 return next == -ESC_S || next == -ESC_H || next == -ESC_d ||
2882 next == -ESC_w || next == -ESC_v || next == -ESC_R;
2883
2884 case OP_NOT_HSPACE:
2885 return next == -ESC_h;
2886
2887 /* Can't have \S in here because VT matches \S (Perl anomaly) */
2888 case OP_ANYNL:
2889 case OP_VSPACE:
2890 return next == -ESC_V || next == -ESC_d || next == -ESC_w;
2891
2892 case OP_NOT_VSPACE:
2893 return next == -ESC_v || next == -ESC_R;
2894
2895 case OP_WORDCHAR:
2896 return next == -ESC_W || next == -ESC_s || next == -ESC_h ||
2897 next == -ESC_v || next == -ESC_R;
2898
2899 case OP_NOT_WORDCHAR:
2900 return next == -ESC_w || next == -ESC_d;
2901
2902 default:
2903 return FALSE;
2904 }
2905
2906 /* Control does not reach here */
2907 }
2908
2909
2910
2911 /*************************************************
2912 * Compile one branch *
2913 *************************************************/
2914
2915 /* Scan the pattern, compiling it into the a vector. If the options are
2916 changed during the branch, the pointer is used to change the external options
2917 bits. This function is used during the pre-compile phase when we are trying
2918 to find out the amount of memory needed, as well as during the real compile
2919 phase. The value of lengthptr distinguishes the two phases.
2920
2921 Arguments:
2922 optionsptr pointer to the option bits
2923 codeptr points to the pointer to the current code point
2924 ptrptr points to the current pattern pointer
2925 errorcodeptr points to error code variable
2926 firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2927 reqbyteptr set to the last literal character required, else < 0
2928 bcptr points to current branch chain
2929 cd contains pointers to tables etc.
2930 lengthptr NULL during the real compile phase
2931 points to length accumulator during pre-compile phase
2932
2933 Returns: TRUE on success
2934 FALSE, with *errorcodeptr set non-zero on error
2935 */
2936
2937 static BOOL
2938 compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2939 int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2940 compile_data *cd, int *lengthptr)
2941 {
2942 int repeat_type, op_type;
2943 int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
2944 int bravalue = 0;
2945 int greedy_default, greedy_non_default;
2946 int firstbyte, reqbyte;
2947 int zeroreqbyte, zerofirstbyte;
2948 int req_caseopt, reqvary, tempreqvary;
2949 int options = *optionsptr;
2950 int after_manual_callout = 0;
2951 int length_prevgroup = 0;
2952 register int c;
2953 register uschar *code = *codeptr;
2954 uschar *last_code = code;
2955 uschar *orig_code = code;
2956 uschar *tempcode;
2957 BOOL inescq = FALSE;
2958 BOOL groupsetfirstbyte = FALSE;
2959 const uschar *ptr = *ptrptr;
2960 const uschar *tempptr;
2961 const uschar *nestptr = NULL;
2962 uschar *previous = NULL;
2963 uschar *previous_callout = NULL;
2964 uschar *save_hwm = NULL;
2965 uschar classbits[32];
2966
2967 #ifdef SUPPORT_UTF8
2968 BOOL class_utf8;
2969 BOOL utf8 = (options & PCRE_UTF8) != 0;
2970 uschar *class_utf8data;
2971 uschar *class_utf8data_base;
2972 uschar utf8_char[6];
2973 #else
2974 BOOL utf8 = FALSE;
2975 uschar *utf8_char = NULL;
2976 #endif
2977
2978 #ifdef PCRE_DEBUG
2979 if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2980 #endif
2981
2982 /* Set up the default and non-default settings for greediness */
2983
2984 greedy_default = ((options & PCRE_UNGREEDY) != 0);
2985 greedy_non_default = greedy_default ^ 1;
2986
2987 /* Initialize no first byte, no required byte. REQ_UNSET means "no char
2988 matching encountered yet". It gets changed to REQ_NONE if we hit something that
2989 matches a non-fixed char first char; reqbyte just remains unset if we never
2990 find one.
2991
2992 When we hit a repeat whose minimum is zero, we may have to adjust these values
2993 to take the zero repeat into account. This is implemented by setting them to
2994 zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2995 item types that can be repeated set these backoff variables appropriately. */
2996
2997 firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2998
2999 /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
3000 according to the current setting of the caseless flag. REQ_CASELESS is a bit
3001 value > 255. It is added into the firstbyte or reqbyte variables to record the
3002 case status of the value. This is used only for ASCII characters. */
3003
3004 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
3005
3006 /* Switch on next character until the end of the branch */
3007
3008 for (;; ptr++)
3009 {
3010 BOOL negate_class;
3011 BOOL should_flip_negation;
3012 BOOL possessive_quantifier;
3013 BOOL is_quantifier;
3014 BOOL is_recurse;
3015 BOOL reset_bracount;
3016 int class_charcount;
3017 int class_lastchar;
3018 int newoptions;
3019 int recno;
3020 int refsign;
3021 int skipbytes;
3022 int subreqbyte;
3023 int subfirstbyte;
3024 int terminator;
3025 int mclength;
3026 uschar mcbuffer[8];
3027
3028 /* Get next byte in the pattern */
3029
3030 c = *ptr;
3031
3032 /* If we are at the end of a nested substitution, revert to the outer level
3033 string. Nesting only happens one level deep. */
3034
3035 if (c == 0 && nestptr != NULL)
3036 {
3037 ptr = nestptr;
3038 nestptr = NULL;
3039 c = *ptr;
3040 }
3041
3042 /* If we are in the pre-compile phase, accumulate the length used for the
3043 previous cycle of this loop. */
3044
3045 if (lengthptr != NULL)
3046 {
3047 #ifdef PCRE_DEBUG
3048 if (code > cd->hwm) cd->hwm = code; /* High water info */
3049 #endif
3050 if (code > cd->start_workspace + WORK_SIZE_CHECK) /* Check for overrun */
3051 {
3052 *errorcodeptr = ERR52;
3053 goto FAILED;
3054 }
3055
3056 /* There is at least one situation where code goes backwards: this is the
3057 case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
3058 the class is simply eliminated. However, it is created first, so we have to
3059 allow memory for it. Therefore, don't ever reduce the length at this point.
3060 */
3061
3062 if (code < last_code) code = last_code;
3063
3064 /* Paranoid check for integer overflow */
3065
3066 if (OFLOW_MAX - *lengthptr < code - last_code)
3067 {
3068 *errorcodeptr = ERR20;
3069 goto FAILED;
3070 }
3071
3072 *lengthptr += (int)(code - last_code);
3073 DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
3074
3075 /* If "previous" is set and it is not at the start of the work space, move
3076 it back to there, in order to avoid filling up the work space. Otherwise,
3077 if "previous" is NULL, reset the current code pointer to the start. */
3078
3079 if (previous != NULL)
3080 {
3081 if (previous > orig_code)
3082 {
3083 memmove(orig_code, previous, code - previous);
3084 code -= previous - orig_code;
3085 previous = orig_code;
3086 }
3087 }
3088 else code = orig_code;
3089
3090 /* Remember where this code item starts so we can pick up the length
3091 next time round. */
3092
3093 last_code = code;
3094 }
3095
3096 /* In the real compile phase, just check the workspace used by the forward
3097 reference list. */
3098
3099 else if (cd->hwm > cd->start_workspace + WORK_SIZE_CHECK)
3100 {
3101 *errorcodeptr = ERR52;
3102 goto FAILED;
3103 }
3104
3105 /* If in \Q...\E, check for the end; if not, we have a literal */
3106
3107 if (inescq && c != 0)
3108 {
3109 if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)
3110 {
3111 inescq = FALSE;
3112 ptr++;
3113 continue;
3114 }
3115 else
3116 {
3117 if (previous_callout != NULL)
3118 {
3119 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
3120 complete_callout(previous_callout, ptr, cd);
3121 previous_callout = NULL;
3122 }
3123 if ((options & PCRE_AUTO_CALLOUT) != 0)
3124 {
3125 previous_callout = code;
3126 code = auto_callout(code, ptr, cd);
3127 }
3128 goto NORMAL_CHAR;
3129 }
3130 }
3131
3132 /* Fill in length of a previous callout, except when the next thing is
3133 a quantifier. */
3134
3135 is_quantifier =
3136 c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK ||
3137 (c == CHAR_LEFT_CURLY_BRACKET && is_counted_repeat(ptr+1));
3138
3139 if (!is_quantifier && previous_callout != NULL &&
3140 after_manual_callout-- <= 0)
3141 {
3142 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
3143 complete_callout(previous_callout, ptr, cd);
3144 previous_callout = NULL;
3145 }
3146
3147 /* In extended mode, skip white space and comments */
3148
3149 if ((options & PCRE_EXTENDED) != 0)
3150 {
3151 if ((cd->ctypes[c] & ctype_space) != 0) continue;
3152 if (c == CHAR_NUMBER_SIGN)
3153 {
3154 ptr++;
3155 while (*ptr != 0)
3156 {
3157 if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
3158 ptr++;
3159 #ifdef SUPPORT_UTF8
3160 if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
3161 #endif
3162 }
3163 if (*ptr != 0) continue;
3164
3165 /* Else fall through to handle end of string */
3166 c = 0;
3167 }
3168 }
3169
3170 /* No auto callout for quantifiers. */
3171
3172 if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
3173 {
3174 previous_callout = code;
3175 code = auto_callout(code, ptr, cd);
3176 }
3177
3178 switch(c)
3179 {
3180 /* ===================================================================*/
3181 case 0: /* The branch terminates at string end */
3182 case CHAR_VERTICAL_LINE: /* or | or ) */
3183 case CHAR_RIGHT_PARENTHESIS:
3184 *firstbyteptr = firstbyte;
3185 *reqbyteptr = reqbyte;
3186 *codeptr = code;
3187 *ptrptr = ptr;
3188 if (lengthptr != NULL)
3189 {
3190 if (OFLOW_MAX - *lengthptr < code - last_code)
3191 {
3192 *errorcodeptr = ERR20;
3193 goto FAILED;
3194 }
3195 *lengthptr += (int)(code - last_code); /* To include callout length */
3196 DPRINTF((">> end branch\n"));
3197 }
3198 return TRUE;
3199
3200
3201 /* ===================================================================*/
3202 /* Handle single-character metacharacters. In multiline mode, ^ disables
3203 the setting of any following char as a first character. */
3204
3205 case CHAR_CIRCUMFLEX_ACCENT:
3206 if ((options & PCRE_MULTILINE) != 0)
3207 {
3208 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3209 }
3210 previous = NULL;
3211 *code++ = OP_CIRC;
3212 break;
3213
3214 case CHAR_DOLLAR_SIGN:
3215 previous = NULL;
3216 *code++ = OP_DOLL;
3217 break;
3218
3219 /* There can never be a first char if '.' is first, whatever happens about
3220 repeats. The value of reqbyte doesn't change either. */
3221
3222 case CHAR_DOT:
3223 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3224 zerofirstbyte = firstbyte;
3225 zeroreqbyte = reqbyte;
3226 previous = code;
3227 *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;
3228 break;
3229
3230
3231 /* ===================================================================*/
3232 /* Character classes. If the included characters are all < 256, we build a
3233 32-byte bitmap of the permitted characters, except in the special case
3234 where there is only one such character. For negated classes, we build the
3235 map as usual, then invert it at the end. However, we use a different opcode
3236 so that data characters > 255 can be handled correctly.
3237
3238 If the class contains characters outside the 0-255 range, a different
3239 opcode is compiled. It may optionally have a bit map for characters < 256,
3240 but those above are are explicitly listed afterwards. A flag byte tells
3241 whether the bitmap is present, and whether this is a negated class or not.
3242
3243 In JavaScript compatibility mode, an isolated ']' causes an error. In
3244 default (Perl) mode, it is treated as a data character. */
3245
3246 case CHAR_RIGHT_SQUARE_BRACKET:
3247 if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
3248 {
3249 *errorcodeptr = ERR64;
3250 goto FAILED;
3251 }
3252 goto NORMAL_CHAR;
3253
3254 case CHAR_LEFT_SQUARE_BRACKET:
3255 previous = code;
3256
3257 /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
3258 they are encountered at the top level, so we'll do that too. */
3259
3260 if ((ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
3261 ptr[1] == CHAR_EQUALS_SIGN) &&
3262 check_posix_syntax(ptr, &tempptr))
3263 {
3264 *errorcodeptr = (ptr[1] == CHAR_COLON)? ERR13 : ERR31;
3265 goto FAILED;
3266 }
3267
3268 /* If the first character is '^', set the negation flag and skip it. Also,
3269 if the first few characters (either before or after ^) are \Q\E or \E we
3270 skip them too. This makes for compatibility with Perl. */
3271
3272 negate_class = FALSE;
3273 for (;;)
3274 {
3275 c = *(++ptr);
3276 if (c == CHAR_BACKSLASH)
3277 {
3278 if (ptr[1] == CHAR_E)
3279 ptr++;
3280 else if (strncmp((const char *)ptr+1,
3281 STR_Q STR_BACKSLASH STR_E, 3) == 0)
3282 ptr += 3;
3283 else
3284 break;
3285 }
3286 else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
3287 negate_class = TRUE;
3288 else break;
3289 }
3290
3291 /* Empty classes are allowed in JavaScript compatibility mode. Otherwise,
3292 an initial ']' is taken as a data character -- the code below handles
3293 that. In JS mode, [] must always fail, so generate OP_FAIL, whereas
3294 [^] must match any character, so generate OP_ALLANY. */
3295
3296 if (c == CHAR_RIGHT_SQUARE_BRACKET &&
3297 (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
3298 {
3299 *code++ = negate_class? OP_ALLANY : OP_FAIL;
3300 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3301 zerofirstbyte = firstbyte;
3302 break;
3303 }
3304
3305 /* If a class contains a negative special such as \S, we need to flip the
3306 negation flag at the end, so that support for characters > 255 works
3307 correctly (they are all included in the class). */
3308
3309 should_flip_negation = FALSE;
3310
3311 /* Keep a count of chars with values < 256 so that we can optimize the case
3312 of just a single character (as long as it's < 256). However, For higher
3313 valued UTF-8 characters, we don't yet do any optimization. */
3314
3315 class_charcount = 0;
3316 class_lastchar = -1;
3317
3318 /* Initialize the 32-char bit map to all zeros. We build the map in a
3319 temporary bit of memory, in case the class contains only 1 character (less
3320 than 256), because in that case the compiled code doesn't use the bit map.
3321 */
3322
3323 memset(classbits, 0, 32 * sizeof(uschar));
3324
3325 #ifdef SUPPORT_UTF8
3326 class_utf8 = FALSE; /* No chars >= 256 */
3327 class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */
3328 class_utf8data_base = class_utf8data; /* For resetting in pass 1 */
3329 #endif
3330
3331 /* Process characters until ] is reached. By writing this as a "do" it
3332 means that an initial ] is taken as a data character. At the start of the
3333 loop, c contains the first byte of the character. */
3334
3335 if (c != 0) do
3336 {
3337 const uschar *oldptr;
3338
3339 #ifdef SUPPORT_UTF8
3340 if (utf8 && c > 127)
3341 { /* Braces are required because the */
3342 GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
3343 }
3344
3345 /* In the pre-compile phase, accumulate the length of any UTF-8 extra
3346 data and reset the pointer. This is so that very large classes that
3347 contain a zillion UTF-8 characters no longer overwrite the work space
3348 (which is on the stack). */
3349
3350 if (lengthptr != NULL)
3351 {
3352 *lengthptr += class_utf8data - class_utf8data_base;
3353 class_utf8data = class_utf8data_base;
3354 }
3355
3356 #endif
3357
3358 /* Inside \Q...\E everything is literal except \E */
3359
3360 if (inescq)
3361 {
3362 if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E) /* If we are at \E */
3363 {
3364 inescq = FALSE; /* Reset literal state */
3365 ptr++; /* Skip the 'E' */
3366 continue; /* Carry on with next */
3367 }
3368 goto CHECK_RANGE; /* Could be range if \E follows */
3369 }
3370
3371 /* Handle POSIX class names. Perl allows a negation extension of the
3372 form [:^name:]. A square bracket that doesn't match the syntax is
3373 treated as a literal. We also recognize the POSIX constructions
3374 [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
3375 5.6 and 5.8 do. */
3376
3377 if (c == CHAR_LEFT_SQUARE_BRACKET &&
3378 (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
3379 ptr[1] == CHAR_EQUALS_SIGN) && check_posix_syntax(ptr, &tempptr))
3380 {
3381 BOOL local_negate = FALSE;
3382 int posix_class, taboffset, tabopt;
3383 register const uschar *cbits = cd->cbits;
3384 uschar pbits[32];
3385
3386 if (ptr[1] != CHAR_COLON)
3387 {
3388 *errorcodeptr = ERR31;
3389 goto FAILED;
3390 }
3391
3392 ptr += 2;
3393 if (*ptr == CHAR_CIRCUMFLEX_ACCENT)
3394 {
3395 local_negate = TRUE;
3396 should_flip_negation = TRUE; /* Note negative special */
3397 ptr++;
3398 }
3399
3400 posix_class = check_posix_name(ptr, (int)(tempptr - ptr));
3401 if (posix_class < 0)
3402 {
3403 *errorcodeptr = ERR30;
3404 goto FAILED;
3405 }
3406
3407 /* If matching is caseless, upper and lower are converted to
3408 alpha. This relies on the fact that the class table starts with
3409 alpha, lower, upper as the first 3 entries. */
3410
3411 if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
3412 posix_class = 0;
3413
3414 /* When PCRE_UCP is set, some of the POSIX classes are converted to
3415 different escape sequences that use Unicode properties. */
3416
3417 #ifdef SUPPORT_UCP
3418 if ((options & PCRE_UCP) != 0)
3419 {
3420 int pc = posix_class + ((local_negate)? POSIX_SUBSIZE/2 : 0);
3421 if (posix_substitutes[pc] != NULL)
3422 {
3423 nestptr = tempptr + 1;
3424 ptr = posix_substitutes[pc] - 1;
3425 continue;
3426 }
3427 }
3428 #endif
3429 /* In the non-UCP case, we build the bit map for the POSIX class in a
3430 chunk of local store because we may be adding and subtracting from it,
3431 and we don't want to subtract bits that may be in the main map already.
3432 At the end we or the result into the bit map that is being built. */
3433
3434 posix_class *= 3;
3435
3436 /* Copy in the first table (always present) */
3437
3438 memcpy(pbits, cbits + posix_class_maps[posix_class],
3439 32 * sizeof(uschar));
3440
3441 /* If there is a second table, add or remove it as required. */
3442
3443 taboffset = posix_class_maps[posix_class + 1];
3444 tabopt = posix_class_maps[posix_class + 2];
3445
3446 if (taboffset >= 0)
3447 {
3448 if (tabopt >= 0)
3449 for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
3450 else
3451 for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
3452 }
3453
3454 /* Not see if we need to remove any special characters. An option
3455 value of 1 removes vertical space and 2 removes underscore. */
3456
3457 if (tabopt < 0) tabopt = -tabopt;
3458 if (tabopt == 1) pbits[1] &= ~0x3c;
3459 else if (tabopt == 2) pbits[11] &= 0x7f;
3460
3461 /* Add the POSIX table or its complement into the main table that is
3462 being built and we are done. */
3463
3464 if (local_negate)
3465 for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
3466 else
3467 for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
3468
3469 ptr = tempptr + 1;
3470 class_charcount = 10; /* Set > 1; assumes more than 1 per class */
3471 continue; /* End of POSIX syntax handling */
3472 }
3473
3474 /* Backslash may introduce a single character, or it may introduce one
3475 of the specials, which just set a flag. The sequence \b is a special
3476 case. Inside a class (and only there) it is treated as backspace. We
3477 assume that other escapes have more than one character in them, so set
3478 class_charcount bigger than one. Unrecognized escapes fall through and
3479 are either treated as literal characters (by default), or are faulted if
3480 PCRE_EXTRA is set. */
3481
3482 if (c == CHAR_BACKSLASH)
3483 {
3484 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3485 if (*errorcodeptr != 0) goto FAILED;
3486
3487 if (-c == ESC_b) c = CHAR_BS; /* \b is backspace in a class */
3488 else if (-c == ESC_Q) /* Handle start of quoted string */
3489 {
3490 if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
3491 {
3492 ptr += 2; /* avoid empty string */
3493 }
3494 else inescq = TRUE;
3495 continue;
3496 }
3497 else if (-c == ESC_E) continue; /* Ignore orphan \E */
3498
3499 if (c < 0)
3500 {
3501 register const uschar *cbits = cd->cbits;
3502 class_charcount += 2; /* Greater than 1 is what matters */
3503
3504 switch (-c)
3505 {
3506 #ifdef SUPPORT_UCP
3507 case ESC_du: /* These are the values given for \d etc */
3508 case ESC_DU: /* when PCRE_UCP is set. We replace the */
3509 case ESC_wu: /* escape sequence with an appropriate \p */
3510 case ESC_WU: /* or \P to test Unicode properties instead */
3511 case ESC_su: /* of the default ASCII testing. */
3512 case ESC_SU:
3513 nestptr = ptr;
3514 ptr = substitutes[-c - ESC_DU] - 1; /* Just before substitute */
3515 class_charcount -= 2; /* Undo! */
3516 continue;
3517 #endif
3518 case ESC_d:
3519 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
3520 continue;
3521
3522 case ESC_D:
3523 should_flip_negation = TRUE;
3524 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
3525 continue;
3526
3527 case ESC_w:
3528 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
3529 continue;
3530
3531 case ESC_W:
3532 should_flip_negation = TRUE;
3533 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
3534 continue;
3535
3536 /* Perl 5.004 onwards omits VT from \s, but we must preserve it
3537 if it was previously set by something earlier in the character
3538 class. */
3539
3540 case ESC_s:
3541 classbits[0] |= cbits[cbit_space];
3542 classbits[1] |= cbits[cbit_space+1] & ~0x08;
3543 for (c = 2; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
3544 continue;
3545
3546 case ESC_S:
3547 should_flip_negation = TRUE;
3548 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
3549 classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
3550 continue;
3551
3552 case ESC_h:
3553 SETBIT(classbits, 0x09); /* VT */
3554 SETBIT(classbits, 0x20); /* SPACE */
3555 SETBIT(classbits, 0xa0); /* NSBP */
3556 #ifdef SUPPORT_UTF8
3557 if (utf8)
3558 {
3559 class_utf8 = TRUE;
3560 *class_utf8data++ = XCL_SINGLE;
3561 class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
3562 *class_utf8data++ = XCL_SINGLE;
3563 class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
3564 *class_utf8data++ = XCL_RANGE;
3565 class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
3566 class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
3567 *class_utf8data++ = XCL_SINGLE;
3568 class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
3569 *class_utf8data++ = XCL_SINGLE;
3570 class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
3571 *class_utf8data++ = XCL_SINGLE;
3572 class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
3573 }
3574 #endif
3575 continue;
3576
3577 case ESC_H:
3578 for (c = 0; c < 32; c++)
3579 {
3580 int x = 0xff;
3581 switch (c)
3582 {
3583 case 0x09/8: x ^= 1 << (0x09%8); break;
3584 case 0x20/8: x ^= 1 << (0x20%8); break;
3585 case 0xa0/8: x ^= 1 << (0xa0%8); break;
3586 default: break;
3587 }
3588 classbits[c] |= x;
3589 }
3590
3591 #ifdef SUPPORT_UTF8
3592 if (utf8)
3593 {
3594 class_utf8 = TRUE;
3595 *class_utf8data++ = XCL_RANGE;
3596 class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
3597 class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
3598 *class_utf8data++ = XCL_RANGE;
3599 class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
3600 class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
3601 *class_utf8data++ = XCL_RANGE;
3602 class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
3603 class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
3604 *class_utf8data++ = XCL_RANGE;
3605 class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
3606 class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
3607 *class_utf8data++ = XCL_RANGE;
3608 class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
3609 class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
3610 *class_utf8data++ = XCL_RANGE;
3611 class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
3612 class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
3613 *class_utf8data++ = XCL_RANGE;
3614 class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
3615 class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
3616 }
3617 #endif
3618 continue;
3619
3620 case ESC_v:
3621 SETBIT(classbits, 0x0a); /* LF */
3622 SETBIT(classbits, 0x0b); /* VT */
3623 SETBIT(classbits, 0x0c); /* FF */
3624 SETBIT(classbits, 0x0d); /* CR */
3625 SETBIT(classbits, 0x85); /* NEL */
3626 #ifdef SUPPORT_UTF8
3627 if (utf8)
3628 {
3629 class_utf8 = TRUE;
3630 *class_utf8data++ = XCL_RANGE;
3631 class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
3632 class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
3633 }
3634 #endif
3635 continue;
3636
3637 case ESC_V:
3638 for (c = 0; c < 32; c++)
3639 {
3640 int x = 0xff;
3641 switch (c)
3642 {
3643 case 0x0a/8: x ^= 1 << (0x0a%8);
3644 x ^= 1 << (0x0b%8);
3645 x ^= 1 << (0x0c%8);
3646 x ^= 1 << (0x0d%8);
3647 break;
3648 case 0x85/8: x ^= 1 << (0x85%8); break;
3649 default: break;
3650 }
3651 classbits[c] |= x;
3652 }
3653
3654 #ifdef SUPPORT_UTF8
3655 if (utf8)
3656 {
3657 class_utf8 = TRUE;
3658 *class_utf8data++ = XCL_RANGE;
3659 class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
3660 class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
3661 *class_utf8data++ = XCL_RANGE;
3662 class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
3663 class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
3664 }
3665 #endif
3666 continue;
3667
3668 #ifdef SUPPORT_UCP
3669 case ESC_p:
3670 case ESC_P:
3671 {
3672 BOOL negated;
3673 int pdata;
3674 int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
3675 if (ptype < 0) goto FAILED;
3676 class_utf8 = TRUE;
3677 *class_utf8data++ = ((-c == ESC_p) != negated)?
3678 XCL_PROP : XCL_NOTPROP;
3679 *class_utf8data++ = ptype;
3680 *class_utf8data++ = pdata;
3681 class_charcount -= 2; /* Not a < 256 character */
3682 continue;
3683 }
3684 #endif
3685 /* Unrecognized escapes are faulted if PCRE is running in its
3686 strict mode. By default, for compatibility with Perl, they are
3687 treated as literals. */
3688
3689 default:
3690 if ((options & PCRE_EXTRA) != 0)
3691 {
3692 *errorcodeptr = ERR7;
3693 goto FAILED;
3694 }
3695 class_charcount -= 2; /* Undo the default count from above */
3696 c = *ptr; /* Get the final character and fall through */
3697 break;
3698 }
3699 }
3700
3701 /* Fall through if we have a single character (c >= 0). This may be
3702 greater than 256 in UTF-8 mode. */
3703
3704 } /* End of backslash handling */
3705
3706 /* A single character may be followed by '-' to form a range. However,
3707 Perl does not permit ']' to be the end of the range. A '-' character
3708 at the end is treated as a literal. Perl ignores orphaned \E sequences
3709 entirely. The code for handling \Q and \E is messy. */
3710
3711 CHECK_RANGE:
3712 while (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
3713 {
3714 inescq = FALSE;
3715 ptr += 2;
3716 }
3717
3718 oldptr = ptr;
3719
3720 /* Remember \r or \n */
3721
3722 if (c == CHAR_CR || c == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
3723
3724 /* Check for range */
3725
3726 if (!inescq && ptr[1] == CHAR_MINUS)
3727 {
3728 int d;
3729 ptr += 2;
3730 while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E) ptr += 2;
3731
3732 /* If we hit \Q (not followed by \E) at this point, go into escaped
3733 mode. */
3734
3735 while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_Q)
3736 {
3737 ptr += 2;
3738 if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E)
3739 { ptr += 2; continue; }
3740 inescq = TRUE;
3741 break;
3742 }
3743
3744 if (*ptr == 0 || (!inescq && *ptr == CHAR_RIGHT_SQUARE_BRACKET))
3745 {
3746 ptr = oldptr;
3747 goto LONE_SINGLE_CHARACTER;
3748 }
3749
3750 #ifdef SUPPORT_UTF8
3751 if (utf8)
3752 { /* Braces are required because the */
3753 GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
3754 }
3755 else
3756 #endif
3757 d = *ptr; /* Not UTF-8 mode */
3758
3759 /* The second part of a range can be a single-character escape, but
3760 not any of the other escapes. Perl 5.6 treats a hyphen as a literal
3761 in such circumstances. */
3762
3763 if (!inescq && d == CHAR_BACKSLASH)
3764 {
3765 d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3766 if (*errorcodeptr != 0) goto FAILED;
3767
3768 /* \b is backspace; any other special means the '-' was literal */
3769
3770 if (d < 0)
3771 {
3772 if (d == -ESC_b) d = CHAR_BS; else
3773 {
3774 ptr = oldptr;
3775 goto LONE_SINGLE_CHARACTER; /* A few lines below */
3776 }
3777 }
3778 }
3779
3780 /* Check that the two values are in the correct order. Optimize
3781 one-character ranges */
3782
3783 if (d < c)
3784 {
3785 *errorcodeptr = ERR8;
3786 goto FAILED;
3787 }
3788
3789 if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
3790
3791 /* Remember \r or \n */
3792
3793 if (d == CHAR_CR || d == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
3794
3795 /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
3796 matching, we have to use an XCLASS with extra data items. Caseless
3797 matching for characters > 127 is available only if UCP support is
3798 available. */
3799
3800 #ifdef SUPPORT_UTF8
3801 if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
3802 {
3803 class_utf8 = TRUE;
3804
3805 /* With UCP support, we can find the other case equivalents of
3806 the relevant characters. There may be several ranges. Optimize how
3807 they fit with the basic range. */
3808
3809 #ifdef SUPPORT_UCP
3810 if ((options & PCRE_CASELESS) != 0)
3811 {
3812 unsigned int occ, ocd;
3813 unsigned int cc = c;
3814 unsigned int origd = d;
3815 while (get_othercase_range(&cc, origd, &occ, &ocd))
3816 {
3817 if (occ >= (unsigned int)c &&
3818 ocd <= (unsigned int)d)
3819 continue; /* Skip embedded ranges */
3820
3821 if (occ < (unsigned int)c &&
3822 ocd >= (unsigned int)c - 1) /* Extend the basic range */
3823 { /* if there is overlap, */
3824 c = occ; /* noting that if occ < c */
3825 continue; /* we can't have ocd > d */
3826 } /* because a subrange is */
3827 if (ocd > (unsigned int)d &&
3828 occ <= (unsigned int)d + 1) /* always shorter than */
3829 { /* the basic range. */
3830 d = ocd;
3831 continue;
3832 }
3833
3834 if (occ == ocd)
3835 {
3836 *class_utf8data++ = XCL_SINGLE;
3837 }
3838 else
3839 {
3840 *class_utf8data++ = XCL_RANGE;
3841 class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
3842 }
3843 class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
3844 }
3845 }
3846 #endif /* SUPPORT_UCP */
3847
3848 /* Now record the original range, possibly modified for UCP caseless
3849 overlapping ranges. */
3850
3851 *class_utf8data++ = XCL_RANGE;
3852 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3853 class_utf8data += _pcre_ord2utf8(d, class_utf8data);
3854
3855 /* With UCP support, we are done. Without UCP support, there is no
3856 caseless matching for UTF-8 characters > 127; we can use the bit map
3857 for the smaller ones. */
3858
3859 #ifdef SUPPORT_UCP
3860 continue; /* With next character in the class */
3861 #else
3862 if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
3863
3864 /* Adjust upper limit and fall through to set up the map */
3865
3866 d = 127;
3867
3868 #endif /* SUPPORT_UCP */
3869 }
3870 #endif /* SUPPORT_UTF8 */
3871
3872 /* We use the bit map for all cases when not in UTF-8 mode; else
3873 ranges that lie entirely within 0-127 when there is UCP support; else
3874 for partial ranges without UCP support. */
3875
3876 class_charcount += d - c + 1;
3877 class_lastchar = d;
3878
3879 /* We can save a bit of time by skipping this in the pre-compile. */
3880
3881 if (lengthptr == NULL) for (; c <= d; c++)
3882 {
3883 classbits[c/8] |= (1 << (c&7));
3884 if ((options & PCRE_CASELESS) != 0)
3885 {
3886 int uc = cd->fcc[c]; /* flip case */
3887 classbits[uc/8] |= (1 << (uc&7));
3888 }
3889 }
3890
3891 continue; /* Go get the next char in the class */
3892 }
3893
3894 /* Handle a lone single character - we can get here for a normal
3895 non-escape char, or after \ that introduces a single character or for an
3896 apparent range that isn't. */
3897
3898 LONE_SINGLE_CHARACTER:
3899
3900 /* Handle a character that cannot go in the bit map */
3901
3902 #ifdef SUPPORT_UTF8
3903 if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
3904 {
3905 class_utf8 = TRUE;
3906 *class_utf8data++ = XCL_SINGLE;
3907 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3908
3909 #ifdef SUPPORT_UCP
3910 if ((options & PCRE_CASELESS) != 0)
3911 {
3912 unsigned int othercase;
3913 if ((othercase = UCD_OTHERCASE(c)) != c)
3914 {
3915 *class_utf8data++ = XCL_SINGLE;
3916 class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
3917 }
3918 }
3919 #endif /* SUPPORT_UCP */
3920
3921 }
3922 else
3923 #endif /* SUPPORT_UTF8 */
3924
3925 /* Handle a single-byte character */
3926 {
3927 classbits[c/8] |= (1 << (c&7));
3928 if ((options & PCRE_CASELESS) != 0)
3929 {
3930 c = cd->fcc[c]; /* flip case */
3931 classbits[c/8] |= (1 << (c&7));
3932 }
3933 class_charcount++;
3934 class_lastchar = c;
3935 }
3936 }
3937
3938 /* Loop until ']' reached. This "while" is the end of the "do" far above.
3939 If we are at the end of an internal nested string, revert to the outer
3940 string. */
3941
3942 while (((c = *(++ptr)) != 0 ||
3943 (nestptr != NULL &&
3944 (ptr = nestptr, nestptr = NULL, c = *(++ptr)) != 0)) &&
3945 (c != CHAR_RIGHT_SQUARE_BRACKET || inescq));
3946
3947 /* Check for missing terminating ']' */
3948
3949 if (c == 0)
3950 {
3951 *errorcodeptr = ERR6;
3952 goto FAILED;
3953 }
3954
3955 /* If class_charcount is 1, we saw precisely one character whose value is
3956 less than 256. As long as there were no characters >= 128 and there was no
3957 use of \p or \P, in other words, no use of any XCLASS features, we can
3958 optimize.
3959
3960 In UTF-8 mode, we can optimize the negative case only if there were no
3961 characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR
3962 operate on single-bytes only. This is an historical hangover. Maybe one day
3963 we can tidy these opcodes to handle multi-byte characters.
3964
3965 The optimization throws away the bit map. We turn the item into a
3966 1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
3967 that OP_NOT does not support multibyte characters. In the positive case, it
3968 can cause firstbyte to be set. Otherwise, there can be no first char if
3969 this item is first, whatever repeat count may follow. In the case of
3970 reqbyte, save the previous value for reinstating. */
3971
3972 #ifdef SUPPORT_UTF8
3973 if (class_charcount == 1 && !class_utf8 &&
3974 (!utf8 || !negate_class || class_lastchar < 128))
3975 #else
3976 if (class_charcount == 1)
3977 #endif
3978 {
3979 zeroreqbyte = reqbyte;
3980
3981 /* The OP_NOT opcode works on one-byte characters only. */
3982
3983 if (negate_class)
3984 {
3985 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3986 zerofirstbyte = firstbyte;
3987 *code++ = OP_NOT;
3988 *code++ = class_lastchar;
3989 break;
3990 }
3991
3992 /* For a single, positive character, get the value into mcbuffer, and
3993 then we can handle this with the normal one-character code. */
3994
3995 #ifdef SUPPORT_UTF8
3996 if (utf8 && class_lastchar > 127)
3997 mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
3998 else
3999 #endif
4000 {
4001 mcbuffer[0] = class_lastchar;
4002 mclength = 1;
4003 }
4004 goto ONE_CHAR;
4005 } /* End of 1-char optimization */
4006
4007 /* The general case - not the one-char optimization. If this is the first
4008 thing in the branch, there can be no first char setting, whatever the
4009 repeat count. Any reqbyte setting must remain unchanged after any kind of
4010 repeat. */
4011
4012 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
4013 zerofirstbyte = firstbyte;
4014 zeroreqbyte = reqbyte;
4015
4016 /* If there are characters with values > 255, we have to compile an
4017 extended class, with its own opcode, unless there was a negated special
4018 such as \S in the class, and PCRE_UCP is not set, because in that case all
4019 characters > 255 are in the class, so any that were explicitly given as
4020 well can be ignored. If (when there are explicit characters > 255 that must
4021 be listed) there are no characters < 256, we can omit the bitmap in the
4022 actual compiled code. */
4023
4024 #ifdef SUPPORT_UTF8
4025 if (class_utf8 && (!should_flip_negation || (options & PCRE_UCP) != 0))
4026 {
4027 *class_utf8data++ = XCL_END; /* Marks the end of extra data */
4028 *code++ = OP_XCLASS;
4029 code += LINK_SIZE;
4030 *code = negate_class? XCL_NOT : 0;
4031
4032 /* If the map is required, move up the extra data to make room for it;
4033 otherwise just move the code pointer to the end of the extra data. */
4034
4035 if (class_charcount > 0)
4036 {
4037 *code++ |= XCL_MAP;
4038 memmove(code + 32, code, class_utf8data - code);
4039 memcpy(code, classbits, 32);
4040 code = class_utf8data + 32;
4041 }
4042 else code = class_utf8data;
4043
4044 /* Now fill in the complete length of the item */
4045
4046 PUT(previous, 1, code - previous);
4047 break; /* End of class handling */
4048 }
4049 #endif
4050
4051 /* If there are no characters > 255, or they are all to be included or
4052 excluded, set the opcode to OP_CLASS or OP_NCLASS, depending on whether the
4053 whole class was negated and whether there were negative specials such as \S
4054 (non-UCP) in the class. Then copy the 32-byte map into the code vector,
4055 negating it if necessary. */
4056
4057 *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
4058 if (negate_class)
4059 {
4060 if (lengthptr == NULL) /* Save time in the pre-compile phase */
4061 for (c = 0; c < 32; c++) code[c] = ~classbits[c];
4062 }
4063 else
4064 {
4065 memcpy(code, classbits, 32);
4066 }
4067 code += 32;
4068 break;
4069
4070
4071 /* ===================================================================*/
4072 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
4073 has been tested above. */
4074
4075 case CHAR_LEFT_CURLY_BRACKET:
4076 if (!is_quantifier) goto NORMAL_CHAR;
4077 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
4078 if (*errorcodeptr != 0) goto FAILED;
4079 goto REPEAT;
4080
4081 case CHAR_ASTERISK:
4082 repeat_min = 0;
4083 repeat_max = -1;
4084 goto REPEAT;
4085
4086 case CHAR_PLUS:
4087 repeat_min = 1;
4088 repeat_max = -1;
4089 goto REPEAT;
4090
4091 case CHAR_QUESTION_MARK:
4092 repeat_min = 0;
4093 repeat_max = 1;
4094
4095 REPEAT:
4096 if (previous == NULL)
4097 {
4098 *errorcodeptr = ERR9;
4099 goto FAILED;
4100 }
4101
4102 if (repeat_min == 0)
4103 {
4104 firstbyte = zerofirstbyte; /* Adjust for zero repeat */
4105 reqbyte = zeroreqbyte; /* Ditto */
4106 }
4107
4108 /* Remember whether this is a variable length repeat */
4109
4110 reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
4111
4112 op_type = 0; /* Default single-char op codes */
4113 possessive_quantifier = FALSE; /* Default not possessive quantifier */
4114
4115 /* Save start of previous item, in case we have to move it up to make space
4116 for an inserted OP_ONCE for the additional '+' extension. */
4117
4118 tempcode = previous;
4119
4120 /* If the next character is '+', we have a possessive quantifier. This
4121 implies greediness, whatever the setting of the PCRE_UNGREEDY option.
4122 If the next character is '?' this is a minimizing repeat, by default,
4123 but if PCRE_UNGREEDY is set, it works the other way round. We change the
4124 repeat type to the non-default. */
4125
4126 if (ptr[1] == CHAR_PLUS)
4127 {
4128 repeat_type = 0; /* Force greedy */
4129 possessive_quantifier = TRUE;
4130 ptr++;
4131 }
4132 else if (ptr[1] == CHAR_QUESTION_MARK)
4133 {
4134 repeat_type = greedy_non_default;
4135 ptr++;
4136 }
4137 else repeat_type = greedy_default;
4138
4139 /* If previous was a character match, abolish the item and generate a
4140 repeat item instead. If a char item has a minumum of more than one, ensure
4141 that it is set in reqbyte - it might not be if a sequence such as x{3} is
4142 the first thing in a branch because the x will have gone into firstbyte
4143 instead. */
4144
4145 if (*previous == OP_CHAR || *previous == OP_CHARNC)
4146 {
4147 /* Deal with UTF-8 characters that take up more than one byte. It's
4148 easier to write this out separately than try to macrify it. Use c to
4149 hold the length of the character in bytes, plus 0x80 to flag that it's a
4150 length rather than a small character. */
4151
4152 #ifdef SUPPORT_UTF8
4153 if (utf8 && (code[-1] & 0x80) != 0)
4154 {
4155 uschar *lastchar = code - 1;
4156 while((*lastchar & 0xc0) == 0x80) lastchar--;
4157 c = code - lastchar; /* Length of UTF-8 character */
4158 memcpy(utf8_char, lastchar, c); /* Save the char */
4159 c |= 0x80; /* Flag c as a length */
4160 }
4161 else
4162 #endif
4163
4164 /* Handle the case of a single byte - either with no UTF8 support, or
4165 with UTF-8 disabled, or for a UTF-8 character < 128. */
4166
4167 {
4168 c = code[-1];
4169 if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
4170 }
4171
4172 /* If the repetition is unlimited, it pays to see if the next thing on
4173 the line is something that cannot possibly match this character. If so,
4174 automatically possessifying this item gains some performance in the case
4175 where the match fails. */
4176
4177 if (!possessive_quantifier &&
4178 repeat_max < 0 &&
4179 check_auto_possessive(previous, utf8, ptr + 1, options, cd))
4180 {
4181 repeat_type = 0; /* Force greedy */
4182 possessive_quantifier = TRUE;
4183 }
4184
4185 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
4186 }
4187
4188 /* If previous was a single negated character ([^a] or similar), we use
4189 one of the special opcodes, replacing it. The code is shared with single-
4190 character repeats by setting opt_type to add a suitable offset into
4191 repeat_type. We can also test for auto-possessification. OP_NOT is
4192 currently used only for single-byte chars. */
4193
4194 else if (*previous == OP_NOT)
4195 {
4196 op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
4197 c = previous[1];
4198 if (!possessive_quantifier &&
4199 repeat_max < 0 &&
4200 check_auto_possessive(previous, utf8, ptr + 1, options, cd))
4201 {
4202 repeat_type = 0; /* Force greedy */
4203 possessive_quantifier = TRUE;
4204 }
4205 goto OUTPUT_SINGLE_REPEAT;
4206 }
4207
4208 /* If previous was a character type match (\d or similar), abolish it and
4209 create a suitable repeat item. The code is shared with single-character
4210 repeats by setting op_type to add a suitable offset into repeat_type. Note
4211 the the Unicode property types will be present only when SUPPORT_UCP is
4212 defined, but we don't wrap the little bits of code here because it just
4213 makes it horribly messy. */
4214
4215 else if (*previous < OP_EODN)
4216 {
4217 uschar *oldcode;
4218 int prop_type, prop_value;
4219 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
4220 c = *previous;
4221
4222 if (!possessive_quantifier &&
4223 repeat_max < 0 &&
4224 check_auto_possessive(previous, utf8, ptr + 1, options, cd))
4225 {
4226 repeat_type = 0; /* Force greedy */
4227 possessive_quantifier = TRUE;
4228 }
4229
4230 OUTPUT_SINGLE_REPEAT:
4231 if (*previous == OP_PROP || *previous == OP_NOTPROP)
4232 {
4233 prop_type = previous[1];
4234 prop_value = previous[2];
4235 }
4236 else prop_type = prop_value = -1;
4237
4238 oldcode = code;
4239 code = previous; /* Usually overwrite previous item */
4240
4241 /* If the maximum is zero then the minimum must also be zero; Perl allows
4242 this case, so we do too - by simply omitting the item altogether. */
4243
4244 if (repeat_max == 0) goto END_REPEAT;
4245
4246 /*--------------------------------------------------------------------*/
4247 /* This code is obsolete from release 8.00; the restriction was finally
4248 removed: */
4249
4250 /* All real repeats make it impossible to handle partial matching (maybe
4251 one day we will be able to remove this restriction). */
4252
4253 /* if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL; */
4254 /*--------------------------------------------------------------------*/
4255
4256 /* Combine the op_type with the repeat_type */
4257
4258 repeat_type += op_type;
4259
4260 /* A minimum of zero is handled either as the special case * or ?, or as
4261 an UPTO, with the maximum given. */
4262
4263 if (repeat_min == 0)
4264 {
4265 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
4266 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
4267 else
4268 {
4269 *code++ = OP_UPTO + repeat_type;
4270 PUT2INC(code, 0, repeat_max);
4271 }
4272 }
4273
4274 /* A repeat minimum of 1 is optimized into some special cases. If the
4275 maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
4276 left in place and, if the maximum is greater than 1, we use OP_UPTO with
4277 one less than the maximum. */
4278
4279 else if (repeat_min == 1)
4280 {
4281 if (repeat_max == -1)
4282 *code++ = OP_PLUS + repeat_type;
4283 else
4284 {
4285 code = oldcode; /* leave previous item in place */
4286 if (repeat_max == 1) goto END_REPEAT;
4287 *code++ = OP_UPTO + repeat_type;
4288 PUT2INC(code, 0, repeat_max - 1);
4289 }
4290 }
4291
4292 /* The case {n,n} is just an EXACT, while the general case {n,m} is
4293 handled as an EXACT followed by an UPTO. */
4294
4295 else
4296 {
4297 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
4298 PUT2INC(code, 0, repeat_min);
4299
4300 /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
4301 we have to insert the character for the previous code. For a repeated
4302 Unicode property match, there are two extra bytes that define the
4303 required property. In UTF-8 mode, long characters have their length in
4304 c, with the 0x80 bit as a flag. */
4305
4306 if (repeat_max < 0)
4307 {
4308 #ifdef SUPPORT_UTF8
4309 if (utf8 && c >= 128)
4310 {
4311 memcpy(code, utf8_char, c & 7);
4312 code += c & 7;
4313 }
4314 else
4315 #endif
4316 {
4317 *code++ = c;
4318 if (prop_type >= 0)
4319 {
4320 *code++ = prop_type;
4321 *code++ = prop_value;
4322 }
4323 }
4324 *code++ = OP_STAR + repeat_type;
4325 }
4326
4327 /* Else insert an UPTO if the max is greater than the min, again
4328 preceded by the character, for the previously inserted code. If the
4329 UPTO is just for 1 instance, we can use QUERY instead. */
4330
4331 else if (repeat_max != repeat_min)
4332 {
4333 #ifdef SUPPORT_UTF8
4334 if (utf8 && c >= 128)
4335 {
4336 memcpy(code, utf8_char, c & 7);
4337 code += c & 7;
4338 }
4339 else
4340 #endif
4341 *code++ = c;
4342 if (prop_type >= 0)
4343 {
4344 *code++ = prop_type;
4345 *code++ = prop_value;
4346 }
4347 repeat_max -= repeat_min;
4348
4349 if (repeat_max == 1)
4350 {
4351 *code++ = OP_QUERY + repeat_type;
4352 }
4353 else
4354 {
4355 *code++ = OP_UPTO + repeat_type;
4356 PUT2INC(code, 0, repeat_max);
4357 }
4358 }
4359 }
4360
4361 /* The character or character type itself comes last in all cases. */
4362
4363 #ifdef SUPPORT_UTF8
4364 if (utf8 && c >= 128)
4365 {
4366 memcpy(code, utf8_char, c & 7);
4367 code += c & 7;
4368 }
4369 else
4370 #endif
4371 *code++ = c;
4372
4373 /* For a repeated Unicode property match, there are two extra bytes that
4374 define the required property. */
4375
4376 #ifdef SUPPORT_UCP
4377 if (prop_type >= 0)
4378 {
4379 *code++ = prop_type;
4380 *code++ = prop_value;
4381 }
4382 #endif
4383 }
4384
4385 /* If previous was a character class or a back reference, we put the repeat
4386 stuff after it, but just skip the item if the repeat was {0,0}. */
4387
4388 else if (*previous == OP_CLASS ||
4389 *previous == OP_NCLASS ||
4390 #ifdef SUPPORT_UTF8
4391 *previous == OP_XCLASS ||
4392 #endif
4393 *previous == OP_REF)
4394 {
4395 if (repeat_max == 0)
4396 {
4397 code = previous;
4398 goto END_REPEAT;
4399 }
4400
4401 /*--------------------------------------------------------------------*/
4402 /* This code is obsolete from release 8.00; the restriction was finally
4403 removed: */
4404
4405 /* All real repeats make it impossible to handle partial matching (maybe
4406 one day we will be able to remove this restriction). */
4407
4408 /* if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL; */
4409 /*--------------------------------------------------------------------*/
4410
4411 if (repeat_min == 0 && repeat_max == -1)
4412 *code++ = OP_CRSTAR + repeat_type;
4413 else if (repeat_min == 1 && repeat_max == -1)
4414 *code++ = OP_CRPLUS + repeat_type;
4415 else if (repeat_min == 0 && repeat_max == 1)
4416 *code++ = OP_CRQUERY + repeat_type;
4417 else
4418 {
4419 *code++ = OP_CRRANGE + repeat_type;
4420 PUT2INC(code, 0, repeat_min);
4421 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
4422 PUT2INC(code, 0, repeat_max);
4423 }
4424 }
4425
4426 /* If previous was a bracket group, we may have to replicate it in certain
4427 cases. */
4428
4429 else if (*previous == OP_BRA || *previous == OP_CBRA ||
4430 *previous == OP_ONCE || *previous == OP_COND)
4431 {
4432 register int i;
4433 int ketoffset = 0;
4434 int len = (int)(code - previous);
4435 uschar *bralink = NULL;
4436
4437 /* Repeating a DEFINE group is pointless */
4438
4439 if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
4440 {
4441 *errorcodeptr = ERR55;
4442 goto FAILED;
4443 }
4444
4445 /* If the maximum repeat count is unlimited, find the end of the bracket
4446 by scanning through from the start, and compute the offset back to it
4447 from the current code pointer. There may be an OP_OPT setting following
4448 the final KET, so we can't find the end just by going back from the code
4449 pointer. */
4450
4451 if (repeat_max == -1)
4452 {
4453 register uschar *ket = previous;
4454 do ket += GET(ket, 1); while (*ket != OP_KET);
4455 ketoffset = (int)(code - ket);
4456 }
4457
4458 /* The case of a zero minimum is special because of the need to stick
4459 OP_BRAZERO in front of it, and because the group appears once in the
4460 data, whereas in other cases it appears the minimum number of times. For
4461 this reason, it is simplest to treat this case separately, as otherwise
4462 the code gets far too messy. There are several special subcases when the
4463 minimum is zero. */
4464
4465 if (repeat_min == 0)
4466 {
4467 /* If the maximum is also zero, we used to just omit the group from the
4468 output altogether, like this:
4469
4470 ** if (repeat_max == 0)
4471 ** {
4472 ** code = previous;
4473 ** goto END_REPEAT;
4474 ** }
4475
4476 However, that fails when a group is referenced as a subroutine from
4477 elsewhere in the pattern, so now we stick in OP_SKIPZERO in front of it
4478 so that it is skipped on execution. As we don't have a list of which
4479 groups are referenced, we cannot do this selectively.
4480
4481 If the maximum is 1 or unlimited, we just have to stick in the BRAZERO
4482 and do no more at this point. However, we do need to adjust any
4483 OP_RECURSE calls inside the group that refer to the group itself or any
4484 internal or forward referenced group, because the offset is from the
4485 start of the whole regex. Temporarily terminate the pattern while doing
4486 this. */
4487
4488 if (repeat_max <= 1) /* Covers 0, 1, and unlimited */
4489 {
4490 *code = OP_END;
4491 adjust_recurse(previous, 1, utf8, cd, save_hwm);
4492 memmove(previous+1, previous, len);
4493 code++;
4494 if (repeat_max == 0)
4495 {
4496 *previous++ = OP_SKIPZERO;
4497 goto END_REPEAT;
4498 }
4499 *previous++ = OP_BRAZERO + repeat_type;
4500 }
4501
4502 /* If the maximum is greater than 1 and limited, we have to replicate
4503 in a nested fashion, sticking OP_BRAZERO before each set of brackets.
4504 The first one has to be handled carefully because it's the original
4505 copy, which has to be moved up. The remainder can be handled by code
4506 that is common with the non-zero minimum case below. We have to
4507 adjust the value or repeat_max, since one less copy is required. Once
4508 again, we may have to adjust any OP_RECURSE calls inside the group. */
4509
4510 else
4511 {
4512 int offset;
4513 *code = OP_END;
4514 adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
4515 memmove(previous + 2 + LINK_SIZE, previous, len);
4516 code += 2 + LINK_SIZE;
4517 *previous++ = OP_BRAZERO + repeat_type;
4518 *previous++ = OP_BRA;
4519
4520 /* We chain together the bracket offset fields that have to be
4521 filled in later when the ends of the brackets are reached. */
4522
4523 offset = (bralink == NULL)? 0 : (int)(previous - bralink);
4524 bralink = previous;
4525 PUTINC(previous, 0, offset);
4526 }
4527
4528 repeat_max--;
4529 }
4530
4531 /* If the minimum is greater than zero, replicate the group as many
4532 times as necessary, and adjust the maximum to the number of subsequent
4533 copies that we need. If we set a first char from the group, and didn't
4534 set a required char, copy the latter from the former. If there are any
4535 forward reference subroutine calls in the group, there will be entries on
4536 the workspace list; replicate these with an appropriate increment. */
4537
4538 else
4539 {
4540 if (repeat_min > 1)
4541 {
4542 /* In the pre-compile phase, we don't actually do the replication. We
4543 just adjust the length as if we had. Do some paranoid checks for
4544 potential integer overflow. The INT64_OR_DOUBLE type is a 64-bit
4545 integer type when available, otherwise double. */
4546
4547 if (lengthptr != NULL)
4548 {
4549 int delta = (repeat_min - 1)*length_prevgroup;
4550 if ((INT64_OR_DOUBLE)(repeat_min - 1)*
4551 (INT64_OR_DOUBLE)length_prevgroup >
4552 (INT64_OR_DOUBLE)INT_MAX ||
4553 OFLOW_MAX - *lengthptr < delta)
4554 {
4555 *errorcodeptr = ERR20;
4556 goto FAILED;
4557 }
4558 *lengthptr += delta;
4559 }
4560
4561 /* This is compiling for real */
4562
4563 else
4564 {
4565 if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
4566 for (i = 1; i < repeat_min; i++)
4567 {
4568 uschar *hc;
4569 uschar *this_hwm = cd->hwm;
4570 memcpy(code, previous, len);
4571 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
4572 {
4573 PUT(cd->hwm, 0, GET(hc, 0) + len);
4574 cd->hwm += LINK_SIZE;
4575 }
4576 save_hwm = this_hwm;
4577 code += len;
4578 }
4579 }
4580 }
4581
4582 if (repeat_max > 0) repeat_max -= repeat_min;
4583 }
4584
4585 /* This code is common to both the zero and non-zero minimum cases. If
4586 the maximum is limited, it replicates the group in a nested fashion,
4587 remembering the bracket starts on a stack. In the case of a zero minimum,
4588 the first one was set up above. In all cases the repeat_max now specifies
4589 the number of additional copies needed. Again, we must remember to
4590 replicate entries on the forward reference list. */
4591
4592 if (repeat_max >= 0)
4593 {
4594 /* In the pre-compile phase, we don't actually do the replication. We
4595 just adjust the length as if we had. For each repetition we must add 1
4596 to the length for BRAZERO and for all but the last repetition we must
4597 add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
4598 paranoid checks to avoid integer overflow. The INT64_OR_DOUBLE type is
4599 a 64-bit integer type when available, otherwise double. */
4600
4601 if (lengthptr != NULL && repeat_max > 0)
4602 {
4603 int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
4604 2 - 2*LINK_SIZE; /* Last one doesn't nest */
4605 if ((INT64_OR_DOUBLE)repeat_max *
4606 (INT64_OR_DOUBLE)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
4607 > (INT64_OR_DOUBLE)INT_MAX ||
4608 OFLOW_MAX - *lengthptr < delta)
4609 {
4610 *errorcodeptr = ERR20;
4611 goto FAILED;
4612 }
4613 *lengthptr += delta;
4614 }
4615
4616 /* This is compiling for real */
4617
4618 else for (i = repeat_max - 1; i >= 0; i--)
4619 {
4620 uschar *hc;
4621 uschar *this_hwm = cd->hwm;
4622
4623 *code++ = OP_BRAZERO + repeat_type;
4624
4625 /* All but the final copy start a new nesting, maintaining the
4626 chain of brackets outstanding. */
4627
4628 if (i != 0)
4629 {
4630 int offset;
4631 *code++ = OP_BRA;
4632 offset = (bralink == NULL)? 0 : (int)(code - bralink);
4633 bralink = code;
4634 PUTINC(code, 0, offset);
4635 }
4636
4637 memcpy(code, previous, len);
4638 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
4639 {
4640 PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
4641 cd->hwm += LINK_SIZE;
4642 }
4643 save_hwm = this_hwm;
4644 code += len;
4645 }
4646
4647 /* Now chain through the pending brackets, and fill in their length
4648 fields (which are holding the chain links pro tem). */
4649
4650 while (bralink != NULL)
4651 {
4652 int oldlinkoffset;
4653 int offset = (int)(code - bralink + 1);
4654 uschar *bra = code - offset;
4655 oldlinkoffset = GET(bra, 1);
4656 bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
4657 *code++ = OP_KET;
4658 PUTINC(code, 0, offset);
4659 PUT(bra, 1, offset);
4660 }
4661 }
4662
4663 /* If the maximum is unlimited, set a repeater in the final copy. We
4664 can't just offset backwards from the current code point, because we
4665 don't know if there's been an options resetting after the ket. The
4666 correct offset was computed above.
4667
4668 Then, when we are doing the actual compile phase, check to see whether
4669 this group is a non-atomic one that could match an empty string. If so,
4670 convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
4671 that runtime checking can be done. [This check is also applied to
4672 atomic groups at runtime, but in a different way.] */
4673
4674 else
4675 {
4676 uschar *ketcode = code - ketoffset;
4677 uschar *bracode = ketcode - GET(ketcode, 1);
4678 *ketcode = OP_KETRMAX + repeat_type;
4679 if (lengthptr == NULL && *bracode != OP_ONCE)
4680 {
4681 uschar *scode = bracode;
4682 do
4683 {
4684 if (could_be_empty_branch(scode, ketcode, utf8, cd))
4685 {
4686 *bracode += OP_SBRA - OP_BRA;
4687 break;
4688 }
4689 scode += GET(scode, 1);
4690 }
4691 while (*scode == OP_ALT);
4692 }
4693 }
4694 }
4695
4696 /* If previous is OP_FAIL, it was generated by an empty class [] in
4697 JavaScript mode. The other ways in which OP_FAIL can be generated, that is
4698 by (*FAIL) or (?!) set previous to NULL, which gives a "nothing to repeat"
4699 error above. We can just ignore the repeat in JS case. */
4700
4701 else if (*previous == OP_FAIL) goto END_REPEAT;
4702
4703 /* Else there's some kind of shambles */
4704
4705 else
4706 {
4707 *errorcodeptr = ERR11;
4708 goto FAILED;
4709 }
4710
4711 /* If the character following a repeat is '+', or if certain optimization
4712 tests above succeeded, possessive_quantifier is TRUE. For some of the
4713 simpler opcodes, there is an special alternative opcode for this. For
4714 anything else, we wrap the entire repeated item inside OP_ONCE brackets.
4715 The '+' notation is just syntactic sugar, taken from Sun's Java package,
4716 but the special opcodes can optimize it a bit. The repeated item starts at
4717 tempcode, not at previous, which might be the first part of a string whose
4718 (former) last char we repeated.
4719
4720 Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
4721 an 'upto' may follow. We skip over an 'exact' item, and then test the
4722 length of what remains before proceeding. */
4723
4724 if (possessive_quantifier)
4725 {
4726 int len;
4727
4728 if (*tempcode == OP_TYPEEXACT)
4729 tempcode += _pcre_OP_lengths[*tempcode] +
4730 ((tempcode[3] == OP_PROP || tempcode[3] == OP_NOTPROP)? 2 : 0);
4731
4732 else if (*tempcode == OP_EXACT || *tempcode == OP_NOTEXACT)
4733 {
4734 tempcode += _pcre_OP_lengths[*tempcode];
4735 #ifdef SUPPORT_UTF8
4736 if (utf8 && tempcode[-1] >= 0xc0)
4737 tempcode += _pcre_utf8_table4[tempcode[-1] & 0x3f];
4738 #endif
4739 }
4740
4741 len = (int)(code - tempcode);
4742 if (len > 0) switch (*tempcode)
4743 {
4744 case OP_STAR: *tempcode = OP_POSSTAR; break;
4745 case OP_PLUS: *tempcode = OP_POSPLUS; break;
4746 case OP_QUERY: *tempcode = OP_POSQUERY; break;
4747 case OP_UPTO: *tempcode = OP_POSUPTO; break;
4748
4749 case OP_TYPESTAR: *tempcode = OP_TYPEPOSSTAR; break;
4750 case OP_TYPEPLUS: *tempcode = OP_TYPEPOSPLUS; break;
4751 case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
4752 case OP_TYPEUPTO: *tempcode = OP_TYPEPOSUPTO; break;
4753
4754 case OP_NOTSTAR: *tempcode = OP_NOTPOSSTAR; break;
4755 case OP_NOTPLUS: *tempcode = OP_NOTPOSPLUS; break;
4756 case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
4757 case OP_NOTUPTO: *tempcode = OP_NOTPOSUPTO; break;
4758
4759 /* Because we are moving code along, we must ensure that any
4760 pending recursive references are updated. */
4761
4762 default:
4763 *code = OP_END;
4764 adjust_recurse(tempcode, 1 + LINK_SIZE, utf8, cd, save_hwm);
4765 memmove(tempcode + 1+LINK_SIZE, tempcode, len);
4766 code += 1 + LINK_SIZE;
4767 len += 1 + LINK_SIZE;
4768 tempcode[0] = OP_ONCE;
4769 *code++ = OP_KET;
4770 PUTINC(code, 0, len);
4771 PUT(tempcode, 1, len);
4772 break;
4773 }
4774 }
4775
4776 /* In all case we no longer have a previous item. We also set the
4777 "follows varying string" flag for subsequently encountered reqbytes if
4778 it isn't already set and we have just passed a varying length item. */
4779
4780 END_REPEAT:
4781 previous = NULL;
4782 cd->req_varyopt |= reqvary;
4783 break;
4784
4785
4786 /* ===================================================================*/
4787 /* Start of nested parenthesized sub-expression, or comment or lookahead or
4788 lookbehind or option setting or condition or all the other extended
4789 parenthesis forms. */
4790
4791 case CHAR_LEFT_PARENTHESIS:
4792 newoptions = options;
4793 skipbytes = 0;
4794 bravalue = OP_CBRA;
4795 save_hwm = cd->hwm;
4796 reset_bracount = FALSE;
4797
4798 /* First deal with various "verbs" that can be introduced by '*'. */
4799
4800 if (*(++ptr) == CHAR_ASTERISK &&
4801 ((cd->ctypes[ptr[1]] & ctype_letter) != 0 || ptr[1] == ':'))
4802 {
4803 int i, namelen;
4804 int arglen = 0;
4805 const char *vn = verbnames;
4806 const uschar *name = ptr + 1;
4807 const uschar *arg = NULL;
4808 previous = NULL;
4809 while ((cd->ctypes[*++ptr] & ctype_letter) != 0) {};
4810 namelen = (int)(ptr - name);
4811
4812 if (*ptr == CHAR_COLON)
4813 {
4814 arg = ++ptr;
4815 while ((cd->ctypes[*ptr] & (ctype_letter|ctype_digit)) != 0
4816 || *ptr == '_') ptr++;
4817 arglen = (int)(ptr - arg);
4818 }
4819
4820 if (*ptr != CHAR_RIGHT_PARENTHESIS)
4821 {
4822 *errorcodeptr = ERR60;
4823 goto FAILED;
4824 }
4825
4826 /* Scan the table of verb names */
4827
4828 for (i = 0; i < verbcount; i++)
4829 {
4830 if (namelen == verbs[i].len &&
4831 strncmp((char *)name, vn, namelen) == 0)
4832 {
4833 /* Check for open captures before ACCEPT */
4834
4835 if (verbs[i].op == OP_ACCEPT)
4836 {
4837 open_capitem *oc;
4838 cd->had_accept = TRUE;
4839 for (oc = cd->open_caps; oc != NULL; oc = oc->next)
4840 {
4841 *code++ = OP_CLOSE;
4842 PUT2INC(code, 0, oc->number);
4843 }
4844 }
4845
4846 /* Handle the cases with/without an argument */
4847
4848 if (arglen == 0)
4849 {
4850 if (verbs[i].op < 0) /* Argument is mandatory */
4851 {
4852 *errorcodeptr = ERR66;
4853 goto FAILED;
4854 }
4855 *code = verbs[i].op;
4856 if (*code++ == OP_THEN)
4857 {
4858 PUT(code, 0, code - bcptr->current_branch - 1);
4859 code += LINK_SIZE;
4860 }
4861 }
4862
4863 else
4864 {
4865 if (verbs[i].op_arg < 0) /* Argument is forbidden */
4866 {
4867 *errorcodeptr = ERR59;
4868 goto FAILED;
4869 }
4870 *code = verbs[i].op_arg;
4871 if (*code++ == OP_THEN_ARG)
4872 {
4873 PUT(code, 0, code - bcptr->current_branch - 1);
4874 code += LINK_SIZE;
4875 }
4876 *code++ = arglen;
4877 memcpy(code, arg, arglen);
4878 code += arglen;
4879 *code++ = 0;
4880 }
4881
4882 break; /* Found verb, exit loop */
4883 }
4884
4885 vn += verbs[i].len + 1;
4886 }
4887
4888 if (i < verbcount) continue; /* Successfully handled a verb */
4889 *errorcodeptr = ERR60; /* Verb not recognized */
4890 goto FAILED;
4891 }
4892
4893 /* Deal with the extended parentheses; all are introduced by '?', and the
4894 appearance of any of them means that this is not a capturing group. */
4895
4896 else if (*ptr == CHAR_QUESTION_MARK)
4897 {
4898 int i, set, unset, namelen;
4899 int *optset;
4900 const uschar *name;
4901 uschar *slot;
4902
4903 switch (*(++ptr))
4904 {
4905 case CHAR_NUMBER_SIGN: /* Comment; skip to ket */
4906 ptr++;
4907 while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
4908 if (*ptr == 0)
4909 {
4910 *errorcodeptr = ERR18;
4911 goto FAILED;
4912 }
4913 continue;
4914
4915
4916 /* ------------------------------------------------------------ */
4917 case CHAR_VERTICAL_LINE: /* Reset capture count for each branch */
4918 reset_bracount = TRUE;
4919 /* Fall through */
4920
4921 /* ------------------------------------------------------------ */
4922 case CHAR_COLON: /* Non-capturing bracket */
4923 bravalue = OP_BRA;
4924 ptr++;
4925 break;
4926
4927
4928 /* ------------------------------------------------------------ */
4929 case CHAR_LEFT_PARENTHESIS:
4930 bravalue = OP_COND; /* Conditional group */
4931
4932 /* A condition can be an assertion, a number (referring to a numbered
4933 group), a name (referring to a named group), or 'R', referring to
4934 recursion. R<digits> and R&name are also permitted for recursion tests.
4935
4936 There are several syntaxes for testing a named group: (?(name)) is used
4937 by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
4938
4939 There are two unfortunate ambiguities, caused by history. (a) 'R' can
4940 be the recursive thing or the name 'R' (and similarly for 'R' followed
4941 by digits), and (b) a number could be a name that consists of digits.
4942 In both cases, we look for a name first; if not found, we try the other
4943 cases. */
4944
4945 /* For conditions that are assertions, check the syntax, and then exit
4946 the switch. This will take control down to where bracketed groups,
4947 including assertions, are processed. */
4948
4949 if (ptr[1] == CHAR_QUESTION_MARK && (ptr[2] == CHAR_EQUALS_SIGN ||
4950 ptr[2] == CHAR_EXCLAMATION_MARK || ptr[2] == CHAR_LESS_THAN_SIGN))
4951 break;
4952
4953 /* Most other conditions use OP_CREF (a couple change to OP_RREF
4954 below), and all need to skip 3 bytes at the start of the group. */
4955
4956 code[1+LINK_SIZE] = OP_CREF;
4957 skipbytes = 3;
4958 refsign = -1;
4959
4960 /* Check for a test for recursion in a named group. */
4961
4962 if (ptr[1] == CHAR_R && ptr[2] == CHAR_AMPERSAND)
4963 {
4964 terminator = -1;
4965 ptr += 2;
4966 code[1+LINK_SIZE] = OP_RREF; /* Change the type of test */
4967 }
4968
4969 /* Check for a test for a named group's having been set, using the Perl
4970 syntax (?(<name>) or (?('name') */
4971
4972 else if (ptr[1] == CHAR_LESS_THAN_SIGN)
4973 {
4974 terminator = CHAR_GREATER_THAN_SIGN;
4975 ptr++;
4976 }
4977 else if (ptr[1] == CHAR_APOSTROPHE)
4978 {
4979 terminator = CHAR_APOSTROPHE;
4980 ptr++;
4981 }
4982 else
4983 {
4984 terminator = 0;
4985 if (ptr[1] == CHAR_MINUS || ptr[1] == CHAR_PLUS) refsign = *(++ptr);
4986 }
4987
4988 /* We now expect to read a name; any thing else is an error */
4989
4990 if ((cd->ctypes[ptr[1]] & ctype_word) == 0)
4991 {
4992 ptr += 1; /* To get the right offset */
4993 *errorcodeptr = ERR28;
4994 goto FAILED;
4995 }
4996
4997 /* Read the name, but also get it as a number if it's all digits */
4998
4999 recno = 0;
5000 name = ++ptr;
5001 while ((cd->ctypes[*ptr] & ctype_word) != 0)
5002 {
5003 if (recno >= 0)
5004 recno = ((digitab[*ptr] & ctype_digit) != 0)?
5005 recno * 10 + *ptr - CHAR_0 : -1;
5006 ptr++;
5007 }
5008 namelen = (int)(ptr - name);
5009
5010 if ((terminator > 0 && *ptr++ != terminator) ||
5011 *ptr++ != CHAR_RIGHT_PARENTHESIS)
5012 {
5013 ptr--; /* Error offset */
5014 *errorcodeptr = ERR26;
5015 goto FAILED;
5016 }
5017
5018 /* Do no further checking in the pre-compile phase. */
5019
5020 if (lengthptr != NULL) break;
5021
5022 /* In the real compile we do the work of looking for the actual
5023 reference. If the string started with "+" or "-" we require the rest to
5024 be digits, in which case recno will be set. */
5025
5026 if (refsign > 0)
5027 {
5028 if (recno <= 0)
5029 {
5030 *errorcodeptr = ERR58;
5031 goto FAILED;
5032 }
5033 recno = (refsign == CHAR_MINUS)?
5034 cd->bracount - recno + 1 : recno +cd->bracount;
5035 if (recno <= 0 || recno > cd->final_bracount)
5036 {
5037 *errorcodeptr = ERR15;
5038 goto FAILED;
5039 }
5040 PUT2(code, 2+LINK_SIZE, recno);
5041 break;
5042 }
5043
5044 /* Otherwise (did not start with "+" or "-"), start by looking for the
5045 name. If we find a name, add one to the opcode to change OP_CREF or
5046 OP_RREF into OP_NCREF or OP_NRREF. These behave exactly the same,
5047 except they record that the reference was originally to a name. The
5048 information is used to check duplicate names. */
5049
5050 slot = cd->name_table;
5051 for (i = 0; i < cd->names_found; i++)
5052 {
5053 if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
5054 slot += cd->name_entry_size;
5055 }
5056
5057 /* Found a previous named subpattern */
5058
5059 if (i < cd->names_found)
5060 {
5061 recno = GET2(slot, 0);
5062 PUT2(code, 2+LINK_SIZE, recno);
5063 code[1+LINK_SIZE]++;
5064 }
5065
5066 /* Search the pattern for a forward reference */
5067
5068 else if ((i = find_parens(cd, name, namelen,
5069 (options & PCRE_EXTENDED) != 0, utf8)) > 0)
5070 {
5071 PUT2(code, 2+LINK_SIZE, i);
5072 code[1+LINK_SIZE]++;
5073 }
5074
5075 /* If terminator == 0 it means that the name followed directly after
5076 the opening parenthesis [e.g. (?(abc)...] and in this case there are
5077 some further alternatives to try. For the cases where terminator != 0
5078 [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
5079 now checked all the possibilities, so give an error. */
5080
5081 else if (terminator != 0)
5082 {
5083 *errorcodeptr = ERR15;
5084 goto FAILED;
5085 }
5086
5087 /* Check for (?(R) for recursion. Allow digits after R to specify a
5088 specific group number. */
5089
5090 else if (*name == CHAR_R)
5091 {
5092 recno = 0;
5093 for (i = 1; i < namelen; i++)
5094 {
5095 if ((digitab[name[i]] & ctype_digit) == 0)
5096 {
5097 *errorcodeptr = ERR15;
5098 goto FAILED;
5099 }
5100 recno = recno * 10 + name[i] - CHAR_0;
5101 }
5102 if (recno == 0) recno = RREF_ANY;
5103 code[1+LINK_SIZE] = OP_RREF; /* Change test type */
5104 PUT2(code, 2+LINK_SIZE, recno);
5105 }
5106
5107 /* Similarly, check for the (?(DEFINE) "condition", which is always
5108 false. */
5109
5110 else if (namelen == 6 && strncmp((char *)name, STRING_DEFINE, 6) == 0)
5111 {
5112 code[1+LINK_SIZE] = OP_DEF;
5113 skipbytes = 1;
5114 }
5115
5116 /* Check for the "name" actually being a subpattern number. We are
5117 in the second pass here, so final_bracount is set. */
5118
5119 else if (recno > 0 && recno <= cd->final_bracount)
5120 {
5121 PUT2(code, 2+LINK_SIZE, recno);
5122 }
5123
5124 /* Either an unidentified subpattern, or a reference to (?(0) */
5125
5126 else
5127 {
5128 *errorcodeptr = (recno == 0)? ERR35: ERR15;
5129 goto FAILED;
5130 }
5131 break;
5132
5133
5134 /* ------------------------------------------------------------ */
5135 case CHAR_EQUALS_SIGN: /* Positive lookahead */
5136 bravalue = OP_ASSERT;
5137 ptr++;
5138 break;
5139
5140
5141 /* ------------------------------------------------------------ */
5142 case CHAR_EXCLAMATION_MARK: /* Negative lookahead */
5143 ptr++;
5144 if (*ptr == CHAR_RIGHT_PARENTHESIS) /* Optimize (?!) */
5145 {
5146 *code++ = OP_FAIL;
5147 previous = NULL;
5148 continue;
5149 }
5150 bravalue = OP_ASSERT_NOT;
5151 break;
5152
5153
5154 /* ------------------------------------------------------------ */
5155 case CHAR_LESS_THAN_SIGN: /* Lookbehind or named define */
5156 switch (ptr[1])
5157 {
5158 case CHAR_EQUALS_SIGN: /* Positive lookbehind */
5159 bravalue = OP_ASSERTBACK;
5160 ptr += 2;
5161 break;
5162
5163 case CHAR_EXCLAMATION_MARK: /* Negative lookbehind */
5164 bravalue = OP_ASSERTBACK_NOT;
5165 ptr += 2;
5166 break;
5167
5168 default: /* Could be name define, else bad */
5169 if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME;
5170 ptr++; /* Correct offset for error */
5171 *errorcodeptr = ERR24;
5172 goto FAILED;
5173 }
5174 break;
5175
5176
5177 /* ------------------------------------------------------------ */
5178 case CHAR_GREATER_THAN_SIGN: /* One-time brackets */
5179 bravalue = OP_ONCE;
5180 ptr++;
5181 break;
5182
5183
5184 /* ------------------------------------------------------------ */
5185 case CHAR_C: /* Callout - may be followed by digits; */
5186 previous_callout = code; /* Save for later completion */
5187 after_manual_callout = 1; /* Skip one item before completing */
5188 *code++ = OP_CALLOUT;
5189 {
5190 int n = 0;
5191 while ((digitab[*(++ptr)] & ctype_digit) != 0)
5192 n = n * 10 + *ptr - CHAR_0;
5193 if (*ptr != CHAR_RIGHT_PARENTHESIS)
5194 {
5195 *errorcodeptr = ERR39;
5196 goto FAILED;
5197 }
5198 if (n > 255)
5199 {
5200 *errorcodeptr = ERR38;
5201 goto FAILED;
5202 }
5203 *code++ = n;
5204 PUT(code, 0, (int)(ptr - cd->start_pattern + 1)); /* Pattern offset */
5205 PUT(code, LINK_SIZE, 0); /* Default length */
5206 code += 2 * LINK_SIZE;
5207 }
5208 previous = NULL;
5209 continue;
5210
5211
5212 /* ------------------------------------------------------------ */
5213 case CHAR_P: /* Python-style named subpattern handling */
5214 if (*(++ptr) == CHAR_EQUALS_SIGN ||
5215 *ptr == CHAR_GREATER_THAN_SIGN) /* Reference or recursion */
5216 {
5217 is_recurse = *ptr == CHAR_GREATER_THAN_SIGN;
5218 terminator = CHAR_RIGHT_PARENTHESIS;
5219 goto NAMED_REF_OR_RECURSE;
5220 }
5221 else if (*ptr != CHAR_LESS_THAN_SIGN) /* Test for Python-style defn */
5222 {
5223 *errorcodeptr = ERR41;
5224 goto FAILED;
5225 }
5226 /* Fall through to handle (?P< as (?< is handled */
5227
5228
5229 /* ------------------------------------------------------------ */
5230 DEFINE_NAME: /* Come here from (?< handling */
5231 case CHAR_APOSTROPHE:
5232 {
5233 terminator = (*ptr == CHAR_LESS_THAN_SIGN)?
5234 CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
5235 name = ++ptr;
5236
5237 while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
5238 namelen = (int)(ptr - name);
5239
5240 /* In the pre-compile phase, just do a syntax check. */
5241
5242 if (lengthptr != NULL)
5243 {
5244 if (*ptr != terminator)
5245 {
5246 *errorcodeptr = ERR42;
5247 goto FAILED;
5248 }
5249 if (cd->names_found >= MAX_NAME_COUNT)
5250 {
5251 *errorcodeptr = ERR49;
5252 goto FAILED;
5253 }
5254 if (namelen + 3 > cd->name_entry_size)
5255 {
5256 cd->name_entry_size = namelen + 3;
5257 if (namelen > MAX_NAME_SIZE)
5258 {
5259 *errorcodeptr = ERR48;
5260 goto FAILED;
5261 }
5262 }
5263 }
5264
5265 /* In the real compile, create the entry in the table, maintaining
5266 alphabetical order. Duplicate names for different numbers are
5267 permitted only if PCRE_DUPNAMES is set. Duplicate names for the same
5268 number are always OK. (An existing number can be re-used if (?|
5269 appears in the pattern.) In either event, a duplicate name results in
5270 a duplicate entry in the table, even if the number is the same. This
5271 is because the number of names, and hence the table size, is computed
5272 in the pre-compile, and it affects various numbers and pointers which
5273 would all have to be modified, and the compiled code moved down, if
5274 duplicates with the same number were omitted from the table. This
5275 doesn't seem worth the hassle. However, *different* names for the
5276 same number are not permitted. */
5277
5278 else
5279 {
5280 BOOL dupname = FALSE;
5281 slot = cd->name_table;
5282
5283 for (i = 0; i < cd->names_found; i++)
5284 {
5285 int crc = memcmp(name, slot+2, namelen);
5286 if (crc == 0)
5287 {
5288 if (slot[2+namelen] == 0)
5289 {
5290 if (GET2(slot, 0) != cd->bracount + 1 &&
5291 (options & PCRE_DUPNAMES) == 0)
5292 {
5293 *errorcodeptr = ERR43;
5294 goto FAILED;
5295 }
5296 else dupname = TRUE;
5297 }
5298 else crc = -1; /* Current name is a substring */
5299 }
5300
5301 /* Make space in the table and break the loop for an earlier
5302 name. For a duplicate or later name, carry on. We do this for
5303 duplicates so that in the simple case (when ?(| is not used) they
5304 are in order of their numbers. */
5305
5306 if (crc < 0)
5307 {
5308 memmove(slot + cd->name_entry_size, slot,
5309 (cd->names_found - i) * cd->name_entry_size);
5310 break;
5311 }
5312
5313 /* Continue the loop for a later or duplicate name */
5314
5315 slot += cd->name_entry_size;
5316 }
5317
5318 /* For non-duplicate names, check for a duplicate number before
5319 adding the new name. */
5320
5321 if (!dupname)
5322 {
5323 uschar *cslot = cd->name_table;
5324 for (i = 0; i < cd->names_found; i++)
5325 {
5326 if (cslot != slot)
5327 {
5328 if (GET2(cslot, 0) == cd->bracount + 1)
5329 {
5330 *errorcodeptr = ERR65;
5331 goto FAILED;
5332 }
5333 }
5334 else i--;
5335 cslot += cd->name_entry_size;
5336 }
5337 }
5338
5339 PUT2(slot, 0, cd->bracount + 1);
5340 memcpy(slot + 2, name, namelen);
5341 slot[2+namelen] = 0;
5342 }
5343 }
5344
5345 /* In both pre-compile and compile, count the number of names we've
5346 encountered. */
5347
5348 cd->names_found++;
5349 ptr++; /* Move past > or ' */
5350 goto NUMBERED_GROUP;
5351
5352
5353 /* ------------------------------------------------------------ */
5354 case CHAR_AMPERSAND: /* Perl recursion/subroutine syntax */
5355 terminator = CHAR_RIGHT_PARENTHESIS;
5356 is_recurse = TRUE;
5357 /* Fall through */
5358
5359 /* We come here from the Python syntax above that handles both
5360 references (?P=name) and recursion (?P>name), as well as falling
5361 through from the Perl recursion syntax (?&name). We also come here from
5362 the Perl \k<name> or \k'name' back reference syntax and the \k{name}
5363 .NET syntax, and the Oniguruma \g<...> and \g'...' subroutine syntax. */
5364
5365 NAMED_REF_OR_RECURSE:
5366 name = ++ptr;
5367 while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
5368 namelen = (int)(ptr - name);
5369
5370 /* In the pre-compile phase, do a syntax check and set a dummy
5371 reference number. */
5372
5373 if (lengthptr != NULL)
5374 {
5375 if (namelen == 0)
5376 {
5377 *errorcodeptr = ERR62;
5378 goto FAILED;
5379 }
5380 if (*ptr != terminator)
5381 {
5382 *errorcodeptr = ERR42;
5383 goto FAILED;
5384 }
5385 if (namelen > MAX_NAME_SIZE)
5386 {
5387 *errorcodeptr = ERR48;
5388 goto FAILED;
5389 }
5390 recno = 0;
5391 }
5392
5393 /* In the real compile, seek the name in the table. We check the name
5394 first, and then check that we have reached the end of the name in the
5395 table. That way, if the name that is longer than any in the table,
5396 the comparison will fail without reading beyond the table entry. */
5397
5398 else
5399 {
5400 slot = cd->name_table;
5401 for (i = 0; i < cd->names_found; i++)
5402 {
5403 if (strncmp((char *)name, (char *)slot+2, namelen) == 0 &&
5404 slot[2+namelen] == 0)
5405 break;
5406 slot += cd->name_entry_size;
5407 }
5408
5409 if (i < cd->names_found) /* Back reference */
5410 {
5411 recno = GET2(slot, 0);
5412 }
5413 else if ((recno = /* Forward back reference */
5414 find_parens(cd, name, namelen,
5415 (options & PCRE_EXTENDED) != 0, utf8)) <= 0)
5416 {
5417 *errorcodeptr = ERR15;
5418 goto FAILED;
5419 }
5420 }
5421
5422 /* In both phases, we can now go to the code than handles numerical
5423 recursion or backreferences. */
5424
5425 if (is_recurse) goto HANDLE_RECURSION;
5426 else goto HANDLE_REFERENCE;
5427
5428
5429 /* ------------------------------------------------------------ */
5430 case CHAR_R: /* Recursion */
5431 ptr++; /* Same as (?0) */
5432 /* Fall through */
5433
5434
5435 /* ------------------------------------------------------------ */
5436 case CHAR_MINUS: case CHAR_PLUS: /* Recursion or subroutine */
5437 case CHAR_0: case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4:
5438 case CHAR_5: case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
5439 {
5440 const uschar *called;
5441 terminator = CHAR_RIGHT_PARENTHESIS;
5442
5443 /* Come here from the \g<...> and \g'...' code (Oniguruma
5444 compatibility). However, the syntax has been checked to ensure that
5445 the ... are a (signed) number, so that neither ERR63 nor ERR29 will
5446 be called on this path, nor with the jump to OTHER_CHAR_AFTER_QUERY
5447 ever be taken. */
5448
5449 HANDLE_NUMERICAL_RECURSION:
5450
5451 if ((refsign = *ptr) == CHAR_PLUS)
5452 {
5453 ptr++;
5454 if ((digitab[*ptr] & ctype_digit) == 0)
5455 {
5456 *errorcodeptr = ERR63;
5457 goto FAILED;
5458 }
5459 }
5460 else if (refsign == CHAR_MINUS)
5461 {
5462 if ((digitab[ptr[1]] & ctype_digit) == 0)
5463 goto OTHER_CHAR_AFTER_QUERY;
5464 ptr++;
5465 }
5466
5467 recno = 0;
5468 while((digitab[*ptr] & ctype_digit) != 0)
5469 recno = recno * 10 + *ptr++ - CHAR_0;
5470
5471 if (*ptr != terminator)
5472 {
5473 *errorcodeptr = ERR29;
5474 goto FAILED;
5475 }
5476
5477 if (refsign == CHAR_MINUS)
5478 {
5479 if (recno == 0)
5480 {
5481 *errorcodeptr = ERR58;
5482 goto FAILED;
5483 }
5484 recno = cd->bracount - recno + 1;
5485 if (recno <= 0)
5486 {
5487 *errorcodeptr = ERR15;
5488 goto FAILED;
5489 }
5490 }
5491 else if (refsign == CHAR_PLUS)
5492 {
5493 if (recno == 0)
5494 {
5495 *errorcodeptr = ERR58;
5496 goto FAILED;
5497 }
5498 recno += cd->bracount;
5499 }
5500
5501 /* Come here from code above that handles a named recursion */
5502
5503 HANDLE_RECURSION:
5504
5505 previous = code;
5506 called = cd->start_code;
5507
5508 /* When we are actually compiling, find the bracket that is being
5509 referenced. Temporarily end the regex in case it doesn't exist before
5510 this point. If we end up with a forward reference, first check that
5511 the bracket does occur later so we can give the error (and position)
5512 now. Then remember this forward reference in the workspace so it can
5513 be filled in at the end. */
5514
5515 if (lengthptr == NULL)
5516 {
5517 *code = OP_END;
5518 if (recno != 0)
5519 called = _pcre_find_bracket(cd->start_code, utf8, recno);
5520
5521 /* Forward reference */
5522
5523 if (called == NULL)
5524 {
5525 if (find_parens(cd, NULL, recno,
5526 (options & PCRE_EXTENDED) != 0, utf8) < 0)
5527 {
5528 *errorcodeptr = ERR15;
5529 goto FAILED;
5530 }
5531
5532 /* Fudge the value of "called" so that when it is inserted as an
5533 offset below, what it actually inserted is the reference number
5534 of the group. */
5535
5536 called = cd->start_code + recno;
5537 PUTINC(cd->hwm, 0, (int)(code + 2 + LINK_SIZE - cd->start_code));
5538 }
5539
5540 /* If not a forward reference, and the subpattern is still open,
5541 this is a recursive call. We check to see if this is a left
5542 recursion that could loop for ever, and diagnose that case. */
5543
5544 else if (GET(called, 1) == 0 &&
5545 could_be_empty(called, code, bcptr, utf8, cd))
5546 {
5547 *errorcodeptr = ERR40;
5548 goto FAILED;
5549 }
5550 }
5551
5552 /* Insert the recursion/subroutine item, automatically wrapped inside
5553 "once" brackets. Set up a "previous group" length so that a
5554 subsequent quantifier will work. */
5555
5556 *code = OP_ONCE;
5557 PUT(code, 1, 2 + 2*LINK_SIZE);
5558 code += 1 + LINK_SIZE;
5559
5560 *code = OP_RECURSE;
5561 PUT(code, 1, (int)(called - cd->start_code));
5562 code += 1 + LINK_SIZE;
5563
5564 *code = OP_KET;
5565 PUT(code, 1, 2 + 2*LINK_SIZE);
5566 code += 1 + LINK_SIZE;
5567
5568 length_prevgroup = 3 + 3*LINK_SIZE;
5569 }
5570
5571 /* Can't determine a first byte now */
5572
5573 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
5574 continue;
5575
5576
5577 /* ------------------------------------------------------------ */
5578 default: /* Other characters: check option setting */
5579 OTHER_CHAR_AFTER_QUERY:
5580 set = unset = 0;
5581 optset = &set;
5582
5583 while (*ptr != CHAR_RIGHT_PARENTHESIS && *ptr != CHAR_COLON)
5584 {
5585 switch (*ptr++)
5586 {
5587 case CHAR_MINUS: optset = &unset; break;
5588
5589 case CHAR_J: /* Record that it changed in the external options */
5590 *optset |= PCRE_DUPNAMES;
5591 cd->external_flags |= PCRE_JCHANGED;
5592 break;
5593
5594 case CHAR_i: *optset |= PCRE_CASELESS; break;
5595 case CHAR_m: *optset |= PCRE_MULTILINE; break;
5596 case CHAR_s: *optset |= PCRE_DOTALL; break;
5597 case CHAR_x: *optset |= PCRE_EXTENDED; break;
5598 case CHAR_U: *optset |= PCRE_UNGREEDY; break;
5599 case CHAR_X: *optset |= PCRE_EXTRA; break;
5600
5601 default: *errorcodeptr = ERR12;
5602 ptr--; /* Correct the offset */
5603 goto FAILED;
5604 }
5605 }
5606
5607 /* Set up the changed option bits, but don't change anything yet. */
5608
5609 newoptions = (options | set) & (~unset);
5610
5611 /* If the options ended with ')' this is not the start of a nested
5612 group with option changes, so the options change at this level. If this
5613 item is right at the start of the pattern, the options can be
5614 abstracted and made external in the pre-compile phase, and ignored in
5615 the compile phase. This can be helpful when matching -- for instance in
5616 caseless checking of required bytes.
5617
5618 If the code pointer is not (cd->start_code + 1 + LINK_SIZE), we are
5619 definitely *not* at the start of the pattern because something has been
5620 compiled. In the pre-compile phase, however, the code pointer can have
5621 that value after the start, because it gets reset as code is discarded
5622 during the pre-compile. However, this can happen only at top level - if
5623 we are within parentheses, the starting BRA will still be present. At
5624 any parenthesis level, the length value can be used to test if anything
5625 has been compiled at that level. Thus, a test for both these conditions
5626 is necessary to ensure we correctly detect the start of the pattern in
5627 both phases.
5628
5629 If we are not at the pattern start, compile code to change the ims
5630 options if this setting actually changes any of them, and reset the
5631 greedy defaults and the case value for firstbyte and reqbyte. */
5632
5633 if (*ptr == CHAR_RIGHT_PARENTHESIS)
5634 {
5635 if (code == cd->start_code + 1 + LINK_SIZE &&
5636 (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE))
5637 {
5638 cd->external_options = newoptions;
5639 }
5640 else
5641 {
5642 if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
5643 {
5644 *code++ = OP_OPT;
5645 *code++ = newoptions & PCRE_IMS;
5646 }
5647 greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
5648 greedy_non_default = greedy_default ^ 1;
5649 req_caseopt = ((newoptions & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
5650 }
5651
5652 /* Change options at this level, and pass them back for use
5653 in subsequent branches. When not at the start of the pattern, this
5654 information is also necessary so that a resetting item can be
5655 compiled at the end of a group (if we are in a group). */
5656
5657 *optionsptr = options = newoptions;
5658 previous = NULL; /* This item can't be repeated */
5659 continue; /* It is complete */
5660 }
5661
5662 /* If the options ended with ':' we are heading into a nested group
5663 with possible change of options. Such groups are non-capturing and are
5664 not assertions of any kind. All we need to do is skip over the ':';
5665 the newoptions value is handled below. */
5666
5667 bravalue = OP_BRA;
5668 ptr++;
5669 } /* End of switch for character following (? */
5670 } /* End of (? handling */
5671
5672 /* Opening parenthesis not followed by '*' or '?'. If PCRE_NO_AUTO_CAPTURE
5673 is set, all unadorned brackets become non-capturing and behave like (?:...)
5674 brackets. */
5675
5676 else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
5677 {
5678 bravalue = OP_BRA;
5679 }
5680
5681 /* Else we have a capturing group. */
5682
5683 else
5684 {
5685 NUMBERED_GROUP:
5686 cd->bracount += 1;
5687 PUT2(code, 1+LINK_SIZE, cd->bracount);
5688 skipbytes = 2;
5689 }
5690
5691 /* Process nested bracketed regex. Assertions may not be repeated, but
5692 other kinds can be. All their opcodes are >= OP_ONCE. We copy code into a
5693 non-register variable in order to be able to pass its address because some
5694 compilers complain otherwise. Pass in a new setting for the ims options if
5695 they have changed. */
5696
5697 previous = (bravalue >= OP_ONCE)? code : NULL;
5698 *code = bravalue;
5699 tempcode = code;
5700 tempreqvary = cd->req_varyopt; /* Save value before bracket */
5701 length_prevgroup = 0; /* Initialize for pre-compile phase */
5702
5703 if (!compile_regex(
5704 newoptions, /* The complete new option state */
5705 options & PCRE_IMS, /* The previous ims option state */
5706 &tempcode, /* Where to put code (updated) */
5707 &ptr, /* Input pointer (updated) */
5708 errorcodeptr, /* Where to put an error message */
5709 (bravalue == OP_ASSERTBACK ||
5710 bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
5711 reset_bracount, /* True if (?| group */
5712 skipbytes, /* Skip over bracket number */
5713 &subfirstbyte, /* For possible first char */
5714 &subreqbyte, /* For possible last char */
5715 bcptr, /* Current branch chain */
5716 cd, /* Tables block */
5717 (lengthptr == NULL)? NULL : /* Actual compile phase */
5718 &length_prevgroup /* Pre-compile phase */
5719 ))
5720 goto FAILED;
5721
5722 /* At the end of compiling, code is still pointing to the start of the
5723 group, while tempcode has been updated to point past the end of the group
5724 and any option resetting that may follow it. The pattern pointer (ptr)
5725 is on the bracket. */
5726
5727 /* If this is a conditional bracket, check that there are no more than
5728 two branches in the group, or just one if it's a DEFINE group. We do this
5729 in the real compile phase, not in the pre-pass, where the whole group may
5730 not be available. */
5731
5732 if (bravalue == OP_COND && lengthptr == NULL)
5733 {
5734 uschar *tc = code;
5735 int condcount = 0;
5736
5737 do {
5738 condcount++;
5739 tc += GET(tc,1);
5740 }
5741 while (*tc != OP_KET);
5742
5743 /* A DEFINE group is never obeyed inline (the "condition" is always
5744 false). It must have only one branch. */
5745
5746 if (code[LINK_SIZE+1] == OP_DEF)
5747 {
5748 if (condcount > 1)
5749 {
5750 *errorcodeptr = ERR54;
5751 goto FAILED;
5752 }
5753 bravalue = OP_DEF; /* Just a flag to suppress char handling below */
5754 }
5755
5756 /* A "normal" conditional group. If there is just one branch, we must not
5757 make use of its firstbyte or reqbyte, because this is equivalent to an
5758 empty second branch. */
5759
5760 else
5761 {
5762 if (condcount > 2)
5763 {
5764 *errorcodeptr = ERR27;
5765 goto FAILED;
5766 }
5767 if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
5768 }
5769 }
5770
5771 /* Error if hit end of pattern */
5772
5773 if (*ptr != CHAR_RIGHT_PARENTHESIS)
5774 {
5775 *errorcodeptr = ERR14;
5776 goto FAILED;
5777 }
5778
5779 /* In the pre-compile phase, update the length by the length of the group,
5780 less the brackets at either end. Then reduce the compiled code to just a
5781 set of non-capturing brackets so that it doesn't use much memory if it is
5782 duplicated by a quantifier.*/
5783
5784 if (lengthptr != NULL)
5785 {
5786 if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE)
5787 {
5788 *errorcodeptr = ERR20;
5789 goto FAILED;
5790 }
5791 *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
5792 *code++ = OP_BRA;
5793 PUTINC(code, 0, 1 + LINK_SIZE);
5794 *code++ = OP_KET;
5795 PUTINC(code, 0, 1 + LINK_SIZE);
5796 break; /* No need to waste time with special character handling */
5797 }
5798
5799 /* Otherwise update the main code pointer to the end of the group. */
5800
5801 code = tempcode;
5802
5803 /* For a DEFINE group, required and first character settings are not
5804 relevant. */
5805
5806 if (bravalue == OP_DEF) break;
5807
5808 /* Handle updating of the required and first characters for other types of
5809 group. Update for normal brackets of all kinds, and conditions with two
5810 branches (see code above). If the bracket is followed by a quantifier with
5811 zero repeat, we have to back off. Hence the definition of zeroreqbyte and
5812 zerofirstbyte outside the main loop so that they can be accessed for the
5813 back off. */
5814
5815 zeroreqbyte = reqbyte;
5816 zerofirstbyte = firstbyte;
5817 groupsetfirstbyte = FALSE;
5818
5819 if (bravalue >= OP_ONCE)
5820 {
5821 /* If we have not yet set a firstbyte in this branch, take it from the
5822 subpattern, remembering that it was set here so that a repeat of more
5823 than one can replicate it as reqbyte if necessary. If the subpattern has
5824 no firstbyte, set "none" for the whole branch. In both cases, a zero
5825 repeat forces firstbyte to "none". */
5826
5827 if (firstbyte == REQ_UNSET)
5828 {
5829 if (subfirstbyte >= 0)
5830 {
5831 firstbyte = subfirstbyte;
5832 groupsetfirstbyte = TRUE;
5833 }
5834 else firstbyte = REQ_NONE;
5835 zerofirstbyte = REQ_NONE;
5836 }
5837
5838 /* If firstbyte was previously set, convert the subpattern's firstbyte
5839 into reqbyte if there wasn't one, using the vary flag that was in
5840 existence beforehand. */
5841
5842 else if (subfirstbyte >= 0 && subreqbyte < 0)
5843 subreqbyte = subfirstbyte | tempreqvary;
5844
5845 /* If the subpattern set a required byte (or set a first byte that isn't
5846 really the first byte - see above), set it. */
5847
5848 if (subreqbyte >= 0) reqbyte = subreqbyte;
5849 }
5850
5851 /* For a forward assertion, we take the reqbyte, if set. This can be
5852 helpful if the pattern that follows the assertion doesn't set a different
5853 char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
5854 for an assertion, however because it leads to incorrect effect for patterns
5855 such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
5856 of a firstbyte. This is overcome by a scan at the end if there's no
5857 firstbyte, looking for an asserted first char. */
5858
5859 else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
5860 break; /* End of processing '(' */
5861
5862
5863 /* ===================================================================*/
5864 /* Handle metasequences introduced by \. For ones like \d, the ESC_ values
5865 are arranged to be the negation of the corresponding OP_values in the
5866 default case when PCRE_UCP is not set. For the back references, the values
5867 are ESC_REF plus the reference number. Only back references and those types
5868 that consume a character may be repeated. We can test for values between
5869 ESC_b and ESC_Z for the latter; this may have to change if any new ones are
5870 ever created. */
5871
5872 case CHAR_BACKSLASH:
5873 tempptr = ptr;
5874 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, FALSE);
5875 if (*errorcodeptr != 0) goto FAILED;
5876
5877 if (c < 0)
5878 {
5879 if (-c == ESC_Q) /* Handle start of quoted string */
5880 {
5881 if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
5882 ptr += 2; /* avoid empty string */
5883 else inescq = TRUE;
5884 continue;
5885 }
5886
5887 if (-c == ESC_E) continue; /* Perl ignores an orphan \E */
5888
5889 /* For metasequences that actually match a character, we disable the
5890 setting of a first character if it hasn't already been set. */
5891
5892 if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
5893 firstbyte = REQ_NONE;
5894
5895 /* Set values to reset to if this is followed by a zero repeat. */
5896
5897 zerofirstbyte = firstbyte;
5898 zeroreqbyte = reqbyte;
5899
5900 /* \g<name> or \g'name' is a subroutine call by name and \g<n> or \g'n'
5901 is a subroutine call by number (Oniguruma syntax). In fact, the value
5902 -ESC_g is returned only for these cases. So we don't need to check for <
5903 or ' if the value is -ESC_g. For the Perl syntax \g{n} the value is
5904 -ESC_REF+n, and for the Perl syntax \g{name} the result is -ESC_k (as
5905 that is a synonym for a named back reference). */
5906
5907 if (-c == ESC_g)
5908 {
5909 const uschar *p;
5910 save_hwm = cd->hwm; /* Normally this is set when '(' is read */
5911 terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
5912 CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
5913
5914 /* These two statements stop the compiler for warning about possibly
5915 unset variables caused by the jump to HANDLE_NUMERICAL_RECURSION. In
5916 fact, because we actually check for a number below, the paths that
5917 would actually be in error are never taken. */
5918
5919 skipbytes = 0;
5920 reset_bracount = FALSE;
5921
5922 /* Test for a name */
5923
5924 if (ptr[1] != CHAR_PLUS && ptr[1] != CHAR_MINUS)
5925 {
5926 BOOL isnumber = TRUE;
5927 for (p = ptr + 1; *p != 0 && *p != terminator; p++)
5928 {
5929 if ((cd->ctypes[*p] & ctype_digit) == 0) isnumber = FALSE;
5930 if ((cd->ctypes[*p] & ctype_word) == 0) break;
5931 }
5932 if (*p != terminator)
5933 {
5934 *errorcodeptr = ERR57;
5935 break;
5936 }
5937 if (isnumber)
5938 {
5939 ptr++;
5940 goto HANDLE_NUMERICAL_RECURSION;
5941 }
5942 is_recurse = TRUE;
5943 goto NAMED_REF_OR_RECURSE;
5944 }
5945
5946 /* Test a signed number in angle brackets or quotes. */
5947
5948 p = ptr + 2;
5949 while ((digitab[*p] & ctype_digit) != 0) p++;
5950 if (*p != terminator)
5951 {
5952 *errorcodeptr = ERR57;
5953 break;
5954 }
5955 ptr++;
5956 goto HANDLE_NUMERICAL_RECURSION;
5957 }
5958
5959 /* \k<name> or \k'name' is a back reference by name (Perl syntax).
5960 We also support \k{name} (.NET syntax) */
5961
5962 if (-c == ESC_k && (ptr[1] == CHAR_LESS_THAN_SIGN ||
5963 ptr[1] == CHAR_APOSTROPHE || ptr[1] == CHAR_LEFT_CURLY_BRACKET))
5964 {
5965 is_recurse = FALSE;
5966 terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
5967 CHAR_GREATER_THAN_SIGN : (*ptr == CHAR_APOSTROPHE)?
5968 CHAR_APOSTROPHE : CHAR_RIGHT_CURLY_BRACKET;
5969 goto NAMED_REF_OR_RECURSE;
5970 }
5971
5972 /* Back references are handled specially; must disable firstbyte if
5973 not set to cope with cases like (?=(\w+))\1: which would otherwise set
5974 ':' later. */
5975
5976 if (-c >= ESC_REF)
5977 {
5978 open_capitem *oc;
5979 recno = -c - ESC_REF;
5980
5981 HANDLE_REFERENCE: /* Come here from named backref handling */
5982 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
5983 previous = code;
5984 *code++ = OP_REF;
5985 PUT2INC(code, 0, recno);
5986 cd->backref_map |= (recno < 32)? (1 << recno) : 1;
5987 if (recno > cd->top_backref) cd->top_backref = recno;
5988
5989 /* Check to see if this back reference is recursive, that it, it
5990 is inside the group that it references. A flag is set so that the
5991 group can be made atomic. */
5992
5993 for (oc = cd->open_caps; oc != NULL; oc = oc->next)
5994 {
5995 if (oc->number == recno)
5996 {
5997 oc->flag = TRUE;
5998 break;
5999 }
6000 }
6001 }
6002
6003 /* So are Unicode property matches, if supported. */
6004
6005 #ifdef SUPPORT_UCP
6006 else if (-c == ESC_P || -c == ESC_p)
6007 {
6008 BOOL negated;
6009 int pdata;
6010 int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
6011 if (ptype < 0) goto FAILED;
6012 previous = code;
6013 *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
6014 *code++ = ptype;
6015 *code++ = pdata;
6016 }
6017 #else
6018
6019 /* If Unicode properties are not supported, \X, \P, and \p are not
6020 allowed. */
6021
6022 else if (-c == ESC_X || -c == ESC_P || -c == ESC_p)
6023 {
6024 *errorcodeptr = ERR45;
6025 goto FAILED;
6026 }
6027 #endif
6028
6029 /* For the rest (including \X when Unicode properties are supported), we
6030 can obtain the OP value by negating the escape value in the default
6031 situation when PCRE_UCP is not set. When it *is* set, we substitute
6032 Unicode property tests. */
6033
6034 else
6035 {
6036 #ifdef SUPPORT_UCP
6037 if (-c >= ESC_DU && -c <= ESC_wu)
6038 {
6039 nestptr = ptr + 1; /* Where to resume */
6040 ptr = substitutes[-c - ESC_DU] - 1; /* Just before substitute */
6041 }
6042 else
6043 #endif
6044 {
6045 previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
6046 *code++ = -c;
6047 }
6048 }
6049 continue;
6050 }
6051
6052 /* We have a data character whose value is in c. In UTF-8 mode it may have
6053 a value > 127. We set its representation in the length/buffer, and then
6054 handle it as a data character. */
6055
6056 #ifdef SUPPORT_UTF8
6057 if (utf8 && c > 127)
6058 mclength = _pcre_ord2utf8(c, mcbuffer);
6059 else
6060 #endif
6061
6062 {
6063 mcbuffer[0] = c;
6064 mclength = 1;
6065 }
6066 goto ONE_CHAR;
6067
6068
6069 /* ===================================================================*/
6070 /* Handle a literal character. It is guaranteed not to be whitespace or #
6071 when the extended flag is set. If we are in UTF-8 mode, it may be a
6072 multi-byte literal character. */
6073
6074 default:
6075 NORMAL_CHAR:
6076 mclength = 1;
6077 mcbuffer[0] = c;
6078
6079 #ifdef SUPPORT_UTF8
6080 if (utf8 && c >= 0xc0)
6081 {
6082 while ((ptr[1] & 0xc0) == 0x80)
6083 mcbuffer[mclength++] = *(++ptr);
6084 }
6085 #endif
6086
6087 /* At this point we have the character's bytes in mcbuffer, and the length
6088 in mclength. When not in UTF-8 mode, the length is always 1. */
6089
6090 ONE_CHAR:
6091 previous = code;
6092 *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARNC : OP_CHAR;
6093 for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
6094
6095 /* Remember if \r or \n were seen */
6096
6097 if (mcbuffer[0] == CHAR_CR || mcbuffer[0] == CHAR_NL)
6098 cd->external_flags |= PCRE_HASCRORLF;
6099
6100 /* Set the first and required bytes appropriately. If no previous first
6101 byte, set it from this character, but revert to none on a zero repeat.
6102 Otherwise, leave the firstbyte value alone, and don't change it on a zero
6103 repeat. */
6104
6105 if (firstbyte == REQ_UNSET)
6106 {
6107 zerofirstbyte = REQ_NONE;
6108 zeroreqbyte = reqbyte;
6109
6110 /* If the character is more than one byte long, we can set firstbyte
6111 only if it is not to be matched caselessly. */
6112
6113 if (mclength == 1 || req_caseopt == 0)
6114 {
6115 firstbyte = mcbuffer[0] | req_caseopt;
6116 if (mclength != 1) reqbyte = code[-1] | cd->req_varyopt;
6117 }
6118 else firstbyte = reqbyte = REQ_NONE;
6119 }
6120
6121 /* firstbyte was previously set; we can set reqbyte only the length is
6122 1 or the matching is caseful. */
6123
6124 else
6125 {
6126 zerofirstbyte = firstbyte;
6127 zeroreqbyte = reqbyte;
6128 if (mclength == 1 || req_caseopt == 0)
6129 reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
6130 }
6131
6132 break; /* End of literal character handling */
6133 }
6134 } /* end of big loop */
6135
6136
6137 /* Control never reaches here by falling through, only by a goto for all the
6138 error states. Pass back the position in the pattern so that it can be displayed
6139 to the user for diagnosing the error. */
6140
6141 FAILED:
6142 *ptrptr = ptr;
6143 return FALSE;
6144 }
6145
6146
6147
6148
6149 /*************************************************
6150 * Compile sequence of alternatives *
6151 *************************************************/
6152
6153 /* On entry, ptr is pointing past the bracket character, but on return it
6154 points to the closing bracket, or vertical bar, or end of string. The code
6155 variable is pointing at the byte into which the BRA operator has been stored.
6156 If the ims options are changed at the start (for a (?ims: group) or during any
6157 branch, we need to insert an OP_OPT item at the start of every following branch
6158 to ensure they get set correctly at run time, and also pass the new options
6159 into every subsequent branch compile.
6160
6161 This function is used during the pre-compile phase when we are trying to find
6162 out the amount of memory needed, as well as during the real compile phase. The
6163 value of lengthptr distinguishes the two phases.
6164
6165 Arguments:
6166 options option bits, including any changes for this subpattern
6167 oldims previous settings of ims option bits
6168 codeptr -> the address of the current code pointer
6169 ptrptr -> the address of the current pattern pointer
6170 errorcodeptr -> pointer to error code variable
6171 lookbehind TRUE if this is a lookbehind assertion
6172 reset_bracount TRUE to reset the count for each branch
6173 skipbytes skip this many bytes at start (for brackets and OP_COND)
6174 firstbyteptr place to put the first required character, or a negative number
6175 reqbyteptr place to put the last required character, or a negative number
6176 bcptr pointer to the chain of currently open branches
6177 cd points to the data block with tables pointers etc.
6178 lengthptr NULL during the real compile phase
6179 points to length accumulator during pre-compile phase
6180
6181 Returns: TRUE on success
6182 */
6183
6184 static BOOL
6185 compile_regex(int options, int oldims, uschar **codeptr, const uschar **ptrptr,
6186 int *errorcodeptr, BOOL lookbehind, BOOL reset_bracount, int skipbytes,
6187 int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd,
6188 int *lengthptr)
6189 {
6190 const uschar *ptr = *ptrptr;
6191 uschar *code = *codeptr;
6192 uschar *last_branch = code;
6193 uschar *start_bracket = code;
6194 uschar *reverse_count = NULL;
6195 open_capitem capitem;
6196 int capnumber = 0;
6197 int firstbyte, reqbyte;
6198 int branchfirstbyte, branchreqbyte;
6199 int length;
6200 int orig_bracount;
6201 int max_bracount;
6202 int old_external_options = cd->external_options;
6203 branch_chain bc;
6204
6205 bc.outer = bcptr;
6206 bc.current_branch = code;
6207
6208 firstbyte = reqbyte = REQ_UNSET;
6209
6210 /* Accumulate the length for use in the pre-compile phase. Start with the
6211 length of the BRA and KET and any extra bytes that are required at the
6212 beginning. We accumulate in a local variable to save frequent testing of
6213 lenthptr for NULL. We cannot do this by looking at the value of code at the
6214 start and end of each alternative, because compiled items are discarded during
6215 the pre-compile phase so that the work space is not exceeded. */
6216
6217 length = 2 + 2*LINK_SIZE + skipbytes;
6218
6219 /* WARNING: If the above line is changed for any reason, you must also change
6220 the code that abstracts option settings at the start of the pattern and makes
6221 them global. It tests the value of length for (2 + 2*LINK_SIZE) in the
6222 pre-compile phase to find out whether anything has yet been compiled or not. */
6223
6224 /* If this is a capturing subpattern, add to the chain of open capturing items
6225 so that we can detect them if (*ACCEPT) is encountered. This is also used to
6226 detect groups that contain recursive back references to themselves. */
6227
6228 if (*code == OP_CBRA)
6229 {
6230 capnumber = GET2(code, 1 + LINK_SIZE);
6231 capitem.number = capnumber;
6232 capitem.next = cd->open_caps;
6233 capitem.flag = FALSE;
6234 cd->open_caps = &capitem;
6235 }
6236
6237 /* Offset is set zero to mark that this bracket is still open */
6238
6239 PUT(code, 1, 0);
6240 code += 1 + LINK_SIZE + skipbytes;
6241
6242 /* Loop for each alternative branch */
6243
6244 orig_bracount = max_bracount = cd->bracount;
6245 for (;;)
6246 {
6247 /* For a (?| group, reset the capturing bracket count so that each branch
6248 uses the same numbers. */
6249
6250 if (reset_bracount) cd->bracount = orig_bracount;
6251
6252 /* Handle a change of ims options at the start of the branch */
6253
6254 if ((options & PCRE_IMS) != oldims)
6255 {
6256 *code++ = OP_OPT;
6257 *code++ = options & PCRE_IMS;
6258 length += 2;
6259 }
6260
6261 /* Set up dummy OP_REVERSE if lookbehind assertion */
6262
6263 if (lookbehind)
6264 {
6265 *code++ = OP_REVERSE;
6266 reverse_count = code;
6267 PUTINC(code, 0, 0);
6268 length += 1 + LINK_SIZE;
6269 }
6270
6271 /* Now compile the branch; in the pre-compile phase its length gets added
6272 into the length. */
6273
6274 if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstbyte,
6275 &branchreqbyte, &bc, cd, (lengthptr == NULL)? NULL : &length))
6276 {
6277 *ptrptr = ptr;
6278 return FALSE;
6279 }
6280
6281 /* If the external options have changed during this branch, it means that we
6282 are at the top level, and a leading option setting has been encountered. We
6283 need to re-set the original option values to take account of this so that,
6284 during the pre-compile phase, we know to allow for a re-set at the start of
6285 subsequent branches. */
6286
6287 if (old_external_options != cd->external_options)
6288 oldims = cd->external_options & PCRE_IMS;
6289
6290 /* Keep the highest bracket count in case (?| was used and some branch
6291 has fewer than the rest. */
6292
6293 if (cd->bracount > max_bracount) max_bracount = cd->bracount;
6294
6295 /* In the real compile phase, there is some post-processing to be done. */
6296
6297 if (lengthptr == NULL)
6298 {
6299 /* If this is the first branch, the firstbyte and reqbyte values for the
6300 branch become the values for the regex. */
6301
6302 if (*last_branch != OP_ALT)
6303 {
6304 firstbyte = branchfirstbyte;
6305 reqbyte = branchreqbyte;
6306 }
6307
6308 /* If this is not the first branch, the first char and reqbyte have to
6309 match the values from all the previous branches, except that if the
6310 previous value for reqbyte didn't have REQ_VARY set, it can still match,
6311 and we set REQ_VARY for the regex. */
6312
6313 else
6314 {
6315 /* If we previously had a firstbyte, but it doesn't match the new branch,
6316 we have to abandon the firstbyte for the regex, but if there was
6317 previously no reqbyte, it takes on the value of the old firstbyte. */
6318
6319 if (firstbyte >= 0 && firstbyte != branchfirstbyte)
6320 {
6321 if (reqbyte < 0) reqbyte = firstbyte;
6322 firstbyte = REQ_NONE;
6323 }
6324
6325 /* If we (now or from before) have no firstbyte, a firstbyte from the
6326 branch becomes a reqbyte if there isn't a branch reqbyte. */
6327
6328 if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
6329 branchreqbyte = branchfirstbyte;
6330
6331 /* Now ensure that the reqbytes match */
6332
6333 if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
6334 reqbyte = REQ_NONE;
6335 else reqbyte |= branchreqbyte; /* To "or" REQ_VARY */
6336 }
6337
6338 /* If lookbehind, check that this branch matches a fixed-length string, and
6339 put the length into the OP_REVERSE item. Temporarily mark the end of the
6340 branch with OP_END. If the branch contains OP_RECURSE, the result is -3
6341 because there may be forward references that we can't check here. Set a
6342 flag to cause another lookbehind check at the end. Why not do it all at the
6343 end? Because common, erroneous checks are picked up here and the offset of
6344 the problem can be shown. */
6345
6346 if (lookbehind)
6347 {
6348 int fixed_length;
6349 *code = OP_END;
6350 fixed_length = find_fixedlength(last_branch, options, FALSE, cd);
6351 DPRINTF(("fixed length = %d\n", fixed_length));
6352 if (fixed_length == -3)
6353 {
6354 cd->check_lookbehind = TRUE;
6355 }
6356 else if (fixed_length < 0)
6357 {
6358 *errorcodeptr = (fixed_length == -2)? ERR36 : ERR25;
6359 *ptrptr = ptr;
6360 return FALSE;
6361 }
6362 else { PUT(reverse_count, 0, fixed_length); }
6363 }
6364 }
6365
6366 /* Reached end of expression, either ')' or end of pattern. In the real
6367 compile phase, go back through the alternative branches and reverse the chain
6368 of offsets, with the field in the BRA item now becoming an offset to the
6369 first alternative. If there are no alternatives, it points to the end of the
6370 group. The length in the terminating ket is always the length of the whole
6371 bracketed item. If any of the ims options were changed inside the group,
6372 compile a resetting op-code following, except at the very end of the pattern.
6373 Return leaving the pointer at the terminating char. */
6374
6375 if (*ptr != CHAR_VERTICAL_LINE)
6376 {
6377 if (lengthptr == NULL)
6378 {
6379 int branch_length = (int)(code - last_branch);
6380 do
6381 {
6382 int prev_length = GET(last_branch, 1);
6383 PUT(last_branch, 1, branch_length);
6384 branch_length = prev_length;
6385 last_branch -= branch_length;
6386 }
6387 while (branch_length > 0);
6388 }
6389
6390 /* Fill in the ket */
6391
6392 *code = OP_KET;
6393 PUT(code, 1, (int)(code - start_bracket));
6394 code += 1 + LINK_SIZE;
6395
6396 /* If it was a capturing subpattern, check to see if it contained any
6397 recursive back references. If so, we must wrap it in atomic brackets.
6398 In any event, remove the block from the chain. */
6399
6400 if (capnumber > 0)
6401 {
6402 if (cd->open_caps->flag)
6403 {
6404 memmove(start_bracket + 1 + LINK_SIZE, start_bracket,
6405 code - start_bracket);
6406 *start_bracket = OP_ONCE;
6407 code += 1 + LINK_SIZE;
6408 PUT(start_bracket, 1, (int)(code - start_bracket));
6409 *code = OP_KET;
6410 PUT(code, 1, (int)(code - start_bracket));
6411 code += 1 + LINK_SIZE;
6412 length += 2 + 2*LINK_SIZE;
6413 }
6414 cd->open_caps = cd->open_caps->next;
6415 }
6416
6417 /* Reset options if needed. */
6418
6419 if ((options & PCRE_IMS) != oldims && *ptr == CHAR_RIGHT_PARENTHESIS)
6420 {
6421 *code++ = OP_OPT;
6422 *code++ = oldims;
6423 length += 2;
6424 }
6425
6426 /* Retain the highest bracket number, in case resetting was used. */
6427
6428 cd->bracount = max_bracount;
6429
6430 /* Set values to pass back */
6431
6432 *codeptr = code;
6433 *ptrptr = ptr;
6434 *firstbyteptr = firstbyte;
6435 *reqbyteptr = reqbyte;
6436 if (lengthptr != NULL)
6437 {
6438 if (OFLOW_MAX - *lengthptr < length)
6439 {
6440 *errorcodeptr = ERR20;
6441 return FALSE;
6442 }
6443 *lengthptr += length;
6444 }
6445 return TRUE;
6446 }
6447
6448 /* Another branch follows. In the pre-compile phase, we can move the code
6449 pointer back to where it was for the start of the first branch. (That is,
6450 pretend that each branch is the only one.)
6451
6452 In the real compile phase, insert an ALT node. Its length field points back
6453 to the previous branch while the bracket remains open. At the end the chain
6454 is reversed. It's done like this so that the start of the bracket has a
6455 zero offset until it is closed, making it possible to detect recursion. */
6456
6457 if (lengthptr != NULL)
6458 {
6459 code = *codeptr + 1 + LINK_SIZE + skipbytes;
6460 length += 1 + LINK_SIZE;
6461 }
6462 else
6463 {
6464 *code = OP_ALT;
6465 PUT(code, 1, (int)(code - last_branch));
6466 bc.current_branch = last_branch = code;
6467 code += 1 + LINK_SIZE;
6468 }
6469
6470 ptr++;
6471 }
6472 /* Control never reaches here */
6473 }
6474
6475
6476
6477
6478 /*************************************************
6479 * Check for anchored expression *
6480 *************************************************/
6481
6482 /* Try to find out if this is an anchored regular expression. Consider each
6483 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
6484 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
6485 it's anchored. However, if this is a multiline pattern, then only OP_SOD
6486 counts, since OP_CIRC can match in the middle.
6487
6488 We can also consider a regex to be anchored if OP_SOM starts all its branches.
6489 This is the code for \G, which means "match at start of match position, taking
6490 into account the match offset".
6491
6492 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
6493 because that will try the rest of the pattern at all possible matching points,
6494 so there is no point trying again.... er ....
6495
6496 .... except when the .* appears inside capturing parentheses, and there is a
6497 subsequent back reference to those parentheses. We haven't enough information
6498 to catch that case precisely.
6499
6500 At first, the best we could do was to detect when .* was in capturing brackets
6501 and the highest back reference was greater than or equal to that level.
6502 However, by keeping a bitmap of the first 31 back references, we can catch some
6503 of the more common cases more precisely.
6504
6505 Arguments:
6506 code points to start of expression (the bracket)
6507 options points to the options setting
6508 bracket_map a bitmap of which brackets we are inside while testing; this
6509 handles up to substring 31; after that we just have to take
6510 the less precise approach
6511 backref_map the back reference bitmap
6512
6513 Returns: TRUE or FALSE
6514 */
6515
6516 static BOOL
6517 is_anchored(register const uschar *code, int *options, unsigned int bracket_map,
6518 unsigned int backref_map)
6519 {
6520 do {
6521 const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
6522 options, PCRE_MULTILINE, FALSE);
6523 register int op = *scode;
6524
6525 /* Non-capturing brackets */
6526
6527 if (op == OP_BRA)
6528 {
6529 if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
6530 }
6531
6532 /* Capturing brackets */
6533
6534 else if (op == OP_CBRA)
6535 {
6536 int n = GET2(scode, 1+LINK_SIZE);
6537 int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
6538 if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;
6539 }
6540
6541 /* Other brackets */
6542
6543 else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
6544 {
6545 if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
6546 }
6547
6548 /* .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and
6549 it isn't in brackets that are or may be referenced. */
6550
6551 else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
6552 op == OP_TYPEPOSSTAR))
6553 {
6554 if (scode[1] != OP_ALLANY || (bracket_map & backref_map) != 0)
6555 return FALSE;
6556 }
6557
6558 /* Check for explicit anchoring */
6559
6560 else if (op != OP_SOD && op != OP_SOM &&
6561 ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
6562 return FALSE;
6563 code += GET(code, 1);
6564 }
6565 while (*code == OP_ALT); /* Loop for each alternative */
6566 return TRUE;
6567 }
6568
6569
6570
6571 /*************************************************
6572 * Check for starting with ^ or .* *
6573 *************************************************/
6574
6575 /* This is called to find out if every branch starts with ^ or .* so that
6576 "first char" processing can be done to speed things up in multiline
6577 matching and for non-DOTALL patterns that start with .* (which must start at
6578 the beginning or after \n). As in the case of is_anchored() (see above), we
6579 have to take account of back references to capturing brackets that contain .*
6580 because in that case we can't make the assumption.
6581
6582 Arguments:
6583 code points to start of expression (the bracket)
6584 bracket_map a bitmap of which brackets we are inside while testing; this
6585 handles up to substring 31; after that we just have to take
6586 the less precise approach
6587 backref_map the back reference bitmap
6588
6589 Returns: TRUE or FALSE
6590 */
6591
6592 static BOOL
6593 is_startline(const uschar *code, unsigned int bracket_map,
6594 unsigned int backref_map)
6595 {
6596 do {
6597 const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
6598 NULL, 0, FALSE);
6599 register int op = *scode;
6600
6601 /* If we are at the start of a conditional assertion group, *both* the
6602 conditional assertion *and* what follows the condition must satisfy the test
6603 for start of line. Other kinds of condition fail. Note that there may be an
6604 auto-callout at the start of a condition. */
6605
6606 if (op == OP_COND)
6607 {
6608 scode += 1 + LINK_SIZE;
6609 if (*scode == OP_CALLOUT) scode += _pcre_OP_lengths[OP_CALLOUT];
6610 switch (*scode)
6611 {
6612 case OP_CREF:
6613 case OP_NCREF:
6614 case OP_RREF:
6615 case OP_NRREF:
6616 case OP_DEF:
6617 return FALSE;
6618
6619 default: /* Assertion */
6620 if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
6621 do scode += GET(scode, 1); while (*scode == OP_ALT);
6622 scode += 1 + LINK_SIZE;
6623 break;
6624 }
6625 scode = first_significant_code(scode, NULL, 0, FALSE);
6626 op = *scode;
6627 }
6628
6629 /* Non-capturing brackets */
6630
6631 if (op == OP_BRA)
6632 {
6633 if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
6634 }
6635
6636 /* Capturing brackets */
6637
6638 else if (op == OP_CBRA)
6639 {
6640 int n = GET2(scode, 1+LINK_SIZE);
6641 int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
6642 if (!is_startline(scode, new_map, backref_map)) return FALSE;
6643 }
6644
6645 /* Other brackets */
6646
6647 else if (op == OP_ASSERT || op == OP_ONCE)
6648 {
6649 if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
6650 }
6651
6652 /* .* means "start at start or after \n" if it isn't in brackets that
6653 may be referenced. */
6654
6655 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
6656 {
6657 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
6658 }
6659
6660 /* Check for explicit circumflex */
6661
6662 else if (op != OP_CIRC) return FALSE;
6663
6664 /* Move on to the next alternative */
6665
6666 code += GET(code, 1);
6667 }
6668 while (*code == OP_ALT); /* Loop for each alternative */