/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Contents of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 461 - (show annotations) (download)
Mon Oct 5 10:59:35 2009 UTC (4 years, 11 months ago) by ph10
File MIME type: text/plain
File size: 219677 byte(s)
Tidy up, remove trailing spaces, etc. for 8.00-RC1.

1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2009 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains the external function pcre_compile(), along with
42 supporting internal functions that are not used by other modules. */
43
44
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48
49 #define NLBLOCK cd /* Block containing newline information */
50 #define PSSTART start_pattern /* Field containing processed string start */
51 #define PSEND end_pattern /* Field containing processed string end */
52
53 #include "pcre_internal.h"
54
55
56 /* When DEBUG is defined, we need the pcre_printint() function, which is also
57 used by pcretest. DEBUG is not defined when building a production library. */
58
59 #ifdef DEBUG
60 #include "pcre_printint.src"
61 #endif
62
63
64 /* Macro for setting individual bits in class bitmaps. */
65
66 #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
67
68 /* Maximum length value to check against when making sure that the integer that
69 holds the compiled pattern length does not overflow. We make it a bit less than
70 INT_MAX to allow for adding in group terminating bytes, so that we don't have
71 to check them every time. */
72
73 #define OFLOW_MAX (INT_MAX - 20)
74
75
76 /*************************************************
77 * Code parameters and static tables *
78 *************************************************/
79
80 /* This value specifies the size of stack workspace that is used during the
81 first pre-compile phase that determines how much memory is required. The regex
82 is partly compiled into this space, but the compiled parts are discarded as
83 soon as they can be, so that hopefully there will never be an overrun. The code
84 does, however, check for an overrun. The largest amount I've seen used is 218,
85 so this number is very generous.
86
87 The same workspace is used during the second, actual compile phase for
88 remembering forward references to groups so that they can be filled in at the
89 end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
90 is 4 there is plenty of room. */
91
92 #define COMPILE_WORK_SIZE (4096)
93
94
95 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
96 are simple data values; negative values are for special things like \d and so
97 on. Zero means further processing is needed (for things like \x), or the escape
98 is invalid. */
99
100 #ifndef EBCDIC
101
102 /* This is the "normal" table for ASCII systems or for EBCDIC systems running
103 in UTF-8 mode. */
104
105 static const short int escapes[] = {
106 0, 0,
107 0, 0,
108 0, 0,
109 0, 0,
110 0, 0,
111 CHAR_COLON, CHAR_SEMICOLON,
112 CHAR_LESS_THAN_SIGN, CHAR_EQUALS_SIGN,
113 CHAR_GREATER_THAN_SIGN, CHAR_QUESTION_MARK,
114 CHAR_COMMERCIAL_AT, -ESC_A,
115 -ESC_B, -ESC_C,
116 -ESC_D, -ESC_E,
117 0, -ESC_G,
118 -ESC_H, 0,
119 0, -ESC_K,
120 0, 0,
121 0, 0,
122 -ESC_P, -ESC_Q,
123 -ESC_R, -ESC_S,
124 0, 0,
125 -ESC_V, -ESC_W,
126 -ESC_X, 0,
127 -ESC_Z, CHAR_LEFT_SQUARE_BRACKET,
128 CHAR_BACKSLASH, CHAR_RIGHT_SQUARE_BRACKET,
129 CHAR_CIRCUMFLEX_ACCENT, CHAR_UNDERSCORE,
130 CHAR_GRAVE_ACCENT, 7,
131 -ESC_b, 0,
132 -ESC_d, ESC_e,
133 ESC_f, 0,
134 -ESC_h, 0,
135 0, -ESC_k,
136 0, 0,
137 ESC_n, 0,
138 -ESC_p, 0,
139 ESC_r, -ESC_s,
140 ESC_tee, 0,
141 -ESC_v, -ESC_w,
142 0, 0,
143 -ESC_z
144 };
145
146 #else
147
148 /* This is the "abnormal" table for EBCDIC systems without UTF-8 support. */
149
150 static const short int escapes[] = {
151 /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
152 /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
153 /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
154 /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
155 /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
156 /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
157 /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
158 /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
159 /* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0,
160 /* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
161 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
162 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0,
163 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
164 /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
165 /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
166 /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
167 /* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0,
168 /* D0 */ '}', 0, -ESC_K, 0, 0, 0, 0, -ESC_P,
169 /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
170 /* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X,
171 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
172 /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
173 /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
174 };
175 #endif
176
177
178 /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
179 searched linearly. Put all the names into a single string, in order to reduce
180 the number of relocations when a shared library is dynamically linked. The
181 string is built from string macros so that it works in UTF-8 mode on EBCDIC
182 platforms. */
183
184 typedef struct verbitem {
185 int len;
186 int op;
187 } verbitem;
188
189 static const char verbnames[] =
190 STRING_ACCEPT0
191 STRING_COMMIT0
192 STRING_F0
193 STRING_FAIL0
194 STRING_PRUNE0
195 STRING_SKIP0
196 STRING_THEN;
197
198 static const verbitem verbs[] = {
199 { 6, OP_ACCEPT },
200 { 6, OP_COMMIT },
201 { 1, OP_FAIL },
202 { 4, OP_FAIL },
203 { 5, OP_PRUNE },
204 { 4, OP_SKIP },
205 { 4, OP_THEN }
206 };
207
208 static const int verbcount = sizeof(verbs)/sizeof(verbitem);
209
210
211 /* Tables of names of POSIX character classes and their lengths. The names are
212 now all in a single string, to reduce the number of relocations when a shared
213 library is dynamically loaded. The list of lengths is terminated by a zero
214 length entry. The first three must be alpha, lower, upper, as this is assumed
215 for handling case independence. */
216
217 static const char posix_names[] =
218 STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
219 STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
220 STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
221 STRING_word0 STRING_xdigit;
222
223 static const uschar posix_name_lengths[] = {
224 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
225
226 /* Table of class bit maps for each POSIX class. Each class is formed from a
227 base map, with an optional addition or removal of another map. Then, for some
228 classes, there is some additional tweaking: for [:blank:] the vertical space
229 characters are removed, and for [:alpha:] and [:alnum:] the underscore
230 character is removed. The triples in the table consist of the base map offset,
231 second map offset or -1 if no second map, and a non-negative value for map
232 addition or a negative value for map subtraction (if there are two maps). The
233 absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
234 remove vertical space characters, 2 => remove underscore. */
235
236 static const int posix_class_maps[] = {
237 cbit_word, cbit_digit, -2, /* alpha */
238 cbit_lower, -1, 0, /* lower */
239 cbit_upper, -1, 0, /* upper */
240 cbit_word, -1, 2, /* alnum - word without underscore */
241 cbit_print, cbit_cntrl, 0, /* ascii */
242 cbit_space, -1, 1, /* blank - a GNU extension */
243 cbit_cntrl, -1, 0, /* cntrl */
244 cbit_digit, -1, 0, /* digit */
245 cbit_graph, -1, 0, /* graph */
246 cbit_print, -1, 0, /* print */
247 cbit_punct, -1, 0, /* punct */
248 cbit_space, -1, 0, /* space */
249 cbit_word, -1, 0, /* word - a Perl extension */
250 cbit_xdigit,-1, 0 /* xdigit */
251 };
252
253
254 #define STRING(a) # a
255 #define XSTRING(s) STRING(s)
256
257 /* The texts of compile-time error messages. These are "char *" because they
258 are passed to the outside world. Do not ever re-use any error number, because
259 they are documented. Always add a new error instead. Messages marked DEAD below
260 are no longer used. This used to be a table of strings, but in order to reduce
261 the number of relocations needed when a shared library is loaded dynamically,
262 it is now one long string. We cannot use a table of offsets, because the
263 lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
264 simply count through to the one we want - this isn't a performance issue
265 because these strings are used only when there is a compilation error. */
266
267 static const char error_texts[] =
268 "no error\0"
269 "\\ at end of pattern\0"
270 "\\c at end of pattern\0"
271 "unrecognized character follows \\\0"
272 "numbers out of order in {} quantifier\0"
273 /* 5 */
274 "number too big in {} quantifier\0"
275 "missing terminating ] for character class\0"
276 "invalid escape sequence in character class\0"
277 "range out of order in character class\0"
278 "nothing to repeat\0"
279 /* 10 */
280 "operand of unlimited repeat could match the empty string\0" /** DEAD **/
281 "internal error: unexpected repeat\0"
282 "unrecognized character after (? or (?-\0"
283 "POSIX named classes are supported only within a class\0"
284 "missing )\0"
285 /* 15 */
286 "reference to non-existent subpattern\0"
287 "erroffset passed as NULL\0"
288 "unknown option bit(s) set\0"
289 "missing ) after comment\0"
290 "parentheses nested too deeply\0" /** DEAD **/
291 /* 20 */
292 "regular expression is too large\0"
293 "failed to get memory\0"
294 "unmatched parentheses\0"
295 "internal error: code overflow\0"
296 "unrecognized character after (?<\0"
297 /* 25 */
298 "lookbehind assertion is not fixed length\0"
299 "malformed number or name after (?(\0"
300 "conditional group contains more than two branches\0"
301 "assertion expected after (?(\0"
302 "(?R or (?[+-]digits must be followed by )\0"
303 /* 30 */
304 "unknown POSIX class name\0"
305 "POSIX collating elements are not supported\0"
306 "this version of PCRE is not compiled with PCRE_UTF8 support\0"
307 "spare error\0" /** DEAD **/
308 "character value in \\x{...} sequence is too large\0"
309 /* 35 */
310 "invalid condition (?(0)\0"
311 "\\C not allowed in lookbehind assertion\0"
312 "PCRE does not support \\L, \\l, \\N, \\U, or \\u\0"
313 "number after (?C is > 255\0"
314 "closing ) for (?C expected\0"
315 /* 40 */
316 "recursive call could loop indefinitely\0"
317 "unrecognized character after (?P\0"
318 "syntax error in subpattern name (missing terminator)\0"
319 "two named subpatterns have the same name\0"
320 "invalid UTF-8 string\0"
321 /* 45 */
322 "support for \\P, \\p, and \\X has not been compiled\0"
323 "malformed \\P or \\p sequence\0"
324 "unknown property name after \\P or \\p\0"
325 "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
326 "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
327 /* 50 */
328 "repeated subpattern is too long\0" /** DEAD **/
329 "octal value is greater than \\377 (not in UTF-8 mode)\0"
330 "internal error: overran compiling workspace\0"
331 "internal error: previously-checked referenced subpattern not found\0"
332 "DEFINE group contains more than one branch\0"
333 /* 55 */
334 "repeating a DEFINE group is not allowed\0"
335 "inconsistent NEWLINE options\0"
336 "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
337 "a numbered reference must not be zero\0"
338 "(*VERB) with an argument is not supported\0"
339 /* 60 */
340 "(*VERB) not recognized\0"
341 "number is too big\0"
342 "subpattern name expected\0"
343 "digit expected after (?+\0"
344 "] is an invalid data character in JavaScript compatibility mode\0"
345 /* 65 */
346 "different names for subpatterns of the same number are not allowed";
347
348
349 /* Table to identify digits and hex digits. This is used when compiling
350 patterns. Note that the tables in chartables are dependent on the locale, and
351 may mark arbitrary characters as digits - but the PCRE compiling code expects
352 to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
353 a private table here. It costs 256 bytes, but it is a lot faster than doing
354 character value tests (at least in some simple cases I timed), and in some
355 applications one wants PCRE to compile efficiently as well as match
356 efficiently.
357
358 For convenience, we use the same bit definitions as in chartables:
359
360 0x04 decimal digit
361 0x08 hexadecimal digit
362
363 Then we can use ctype_digit and ctype_xdigit in the code. */
364
365 #ifndef EBCDIC
366
367 /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
368 UTF-8 mode. */
369
370 static const unsigned char digitab[] =
371 {
372 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
373 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
374 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
375 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
376 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
377 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
378 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
379 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
380 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
381 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
382 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
383 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
384 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
385 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
386 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
387 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
388 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
389 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
390 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
391 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
392 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
393 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
394 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
395 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
396 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
397 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
398 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
399 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
400 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
401 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
402 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
403 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
404
405 #else
406
407 /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
408
409 static const unsigned char digitab[] =
410 {
411 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
412 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
413 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
414 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
415 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
416 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
417 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
418 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
419 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
420 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
421 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
422 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */
423 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
424 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
425 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
426 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
427 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
428 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
429 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
430 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
431 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
432 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
433 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
434 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
435 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
436 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
437 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
438 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
439 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
440 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
441 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
442 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
443
444 static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
445 0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
446 0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
447 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
448 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
449 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
450 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
451 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
452 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
453 0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
454 0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
455 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
456 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */
457 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
458 0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
459 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
460 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
461 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
462 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
463 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
464 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
465 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
466 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
467 0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
468 0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
469 0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
470 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
471 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
472 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
473 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
474 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
475 0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
476 0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
477 #endif
478
479
480 /* Definition to allow mutual recursion */
481
482 static BOOL
483 compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
484 int *, int *, branch_chain *, compile_data *, int *);
485
486
487
488 /*************************************************
489 * Find an error text *
490 *************************************************/
491
492 /* The error texts are now all in one long string, to save on relocations. As
493 some of the text is of unknown length, we can't use a table of offsets.
494 Instead, just count through the strings. This is not a performance issue
495 because it happens only when there has been a compilation error.
496
497 Argument: the error number
498 Returns: pointer to the error string
499 */
500
501 static const char *
502 find_error_text(int n)
503 {
504 const char *s = error_texts;
505 for (; n > 0; n--) while (*s++ != 0) {};
506 return s;
507 }
508
509
510 /*************************************************
511 * Handle escapes *
512 *************************************************/
513
514 /* This function is called when a \ has been encountered. It either returns a
515 positive value for a simple escape such as \n, or a negative value which
516 encodes one of the more complicated things such as \d. A backreference to group
517 n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
518 UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
519 ptr is pointing at the \. On exit, it is on the final character of the escape
520 sequence.
521
522 Arguments:
523 ptrptr points to the pattern position pointer
524 errorcodeptr points to the errorcode variable
525 bracount number of previous extracting brackets
526 options the options bits
527 isclass TRUE if inside a character class
528
529 Returns: zero or positive => a data character
530 negative => a special escape sequence
531 on error, errorcodeptr is set
532 */
533
534 static int
535 check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
536 int options, BOOL isclass)
537 {
538 BOOL utf8 = (options & PCRE_UTF8) != 0;
539 const uschar *ptr = *ptrptr + 1;
540 int c, i;
541
542 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
543 ptr--; /* Set pointer back to the last byte */
544
545 /* If backslash is at the end of the pattern, it's an error. */
546
547 if (c == 0) *errorcodeptr = ERR1;
548
549 /* Non-alphanumerics are literals. For digits or letters, do an initial lookup
550 in a table. A non-zero result is something that can be returned immediately.
551 Otherwise further processing may be required. */
552
553 #ifndef EBCDIC /* ASCII/UTF-8 coding */
554 else if (c < CHAR_0 || c > CHAR_z) {} /* Not alphanumeric */
555 else if ((i = escapes[c - CHAR_0]) != 0) c = i;
556
557 #else /* EBCDIC coding */
558 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphanumeric */
559 else if ((i = escapes[c - 0x48]) != 0) c = i;
560 #endif
561
562 /* Escapes that need further processing, or are illegal. */
563
564 else
565 {
566 const uschar *oldptr;
567 BOOL braced, negated;
568
569 switch (c)
570 {
571 /* A number of Perl escapes are not handled by PCRE. We give an explicit
572 error. */
573
574 case CHAR_l:
575 case CHAR_L:
576 case CHAR_N:
577 case CHAR_u:
578 case CHAR_U:
579 *errorcodeptr = ERR37;
580 break;
581
582 /* \g must be followed by one of a number of specific things:
583
584 (1) A number, either plain or braced. If positive, it is an absolute
585 backreference. If negative, it is a relative backreference. This is a Perl
586 5.10 feature.
587
588 (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
589 is part of Perl's movement towards a unified syntax for back references. As
590 this is synonymous with \k{name}, we fudge it up by pretending it really
591 was \k.
592
593 (3) For Oniguruma compatibility we also support \g followed by a name or a
594 number either in angle brackets or in single quotes. However, these are
595 (possibly recursive) subroutine calls, _not_ backreferences. Just return
596 the -ESC_g code (cf \k). */
597
598 case CHAR_g:
599 if (ptr[1] == CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_APOSTROPHE)
600 {
601 c = -ESC_g;
602 break;
603 }
604
605 /* Handle the Perl-compatible cases */
606
607 if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
608 {
609 const uschar *p;
610 for (p = ptr+2; *p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET; p++)
611 if (*p != CHAR_MINUS && (digitab[*p] & ctype_digit) == 0) break;
612 if (*p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET)
613 {
614 c = -ESC_k;
615 break;
616 }
617 braced = TRUE;
618 ptr++;
619 }
620 else braced = FALSE;
621
622 if (ptr[1] == CHAR_MINUS)
623 {
624 negated = TRUE;
625 ptr++;
626 }
627 else negated = FALSE;
628
629 c = 0;
630 while ((digitab[ptr[1]] & ctype_digit) != 0)
631 c = c * 10 + *(++ptr) - CHAR_0;
632
633 if (c < 0) /* Integer overflow */
634 {
635 *errorcodeptr = ERR61;
636 break;
637 }
638
639 if (braced && *(++ptr) != CHAR_RIGHT_CURLY_BRACKET)
640 {
641 *errorcodeptr = ERR57;
642 break;
643 }
644
645 if (c == 0)
646 {
647 *errorcodeptr = ERR58;
648 break;
649 }
650
651 if (negated)
652 {
653 if (c > bracount)
654 {
655 *errorcodeptr = ERR15;
656 break;
657 }
658 c = bracount - (c - 1);
659 }
660
661 c = -(ESC_REF + c);
662 break;
663
664 /* The handling of escape sequences consisting of a string of digits
665 starting with one that is not zero is not straightforward. By experiment,
666 the way Perl works seems to be as follows:
667
668 Outside a character class, the digits are read as a decimal number. If the
669 number is less than 10, or if there are that many previous extracting
670 left brackets, then it is a back reference. Otherwise, up to three octal
671 digits are read to form an escaped byte. Thus \123 is likely to be octal
672 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
673 value is greater than 377, the least significant 8 bits are taken. Inside a
674 character class, \ followed by a digit is always an octal number. */
675
676 case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
677 case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
678
679 if (!isclass)
680 {
681 oldptr = ptr;
682 c -= CHAR_0;
683 while ((digitab[ptr[1]] & ctype_digit) != 0)
684 c = c * 10 + *(++ptr) - CHAR_0;
685 if (c < 0) /* Integer overflow */
686 {
687 *errorcodeptr = ERR61;
688 break;
689 }
690 if (c < 10 || c <= bracount)
691 {
692 c = -(ESC_REF + c);
693 break;
694 }
695 ptr = oldptr; /* Put the pointer back and fall through */
696 }
697
698 /* Handle an octal number following \. If the first digit is 8 or 9, Perl
699 generates a binary zero byte and treats the digit as a following literal.
700 Thus we have to pull back the pointer by one. */
701
702 if ((c = *ptr) >= CHAR_8)
703 {
704 ptr--;
705 c = 0;
706 break;
707 }
708
709 /* \0 always starts an octal number, but we may drop through to here with a
710 larger first octal digit. The original code used just to take the least
711 significant 8 bits of octal numbers (I think this is what early Perls used
712 to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
713 than 3 octal digits. */
714
715 case CHAR_0:
716 c -= CHAR_0;
717 while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
718 c = c * 8 + *(++ptr) - CHAR_0;
719 if (!utf8 && c > 255) *errorcodeptr = ERR51;
720 break;
721
722 /* \x is complicated. \x{ddd} is a character number which can be greater
723 than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
724 treated as a data character. */
725
726 case CHAR_x:
727 if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
728 {
729 const uschar *pt = ptr + 2;
730 int count = 0;
731
732 c = 0;
733 while ((digitab[*pt] & ctype_xdigit) != 0)
734 {
735 register int cc = *pt++;
736 if (c == 0 && cc == CHAR_0) continue; /* Leading zeroes */
737 count++;
738
739 #ifndef EBCDIC /* ASCII/UTF-8 coding */
740 if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
741 c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
742 #else /* EBCDIC coding */
743 if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */
744 c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
745 #endif
746 }
747
748 if (*pt == CHAR_RIGHT_CURLY_BRACKET)
749 {
750 if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
751 ptr = pt;
752 break;
753 }
754
755 /* If the sequence of hex digits does not end with '}', then we don't
756 recognize this construct; fall through to the normal \x handling. */
757 }
758
759 /* Read just a single-byte hex-defined char */
760
761 c = 0;
762 while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
763 {
764 int cc; /* Some compilers don't like */
765 cc = *(++ptr); /* ++ in initializers */
766 #ifndef EBCDIC /* ASCII/UTF-8 coding */
767 if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
768 c = c * 16 + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
769 #else /* EBCDIC coding */
770 if (cc <= CHAR_z) cc += 64; /* Convert to upper case */
771 c = c * 16 + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
772 #endif
773 }
774 break;
775
776 /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
777 This coding is ASCII-specific, but then the whole concept of \cx is
778 ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
779
780 case CHAR_c:
781 c = *(++ptr);
782 if (c == 0)
783 {
784 *errorcodeptr = ERR2;
785 break;
786 }
787
788 #ifndef EBCDIC /* ASCII/UTF-8 coding */
789 if (c >= CHAR_a && c <= CHAR_z) c -= 32;
790 c ^= 0x40;
791 #else /* EBCDIC coding */
792 if (c >= CHAR_a && c <= CHAR_z) c += 64;
793 c ^= 0xC0;
794 #endif
795 break;
796
797 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
798 other alphanumeric following \ is an error if PCRE_EXTRA was set;
799 otherwise, for Perl compatibility, it is a literal. This code looks a bit
800 odd, but there used to be some cases other than the default, and there may
801 be again in future, so I haven't "optimized" it. */
802
803 default:
804 if ((options & PCRE_EXTRA) != 0) switch(c)
805 {
806 default:
807 *errorcodeptr = ERR3;
808 break;
809 }
810 break;
811 }
812 }
813
814 *ptrptr = ptr;
815 return c;
816 }
817
818
819
820 #ifdef SUPPORT_UCP
821 /*************************************************
822 * Handle \P and \p *
823 *************************************************/
824
825 /* This function is called after \P or \p has been encountered, provided that
826 PCRE is compiled with support for Unicode properties. On entry, ptrptr is
827 pointing at the P or p. On exit, it is pointing at the final character of the
828 escape sequence.
829
830 Argument:
831 ptrptr points to the pattern position pointer
832 negptr points to a boolean that is set TRUE for negation else FALSE
833 dptr points to an int that is set to the detailed property value
834 errorcodeptr points to the error code variable
835
836 Returns: type value from ucp_type_table, or -1 for an invalid type
837 */
838
839 static int
840 get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
841 {
842 int c, i, bot, top;
843 const uschar *ptr = *ptrptr;
844 char name[32];
845
846 c = *(++ptr);
847 if (c == 0) goto ERROR_RETURN;
848
849 *negptr = FALSE;
850
851 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
852 negation. */
853
854 if (c == CHAR_LEFT_CURLY_BRACKET)
855 {
856 if (ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
857 {
858 *negptr = TRUE;
859 ptr++;
860 }
861 for (i = 0; i < (int)sizeof(name) - 1; i++)
862 {
863 c = *(++ptr);
864 if (c == 0) goto ERROR_RETURN;
865 if (c == CHAR_RIGHT_CURLY_BRACKET) break;
866 name[i] = c;
867 }
868 if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
869 name[i] = 0;
870 }
871
872 /* Otherwise there is just one following character */
873
874 else
875 {
876 name[0] = c;
877 name[1] = 0;
878 }
879
880 *ptrptr = ptr;
881
882 /* Search for a recognized property name using binary chop */
883
884 bot = 0;
885 top = _pcre_utt_size;
886
887 while (bot < top)
888 {
889 i = (bot + top) >> 1;
890 c = strcmp(name, _pcre_utt_names + _pcre_utt[i].name_offset);
891 if (c == 0)
892 {
893 *dptr = _pcre_utt[i].value;
894 return _pcre_utt[i].type;
895 }
896 if (c > 0) bot = i + 1; else top = i;
897 }
898
899 *errorcodeptr = ERR47;
900 *ptrptr = ptr;
901 return -1;
902
903 ERROR_RETURN:
904 *errorcodeptr = ERR46;
905 *ptrptr = ptr;
906 return -1;
907 }
908 #endif
909
910
911
912
913 /*************************************************
914 * Check for counted repeat *
915 *************************************************/
916
917 /* This function is called when a '{' is encountered in a place where it might
918 start a quantifier. It looks ahead to see if it really is a quantifier or not.
919 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
920 where the ddds are digits.
921
922 Arguments:
923 p pointer to the first char after '{'
924
925 Returns: TRUE or FALSE
926 */
927
928 static BOOL
929 is_counted_repeat(const uschar *p)
930 {
931 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
932 while ((digitab[*p] & ctype_digit) != 0) p++;
933 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
934
935 if (*p++ != CHAR_COMMA) return FALSE;
936 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
937
938 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
939 while ((digitab[*p] & ctype_digit) != 0) p++;
940
941 return (*p == CHAR_RIGHT_CURLY_BRACKET);
942 }
943
944
945
946 /*************************************************
947 * Read repeat counts *
948 *************************************************/
949
950 /* Read an item of the form {n,m} and return the values. This is called only
951 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
952 so the syntax is guaranteed to be correct, but we need to check the values.
953
954 Arguments:
955 p pointer to first char after '{'
956 minp pointer to int for min
957 maxp pointer to int for max
958 returned as -1 if no max
959 errorcodeptr points to error code variable
960
961 Returns: pointer to '}' on success;
962 current ptr on error, with errorcodeptr set non-zero
963 */
964
965 static const uschar *
966 read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
967 {
968 int min = 0;
969 int max = -1;
970
971 /* Read the minimum value and do a paranoid check: a negative value indicates
972 an integer overflow. */
973
974 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - CHAR_0;
975 if (min < 0 || min > 65535)
976 {
977 *errorcodeptr = ERR5;
978 return p;
979 }
980
981 /* Read the maximum value if there is one, and again do a paranoid on its size.
982 Also, max must not be less than min. */
983
984 if (*p == CHAR_RIGHT_CURLY_BRACKET) max = min; else
985 {
986 if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)
987 {
988 max = 0;
989 while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - CHAR_0;
990 if (max < 0 || max > 65535)
991 {
992 *errorcodeptr = ERR5;
993 return p;
994 }
995 if (max < min)
996 {
997 *errorcodeptr = ERR4;
998 return p;
999 }
1000 }
1001 }
1002
1003 /* Fill in the required variables, and pass back the pointer to the terminating
1004 '}'. */
1005
1006 *minp = min;
1007 *maxp = max;
1008 return p;
1009 }
1010
1011
1012
1013 /*************************************************
1014 * Subroutine for finding forward reference *
1015 *************************************************/
1016
1017 /* This recursive function is called only from find_parens() below. The
1018 top-level call starts at the beginning of the pattern. All other calls must
1019 start at a parenthesis. It scans along a pattern's text looking for capturing
1020 subpatterns, and counting them. If it finds a named pattern that matches the
1021 name it is given, it returns its number. Alternatively, if the name is NULL, it
1022 returns when it reaches a given numbered subpattern. We know that if (?P< is
1023 encountered, the name will be terminated by '>' because that is checked in the
1024 first pass. Recursion is used to keep track of subpatterns that reset the
1025 capturing group numbers - the (?| feature.
1026
1027 Arguments:
1028 ptrptr address of the current character pointer (updated)
1029 cd compile background data
1030 name name to seek, or NULL if seeking a numbered subpattern
1031 lorn name length, or subpattern number if name is NULL
1032 xmode TRUE if we are in /x mode
1033 count pointer to the current capturing subpattern number (updated)
1034
1035 Returns: the number of the named subpattern, or -1 if not found
1036 */
1037
1038 static int
1039 find_parens_sub(uschar **ptrptr, compile_data *cd, const uschar *name, int lorn,
1040 BOOL xmode, int *count)
1041 {
1042 uschar *ptr = *ptrptr;
1043 int start_count = *count;
1044 int hwm_count = start_count;
1045 BOOL dup_parens = FALSE;
1046
1047 /* If the first character is a parenthesis, check on the type of group we are
1048 dealing with. The very first call may not start with a parenthesis. */
1049
1050 if (ptr[0] == CHAR_LEFT_PARENTHESIS)
1051 {
1052 if (ptr[1] == CHAR_QUESTION_MARK &&
1053 ptr[2] == CHAR_VERTICAL_LINE)
1054 {
1055 ptr += 3;
1056 dup_parens = TRUE;
1057 }
1058
1059 /* Handle a normal, unnamed capturing parenthesis */
1060
1061 else if (ptr[1] != CHAR_QUESTION_MARK && ptr[1] != CHAR_ASTERISK)
1062 {
1063 *count += 1;
1064 if (name == NULL && *count == lorn) return *count;
1065 ptr++;
1066 }
1067
1068 /* Handle a condition. If it is an assertion, just carry on so that it
1069 is processed as normal. If not, skip to the closing parenthesis of the
1070 condition (there can't be any nested parens. */
1071
1072 else if (ptr[2] == CHAR_LEFT_PARENTHESIS)
1073 {
1074 ptr += 2;
1075 if (ptr[1] != CHAR_QUESTION_MARK)
1076 {
1077 while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
1078 if (*ptr != 0) ptr++;
1079 }
1080 }
1081
1082 /* We have either (? or (* and not a condition */
1083
1084 else
1085 {
1086 ptr += 2;
1087 if (*ptr == CHAR_P) ptr++; /* Allow optional P */
1088
1089 /* We have to disambiguate (?<! and (?<= from (?<name> for named groups */
1090
1091 if ((*ptr == CHAR_LESS_THAN_SIGN && ptr[1] != CHAR_EXCLAMATION_MARK &&
1092 ptr[1] != CHAR_EQUALS_SIGN) || *ptr == CHAR_APOSTROPHE)
1093 {
1094 int term;
1095 const uschar *thisname;
1096 *count += 1;
1097 if (name == NULL && *count == lorn) return *count;
1098 term = *ptr++;
1099 if (term == CHAR_LESS_THAN_SIGN) term = CHAR_GREATER_THAN_SIGN;
1100 thisname = ptr;
1101 while (*ptr != term) ptr++;
1102 if (name != NULL && lorn == ptr - thisname &&
1103 strncmp((const char *)name, (const char *)thisname, lorn) == 0)
1104 return *count;
1105 term++;
1106 }
1107 }
1108 }
1109
1110 /* Past any initial parenthesis handling, scan for parentheses or vertical
1111 bars. */
1112
1113 for (; *ptr != 0; ptr++)
1114 {
1115 /* Skip over backslashed characters and also entire \Q...\E */
1116
1117 if (*ptr == CHAR_BACKSLASH)
1118 {
1119 if (*(++ptr) == 0) goto FAIL_EXIT;
1120 if (*ptr == CHAR_Q) for (;;)
1121 {
1122 while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
1123 if (*ptr == 0) goto FAIL_EXIT;
1124 if (*(++ptr) == CHAR_E) break;
1125 }
1126 continue;
1127 }
1128
1129 /* Skip over character classes; this logic must be similar to the way they
1130 are handled for real. If the first character is '^', skip it. Also, if the
1131 first few characters (either before or after ^) are \Q\E or \E we skip them
1132 too. This makes for compatibility with Perl. Note the use of STR macros to
1133 encode "Q\\E" so that it works in UTF-8 on EBCDIC platforms. */
1134
1135 if (*ptr == CHAR_LEFT_SQUARE_BRACKET)
1136 {
1137 BOOL negate_class = FALSE;
1138 for (;;)
1139 {
1140 if (ptr[1] == CHAR_BACKSLASH)
1141 {
1142 if (ptr[2] == CHAR_E)
1143 ptr+= 2;
1144 else if (strncmp((const char *)ptr+2,
1145 STR_Q STR_BACKSLASH STR_E, 3) == 0)
1146 ptr += 4;
1147 else
1148 break;
1149 }
1150 else if (!negate_class && ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
1151 {
1152 negate_class = TRUE;
1153 ptr++;
1154 }
1155 else break;
1156 }
1157
1158 /* If the next character is ']', it is a data character that must be
1159 skipped, except in JavaScript compatibility mode. */
1160
1161 if (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET &&
1162 (cd->external_options & PCRE_JAVASCRIPT_COMPAT) == 0)
1163 ptr++;
1164
1165 while (*(++ptr) != CHAR_RIGHT_SQUARE_BRACKET)
1166 {
1167 if (*ptr == 0) return -1;
1168 if (*ptr == CHAR_BACKSLASH)
1169 {
1170 if (*(++ptr) == 0) goto FAIL_EXIT;
1171 if (*ptr == CHAR_Q) for (;;)
1172 {
1173 while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
1174 if (*ptr == 0) goto FAIL_EXIT;
1175 if (*(++ptr) == CHAR_E) break;
1176 }
1177 continue;
1178 }
1179 }
1180 continue;
1181 }
1182
1183 /* Skip comments in /x mode */
1184
1185 if (xmode && *ptr == CHAR_NUMBER_SIGN)
1186 {
1187 while (*(++ptr) != 0 && *ptr != CHAR_NL) {};
1188 if (*ptr == 0) goto FAIL_EXIT;
1189 continue;
1190 }
1191
1192 /* Check for the special metacharacters */
1193
1194 if (*ptr == CHAR_LEFT_PARENTHESIS)
1195 {
1196 int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, count);
1197 if (rc > 0) return rc;
1198 if (*ptr == 0) goto FAIL_EXIT;
1199 }
1200
1201 else if (*ptr == CHAR_RIGHT_PARENTHESIS)
1202 {
1203 if (dup_parens && *count < hwm_count) *count = hwm_count;
1204 *ptrptr = ptr;
1205 return -1;
1206 }
1207
1208 else if (*ptr == CHAR_VERTICAL_LINE && dup_parens)
1209 {
1210 if (*count > hwm_count) hwm_count = *count;
1211 *count = start_count;
1212 }
1213 }
1214
1215 FAIL_EXIT:
1216 *ptrptr = ptr;
1217 return -1;
1218 }
1219
1220
1221
1222
1223 /*************************************************
1224 * Find forward referenced subpattern *
1225 *************************************************/
1226
1227 /* This function scans along a pattern's text looking for capturing
1228 subpatterns, and counting them. If it finds a named pattern that matches the
1229 name it is given, it returns its number. Alternatively, if the name is NULL, it
1230 returns when it reaches a given numbered subpattern. This is used for forward
1231 references to subpatterns. We used to be able to start this scan from the
1232 current compiling point, using the current count value from cd->bracount, and
1233 do it all in a single loop, but the addition of the possibility of duplicate
1234 subpattern numbers means that we have to scan from the very start, in order to
1235 take account of such duplicates, and to use a recursive function to keep track
1236 of the different types of group.
1237
1238 Arguments:
1239 cd compile background data
1240 name name to seek, or NULL if seeking a numbered subpattern
1241 lorn name length, or subpattern number if name is NULL
1242 xmode TRUE if we are in /x mode
1243
1244 Returns: the number of the found subpattern, or -1 if not found
1245 */
1246
1247 static int
1248 find_parens(compile_data *cd, const uschar *name, int lorn, BOOL xmode)
1249 {
1250 uschar *ptr = (uschar *)cd->start_pattern;
1251 int count = 0;
1252 int rc;
1253
1254 /* If the pattern does not start with an opening parenthesis, the first call
1255 to find_parens_sub() will scan right to the end (if necessary). However, if it
1256 does start with a parenthesis, find_parens_sub() will return when it hits the
1257 matching closing parens. That is why we have to have a loop. */
1258
1259 for (;;)
1260 {
1261 rc = find_parens_sub(&ptr, cd, name, lorn, xmode, &count);
1262 if (rc > 0 || *ptr++ == 0) break;
1263 }
1264
1265 return rc;
1266 }
1267
1268
1269
1270
1271 /*************************************************
1272 * Find first significant op code *
1273 *************************************************/
1274
1275 /* This is called by several functions that scan a compiled expression looking
1276 for a fixed first character, or an anchoring op code etc. It skips over things
1277 that do not influence this. For some calls, a change of option is important.
1278 For some calls, it makes sense to skip negative forward and all backward
1279 assertions, and also the \b assertion; for others it does not.
1280
1281 Arguments:
1282 code pointer to the start of the group
1283 options pointer to external options
1284 optbit the option bit whose changing is significant, or
1285 zero if none are
1286 skipassert TRUE if certain assertions are to be skipped
1287
1288 Returns: pointer to the first significant opcode
1289 */
1290
1291 static const uschar*
1292 first_significant_code(const uschar *code, int *options, int optbit,
1293 BOOL skipassert)
1294 {
1295 for (;;)
1296 {
1297 switch ((int)*code)
1298 {
1299 case OP_OPT:
1300 if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
1301 *options = (int)code[1];
1302 code += 2;
1303 break;
1304
1305 case OP_ASSERT_NOT:
1306 case OP_ASSERTBACK:
1307 case OP_ASSERTBACK_NOT:
1308 if (!skipassert) return code;
1309 do code += GET(code, 1); while (*code == OP_ALT);
1310 code += _pcre_OP_lengths[*code];
1311 break;
1312
1313 case OP_WORD_BOUNDARY:
1314 case OP_NOT_WORD_BOUNDARY:
1315 if (!skipassert) return code;
1316 /* Fall through */
1317
1318 case OP_CALLOUT:
1319 case OP_CREF:
1320 case OP_NCREF:
1321 case OP_RREF:
1322 case OP_NRREF:
1323 case OP_DEF:
1324 code += _pcre_OP_lengths[*code];
1325 break;
1326
1327 default:
1328 return code;
1329 }
1330 }
1331 /* Control never reaches here */
1332 }
1333
1334
1335
1336
1337 /*************************************************
1338 * Find the fixed length of a branch *
1339 *************************************************/
1340
1341 /* Scan a branch and compute the fixed length of subject that will match it,
1342 if the length is fixed. This is needed for dealing with backward assertions.
1343 In UTF8 mode, the result is in characters rather than bytes. The branch is
1344 temporarily terminated with OP_END when this function is called.
1345
1346 This function is called when a backward assertion is encountered, so that if it
1347 fails, the error message can point to the correct place in the pattern.
1348 However, we cannot do this when the assertion contains subroutine calls,
1349 because they can be forward references. We solve this by remembering this case
1350 and doing the check at the end; a flag specifies which mode we are running in.
1351
1352 Arguments:
1353 code points to the start of the pattern (the bracket)
1354 options the compiling options
1355 atend TRUE if called when the pattern is complete
1356 cd the "compile data" structure
1357
1358 Returns: the fixed length,
1359 or -1 if there is no fixed length,
1360 or -2 if \C was encountered
1361 or -3 if an OP_RECURSE item was encountered and atend is FALSE
1362 */
1363
1364 static int
1365 find_fixedlength(uschar *code, int options, BOOL atend, compile_data *cd)
1366 {
1367 int length = -1;
1368
1369 register int branchlength = 0;
1370 register uschar *cc = code + 1 + LINK_SIZE;
1371
1372 /* Scan along the opcodes for this branch. If we get to the end of the
1373 branch, check the length against that of the other branches. */
1374
1375 for (;;)
1376 {
1377 int d;
1378 uschar *ce, *cs;
1379 register int op = *cc;
1380 switch (op)
1381 {
1382 case OP_CBRA:
1383 case OP_BRA:
1384 case OP_ONCE:
1385 case OP_COND:
1386 d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options, atend, cd);
1387 if (d < 0) return d;
1388 branchlength += d;
1389 do cc += GET(cc, 1); while (*cc == OP_ALT);
1390 cc += 1 + LINK_SIZE;
1391 break;
1392
1393 /* Reached end of a branch; if it's a ket it is the end of a nested
1394 call. If it's ALT it is an alternation in a nested call. If it is
1395 END it's the end of the outer call. All can be handled by the same code. */
1396
1397 case OP_ALT:
1398 case OP_KET:
1399 case OP_KETRMAX:
1400 case OP_KETRMIN:
1401 case OP_END:
1402 if (length < 0) length = branchlength;
1403 else if (length != branchlength) return -1;
1404 if (*cc != OP_ALT) return length;
1405 cc += 1 + LINK_SIZE;
1406 branchlength = 0;
1407 break;
1408
1409 /* A true recursion implies not fixed length, but a subroutine call may
1410 be OK. If the subroutine is a forward reference, we can't deal with
1411 it until the end of the pattern, so return -3. */
1412
1413 case OP_RECURSE:
1414 if (!atend) return -3;
1415 cs = ce = (uschar *)cd->start_code + GET(cc, 1); /* Start subpattern */
1416 do ce += GET(ce, 1); while (*ce == OP_ALT); /* End subpattern */
1417 if (cc > cs && cc < ce) return -1; /* Recursion */
1418 d = find_fixedlength(cs + 2, options, atend, cd);
1419 if (d < 0) return d;
1420 branchlength += d;
1421 cc += 1 + LINK_SIZE;
1422 break;
1423
1424 /* Skip over assertive subpatterns */
1425
1426 case OP_ASSERT:
1427 case OP_ASSERT_NOT:
1428 case OP_ASSERTBACK:
1429 case OP_ASSERTBACK_NOT:
1430 do cc += GET(cc, 1); while (*cc == OP_ALT);
1431 /* Fall through */
1432
1433 /* Skip over things that don't match chars */
1434
1435 case OP_REVERSE:
1436 case OP_CREF:
1437 case OP_NCREF:
1438 case OP_RREF:
1439 case OP_NRREF:
1440 case OP_DEF:
1441 case OP_OPT:
1442 case OP_CALLOUT:
1443 case OP_SOD:
1444 case OP_SOM:
1445 case OP_EOD:
1446 case OP_EODN:
1447 case OP_CIRC:
1448 case OP_DOLL:
1449 case OP_NOT_WORD_BOUNDARY:
1450 case OP_WORD_BOUNDARY:
1451 cc += _pcre_OP_lengths[*cc];
1452 break;
1453
1454 /* Handle literal characters */
1455
1456 case OP_CHAR:
1457 case OP_CHARNC:
1458 case OP_NOT:
1459 branchlength++;
1460 cc += 2;
1461 #ifdef SUPPORT_UTF8
1462 if ((options & PCRE_UTF8) != 0 && cc[-1] >= 0xc0)
1463 cc += _pcre_utf8_table4[cc[-1] & 0x3f];
1464 #endif
1465 break;
1466
1467 /* Handle exact repetitions. The count is already in characters, but we
1468 need to skip over a multibyte character in UTF8 mode. */
1469
1470 case OP_EXACT:
1471 branchlength += GET2(cc,1);
1472 cc += 4;
1473 #ifdef SUPPORT_UTF8
1474 if ((options & PCRE_UTF8) != 0 && cc[-1] >= 0xc0)
1475 cc += _pcre_utf8_table4[cc[-1] & 0x3f];
1476 #endif
1477 break;
1478
1479 case OP_TYPEEXACT:
1480 branchlength += GET2(cc,1);
1481 if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
1482 cc += 4;
1483 break;
1484
1485 /* Handle single-char matchers */
1486
1487 case OP_PROP:
1488 case OP_NOTPROP:
1489 cc += 2;
1490 /* Fall through */
1491
1492 case OP_NOT_DIGIT:
1493 case OP_DIGIT:
1494 case OP_NOT_WHITESPACE:
1495 case OP_WHITESPACE:
1496 case OP_NOT_WORDCHAR:
1497 case OP_WORDCHAR:
1498 case OP_ANY:
1499 case OP_ALLANY:
1500 branchlength++;
1501 cc++;
1502 break;
1503
1504 /* The single-byte matcher isn't allowed */
1505
1506 case OP_ANYBYTE:
1507 return -2;
1508
1509 /* Check a class for variable quantification */
1510
1511 #ifdef SUPPORT_UTF8
1512 case OP_XCLASS:
1513 cc += GET(cc, 1) - 33;
1514 /* Fall through */
1515 #endif
1516
1517 case OP_CLASS:
1518 case OP_NCLASS:
1519 cc += 33;
1520
1521 switch (*cc)
1522 {
1523 case OP_CRSTAR:
1524 case OP_CRMINSTAR:
1525 case OP_CRQUERY:
1526 case OP_CRMINQUERY:
1527 return -1;
1528
1529 case OP_CRRANGE:
1530 case OP_CRMINRANGE:
1531 if (GET2(cc,1) != GET2(cc,3)) return -1;
1532 branchlength += GET2(cc,1);
1533 cc += 5;
1534 break;
1535
1536 default:
1537 branchlength++;
1538 }
1539 break;
1540
1541 /* Anything else is variable length */
1542
1543 default:
1544 return -1;
1545 }
1546 }
1547 /* Control never gets here */
1548 }
1549
1550
1551
1552
1553 /*************************************************
1554 * Scan compiled regex for specific bracket *
1555 *************************************************/
1556
1557 /* This little function scans through a compiled pattern until it finds a
1558 capturing bracket with the given number, or, if the number is negative, an
1559 instance of OP_REVERSE for a lookbehind. The function is global in the C sense
1560 so that it can be called from pcre_study() when finding the minimum matching
1561 length.
1562
1563 Arguments:
1564 code points to start of expression
1565 utf8 TRUE in UTF-8 mode
1566 number the required bracket number or negative to find a lookbehind
1567
1568 Returns: pointer to the opcode for the bracket, or NULL if not found
1569 */
1570
1571 const uschar *
1572 _pcre_find_bracket(const uschar *code, BOOL utf8, int number)
1573 {
1574 for (;;)
1575 {
1576 register int c = *code;
1577 if (c == OP_END) return NULL;
1578
1579 /* XCLASS is used for classes that cannot be represented just by a bit
1580 map. This includes negated single high-valued characters. The length in
1581 the table is zero; the actual length is stored in the compiled code. */
1582
1583 if (c == OP_XCLASS) code += GET(code, 1);
1584
1585 /* Handle recursion */
1586
1587 else if (c == OP_REVERSE)
1588 {
1589 if (number < 0) return (uschar *)code;
1590 code += _pcre_OP_lengths[c];
1591 }
1592
1593 /* Handle capturing bracket */
1594
1595 else if (c == OP_CBRA)
1596 {
1597 int n = GET2(code, 1+LINK_SIZE);
1598 if (n == number) return (uschar *)code;
1599 code += _pcre_OP_lengths[c];
1600 }
1601
1602 /* Otherwise, we can get the item's length from the table, except that for
1603 repeated character types, we have to test for \p and \P, which have an extra
1604 two bytes of parameters. */
1605
1606 else
1607 {
1608 switch(c)
1609 {
1610 case OP_TYPESTAR:
1611 case OP_TYPEMINSTAR:
1612 case OP_TYPEPLUS:
1613 case OP_TYPEMINPLUS:
1614 case OP_TYPEQUERY:
1615 case OP_TYPEMINQUERY:
1616 case OP_TYPEPOSSTAR:
1617 case OP_TYPEPOSPLUS:
1618 case OP_TYPEPOSQUERY:
1619 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1620 break;
1621
1622 case OP_TYPEUPTO:
1623 case OP_TYPEMINUPTO:
1624 case OP_TYPEEXACT:
1625 case OP_TYPEPOSUPTO:
1626 if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1627 break;
1628 }
1629
1630 /* Add in the fixed length from the table */
1631
1632 code += _pcre_OP_lengths[c];
1633
1634 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1635 a multi-byte character. The length in the table is a minimum, so we have to
1636 arrange to skip the extra bytes. */
1637
1638 #ifdef SUPPORT_UTF8
1639 if (utf8) switch(c)
1640 {
1641 case OP_CHAR:
1642 case OP_CHARNC:
1643 case OP_EXACT:
1644 case OP_UPTO:
1645 case OP_MINUPTO:
1646 case OP_POSUPTO:
1647 case OP_STAR:
1648 case OP_MINSTAR:
1649 case OP_POSSTAR:
1650 case OP_PLUS:
1651 case OP_MINPLUS:
1652 case OP_POSPLUS:
1653 case OP_QUERY:
1654 case OP_MINQUERY:
1655 case OP_POSQUERY:
1656 if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1657 break;
1658 }
1659 #else
1660 (void)(utf8); /* Keep compiler happy by referencing function argument */
1661 #endif
1662 }
1663 }
1664 }
1665
1666
1667
1668 /*************************************************
1669 * Scan compiled regex for recursion reference *
1670 *************************************************/
1671
1672 /* This little function scans through a compiled pattern until it finds an
1673 instance of OP_RECURSE.
1674
1675 Arguments:
1676 code points to start of expression
1677 utf8 TRUE in UTF-8 mode
1678
1679 Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1680 */
1681
1682 static const uschar *
1683 find_recurse(const uschar *code, BOOL utf8)
1684 {
1685 for (;;)
1686 {
1687 register int c = *code;
1688 if (c == OP_END) return NULL;
1689 if (c == OP_RECURSE) return code;
1690
1691 /* XCLASS is used for classes that cannot be represented just by a bit
1692 map. This includes negated single high-valued characters. The length in
1693 the table is zero; the actual length is stored in the compiled code. */
1694
1695 if (c == OP_XCLASS) code += GET(code, 1);
1696
1697 /* Otherwise, we can get the item's length from the table, except that for
1698 repeated character types, we have to test for \p and \P, which have an extra
1699 two bytes of parameters. */
1700
1701 else
1702 {
1703 switch(c)
1704 {
1705 case OP_TYPESTAR:
1706 case OP_TYPEMINSTAR:
1707 case OP_TYPEPLUS:
1708 case OP_TYPEMINPLUS:
1709 case OP_TYPEQUERY:
1710 case OP_TYPEMINQUERY:
1711 case OP_TYPEPOSSTAR:
1712 case OP_TYPEPOSPLUS:
1713 case OP_TYPEPOSQUERY:
1714 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1715 break;
1716
1717 case OP_TYPEPOSUPTO:
1718 case OP_TYPEUPTO:
1719 case OP_TYPEMINUPTO:
1720 case OP_TYPEEXACT:
1721 if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1722 break;
1723 }
1724
1725 /* Add in the fixed length from the table */
1726
1727 code += _pcre_OP_lengths[c];
1728
1729 /* In UTF-8 mode, opcodes that are followed by a character may be followed
1730 by a multi-byte character. The length in the table is a minimum, so we have
1731 to arrange to skip the extra bytes. */
1732
1733 #ifdef SUPPORT_UTF8
1734 if (utf8) switch(c)
1735 {
1736 case OP_CHAR:
1737 case OP_CHARNC:
1738 case OP_EXACT:
1739 case OP_UPTO:
1740 case OP_MINUPTO:
1741 case OP_POSUPTO:
1742 case OP_STAR:
1743 case OP_MINSTAR:
1744 case OP_POSSTAR:
1745 case OP_PLUS:
1746 case OP_MINPLUS:
1747 case OP_POSPLUS:
1748 case OP_QUERY:
1749 case OP_MINQUERY:
1750 case OP_POSQUERY:
1751 if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1752 break;
1753 }
1754 #else
1755 (void)(utf8); /* Keep compiler happy by referencing function argument */
1756 #endif
1757 }
1758 }
1759 }
1760
1761
1762
1763 /*************************************************
1764 * Scan compiled branch for non-emptiness *
1765 *************************************************/
1766
1767 /* This function scans through a branch of a compiled pattern to see whether it
1768 can match the empty string or not. It is called from could_be_empty()
1769 below and from compile_branch() when checking for an unlimited repeat of a
1770 group that can match nothing. Note that first_significant_code() skips over
1771 backward and negative forward assertions when its final argument is TRUE. If we
1772 hit an unclosed bracket, we return "empty" - this means we've struck an inner
1773 bracket whose current branch will already have been scanned.
1774
1775 Arguments:
1776 code points to start of search
1777 endcode points to where to stop
1778 utf8 TRUE if in UTF8 mode
1779
1780 Returns: TRUE if what is matched could be empty
1781 */
1782
1783 static BOOL
1784 could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1785 {
1786 register int c;
1787 for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1788 code < endcode;
1789 code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1790 {
1791 const uschar *ccode;
1792
1793 c = *code;
1794
1795 /* Skip over forward assertions; the other assertions are skipped by
1796 first_significant_code() with a TRUE final argument. */
1797
1798 if (c == OP_ASSERT)
1799 {
1800 do code += GET(code, 1); while (*code == OP_ALT);
1801 c = *code;
1802 continue;
1803 }
1804
1805 /* Groups with zero repeats can of course be empty; skip them. */
1806
1807 if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO)
1808 {
1809 code += _pcre_OP_lengths[c];
1810 do code += GET(code, 1); while (*code == OP_ALT);
1811 c = *code;
1812 continue;
1813 }
1814
1815 /* For other groups, scan the branches. */
1816
1817 if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND)
1818 {
1819 BOOL empty_branch;
1820 if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1821
1822 /* If a conditional group has only one branch, there is a second, implied,
1823 empty branch, so just skip over the conditional, because it could be empty.
1824 Otherwise, scan the individual branches of the group. */
1825
1826 if (c == OP_COND && code[GET(code, 1)] != OP_ALT)
1827 code += GET(code, 1);
1828 else
1829 {
1830 empty_branch = FALSE;
1831 do
1832 {
1833 if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1834 empty_branch = TRUE;
1835 code += GET(code, 1);
1836 }
1837 while (*code == OP_ALT);
1838 if (!empty_branch) return FALSE; /* All branches are non-empty */
1839 }
1840
1841 c = *code;
1842 continue;
1843 }
1844
1845 /* Handle the other opcodes */
1846
1847 switch (c)
1848 {
1849 /* Check for quantifiers after a class. XCLASS is used for classes that
1850 cannot be represented just by a bit map. This includes negated single
1851 high-valued characters. The length in _pcre_OP_lengths[] is zero; the
1852 actual length is stored in the compiled code, so we must update "code"
1853 here. */
1854
1855 #ifdef SUPPORT_UTF8
1856 case OP_XCLASS:
1857 ccode = code += GET(code, 1);
1858 goto CHECK_CLASS_REPEAT;
1859 #endif
1860
1861 case OP_CLASS:
1862 case OP_NCLASS:
1863 ccode = code + 33;
1864
1865 #ifdef SUPPORT_UTF8
1866 CHECK_CLASS_REPEAT:
1867 #endif
1868
1869 switch (*ccode)
1870 {
1871 case OP_CRSTAR: /* These could be empty; continue */
1872 case OP_CRMINSTAR:
1873 case OP_CRQUERY:
1874 case OP_CRMINQUERY:
1875 break;
1876
1877 default: /* Non-repeat => class must match */
1878 case OP_CRPLUS: /* These repeats aren't empty */
1879 case OP_CRMINPLUS:
1880 return FALSE;
1881
1882 case OP_CRRANGE:
1883 case OP_CRMINRANGE:
1884 if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
1885 break;
1886 }
1887 break;
1888
1889 /* Opcodes that must match a character */
1890
1891 case OP_PROP:
1892 case OP_NOTPROP:
1893 case OP_EXTUNI:
1894 case OP_NOT_DIGIT:
1895 case OP_DIGIT:
1896 case OP_NOT_WHITESPACE:
1897 case OP_WHITESPACE:
1898 case OP_NOT_WORDCHAR:
1899 case OP_WORDCHAR:
1900 case OP_ANY:
1901 case OP_ALLANY:
1902 case OP_ANYBYTE:
1903 case OP_CHAR:
1904 case OP_CHARNC:
1905 case OP_NOT:
1906 case OP_PLUS:
1907 case OP_MINPLUS:
1908 case OP_POSPLUS:
1909 case OP_EXACT:
1910 case OP_NOTPLUS:
1911 case OP_NOTMINPLUS:
1912 case OP_NOTPOSPLUS:
1913 case OP_NOTEXACT:
1914 case OP_TYPEPLUS:
1915 case OP_TYPEMINPLUS:
1916 case OP_TYPEPOSPLUS:
1917 case OP_TYPEEXACT:
1918 return FALSE;
1919
1920 /* These are going to continue, as they may be empty, but we have to
1921 fudge the length for the \p and \P cases. */
1922
1923 case OP_TYPESTAR:
1924 case OP_TYPEMINSTAR:
1925 case OP_TYPEPOSSTAR:
1926 case OP_TYPEQUERY:
1927 case OP_TYPEMINQUERY:
1928 case OP_TYPEPOSQUERY:
1929 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1930 break;
1931
1932 /* Same for these */
1933
1934 case OP_TYPEUPTO:
1935 case OP_TYPEMINUPTO:
1936 case OP_TYPEPOSUPTO:
1937 if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1938 break;
1939
1940 /* End of branch */
1941
1942 case OP_KET:
1943 case OP_KETRMAX:
1944 case OP_KETRMIN:
1945 case OP_ALT:
1946 return TRUE;
1947
1948 /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1949 MINUPTO, and POSUPTO may be followed by a multibyte character */
1950
1951 #ifdef SUPPORT_UTF8
1952 case OP_STAR:
1953 case OP_MINSTAR:
1954 case OP_POSSTAR:
1955 case OP_QUERY:
1956 case OP_MINQUERY:
1957 case OP_POSQUERY:
1958 if (utf8 && code[1] >= 0xc0) code += _pcre_utf8_table4[code[1] & 0x3f];
1959 break;
1960
1961 case OP_UPTO:
1962 case OP_MINUPTO:
1963 case OP_POSUPTO:
1964 if (utf8 && code[3] >= 0xc0) code += _pcre_utf8_table4[code[3] & 0x3f];
1965 break;
1966 #endif
1967 }
1968 }
1969
1970 return TRUE;
1971 }
1972
1973
1974
1975 /*************************************************
1976 * Scan compiled regex for non-emptiness *
1977 *************************************************/
1978
1979 /* This function is called to check for left recursive calls. We want to check
1980 the current branch of the current pattern to see if it could match the empty
1981 string. If it could, we must look outwards for branches at other levels,
1982 stopping when we pass beyond the bracket which is the subject of the recursion.
1983
1984 Arguments:
1985 code points to start of the recursion
1986 endcode points to where to stop (current RECURSE item)
1987 bcptr points to the chain of current (unclosed) branch starts
1988 utf8 TRUE if in UTF-8 mode
1989
1990 Returns: TRUE if what is matched could be empty
1991 */
1992
1993 static BOOL
1994 could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1995 BOOL utf8)
1996 {
1997 while (bcptr != NULL && bcptr->current >= code)
1998 {
1999 if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
2000 bcptr = bcptr->outer;
2001 }
2002 return TRUE;
2003 }
2004
2005
2006
2007 /*************************************************
2008 * Check for POSIX class syntax *
2009 *************************************************/
2010
2011 /* This function is called when the sequence "[:" or "[." or "[=" is
2012 encountered in a character class. It checks whether this is followed by a
2013 sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
2014 reach an unescaped ']' without the special preceding character, return FALSE.
2015
2016 Originally, this function only recognized a sequence of letters between the
2017 terminators, but it seems that Perl recognizes any sequence of characters,
2018 though of course unknown POSIX names are subsequently rejected. Perl gives an
2019 "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
2020 didn't consider this to be a POSIX class. Likewise for [:1234:].
2021
2022 The problem in trying to be exactly like Perl is in the handling of escapes. We
2023 have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
2024 class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
2025 below handles the special case of \], but does not try to do any other escape
2026 processing. This makes it different from Perl for cases such as [:l\ower:]
2027 where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize
2028 "l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,
2029 I think.
2030
2031 Arguments:
2032 ptr pointer to the initial [
2033 endptr where to return the end pointer
2034
2035 Returns: TRUE or FALSE
2036 */
2037
2038 static BOOL
2039 check_posix_syntax(const uschar *ptr, const uschar **endptr)
2040 {
2041 int terminator; /* Don't combine these lines; the Solaris cc */
2042 terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
2043 for (++ptr; *ptr != 0; ptr++)
2044 {
2045 if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET) ptr++; else
2046 {
2047 if (*ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
2048 if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
2049 {
2050 *endptr = ptr;
2051 return TRUE;
2052 }
2053 }
2054 }
2055 return FALSE;
2056 }
2057
2058
2059
2060
2061 /*************************************************
2062 * Check POSIX class name *
2063 *************************************************/
2064
2065 /* This function is called to check the name given in a POSIX-style class entry
2066 such as [:alnum:].
2067
2068 Arguments:
2069 ptr points to the first letter
2070 len the length of the name
2071
2072 Returns: a value representing the name, or -1 if unknown
2073 */
2074
2075 static int
2076 check_posix_name(const uschar *ptr, int len)
2077 {
2078 const char *pn = posix_names;
2079 register int yield = 0;
2080 while (posix_name_lengths[yield] != 0)
2081 {
2082 if (len == posix_name_lengths[yield] &&
2083 strncmp((const char *)ptr, pn, len) == 0) return yield;
2084 pn += posix_name_lengths[yield] + 1;
2085 yield++;
2086 }
2087 return -1;
2088 }
2089
2090
2091 /*************************************************
2092 * Adjust OP_RECURSE items in repeated group *
2093 *************************************************/
2094
2095 /* OP_RECURSE items contain an offset from the start of the regex to the group
2096 that is referenced. This means that groups can be replicated for fixed
2097 repetition simply by copying (because the recursion is allowed to refer to
2098 earlier groups that are outside the current group). However, when a group is
2099 optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is
2100 inserted before it, after it has been compiled. This means that any OP_RECURSE
2101 items within it that refer to the group itself or any contained groups have to
2102 have their offsets adjusted. That one of the jobs of this function. Before it
2103 is called, the partially compiled regex must be temporarily terminated with
2104 OP_END.
2105
2106 This function has been extended with the possibility of forward references for
2107 recursions and subroutine calls. It must also check the list of such references
2108 for the group we are dealing with. If it finds that one of the recursions in
2109 the current group is on this list, it adjusts the offset in the list, not the
2110 value in the reference (which is a group number).
2111
2112 Arguments:
2113 group points to the start of the group
2114 adjust the amount by which the group is to be moved
2115 utf8 TRUE in UTF-8 mode
2116 cd contains pointers to tables etc.
2117 save_hwm the hwm forward reference pointer at the start of the group
2118
2119 Returns: nothing
2120 */
2121
2122 static void
2123 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
2124 uschar *save_hwm)
2125 {
2126 uschar *ptr = group;
2127
2128 while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
2129 {
2130 int offset;
2131 uschar *hc;
2132
2133 /* See if this recursion is on the forward reference list. If so, adjust the
2134 reference. */
2135
2136 for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
2137 {
2138 offset = GET(hc, 0);
2139 if (cd->start_code + offset == ptr + 1)
2140 {
2141 PUT(hc, 0, offset + adjust);
2142 break;
2143 }
2144 }
2145
2146 /* Otherwise, adjust the recursion offset if it's after the start of this
2147 group. */
2148
2149 if (hc >= cd->hwm)
2150 {
2151 offset = GET(ptr, 1);
2152 if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
2153 }
2154
2155 ptr += 1 + LINK_SIZE;
2156 }
2157 }
2158
2159
2160
2161 /*************************************************
2162 * Insert an automatic callout point *
2163 *************************************************/
2164
2165 /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
2166 callout points before each pattern item.
2167
2168 Arguments:
2169 code current code pointer
2170 ptr current pattern pointer
2171 cd pointers to tables etc
2172
2173 Returns: new code pointer
2174 */
2175
2176 static uschar *
2177 auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
2178 {
2179 *code++ = OP_CALLOUT;
2180 *code++ = 255;
2181 PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */
2182 PUT(code, LINK_SIZE, 0); /* Default length */
2183 return code + 2*LINK_SIZE;
2184 }
2185
2186
2187
2188 /*************************************************
2189 * Complete a callout item *
2190 *************************************************/
2191
2192 /* A callout item contains the length of the next item in the pattern, which
2193 we can't fill in till after we have reached the relevant point. This is used
2194 for both automatic and manual callouts.
2195
2196 Arguments:
2197 previous_callout points to previous callout item
2198 ptr current pattern pointer
2199 cd pointers to tables etc
2200
2201 Returns: nothing
2202 */
2203
2204 static void
2205 complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
2206 {
2207 int length = ptr - cd->start_pattern - GET(previous_callout, 2);
2208 PUT(previous_callout, 2 + LINK_SIZE, length);
2209 }
2210
2211
2212
2213 #ifdef SUPPORT_UCP
2214 /*************************************************
2215 * Get othercase range *
2216 *************************************************/
2217
2218 /* This function is passed the start and end of a class range, in UTF-8 mode
2219 with UCP support. It searches up the characters, looking for internal ranges of
2220 characters in the "other" case. Each call returns the next one, updating the
2221 start address.
2222
2223 Arguments:
2224 cptr points to starting character value; updated
2225 d end value
2226 ocptr where to put start of othercase range
2227 odptr where to put end of othercase range
2228
2229 Yield: TRUE when range returned; FALSE when no more
2230 */
2231
2232 static BOOL
2233 get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
2234 unsigned int *odptr)
2235 {
2236 unsigned int c, othercase, next;
2237
2238 for (c = *cptr; c <= d; c++)
2239 { if ((othercase = UCD_OTHERCASE(c)) != c) break; }
2240
2241 if (c > d) return FALSE;
2242
2243 *ocptr = othercase;
2244 next = othercase + 1;
2245
2246 for (++c; c <= d; c++)
2247 {
2248 if (UCD_OTHERCASE(c) != next) break;
2249 next++;
2250 }
2251
2252 *odptr = next - 1;
2253 *cptr = c;
2254
2255 return TRUE;
2256 }
2257 #endif /* SUPPORT_UCP */
2258
2259
2260
2261 /*************************************************
2262 * Check if auto-possessifying is possible *
2263 *************************************************/
2264
2265 /* This function is called for unlimited repeats of certain items, to see
2266 whether the next thing could possibly match the repeated item. If not, it makes
2267 sense to automatically possessify the repeated item.
2268
2269 Arguments:
2270 op_code the repeated op code
2271 this data for this item, depends on the opcode
2272 utf8 TRUE in UTF-8 mode
2273 utf8_char used for utf8 character bytes, NULL if not relevant
2274 ptr next character in pattern
2275 options options bits
2276 cd contains pointers to tables etc.
2277
2278 Returns: TRUE if possessifying is wanted
2279 */
2280
2281 static BOOL
2282 check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
2283 const uschar *ptr, int options, compile_data *cd)
2284 {
2285 int next;
2286
2287 /* Skip whitespace and comments in extended mode */
2288
2289 if ((options & PCRE_EXTENDED) != 0)
2290 {
2291 for (;;)
2292 {
2293 while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2294 if (*ptr == CHAR_NUMBER_SIGN)
2295 {
2296 while (*(++ptr) != 0)
2297 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2298 }
2299 else break;
2300 }
2301 }
2302
2303 /* If the next item is one that we can handle, get its value. A non-negative
2304 value is a character, a negative value is an escape value. */
2305
2306 if (*ptr == CHAR_BACKSLASH)
2307 {
2308 int temperrorcode = 0;
2309 next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
2310 if (temperrorcode != 0) return FALSE;
2311 ptr++; /* Point after the escape sequence */
2312 }
2313
2314 else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
2315 {
2316 #ifdef SUPPORT_UTF8
2317 if (utf8) { GETCHARINC(next, ptr); } else
2318 #endif
2319 next = *ptr++;
2320 }
2321
2322 else return FALSE;
2323
2324 /* Skip whitespace and comments in extended mode */
2325
2326 if ((options & PCRE_EXTENDED) != 0)
2327 {
2328 for (;;)
2329 {
2330 while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2331 if (*ptr == CHAR_NUMBER_SIGN)
2332 {
2333 while (*(++ptr) != 0)
2334 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2335 }
2336 else break;
2337 }
2338 }
2339
2340 /* If the next thing is itself optional, we have to give up. */
2341
2342 if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
2343 strncmp((char *)ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
2344 return FALSE;
2345
2346 /* Now compare the next item with the previous opcode. If the previous is a
2347 positive single character match, "item" either contains the character or, if
2348 "item" is greater than 127 in utf8 mode, the character's bytes are in
2349 utf8_char. */
2350
2351
2352 /* Handle cases when the next item is a character. */
2353
2354 if (next >= 0) switch(op_code)
2355 {
2356 case OP_CHAR:
2357 #ifdef SUPPORT_UTF8
2358 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2359 #else
2360 (void)(utf8_char); /* Keep compiler happy by referencing function argument */
2361 #endif
2362 return item != next;
2363
2364 /* For CHARNC (caseless character) we must check the other case. If we have
2365 Unicode property support, we can use it to test the other case of
2366 high-valued characters. */
2367
2368 case OP_CHARNC:
2369 #ifdef SUPPORT_UTF8
2370 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2371 #endif
2372 if (item == next) return FALSE;
2373 #ifdef SUPPORT_UTF8
2374 if (utf8)
2375 {
2376 unsigned int othercase;
2377 if (next < 128) othercase = cd->fcc[next]; else
2378 #ifdef SUPPORT_UCP
2379 othercase = UCD_OTHERCASE((unsigned int)next);
2380 #else
2381 othercase = NOTACHAR;
2382 #endif
2383 return (unsigned int)item != othercase;
2384 }
2385 else
2386 #endif /* SUPPORT_UTF8 */
2387 return (item != cd->fcc[next]); /* Non-UTF-8 mode */
2388
2389 /* For OP_NOT, "item" must be a single-byte character. */
2390
2391 case OP_NOT:
2392 if (item == next) return TRUE;
2393 if ((options & PCRE_CASELESS) == 0) return FALSE;
2394 #ifdef SUPPORT_UTF8
2395 if (utf8)
2396 {
2397 unsigned int othercase;
2398 if (next < 128) othercase = cd->fcc[next]; else
2399 #ifdef SUPPORT_UCP
2400 othercase = UCD_OTHERCASE(next);
2401 #else
2402 othercase = NOTACHAR;
2403 #endif
2404 return (unsigned int)item == othercase;
2405 }
2406 else
2407 #endif /* SUPPORT_UTF8 */
2408 return (item == cd->fcc[next]); /* Non-UTF-8 mode */
2409
2410 case OP_DIGIT:
2411 return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
2412
2413 case OP_NOT_DIGIT:
2414 return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
2415
2416 case OP_WHITESPACE:
2417 return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
2418
2419 case OP_NOT_WHITESPACE:
2420 return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
2421
2422 case OP_WORDCHAR:
2423 return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
2424
2425 case OP_NOT_WORDCHAR:
2426 return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
2427
2428 case OP_HSPACE:
2429 case OP_NOT_HSPACE:
2430 switch(next)
2431 {
2432 case 0x09:
2433 case 0x20:
2434 case 0xa0:
2435 case 0x1680:
2436 case 0x180e:
2437 case 0x2000:
2438 case 0x2001:
2439 case 0x2002:
2440 case 0x2003:
2441 case 0x2004:
2442 case 0x2005:
2443 case 0x2006:
2444 case 0x2007:
2445 case 0x2008:
2446 case 0x2009:
2447 case 0x200A:
2448 case 0x202f:
2449 case 0x205f:
2450 case 0x3000:
2451 return op_code != OP_HSPACE;
2452 default:
2453 return op_code == OP_HSPACE;
2454 }
2455
2456 case OP_VSPACE:
2457 case OP_NOT_VSPACE:
2458 switch(next)
2459 {
2460 case 0x0a:
2461 case 0x0b:
2462 case 0x0c:
2463 case 0x0d:
2464 case 0x85:
2465 case 0x2028:
2466 case 0x2029:
2467 return op_code != OP_VSPACE;
2468 default:
2469 return op_code == OP_VSPACE;
2470 }
2471
2472 default:
2473 return FALSE;
2474 }
2475
2476
2477 /* Handle the case when the next item is \d, \s, etc. */
2478
2479 switch(op_code)
2480 {
2481 case OP_CHAR:
2482 case OP_CHARNC:
2483 #ifdef SUPPORT_UTF8
2484 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2485 #endif
2486 switch(-next)
2487 {
2488 case ESC_d:
2489 return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
2490
2491 case ESC_D:
2492 return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
2493
2494 case ESC_s:
2495 return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
2496
2497 case ESC_S:
2498 return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
2499
2500 case ESC_w:
2501 return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
2502
2503 case ESC_W:
2504 return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
2505
2506 case ESC_h:
2507 case ESC_H:
2508 switch(item)
2509 {
2510 case 0x09:
2511 case 0x20:
2512 case 0xa0:
2513 case 0x1680:
2514 case 0x180e:
2515 case 0x2000:
2516 case 0x2001:
2517 case 0x2002:
2518 case 0x2003:
2519 case 0x2004:
2520 case 0x2005:
2521 case 0x2006:
2522 case 0x2007:
2523 case 0x2008:
2524 case 0x2009:
2525 case 0x200A:
2526 case 0x202f:
2527 case 0x205f:
2528 case 0x3000:
2529 return -next != ESC_h;
2530 default:
2531 return -next == ESC_h;
2532 }
2533
2534 case ESC_v:
2535 case ESC_V:
2536 switch(item)
2537 {
2538 case 0x0a:
2539 case 0x0b:
2540 case 0x0c:
2541 case 0x0d:
2542 case 0x85:
2543 case 0x2028:
2544 case 0x2029:
2545 return -next != ESC_v;
2546 default:
2547 return -next == ESC_v;
2548 }
2549
2550 default:
2551 return FALSE;
2552 }
2553
2554 case OP_DIGIT:
2555 return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
2556 next == -ESC_h || next == -ESC_v;
2557
2558 case OP_NOT_DIGIT:
2559 return next == -ESC_d;
2560
2561 case OP_WHITESPACE:
2562 return next == -ESC_S || next == -ESC_d || next == -ESC_w;
2563
2564 case OP_NOT_WHITESPACE:
2565 return next == -ESC_s || next == -ESC_h || next == -ESC_v;
2566
2567 case OP_HSPACE:
2568 return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w;
2569
2570 case OP_NOT_HSPACE:
2571 return next == -ESC_h;
2572
2573 /* Can't have \S in here because VT matches \S (Perl anomaly) */
2574 case OP_VSPACE:
2575 return next == -ESC_V || next == -ESC_d || next == -ESC_w;
2576
2577 case OP_NOT_VSPACE:
2578 return next == -ESC_v;
2579
2580 case OP_WORDCHAR:
2581 return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v;
2582
2583 case OP_NOT_WORDCHAR:
2584 return next == -ESC_w || next == -ESC_d;
2585
2586 default:
2587 return FALSE;
2588 }
2589
2590 /* Control does not reach here */
2591 }
2592
2593
2594
2595 /*************************************************
2596 * Compile one branch *
2597 *************************************************/
2598
2599 /* Scan the pattern, compiling it into the a vector. If the options are
2600 changed during the branch, the pointer is used to change the external options
2601 bits. This function is used during the pre-compile phase when we are trying
2602 to find out the amount of memory needed, as well as during the real compile
2603 phase. The value of lengthptr distinguishes the two phases.
2604
2605 Arguments:
2606 optionsptr pointer to the option bits
2607 codeptr points to the pointer to the current code point
2608 ptrptr points to the current pattern pointer
2609 errorcodeptr points to error code variable
2610 firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2611 reqbyteptr set to the last literal character required, else < 0
2612 bcptr points to current branch chain
2613 cd contains pointers to tables etc.
2614 lengthptr NULL during the real compile phase
2615 points to length accumulator during pre-compile phase
2616
2617 Returns: TRUE on success
2618 FALSE, with *errorcodeptr set non-zero on error
2619 */
2620
2621 static BOOL
2622 compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2623 int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2624 compile_data *cd, int *lengthptr)
2625 {
2626 int repeat_type, op_type;
2627 int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
2628 int bravalue = 0;
2629 int greedy_default, greedy_non_default;
2630 int firstbyte, reqbyte;
2631 int zeroreqbyte, zerofirstbyte;
2632 int req_caseopt, reqvary, tempreqvary;
2633 int options = *optionsptr;
2634 int after_manual_callout = 0;
2635 int length_prevgroup = 0;
2636 register int c;
2637 register uschar *code = *codeptr;
2638 uschar *last_code = code;
2639 uschar *orig_code = code;
2640 uschar *tempcode;
2641 BOOL inescq = FALSE;
2642 BOOL groupsetfirstbyte = FALSE;
2643 const uschar *ptr = *ptrptr;
2644 const uschar *tempptr;
2645 uschar *previous = NULL;
2646 uschar *previous_callout = NULL;
2647 uschar *save_hwm = NULL;
2648 uschar classbits[32];
2649
2650 #ifdef SUPPORT_UTF8
2651 BOOL class_utf8;
2652 BOOL utf8 = (options & PCRE_UTF8) != 0;
2653 uschar *class_utf8data;
2654 uschar *class_utf8data_base;
2655 uschar utf8_char[6];
2656 #else
2657 BOOL utf8 = FALSE;
2658 uschar *utf8_char = NULL;
2659 #endif
2660
2661 #ifdef DEBUG
2662 if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2663 #endif
2664
2665 /* Set up the default and non-default settings for greediness */
2666
2667 greedy_default = ((options & PCRE_UNGREEDY) != 0);
2668 greedy_non_default = greedy_default ^ 1;
2669
2670 /* Initialize no first byte, no required byte. REQ_UNSET means "no char
2671 matching encountered yet". It gets changed to REQ_NONE if we hit something that
2672 matches a non-fixed char first char; reqbyte just remains unset if we never
2673 find one.
2674
2675 When we hit a repeat whose minimum is zero, we may have to adjust these values
2676 to take the zero repeat into account. This is implemented by setting them to
2677 zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2678 item types that can be repeated set these backoff variables appropriately. */
2679
2680 firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2681
2682 /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2683 according to the current setting of the caseless flag. REQ_CASELESS is a bit
2684 value > 255. It is added into the firstbyte or reqbyte variables to record the
2685 case status of the value. This is used only for ASCII characters. */
2686
2687 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2688
2689 /* Switch on next character until the end of the branch */
2690
2691 for (;; ptr++)
2692 {
2693 BOOL negate_class;
2694 BOOL should_flip_negation;
2695 BOOL possessive_quantifier;
2696 BOOL is_quantifier;
2697 BOOL is_recurse;
2698 BOOL reset_bracount;
2699 int class_charcount;
2700 int class_lastchar;
2701 int newoptions;
2702 int recno;
2703 int refsign;
2704 int skipbytes;
2705 int subreqbyte;
2706 int subfirstbyte;
2707 int terminator;
2708 int mclength;
2709 uschar mcbuffer[8];
2710
2711 /* Get next byte in the pattern */
2712
2713 c = *ptr;
2714
2715 /* If we are in the pre-compile phase, accumulate the length used for the
2716 previous cycle of this loop. */
2717
2718 if (lengthptr != NULL)
2719 {
2720 #ifdef DEBUG
2721 if (code > cd->hwm) cd->hwm = code; /* High water info */
2722 #endif
2723 if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
2724 {
2725 *errorcodeptr = ERR52;
2726 goto FAILED;
2727 }
2728
2729 /* There is at least one situation where code goes backwards: this is the
2730 case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2731 the class is simply eliminated. However, it is created first, so we have to
2732 allow memory for it. Therefore, don't ever reduce the length at this point.
2733 */
2734
2735 if (code < last_code) code = last_code;
2736
2737 /* Paranoid check for integer overflow */
2738
2739 if (OFLOW_MAX - *lengthptr < code - last_code)
2740 {
2741 *errorcodeptr = ERR20;
2742 goto FAILED;
2743 }
2744
2745 *lengthptr += code - last_code;
2746 DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2747
2748 /* If "previous" is set and it is not at the start of the work space, move
2749 it back to there, in order to avoid filling up the work space. Otherwise,
2750 if "previous" is NULL, reset the current code pointer to the start. */
2751
2752 if (previous != NULL)
2753 {
2754 if (previous > orig_code)
2755 {
2756 memmove(orig_code, previous, code - previous);
2757 code -= previous - orig_code;
2758 previous = orig_code;
2759 }
2760 }
2761 else code = orig_code;
2762
2763 /* Remember where this code item starts so we can pick up the length
2764 next time round. */
2765
2766 last_code = code;
2767 }
2768
2769 /* In the real compile phase, just check the workspace used by the forward
2770 reference list. */
2771
2772 else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
2773 {
2774 *errorcodeptr = ERR52;
2775 goto FAILED;
2776 }
2777
2778 /* If in \Q...\E, check for the end; if not, we have a literal */
2779
2780 if (inescq && c != 0)
2781 {
2782 if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)
2783 {
2784 inescq = FALSE;
2785 ptr++;
2786 continue;
2787 }
2788 else
2789 {
2790 if (previous_callout != NULL)
2791 {
2792 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2793 complete_callout(previous_callout, ptr, cd);
2794 previous_callout = NULL;
2795 }
2796 if ((options & PCRE_AUTO_CALLOUT) != 0)
2797 {
2798 previous_callout = code;
2799 code = auto_callout(code, ptr, cd);
2800 }
2801 goto NORMAL_CHAR;
2802 }
2803 }
2804
2805 /* Fill in length of a previous callout, except when the next thing is
2806 a quantifier. */
2807
2808 is_quantifier =
2809 c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK ||
2810 (c == CHAR_LEFT_CURLY_BRACKET && is_counted_repeat(ptr+1));
2811
2812 if (!is_quantifier && previous_callout != NULL &&
2813 after_manual_callout-- <= 0)
2814 {
2815 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2816 complete_callout(previous_callout, ptr, cd);
2817 previous_callout = NULL;
2818 }
2819
2820 /* In extended mode, skip white space and comments */
2821
2822 if ((options & PCRE_EXTENDED) != 0)
2823 {
2824 if ((cd->ctypes[c] & ctype_space) != 0) continue;
2825 if (c == CHAR_NUMBER_SIGN)
2826 {
2827 while (*(++ptr) != 0)
2828 {
2829 if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
2830 }
2831 if (*ptr != 0) continue;
2832
2833 /* Else fall through to handle end of string */
2834 c = 0;
2835 }
2836 }
2837
2838 /* No auto callout for quantifiers. */
2839
2840 if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
2841 {
2842 previous_callout = code;
2843 code = auto_callout(code, ptr, cd);
2844 }
2845
2846 switch(c)
2847 {
2848 /* ===================================================================*/
2849 case 0: /* The branch terminates at string end */
2850 case CHAR_VERTICAL_LINE: /* or | or ) */
2851 case CHAR_RIGHT_PARENTHESIS:
2852 *firstbyteptr = firstbyte;
2853 *reqbyteptr = reqbyte;
2854 *codeptr = code;
2855 *ptrptr = ptr;
2856 if (lengthptr != NULL)
2857 {
2858 if (OFLOW_MAX - *lengthptr < code - last_code)
2859 {
2860 *errorcodeptr = ERR20;
2861 goto FAILED;
2862 }
2863 *lengthptr += code - last_code; /* To include callout length */
2864 DPRINTF((">> end branch\n"));
2865 }
2866 return TRUE;
2867
2868
2869 /* ===================================================================*/
2870 /* Handle single-character metacharacters. In multiline mode, ^ disables
2871 the setting of any following char as a first character. */
2872
2873 case CHAR_CIRCUMFLEX_ACCENT:
2874 if ((options & PCRE_MULTILINE) != 0)
2875 {
2876 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2877 }
2878 previous = NULL;
2879 *code++ = OP_CIRC;
2880 break;
2881
2882 case CHAR_DOLLAR_SIGN:
2883 previous = NULL;
2884 *code++ = OP_DOLL;
2885 break;
2886
2887 /* There can never be a first char if '.' is first, whatever happens about
2888 repeats. The value of reqbyte doesn't change either. */
2889
2890 case CHAR_DOT:
2891 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2892 zerofirstbyte = firstbyte;
2893 zeroreqbyte = reqbyte;
2894 previous = code;
2895 *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;
2896 break;
2897
2898
2899 /* ===================================================================*/
2900 /* Character classes. If the included characters are all < 256, we build a
2901 32-byte bitmap of the permitted characters, except in the special case
2902 where there is only one such character. For negated classes, we build the
2903 map as usual, then invert it at the end. However, we use a different opcode
2904 so that data characters > 255 can be handled correctly.
2905
2906 If the class contains characters outside the 0-255 range, a different
2907 opcode is compiled. It may optionally have a bit map for characters < 256,
2908 but those above are are explicitly listed afterwards. A flag byte tells
2909 whether the bitmap is present, and whether this is a negated class or not.
2910
2911 In JavaScript compatibility mode, an isolated ']' causes an error. In
2912 default (Perl) mode, it is treated as a data character. */
2913
2914 case CHAR_RIGHT_SQUARE_BRACKET:
2915 if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
2916 {
2917 *errorcodeptr = ERR64;
2918 goto FAILED;
2919 }
2920 goto NORMAL_CHAR;
2921
2922 case CHAR_LEFT_SQUARE_BRACKET:
2923 previous = code;
2924
2925 /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
2926 they are encountered at the top level, so we'll do that too. */
2927
2928 if ((ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
2929 ptr[1] == CHAR_EQUALS_SIGN) &&
2930 check_posix_syntax(ptr, &tempptr))
2931 {
2932 *errorcodeptr = (ptr[1] == CHAR_COLON)? ERR13 : ERR31;
2933 goto FAILED;
2934 }
2935
2936 /* If the first character is '^', set the negation flag and skip it. Also,
2937 if the first few characters (either before or after ^) are \Q\E or \E we
2938 skip them too. This makes for compatibility with Perl. */
2939
2940 negate_class = FALSE;
2941 for (;;)
2942 {
2943 c = *(++ptr);
2944 if (c == CHAR_BACKSLASH)
2945 {
2946 if (ptr[1] == CHAR_E)
2947 ptr++;
2948 else if (strncmp((const char *)ptr+1,
2949 STR_Q STR_BACKSLASH STR_E, 3) == 0)
2950 ptr += 3;
2951 else
2952 break;
2953 }
2954 else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
2955 negate_class = TRUE;
2956 else break;
2957 }
2958
2959 /* Empty classes are allowed in JavaScript compatibility mode. Otherwise,
2960 an initial ']' is taken as a data character -- the code below handles
2961 that. In JS mode, [] must always fail, so generate OP_FAIL, whereas
2962 [^] must match any character, so generate OP_ALLANY. */
2963
2964 if (c == CHAR_RIGHT_SQUARE_BRACKET &&
2965 (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
2966 {
2967 *code++ = negate_class? OP_ALLANY : OP_FAIL;
2968 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2969 zerofirstbyte = firstbyte;
2970 break;
2971 }
2972
2973 /* If a class contains a negative special such as \S, we need to flip the
2974 negation flag at the end, so that support for characters > 255 works
2975 correctly (they are all included in the class). */
2976
2977 should_flip_negation = FALSE;
2978
2979 /* Keep a count of chars with values < 256 so that we can optimize the case
2980 of just a single character (as long as it's < 256). However, For higher
2981 valued UTF-8 characters, we don't yet do any optimization. */
2982
2983 class_charcount = 0;
2984 class_lastchar = -1;
2985
2986 /* Initialize the 32-char bit map to all zeros. We build the map in a
2987 temporary bit of memory, in case the class contains only 1 character (less
2988 than 256), because in that case the compiled code doesn't use the bit map.
2989 */
2990
2991 memset(classbits, 0, 32 * sizeof(uschar));
2992
2993 #ifdef SUPPORT_UTF8
2994 class_utf8 = FALSE; /* No chars >= 256 */
2995 class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */
2996 class_utf8data_base = class_utf8data; /* For resetting in pass 1 */
2997 #endif
2998
2999 /* Process characters until ] is reached. By writing this as a "do" it
3000 means that an initial ] is taken as a data character. At the start of the
3001 loop, c contains the first byte of the character. */
3002
3003 if (c != 0) do
3004 {
3005 const uschar *oldptr;
3006
3007 #ifdef SUPPORT_UTF8
3008 if (utf8 && c > 127)
3009 { /* Braces are required because the */
3010 GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
3011 }
3012
3013 /* In the pre-compile phase, accumulate the length of any UTF-8 extra
3014 data and reset the pointer. This is so that very large classes that
3015 contain a zillion UTF-8 characters no longer overwrite the work space
3016 (which is on the stack). */
3017
3018 if (lengthptr != NULL)
3019 {
3020 *lengthptr += class_utf8data - class_utf8data_base;
3021 class_utf8data = class_utf8data_base;
3022 }
3023
3024 #endif
3025
3026 /* Inside \Q...\E everything is literal except \E */
3027
3028 if (inescq)
3029 {
3030 if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E) /* If we are at \E */
3031 {
3032 inescq = FALSE; /* Reset literal state */
3033 ptr++; /* Skip the 'E' */
3034 continue; /* Carry on with next */
3035 }
3036 goto CHECK_RANGE; /* Could be range if \E follows */
3037 }
3038
3039 /* Handle POSIX class names. Perl allows a negation extension of the
3040 form [:^name:]. A square bracket that doesn't match the syntax is
3041 treated as a literal. We also recognize the POSIX constructions
3042 [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
3043 5.6 and 5.8 do. */
3044
3045 if (c == CHAR_LEFT_SQUARE_BRACKET &&
3046 (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
3047 ptr[1] == CHAR_EQUALS_SIGN) && check_posix_syntax(ptr, &tempptr))
3048 {
3049 BOOL local_negate = FALSE;
3050 int posix_class, taboffset, tabopt;
3051 register const uschar *cbits = cd->cbits;
3052 uschar pbits[32];
3053
3054 if (ptr[1] != CHAR_COLON)
3055 {
3056 *errorcodeptr = ERR31;
3057 goto FAILED;
3058 }
3059
3060 ptr += 2;
3061 if (*ptr == CHAR_CIRCUMFLEX_ACCENT)
3062 {
3063 local_negate = TRUE;
3064 should_flip_negation = TRUE; /* Note negative special */
3065 ptr++;
3066 }
3067
3068 posix_class = check_posix_name(ptr, tempptr - ptr);
3069 if (posix_class < 0)
3070 {
3071 *errorcodeptr = ERR30;
3072 goto FAILED;
3073 }
3074
3075 /* If matching is caseless, upper and lower are converted to
3076 alpha. This relies on the fact that the class table starts with
3077 alpha, lower, upper as the first 3 entries. */
3078
3079 if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
3080 posix_class = 0;
3081
3082 /* We build the bit map for the POSIX class in a chunk of local store
3083 because we may be adding and subtracting from it, and we don't want to
3084 subtract bits that may be in the main map already. At the end we or the
3085 result into the bit map that is being built. */
3086
3087 posix_class *= 3;
3088
3089 /* Copy in the first table (always present) */
3090
3091 memcpy(pbits, cbits + posix_class_maps[posix_class],
3092 32 * sizeof(uschar));
3093
3094 /* If there is a second table, add or remove it as required. */
3095
3096 taboffset = posix_class_maps[posix_class + 1];
3097 tabopt = posix_class_maps[posix_class + 2];
3098
3099 if (taboffset >= 0)
3100 {
3101 if (tabopt >= 0)
3102 for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
3103 else
3104 for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
3105 }
3106
3107 /* Not see if we need to remove any special characters. An option
3108 value of 1 removes vertical space and 2 removes underscore. */
3109
3110 if (tabopt < 0) tabopt = -tabopt;
3111 if (tabopt == 1) pbits[1] &= ~0x3c;
3112 else if (tabopt == 2) pbits[11] &= 0x7f;
3113
3114 /* Add the POSIX table or its complement into the main table that is
3115 being built and we are done. */
3116
3117 if (local_negate)
3118 for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
3119 else
3120 for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
3121
3122 ptr = tempptr + 1;
3123 class_charcount = 10; /* Set > 1; assumes more than 1 per class */
3124 continue; /* End of POSIX syntax handling */
3125 }
3126
3127 /* Backslash may introduce a single character, or it may introduce one
3128 of the specials, which just set a flag. The sequence \b is a special
3129 case. Inside a class (and only there) it is treated as backspace.
3130 Elsewhere it marks a word boundary. Other escapes have preset maps ready
3131 to 'or' into the one we are building. We assume they have more than one
3132 character in them, so set class_charcount bigger than one. */
3133
3134 if (c == CHAR_BACKSLASH)
3135 {
3136 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3137 if (*errorcodeptr != 0) goto FAILED;
3138
3139 if (-c == ESC_b) c = CHAR_BS; /* \b is backspace in a class */
3140 else if (-c == ESC_X) c = CHAR_X; /* \X is literal X in a class */
3141 else if (-c == ESC_R) c = CHAR_R; /* \R is literal R in a class */
3142 else if (-c == ESC_Q) /* Handle start of quoted string */
3143 {
3144 if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
3145 {
3146 ptr += 2; /* avoid empty string */
3147 }
3148 else inescq = TRUE;
3149 continue;
3150 }
3151 else if (-c == ESC_E) continue; /* Ignore orphan \E */
3152
3153 if (c < 0)
3154 {
3155 register const uschar *cbits = cd->cbits;
3156 class_charcount += 2; /* Greater than 1 is what matters */
3157
3158 /* Save time by not doing this in the pre-compile phase. */
3159
3160 if (lengthptr == NULL) switch (-c)
3161 {
3162 case ESC_d:
3163 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
3164 continue;
3165
3166 case ESC_D:
3167 should_flip_negation = TRUE;
3168 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
3169 continue;
3170
3171 case ESC_w:
3172 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
3173 continue;
3174
3175 case ESC_W:
3176 should_flip_negation = TRUE;
3177 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
3178 continue;
3179
3180 case ESC_s:
3181 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
3182 classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
3183 continue;
3184
3185 case ESC_S:
3186 should_flip_negation = TRUE;
3187 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
3188 classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
3189 continue;
3190
3191 default: /* Not recognized; fall through */
3192 break; /* Need "default" setting to stop compiler warning. */
3193 }
3194
3195 /* In the pre-compile phase, just do the recognition. */
3196
3197 else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
3198 c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
3199
3200 /* We need to deal with \H, \h, \V, and \v in both phases because
3201 they use extra memory. */
3202
3203 if (-c == ESC_h)
3204 {
3205 SETBIT(classbits, 0x09); /* VT */
3206 SETBIT(classbits, 0x20); /* SPACE */
3207 SETBIT(classbits, 0xa0); /* NSBP */
3208 #ifdef SUPPORT_UTF8
3209 if (utf8)
3210 {
3211 class_utf8 = TRUE;
3212 *class_utf8data++ = XCL_SINGLE;
3213 class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
3214 *class_utf8data++ = XCL_SINGLE;
3215 class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
3216 *class_utf8data++ = XCL_RANGE;
3217 class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
3218 class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
3219 *class_utf8data++ = XCL_SINGLE;
3220 class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
3221 *class_utf8data++ = XCL_SINGLE;
3222 class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
3223 *class_utf8data++ = XCL_SINGLE;
3224 class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
3225 }
3226 #endif
3227 continue;
3228 }
3229
3230 if (-c == ESC_H)
3231 {
3232 for (c = 0; c < 32; c++)
3233 {
3234 int x = 0xff;
3235 switch (c)
3236 {
3237 case 0x09/8: x ^= 1 << (0x09%8); break;
3238 case 0x20/8: x ^= 1 << (0x20%8); break;
3239 case 0xa0/8: x ^= 1 << (0xa0%8); break;
3240 default: break;
3241 }
3242 classbits[c] |= x;
3243 }
3244
3245 #ifdef SUPPORT_UTF8
3246 if (utf8)
3247 {
3248 class_utf8 = TRUE;
3249 *class_utf8data++ = XCL_RANGE;
3250 class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
3251 class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
3252 *class_utf8data++ = XCL_RANGE;
3253 class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
3254 class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
3255 *class_utf8data++ = XCL_RANGE;
3256 class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
3257 class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
3258 *class_utf8data++ = XCL_RANGE;
3259 class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
3260 class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
3261 *class_utf8data++ = XCL_RANGE;
3262 class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
3263 class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
3264 *class_utf8data++ = XCL_RANGE;
3265 class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
3266 class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
3267 *class_utf8data++ = XCL_RANGE;
3268 class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
3269 class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
3270 }
3271 #endif
3272 continue;
3273 }
3274
3275 if (-c == ESC_v)
3276 {
3277 SETBIT(classbits, 0x0a); /* LF */
3278 SETBIT(classbits, 0x0b); /* VT */
3279 SETBIT(classbits, 0x0c); /* FF */
3280 SETBIT(classbits, 0x0d); /* CR */
3281 SETBIT(classbits, 0x85); /* NEL */
3282 #ifdef SUPPORT_UTF8
3283 if (utf8)
3284 {
3285 class_utf8 = TRUE;
3286 *class_utf8data++ = XCL_RANGE;
3287 class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
3288 class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
3289 }
3290 #endif
3291 continue;
3292 }
3293
3294 if (-c == ESC_V)
3295 {
3296 for (c = 0; c < 32; c++)
3297 {
3298 int x = 0xff;
3299 switch (c)
3300 {
3301 case 0x0a/8: x ^= 1 << (0x0a%8);
3302 x ^= 1 << (0x0b%8);
3303 x ^= 1 << (0x0c%8);
3304 x ^= 1 << (0x0d%8);
3305 break;
3306 case 0x85/8: x ^= 1 << (0x85%8); break;
3307 default: break;
3308 }
3309 classbits[c] |= x;
3310 }
3311
3312 #ifdef SUPPORT_UTF8
3313 if (utf8)
3314 {
3315 class_utf8 = TRUE;
3316 *class_utf8data++ = XCL_RANGE;
3317 class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
3318 class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
3319 *class_utf8data++ = XCL_RANGE;
3320 class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
3321 class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
3322 }
3323 #endif
3324 continue;
3325 }
3326
3327 /* We need to deal with \P and \p in both phases. */
3328
3329 #ifdef SUPPORT_UCP
3330 if (-c == ESC_p || -c == ESC_P)
3331 {
3332 BOOL negated;
3333 int pdata;
3334 int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
3335 if (ptype < 0) goto FAILED;
3336 class_utf8 = TRUE;
3337 *class_utf8data++ = ((-c == ESC_p) != negated)?
3338 XCL_PROP : XCL_NOTPROP;
3339 *class_utf8data++ = ptype;
3340 *class_utf8data++ = pdata;
3341 class_charcount -= 2; /* Not a < 256 character */
3342 continue;
3343 }
3344 #endif
3345 /* Unrecognized escapes are faulted if PCRE is running in its
3346 strict mode. By default, for compatibility with Perl, they are
3347 treated as literals. */
3348
3349 if ((options & PCRE_EXTRA) != 0)
3350 {
3351 *errorcodeptr = ERR7;
3352 goto FAILED;
3353 }
3354
3355 class_charcount -= 2; /* Undo the default count from above */
3356 c = *ptr; /* Get the final character and fall through */
3357 }
3358
3359 /* Fall through if we have a single character (c >= 0). This may be
3360 greater than 256 in UTF-8 mode. */
3361
3362 } /* End of backslash handling */
3363
3364 /* A single character may be followed by '-' to form a range. However,
3365 Perl does not permit ']' to be the end of the range. A '-' character
3366 at the end is treated as a literal. Perl ignores orphaned \E sequences
3367 entirely. The code for handling \Q and \E is messy. */
3368
3369 CHECK_RANGE:
3370 while (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
3371 {
3372 inescq = FALSE;
3373 ptr += 2;
3374 }
3375
3376 oldptr = ptr;
3377
3378 /* Remember \r or \n */
3379
3380 if (c == CHAR_CR || c == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
3381
3382 /* Check for range */
3383
3384 if (!inescq && ptr[1] == CHAR_MINUS)
3385 {
3386 int d;
3387 ptr += 2;
3388 while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E) ptr += 2;
3389
3390 /* If we hit \Q (not followed by \E) at this point, go into escaped
3391 mode. */
3392
3393 while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_Q)
3394 {
3395 ptr += 2;
3396 if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E)
3397 { ptr += 2; continue; }
3398 inescq = TRUE;
3399 break;
3400 }
3401
3402 if (*ptr == 0 || (!inescq && *ptr == CHAR_RIGHT_SQUARE_BRACKET))
3403 {
3404 ptr = oldptr;
3405 goto LONE_SINGLE_CHARACTER;
3406 }
3407
3408 #ifdef SUPPORT_UTF8
3409 if (utf8)
3410 { /* Braces are required because the */
3411 GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
3412 }
3413 else
3414 #endif
3415 d = *ptr; /* Not UTF-8 mode */
3416
3417 /* The second part of a range can be a single-character escape, but
3418 not any of the other escapes. Perl 5.6 treats a hyphen as a literal
3419 in such circumstances. */
3420
3421 if (!inescq && d == CHAR_BACKSLASH)
3422 {
3423 d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3424 if (*errorcodeptr != 0) goto FAILED;
3425
3426 /* \b is backspace; \X is literal X; \R is literal R; any other
3427 special means the '-' was literal */
3428
3429 if (d < 0)
3430 {
3431 if (d == -ESC_b) d = CHAR_BS;
3432 else if (d == -ESC_X) d = CHAR_X;
3433 else if (d == -ESC_R) d = CHAR_R; else
3434 {
3435 ptr = oldptr;
3436 goto LONE_SINGLE_CHARACTER; /* A few lines below */
3437 }
3438 }
3439 }
3440
3441 /* Check that the two values are in the correct order. Optimize
3442 one-character ranges */
3443
3444 if (d < c)
3445 {
3446 *errorcodeptr = ERR8;
3447 goto FAILED;
3448 }
3449
3450 if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
3451
3452 /* Remember \r or \n */
3453
3454 if (d == CHAR_CR || d == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
3455
3456 /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
3457 matching, we have to use an XCLASS with extra data items. Caseless
3458 matching for characters > 127 is available only if UCP support is
3459 available. */
3460
3461 #ifdef SUPPORT_UTF8
3462 if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
3463 {
3464 class_utf8 = TRUE;
3465
3466 /* With UCP support, we can find the other case equivalents of
3467 the relevant characters. There may be several ranges. Optimize how
3468 they fit with the basic range. */
3469
3470 #ifdef SUPPORT_UCP
3471 if ((options & PCRE_CASELESS) != 0)
3472 {
3473 unsigned int occ, ocd;
3474 unsigned int cc = c;
3475 unsigned int origd = d;
3476 while (get_othercase_range(&cc, origd, &occ, &ocd))
3477 {
3478 if (occ >= (unsigned int)c &&
3479 ocd <= (unsigned int)d)
3480 continue; /* Skip embedded ranges */
3481
3482 if (occ < (unsigned int)c &&
3483 ocd >= (unsigned int)c - 1) /* Extend the basic range */
3484 { /* if there is overlap, */
3485 c = occ; /* noting that if occ < c */
3486 continue; /* we can't have ocd > d */
3487 } /* because a subrange is */
3488 if (ocd > (unsigned int)d &&
3489 occ <= (unsigned int)d + 1) /* always shorter than */
3490 { /* the basic range. */
3491 d = ocd;
3492 continue;
3493 }
3494
3495 if (occ == ocd)
3496 {
3497 *class_utf8data++ = XCL_SINGLE;
3498 }
3499 else
3500 {
3501 *class_utf8data++ = XCL_RANGE;
3502 class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
3503 }
3504 class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
3505 }
3506 }
3507 #endif /* SUPPORT_UCP */
3508
3509 /* Now record the original range, possibly modified for UCP caseless
3510 overlapping ranges. */
3511
3512 *class_utf8data++ = XCL_RANGE;
3513 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3514 class_utf8data += _pcre_ord2utf8(d, class_utf8data);
3515
3516 /* With UCP support, we are done. Without UCP support, there is no
3517 caseless matching for UTF-8 characters > 127; we can use the bit map
3518 for the smaller ones. */
3519
3520 #ifdef SUPPORT_UCP
3521 continue; /* With next character in the class */
3522 #else
3523 if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
3524
3525 /* Adjust upper limit and fall through to set up the map */
3526
3527 d = 127;
3528
3529 #endif /* SUPPORT_UCP */
3530 }
3531 #endif /* SUPPORT_UTF8 */
3532
3533 /* We use the bit map for all cases when not in UTF-8 mode; else
3534 ranges that lie entirely within 0-127 when there is UCP support; else
3535 for partial ranges without UCP support. */
3536
3537 class_charcount += d - c + 1;
3538 class_lastchar = d;
3539
3540 /* We can save a bit of time by skipping this in the pre-compile. */
3541
3542 if (lengthptr == NULL) for (; c <= d; c++)
3543 {
3544 classbits[c/8] |= (1 << (c&7));
3545 if ((options & PCRE_CASELESS) != 0)
3546 {
3547 int uc = cd->fcc[c]; /* flip case */
3548 classbits[uc/8] |= (1 << (uc&7));
3549 }
3550 }
3551
3552 continue; /* Go get the next char in the class */
3553 }
3554
3555 /* Handle a lone single character - we can get here for a normal
3556 non-escape char, or after \ that introduces a single character or for an
3557 apparent range that isn't. */
3558
3559 LONE_SINGLE_CHARACTER:
3560
3561 /* Handle a character that cannot go in the bit map */
3562
3563 #ifdef SUPPORT_UTF8
3564 if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
3565 {
3566 class_utf8 = TRUE;
3567 *class_utf8data++ = XCL_SINGLE;
3568 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3569
3570 #ifdef SUPPORT_UCP
3571 if ((options & PCRE_CASELESS) != 0)
3572 {
3573 unsigned int othercase;
3574 if ((othercase = UCD_OTHERCASE(c)) != c)
3575 {
3576 *class_utf8data++ = XCL_SINGLE;
3577 class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
3578 }
3579 }
3580 #endif /* SUPPORT_UCP */
3581
3582 }
3583 else
3584 #endif /* SUPPORT_UTF8 */
3585
3586 /* Handle a single-byte character */
3587 {
3588 classbits[c/8] |= (1 << (c&7));
3589 if ((options & PCRE_CASELESS) != 0)
3590 {
3591 c = cd->fcc[c]; /* flip case */
3592 classbits[c/8] |= (1 << (c&7));
3593 }
3594 class_charcount++;
3595 class_lastchar = c;
3596 }
3597 }
3598
3599 /* Loop until ']' reached. This "while" is the end of the "do" above. */
3600
3601 while ((c = *(++ptr)) != 0 && (c != CHAR_RIGHT_SQUARE_BRACKET || inescq));
3602
3603 if (c == 0) /* Missing terminating ']' */
3604 {
3605 *errorcodeptr = ERR6;
3606 goto FAILED;
3607 }
3608
3609
3610 /* This code has been disabled because it would mean that \s counts as
3611 an explicit \r or \n reference, and that's not really what is wanted. Now
3612 we set the flag only if there is a literal "\r" or "\n" in the class. */
3613
3614 #if 0
3615 /* Remember whether \r or \n are in this class */
3616
3617 if (negate_class)
3618 {
3619 if ((classbits[1] & 0x24) != 0x24) cd->external_flags |= PCRE_HASCRORLF;
3620 }
3621 else
3622 {
3623 if ((classbits[1] & 0x24) != 0) cd->external_flags |= PCRE_HASCRORLF;
3624 }
3625 #endif
3626
3627
3628 /* If class_charcount is 1, we saw precisely one character whose value is
3629 less than 256. As long as there were no characters >= 128 and there was no
3630 use of \p or \P, in other words, no use of any XCLASS features, we can
3631 optimize.
3632
3633 In UTF-8 mode, we can optimize the negative case only if there were no
3634 characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR
3635 operate on single-bytes only. This is an historical hangover. Maybe one day
3636 we can tidy these opcodes to handle multi-byte characters.
3637
3638 The optimization throws away the bit map. We turn the item into a
3639 1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
3640 that OP_NOT does not support multibyte characters. In the positive case, it
3641 can cause firstbyte to be set. Otherwise, there can be no first char if
3642 this item is first, whatever repeat count may follow. In the case of
3643 reqbyte, save the previous value for reinstating. */
3644
3645 #ifdef SUPPORT_UTF8
3646 if (class_charcount == 1 && !class_utf8 &&
3647 (!utf8 || !negate_class || class_lastchar < 128))
3648 #else
3649 if (class_charcount == 1)
3650 #endif
3651 {
3652 zeroreqbyte = reqbyte;
3653
3654 /* The OP_NOT opcode works on one-byte characters only. */
3655
3656 if (negate_class)
3657 {
3658 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3659 zerofirstbyte = firstbyte;
3660 *code++ = OP_NOT;
3661 *code++ = class_lastchar;
3662 break;
3663 }
3664
3665 /* For a single, positive character, get the value into mcbuffer, and
3666 then we can handle this with the normal one-character code. */
3667
3668 #ifdef SUPPORT_UTF8
3669 if (utf8 && class_lastchar > 127)
3670 mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
3671 else
3672 #endif
3673 {
3674 mcbuffer[0] = class_lastchar;
3675 mclength = 1;
3676 }
3677 goto ONE_CHAR;
3678 } /* End of 1-char optimization */
3679
3680 /* The general case - not the one-char optimization. If this is the first
3681 thing in the branch, there can be no first char setting, whatever the
3682 repeat count. Any reqbyte setting must remain unchanged after any kind of
3683 repeat. */
3684
3685 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3686 zerofirstbyte = firstbyte;
3687 zeroreqbyte = reqbyte;
3688
3689 /* If there are characters with values > 255, we have to compile an
3690 extended class, with its own opcode, unless there was a negated special
3691 such as \S in the class, because in that case all characters > 255 are in
3692 the class, so any that were explicitly given as well can be ignored. If
3693 (when there are explicit characters > 255 that must be listed) there are no
3694 characters < 256, we can omit the bitmap in the actual compiled code. */
3695
3696 #ifdef SUPPORT_UTF8
3697 if (class_utf8 && !should_flip_negation)
3698 {
3699 *class_utf8data++ = XCL_END; /* Marks the end of extra data */
3700 *code++ = OP_XCLASS;
3701 code += LINK_SIZE;
3702 *code = negate_class? XCL_NOT : 0;
3703
3704 /* If the map is required, move up the extra data to make room for it;
3705 otherwise just move the code pointer to the end of the extra data. */
3706
3707 if (class_charcount > 0)
3708 {
3709 *code++ |= XCL_MAP;
3710 memmove(code + 32, code, class_utf8data - code);
3711 memcpy(code, classbits, 32);
3712 code = class_utf8data + 32;
3713 }
3714 else code = class_utf8data;
3715
3716 /* Now fill in the complete length of the item */
3717
3718 PUT(previous, 1, code - previous);
3719 break; /* End of class handling */
3720 }
3721 #endif
3722
3723 /* If there are no characters > 255, set the opcode to OP_CLASS or
3724 OP_NCLASS, depending on whether the whole class was negated and whether
3725 there were negative specials such as \S in the class. Then copy the 32-byte
3726 map into the code vector, negating it if necessary. */
3727
3728 *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
3729 if (negate_class)
3730 {
3731 if (lengthptr == NULL) /* Save time in the pre-compile phase */
3732 for (c = 0; c < 32; c++) code[c] = ~classbits[c];
3733 }
3734 else
3735 {
3736 memcpy(code, classbits, 32);
3737 }
3738 code += 32;
3739 break;
3740
3741
3742 /* ===================================================================*/
3743 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
3744 has been tested above. */
3745
3746 case CHAR_LEFT_CURLY_BRACKET:
3747 if (!is_quantifier) goto NORMAL_CHAR;
3748 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
3749 if (*errorcodeptr != 0) goto FAILED;
3750 goto REPEAT;
3751
3752 case CHAR_ASTERISK:
3753 repeat_min = 0;
3754 repeat_max = -1;
3755 goto REPEAT;
3756
3757 case CHAR_PLUS:
3758 repeat_min = 1;
3759 repeat_max = -1;
3760 goto REPEAT;
3761
3762 case CHAR_QUESTION_MARK:
3763 repeat_min = 0;
3764 repeat_max = 1;
3765
3766 REPEAT:
3767 if (previous == NULL)
3768 {
3769 *errorcodeptr = ERR9;
3770 goto FAILED;
3771 }
3772
3773 if (repeat_min == 0)
3774 {
3775 firstbyte = zerofirstbyte; /* Adjust for zero repeat */
3776 reqbyte = zeroreqbyte; /* Ditto */
3777 }
3778
3779 /* Remember whether this is a variable length repeat */
3780
3781 reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
3782
3783 op_type = 0; /* Default single-char op codes */
3784 possessive_quantifier = FALSE; /* Default not possessive quantifier */
3785
3786 /* Save start of previous item, in case we have to move it up to make space
3787 for an inserted OP_ONCE for the additional '+' extension. */
3788
3789 tempcode = previous;
3790
3791 /* If the next character is '+', we have a possessive quantifier. This
3792 implies greediness, whatever the setting of the PCRE_UNGREEDY option.
3793 If the next character is '?' this is a minimizing repeat, by default,
3794 but if PCRE_UNGREEDY is set, it works the other way round. We change the
3795 repeat type to the non-default. */
3796
3797 if (ptr[1] == CHAR_PLUS)
3798 {
3799 repeat_type = 0; /* Force greedy */
3800 possessive_quantifier = TRUE;
3801 ptr++;
3802 }
3803 else if (ptr[1] == CHAR_QUESTION_MARK)
3804 {
3805 repeat_type = greedy_non_default;
3806 ptr++;
3807 }
3808 else repeat_type = greedy_default;
3809
3810 /* If previous was a character match, abolish the item and generate a
3811 repeat item instead. If a char item has a minumum of more than one, ensure
3812 that it is set in reqbyte - it might not be if a sequence such as x{3} is
3813 the first thing in a branch because the x will have gone into firstbyte
3814 instead. */
3815
3816 if (*previous == OP_CHAR || *previous == OP_CHARNC)
3817 {
3818 /* Deal with UTF-8 characters that take up more than one byte. It's
3819 easier to write this out separately than try to macrify it. Use c to
3820 hold the length of the character in bytes, plus 0x80 to flag that it's a
3821 length rather than a small character. */
3822
3823 #ifdef SUPPORT_UTF8
3824 if (utf8 && (code[-1] & 0x80) != 0)
3825 {
3826 uschar *lastchar = code - 1;
3827 while((*lastchar & 0xc0) == 0x80) lastchar--;
3828 c = code - lastchar; /* Length of UTF-8 character */
3829 memcpy(utf8_char, lastchar, c); /* Save the char */
3830 c |= 0x80; /* Flag c as a length */
3831 }
3832 else
3833 #endif
3834
3835 /* Handle the case of a single byte - either with no UTF8 support, or
3836 with UTF-8 disabled, or for a UTF-8 character < 128. */
3837
3838 {
3839 c = code[-1];
3840 if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
3841 }
3842
3843 /* If the repetition is unlimited, it pays to see if the next thing on
3844 the line is something that cannot possibly match this character. If so,
3845 automatically possessifying this item gains some performance in the case
3846 where the match fails. */
3847
3848 if (!possessive_quantifier &&
3849 repeat_max < 0 &&
3850 check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3851 options, cd))
3852 {
3853 repeat_type = 0; /* Force greedy */
3854 possessive_quantifier = TRUE;
3855 }
3856
3857 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
3858 }
3859
3860 /* If previous was a single negated character ([^a] or similar), we use
3861 one of the special opcodes, replacing it. The code is shared with single-
3862 character repeats by setting opt_type to add a suitable offset into
3863 repeat_type. We can also test for auto-possessification. OP_NOT is
3864 currently used only for single-byte chars. */
3865
3866 else if (*previous == OP_NOT)
3867 {
3868 op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
3869 c = previous[1];
3870 if (!possessive_quantifier &&
3871 repeat_max < 0 &&
3872 check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3873 {
3874 repeat_type = 0; /* Force greedy */
3875 possessive_quantifier = TRUE;
3876 }
3877 goto OUTPUT_SINGLE_REPEAT;
3878 }
3879
3880 /* If previous was a character type match (\d or similar), abolish it and
3881 create a suitable repeat item. The code is shared with single-character
3882 repeats by setting op_type to add a suitable offset into repeat_type. Note
3883 the the Unicode property types will be present only when SUPPORT_UCP is
3884 defined, but we don't wrap the little bits of code here because it just
3885 makes it horribly messy. */
3886
3887 else if (*previous < OP_EODN)
3888 {
3889 uschar *oldcode;
3890 int prop_type, prop_value;
3891 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
3892 c = *previous;
3893
3894 if (!possessive_quantifier &&
3895 repeat_max < 0 &&
3896 check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3897 {
3898 repeat_type = 0; /* Force greedy */
3899 possessive_quantifier = TRUE;
3900 }
3901
3902 OUTPUT_SINGLE_REPEAT:
3903 if (*previous == OP_PROP || *previous == OP_NOTPROP)
3904 {
3905 prop_type = previous[1];
3906 prop_value = previous[2];
3907 }
3908 else prop_type = prop_value = -1;
3909
3910 oldcode = code;
3911 code = previous; /* Usually overwrite previous item */
3912
3913 /* If the maximum is zero then the minimum must also be zero; Perl allows
3914 this case, so we do too - by simply omitting the item altogether. */
3915
3916 if (repeat_max == 0) goto END_REPEAT;
3917
3918 /*--------------------------------------------------------------------*/
3919 /* This code is obsolete from release 8.00; the restriction was finally
3920 removed: */
3921
3922 /* All real repeats make it impossible to handle partial matching (maybe
3923 one day we will be able to remove this restriction). */
3924
3925 /* if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL; */
3926 /*--------------------------------------------------------------------*/
3927
3928 /* Combine the op_type with the repeat_type */
3929
3930 repeat_type += op_type;
3931
3932 /* A minimum of zero is handled either as the special case * or ?, or as
3933 an UPTO, with the maximum given. */
3934
3935 if (repeat_min == 0)
3936 {
3937 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
3938 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
3939 else
3940 {
3941 *code++ = OP_UPTO + repeat_type;
3942 PUT2INC(code, 0, repeat_max);
3943 }
3944 }
3945
3946 /* A repeat minimum of 1 is optimized into some special cases. If the
3947 maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
3948 left in place and, if the maximum is greater than 1, we use OP_UPTO with
3949 one less than the maximum. */
3950
3951 else if (repeat_min == 1)
3952 {
3953 if (repeat_max == -1)
3954 *code++ = OP_PLUS + repeat_type;
3955 else
3956 {
3957 code = oldcode; /* leave previous item in place */
3958 if (repeat_max == 1) goto END_REPEAT;
3959 *code++ = OP_UPTO + repeat_type;
3960 PUT2INC(code, 0, repeat_max - 1);
3961 }
3962 }
3963
3964 /* The case {n,n} is just an EXACT, while the general case {n,m} is
3965 handled as an EXACT followed by an UPTO. */
3966
3967 else
3968 {
3969 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
3970 PUT2INC(code, 0, repeat_min);
3971
3972 /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
3973 we have to insert the character for the previous code. For a repeated
3974 Unicode property match, there are two extra bytes that define the
3975 required property. In UTF-8 mode, long characters have their length in
3976 c, with the 0x80 bit as a flag. */
3977
3978 if (repeat_max < 0)
3979 {
3980 #ifdef SUPPORT_UTF8
3981 if (utf8 && c >= 128)
3982 {
3983 memcpy(code, utf8_char, c & 7);
3984 code += c & 7;
3985 }
3986 else
3987 #endif
3988 {
3989 *code++ = c;
3990 if (prop_type >= 0)
3991 {
3992 *code++ = prop_type;
3993 *code++ = prop_value;
3994 }
3995 }
3996 *code++ = OP_STAR + repeat_type;
3997 }
3998
3999 /* Else insert an UPTO if the max is greater than the min, again
4000 preceded by the character, for the previously inserted code. If the
4001 UPTO is just for 1 instance, we can use QUERY instead. */
4002
4003 else if (repeat_max != repeat_min)
4004 {
4005 #ifdef SUPPORT_UTF8
4006 if (utf8 && c >= 128)
4007 {
4008 memcpy(code, utf8_char, c & 7);
4009 code += c & 7;
4010 }
4011 else
4012 #endif
4013 *code++ = c;
4014 if (prop_type >= 0)
4015 {
4016 *code++ = prop_type;
4017 *code++ = prop_value;
4018 }
4019 repeat_max -= repeat_min;
4020
4021 if (repeat_max == 1)
4022 {
4023 *code++ = OP_QUERY + repeat_type;
4024 }
4025 else
4026 {
4027 *code++ = OP_UPTO + repeat_type;
4028 PUT2INC(code, 0, repeat_max);
4029 }
4030 }
4031 }
4032
4033 /* The character or character type itself comes last in all cases. */
4034
4035 #ifdef SUPPORT_UTF8
4036 if (utf8 && c >= 128)
4037 {
4038 memcpy(code, utf8_char, c & 7);
4039 code += c & 7;
4040 }
4041 else
4042 #endif
4043 *code++ = c;
4044
4045 /* For a repeated Unicode property match, there are two extra bytes that
4046 define the required property. */
4047
4048 #ifdef SUPPORT_UCP
4049 if (prop_type >= 0)
4050 {
4051 *code++ = prop_type;
4052 *code++ = prop_value;
4053 }
4054 #endif
4055 }
4056
4057 /* If previous was a character class or a back reference, we put the repeat
4058 stuff after it, but just skip the item if the repeat was {0,0}. */
4059
4060 else if (*previous == OP_CLASS ||
4061 *previous == OP_NCLASS ||
4062 #ifdef SUPPORT_UTF8
4063 *previous == OP_XCLASS ||
4064 #endif
4065 *previous == OP_REF)
4066 {
4067 if (repeat_max == 0)
4068 {
4069 code = previous;
4070 goto END_REPEAT;
4071 }
4072
4073 /*--------------------------------------------------------------------*/
4074 /* This code is obsolete from release 8.00; the restriction was finally
4075 removed: */
4076
4077 /* All real repeats make it impossible to handle partial matching (maybe
4078 one day we will be able to remove this restriction). */
4079
4080 /* if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL; */
4081 /*--------------------------------------------------------------------*/
4082
4083 if (repeat_min == 0 && repeat_max == -1)
4084 *code++ = OP_CRSTAR + repeat_type;
4085 else if (repeat_min == 1 && repeat_max == -1)
4086 *code++ = OP_CRPLUS + repeat_type;
4087 else if (repeat_min == 0 && repeat_max == 1)
4088 *code++ = OP_CRQUERY + repeat_type;
4089 else
4090 {
4091 *code++ = OP_CRRANGE + repeat_type;
4092 PUT2INC(code, 0, repeat_min);
4093 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
4094 PUT2INC(code, 0, repeat_max);
4095 }
4096 }
4097
4098 /* If previous was a bracket group, we may have to replicate it in certain
4099 cases. */
4100
4101 else if (*previous == OP_BRA || *previous == OP_CBRA ||
4102 *previous == OP_ONCE || *previous == OP_COND)
4103 {
4104 register int i;
4105 int ketoffset = 0;
4106 int len = code - previous;
4107 uschar *bralink = NULL;
4108
4109 /* Repeating a DEFINE group is pointless */
4110
4111 if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
4112 {
4113 *errorcodeptr = ERR55;
4114 goto FAILED;
4115 }
4116
4117 /* If the maximum repeat count is unlimited, find the end of the bracket
4118 by scanning through from the start, and compute the offset back to it
4119 from the current code pointer. There may be an OP_OPT setting following
4120 the final KET, so we can't find the end just by going back from the code
4121 pointer. */
4122
4123 if (repeat_max == -1)
4124 {
4125 register uschar *ket = previous;
4126 do ket += GET(ket, 1); while (*ket != OP_KET);
4127 ketoffset = code - ket;
4128 }
4129
4130 /* The case of a zero minimum is special because of the need to stick
4131 OP_BRAZERO in front of it, and because the group appears once in the
4132 data, whereas in other cases it appears the minimum number of times. For
4133 this reason, it is simplest to treat this case separately, as otherwise
4134 the code gets far too messy. There are several special subcases when the
4135 minimum is zero. */
4136
4137 if (repeat_min == 0)
4138 {
4139 /* If the maximum is also zero, we used to just omit the group from the
4140 output altogether, like this:
4141
4142 ** if (repeat_max == 0)
4143 ** {
4144 ** code = previous;
4145 ** goto END_REPEAT;
4146 ** }
4147
4148 However, that fails when a group is referenced as a subroutine from
4149 elsewhere in the pattern, so now we stick in OP_SKIPZERO in front of it
4150 so that it is skipped on execution. As we don't have a list of which
4151 groups are referenced, we cannot do this selectively.
4152
4153 If the maximum is 1 or unlimited, we just have to stick in the BRAZERO
4154 and do no more at this point. However, we do need to adjust any
4155 OP_RECURSE calls inside the group that refer to the group itself or any
4156 internal or forward referenced group, because the offset is from the
4157 start of the whole regex. Temporarily terminate the pattern while doing
4158 this. */
4159
4160 if (repeat_max <= 1) /* Covers 0, 1, and unlimited */
4161 {
4162 *code = OP_END;
4163 adjust_recurse(previous, 1, utf8, cd, save_hwm);
4164 memmove(previous+1, previous, len);
4165 code++;
4166 if (repeat_max == 0)
4167 {
4168 *previous++ = OP_SKIPZERO;
4169 goto END_REPEAT;
4170 }
4171 *previous++ = OP_BRAZERO + repeat_type;
4172 }
4173
4174 /* If the maximum is greater than 1 and limited, we have to replicate
4175 in a nested fashion, sticking OP_BRAZERO before each set of brackets.
4176 The first one has to be handled carefully because it's the original
4177 copy, which has to be moved up. The remainder can be handled by code
4178 that is common with the non-zero minimum case below. We have to
4179 adjust the value or repeat_max, since one less copy is required. Once
4180 again, we may have to adjust any OP_RECURSE calls inside the group. */
4181
4182 else
4183 {
4184 int offset;
4185 *code = OP_END;
4186 adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
4187 memmove(previous + 2 + LINK_SIZE, previous, len);
4188 code += 2 + LINK_SIZE;
4189 *previous++ = OP_BRAZERO + repeat_type;
4190 *previous++ = OP_BRA;
4191
4192 /* We chain together the bracket offset fields that have to be
4193 filled in later when the ends of the brackets are reached. */
4194
4195 offset = (bralink == NULL)? 0 : previous - bralink;
4196 bralink = previous;
4197 PUTINC(previous, 0, offset);
4198 }
4199
4200 repeat_max--;
4201 }
4202
4203 /* If the minimum is greater than zero, replicate the group as many
4204 times as necessary, and adjust the maximum to the number of subsequent
4205 copies that we need. If we set a first char from the group, and didn't
4206 set a required char, copy the latter from the former. If there are any
4207 forward reference subroutine calls in the group, there will be entries on
4208 the workspace list; replicate these with an appropriate increment. */
4209
4210 else
4211 {
4212 if (repeat_min > 1)
4213 {
4214 /* In the pre-compile phase, we don't actually do the replication. We
4215 just adjust the length as if we had. Do some paranoid checks for
4216 potential integer overflow. */
4217
4218 if (lengthptr != NULL)
4219 {
4220 int delta = (repeat_min - 1)*length_prevgroup;
4221 if ((double)(repeat_min - 1)*(double)length_prevgroup >
4222 (double)INT_MAX ||
4223 OFLOW_MAX - *lengthptr < delta)
4224 {
4225 *errorcodeptr = ERR20;
4226 goto FAILED;
4227 }
4228 *lengthptr += delta;
4229 }
4230
4231 /* This is compiling for real */
4232
4233 else
4234 {
4235 if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
4236 for (i = 1; i < repeat_min; i++)
4237 {
4238 uschar *hc;
4239 uschar *this_hwm = cd->hwm;
4240 memcpy(code, previous, len);
4241 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
4242 {
4243 PUT(cd->hwm, 0, GET(hc, 0) + len);
4244 cd->hwm += LINK_SIZE;
4245 }
4246 save_hwm = this_hwm;
4247 code += len;
4248 }
4249 }
4250 }
4251
4252 if (repeat_max > 0) repeat_max -= repeat_min;
4253 }
4254
4255 /* This code is common to both the zero and non-zero minimum cases. If
4256 the maximum is limited, it replicates the group in a nested fashion,
4257 remembering the bracket starts on a stack. In the case of a zero minimum,
4258 the first one was set up above. In all cases the repeat_max now specifies
4259 the number of additional copies needed. Again, we must remember to
4260 replicate entries on the forward reference list. */
4261
4262 if (repeat_max >= 0)
4263 {
4264 /* In the pre-compile phase, we don't actually do the replication. We
4265 just adjust the length as if we had. For each repetition we must add 1
4266 to the length for BRAZERO and for all but the last repetition we must
4267 add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
4268 paranoid checks to avoid integer overflow. */
4269
4270 if (lengthptr != NULL && repeat_max > 0)
4271 {
4272 int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
4273 2 - 2*LINK_SIZE; /* Last one doesn't nest */
4274 if ((double)repeat_max *
4275 (double)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
4276 > (double)INT_MAX ||
4277 OFLOW_MAX - *lengthptr < delta)
4278 {
4279 *errorcodeptr = ERR20;
4280 goto FAILED;
4281 }
4282 *lengthptr += delta;
4283 }
4284
4285 /* This is compiling for real */
4286
4287 else for (i = repeat_max - 1; i >= 0; i--)
4288 {
4289 uschar *hc;
4290 uschar *this_hwm = cd->hwm;
4291
4292 *code++ = OP_BRAZERO + repeat_type;
4293
4294 /* All but the final copy start a new nesting, maintaining the
4295 chain of brackets outstanding. */
4296
4297 if (i != 0)
4298 {
4299 int offset;
4300 *code++ = OP_BRA;
4301 offset = (bralink == NULL)? 0 : code - bralink;
4302 bralink = code;
4303 PUTINC(code, 0, offset);
4304 }
4305
4306 memcpy(code, previous, len);
4307 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
4308 {
4309 PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
4310 cd->hwm += LINK_SIZE;
4311 }
4312 save_hwm = this_hwm;
4313 code += len;
4314 }
4315
4316 /* Now chain through the pending brackets, and fill in their length
4317 fields (which are holding the chain links pro tem). */
4318
4319 while (bralink != NULL)
4320 {
4321 int oldlinkoffset;
4322 int offset = code - bralink + 1;
4323 uschar *bra = code - offset;
4324 oldlinkoffset = GET(bra, 1);
4325 bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
4326 *code++ = OP_KET;
4327 PUTINC(code, 0, offset);
4328 PUT(bra, 1, offset);
4329 }
4330 }
4331
4332 /* If the maximum is unlimited, set a repeater in the final copy. We
4333 can't just offset backwards from the current code point, because we
4334 don't know if there's been an options resetting after the ket. The
4335 correct offset was computed above.
4336
4337 Then, when we are doing the actual compile phase, check to see whether
4338 this group is a non-atomic one that could match an empty string. If so,
4339 convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
4340 that runtime checking can be done. [This check is also applied to
4341 atomic groups at runtime, but in a different way.] */
4342
4343 else
4344 {
4345 uschar *ketcode = code - ketoffset;
4346 uschar *bracode = ketcode - GET(ketcode, 1);
4347 *ketcode = OP_KETRMAX + repeat_type;
4348 if (lengthptr == NULL && *bracode != OP_ONCE)
4349 {
4350 uschar *scode = bracode;
4351 do
4352 {
4353 if (could_be_empty_branch(scode, ketcode, utf8))
4354 {
4355 *bracode += OP_SBRA - OP_BRA;
4356 break;
4357 }
4358 scode += GET(scode, 1);
4359 }
4360 while (*scode == OP_ALT);
4361 }
4362 }
4363 }
4364
4365 /* If previous is OP_FAIL, it was generated by an empty class [] in
4366 JavaScript mode. The other ways in which OP_FAIL can be generated, that is
4367 by (*FAIL) or (?!) set previous to NULL, which gives a "nothing to repeat"
4368 error above. We can just ignore the repeat in JS case. */
4369
4370 else if (*previous == OP_FAIL) goto END_REPEAT;
4371
4372 /* Else there's some kind of shambles */
4373
4374 else
4375 {
4376 *errorcodeptr = ERR11;
4377 goto FAILED;
4378 }
4379
4380 /* If the character following a repeat is '+', or if certain optimization
4381 tests above succeeded, possessive_quantifier is TRUE. For some of the
4382 simpler opcodes, there is an special alternative opcode for this. For
4383 anything else, we wrap the entire repeated item inside OP_ONCE brackets.
4384 The '+' notation is just syntactic sugar, taken from Sun's Java package,
4385 but the special opcodes can optimize it a bit. The repeated item starts at
4386 tempcode, not at previous, which might be the first part of a string whose
4387 (former) last char we repeated.
4388
4389 Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
4390 an 'upto' may follow. We skip over an 'exact' item, and then test the
4391 length of what remains before proceeding. */
4392
4393 if (possessive_quantifier)
4394 {
4395 int len;
4396
4397 if (*tempcode == OP_TYPEEXACT)
4398 tempcode += _pcre_OP_lengths[*tempcode] +
4399 ((tempcode[3] == OP_PROP || tempcode[3] == OP_NOTPROP)? 2 : 0);
4400
4401 else if (*tempcode == OP_EXACT || *tempcode == OP_NOTEXACT)
4402 {
4403 tempcode += _pcre_OP_lengths[*tempcode];
4404 #ifdef SUPPORT_UTF8
4405 if (utf8 && tempcode[-1] >= 0xc0)
4406 tempcode += _pcre_utf8_table4[tempcode[-1] & 0x3f];
4407 #endif
4408 }
4409
4410 len = code - tempcode;
4411 if (len > 0) switch (*tempcode)
4412 {
4413 case OP_STAR: *tempcode = OP_POSSTAR; break;
4414 case OP_PLUS: *tempcode = OP_POSPLUS; break;
4415 case OP_QUERY: *tempcode = OP_POSQUERY; break;
4416 case OP_UPTO: *tempcode = OP_POSUPTO; break;
4417
4418 case OP_TYPESTAR: *tempcode = OP_TYPEPOSSTAR; break;
4419 case OP_TYPEPLUS: *tempcode = OP_TYPEPOSPLUS; break;
4420 case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
4421 case OP_TYPEUPTO: *tempcode = OP_TYPEPOSUPTO; break;
4422
4423 case OP_NOTSTAR: *tempcode = OP_NOTPOSSTAR; break;
4424 case OP_NOTPLUS: *tempcode = OP_NOTPOSPLUS; break;
4425 case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
4426 case OP_NOTUPTO: *tempcode = OP_NOTPOSUPTO; break;
4427
4428 default:
4429 memmove(tempcode + 1+LINK_SIZE, tempcode, len);
4430 code += 1 + LINK_SIZE;
4431 len += 1 + LINK_SIZE;
4432 tempcode[0] = OP_ONCE;
4433 *code++ = OP_KET;
4434 PUTINC(code, 0, len);
4435 PUT(tempcode, 1, len);
4436 break;
4437 }
4438 }
4439
4440 /* In all case we no longer have a previous item. We also set the
4441 "follows varying string" flag for subsequently encountered reqbytes if
4442 it isn't already set and we have just passed a varying length item. */
4443
4444 END_REPEAT:
4445 previous = NULL;
4446 cd->req_varyopt |= reqvary;
4447 break;
4448
4449
4450 /* ===================================================================*/
4451 /* Start of nested parenthesized sub-expression, or comment or lookahead or
4452 lookbehind or option setting or condition or all the other extended
4453 parenthesis forms. */
4454
4455 case CHAR_LEFT_PARENTHESIS:
4456 newoptions = options;
4457 skipbytes = 0;
4458 bravalue = OP_CBRA;
4459 save_hwm = cd->hwm;
4460 reset_bracount = FALSE;
4461
4462 /* First deal with various "verbs" that can be introduced by '*'. */
4463
4464 if (*(++ptr) == CHAR_ASTERISK && (cd->ctypes[ptr[1]] & ctype_letter) != 0)
4465 {
4466 int i, namelen;
4467 const char *vn = verbnames;
4468 const uschar *name = ++ptr;
4469 previous = NULL;
4470 while ((cd->ctypes[*++ptr] & ctype_letter) != 0) {};
4471 if (*ptr == CHAR_COLON)
4472 {
4473 *errorcodeptr = ERR59; /* Not supported */
4474 goto FAILED;
4475 }
4476 if (*ptr != CHAR_RIGHT_PARENTHESIS)
4477 {
4478 *errorcodeptr = ERR60;
4479 goto FAILED;
4480 }
4481 namelen = ptr - name;
4482 for (i = 0; i < verbcount; i++)
4483 {
4484 if (namelen == verbs[i].len &&
4485 strncmp((char *)name, vn, namelen) == 0)
4486 {
4487 /* Check for open captures before ACCEPT */
4488
4489 if (verbs[i].op == OP_ACCEPT)
4490 {
4491 open_capitem *oc;
4492 cd->had_accept = TRUE;
4493 for (oc = cd->open_caps; oc != NULL; oc = oc->next)
4494 {
4495 *code++ = OP_CLOSE;
4496 PUT2INC(code, 0, oc->number);
4497 }
4498 }
4499 *code++ = verbs[i].op;
4500 break;
4501 }
4502 vn += verbs[i].len + 1;
4503 }
4504 if (i < verbcount) continue;
4505 *errorcodeptr = ERR60;
4506 goto FAILED;
4507 }
4508
4509 /* Deal with the extended parentheses; all are introduced by '?', and the
4510 appearance of any of them means that this is not a capturing group. */
4511
4512 else if (*ptr == CHAR_QUESTION_MARK)
4513 {
4514 int i, set, unset, namelen;
4515 int *optset;
4516 const uschar *name;
4517 uschar *slot;
4518
4519 switch (*(++ptr))
4520 {
4521 case CHAR_NUMBER_SIGN: /* Comment; skip to ket */
4522 ptr++;
4523 while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
4524 if (*ptr == 0)
4525 {
4526 *errorcodeptr = ERR18;
4527 goto FAILED;
4528 }
4529 continue;
4530
4531
4532 /* ------------------------------------------------------------ */
4533 case CHAR_VERTICAL_LINE: /* Reset capture count for each branch */
4534 reset_bracount = TRUE;
4535 /* Fall through */
4536
4537 /* ------------------------------------------------------------ */
4538 case CHAR_COLON: /* Non-capturing bracket */
4539 bravalue = OP_BRA;
4540 ptr++;
4541 break;
4542
4543
4544 /* ------------------------------------------------------------ */
4545 case CHAR_LEFT_PARENTHESIS:
4546 bravalue = OP_COND; /* Conditional group */
4547
4548 /* A condition can be an assertion, a number (referring to a numbered
4549 group), a name (referring to a named group), or 'R', referring to
4550 recursion. R<digits> and R&name are also permitted for recursion tests.
4551
4552 There are several syntaxes for testing a named group: (?(name)) is used
4553 by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
4554
4555 There are two unfortunate ambiguities, caused by history. (a) 'R' can
4556 be the recursive thing or the name 'R' (and similarly for 'R' followed
4557 by digits), and (b) a number could be a name that consists of digits.
4558 In both cases, we look for a name first; if not found, we try the other
4559 cases. */
4560
4561 /* For conditions that are assertions, check the syntax, and then exit
4562 the switch. This will take control down to where bracketed groups,
4563 including assertions, are processed. */
4564
4565 if (ptr[1] == CHAR_QUESTION_MARK && (ptr[2] == CHAR_EQUALS_SIGN ||
4566 ptr[2] == CHAR_EXCLAMATION_MARK || ptr[2] == CHAR_LESS_THAN_SIGN))
4567 break;
4568
4569 /* Most other conditions use OP_CREF (a couple change to OP_RREF
4570 below), and all need to skip 3 bytes at the start of the group. */
4571
4572 code[1+LINK_SIZE] = OP_CREF;
4573 skipbytes = 3;
4574 refsign = -1;
4575
4576 /* Check for a test for recursion in a named group. */
4577
4578 if (ptr[1] == CHAR_R && ptr[2] == CHAR_AMPERSAND)
4579 {
4580 terminator = -1;
4581 ptr += 2;
4582 code[1+LINK_SIZE] = OP_RREF; /* Change the type of test */
4583 }
4584
4585 /* Check for a test for a named group's having been set, using the Perl
4586 syntax (?(<name>) or (?('name') */
4587
4588 else if (ptr[1] == CHAR_LESS_THAN_SIGN)
4589 {
4590 terminator = CHAR_GREATER_THAN_SIGN;
4591 ptr++;
4592 }
4593 else if (ptr[1] == CHAR_APOSTROPHE)
4594 {
4595 terminator = CHAR_APOSTROPHE;
4596 ptr++;
4597 }
4598 else
4599 {
4600 terminator = 0;
4601 if (ptr[1] == CHAR_MINUS || ptr[1] == CHAR_PLUS) refsign = *(++ptr);
4602 }
4603
4604 /* We now expect to read a name; any thing else is an error */
4605
4606 if ((cd->ctypes[ptr[1]] & ctype_word) == 0)
4607 {
4608 ptr += 1; /* To get the right offset */
4609 *errorcodeptr = ERR28;
4610 goto FAILED;
4611 }
4612
4613 /* Read the name, but also get it as a number if it's all digits */
4614
4615 recno = 0;
4616 name = ++ptr;
4617 while ((cd->ctypes[*ptr] & ctype_word) != 0)
4618 {
4619 if (recno >= 0)
4620 recno = ((digitab[*ptr] & ctype_digit) != 0)?
4621 recno * 10 + *ptr - CHAR_0 : -1;
4622 ptr++;
4623 }
4624 namelen = ptr - name;
4625
4626 if ((terminator > 0 && *ptr++ != terminator) ||
4627 *ptr++ != CHAR_RIGHT_PARENTHESIS)
4628 {
4629 ptr--; /* Error offset */
4630 *errorcodeptr = ERR26;
4631 goto FAILED;
4632 }
4633
4634 /* Do no further checking in the pre-compile phase. */
4635
4636 if (lengthptr != NULL) break;
4637
4638 /* In the real compile we do the work of looking for the actual
4639 reference. If the string started with "+" or "-" we require the rest to
4640 be digits, in which case recno will be set. */
4641
4642 if (refsign > 0)
4643 {
4644 if (recno <= 0)
4645 {
4646 *errorcodeptr = ERR58;
4647 goto FAILED;
4648 }
4649 recno = (refsign == CHAR_MINUS)?
4650 cd->bracount - recno + 1 : recno +cd->bracount;
4651 if (recno <= 0 || recno > cd->final_bracount)
4652 {
4653 *errorcodeptr = ERR15;
4654 goto FAILED;
4655 }
4656 PUT2(code, 2+LINK_SIZE, recno);
4657 break;
4658 }
4659
4660 /* Otherwise (did not start with "+" or "-"), start by looking for the
4661 name. If we find a name, add one to the opcode to change OP_CREF or
4662 OP_RREF into OP_NCREF or OP_NRREF. These behave exactly the same,
4663 except they record that the reference was originally to a name. The
4664 information is used to check duplicate names. */
4665
4666 slot = cd->name_table;
4667 for (i = 0; i < cd->names_found; i++)
4668 {
4669 if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
4670 slot += cd->name_entry_size;
4671 }
4672
4673 /* Found a previous named subpattern */
4674
4675 if (i < cd->names_found)
4676 {
4677 recno = GET2(slot, 0);
4678 PUT2(code, 2+LINK_SIZE, recno);
4679 code[1+LINK_SIZE]++;
4680 }
4681
4682 /* Search the pattern for a forward reference */
4683
4684 else if ((i = find_parens(cd, name, namelen,
4685 (options & PCRE_EXTENDED) != 0)) > 0)
4686 {
4687 PUT2(code, 2+LINK_SIZE, i);
4688 code[1+LINK_SIZE]++;
4689 }
4690
4691 /* If terminator == 0 it means that the name followed directly after
4692 the opening parenthesis [e.g. (?(abc)...] and in this case there are
4693 some further alternatives to try. For the cases where terminator != 0
4694 [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
4695 now checked all the possibilities, so give an error. */
4696
4697 else if (terminator != 0)
4698 {
4699 *errorcodeptr = ERR15;
4700 goto FAILED;
4701 }
4702
4703 /* Check for (?(R) for recursion. Allow digits after R to specify a
4704 specific group number. */
4705
4706 else if (*name == CHAR_R)
4707 {
4708 recno = 0;
4709 for (i = 1; i < namelen; i++)
4710 {
4711 if ((digitab[name[i]] & ctype_digit) == 0)
4712 {
4713 *errorcodeptr = ERR15;
4714 goto FAILED;
4715 }
4716 recno = recno * 10 + name[i] - CHAR_0;
4717 }
4718 if (recno == 0) recno = RREF_ANY;
4719 code[1+LINK_SIZE] = OP_RREF; /* Change test type */
4720 PUT2(code, 2+LINK_SIZE, recno);
4721 }
4722
4723 /* Similarly, check for the (?(DEFINE) "condition", which is always
4724 false. */
4725
4726 else if (namelen == 6 && strncmp((char *)name, STRING_DEFINE, 6) == 0)
4727 {
4728 code[1+LINK_SIZE] = OP_DEF;
4729 skipbytes = 1;
4730 }
4731
4732 /* Check for the "name" actually being a subpattern number. We are
4733 in the second pass here, so final_bracount is set. */
4734
4735 else if (recno > 0 && recno <= cd->final_bracount)
4736 {
4737 PUT2(code, 2+LINK_SIZE, recno);
4738 }
4739
4740 /* Either an unidentified subpattern, or a reference to (?(0) */
4741
4742 else
4743 {
4744 *errorcodeptr = (recno == 0)? ERR35: ERR15;
4745 goto FAILED;
4746 }
4747 break;
4748
4749
4750 /* ------------------------------------------------------------ */
4751 case CHAR_EQUALS_SIGN: /* Positive lookahead */
4752 bravalue = OP_ASSERT;
4753 ptr++;
4754 break;
4755
4756
4757 /* ------------------------------------------------------------ */
4758 case CHAR_EXCLAMATION_MARK: /* Negative lookahead */
4759 ptr++;
4760 if (*ptr == CHAR_RIGHT_PARENTHESIS) /* Optimize (?!) */
4761 {
4762 *code++ = OP_FAIL;
4763 previous = NULL;
4764 continue;
4765 }
4766 bravalue = OP_ASSERT_NOT;
4767 break;
4768
4769
4770 /* ------------------------------------------------------------ */
4771 case CHAR_LESS_THAN_SIGN: /* Lookbehind or named define */
4772 switch (ptr[1])
4773 {
4774 case CHAR_EQUALS_SIGN: /* Positive lookbehind */
4775 bravalue = OP_ASSERTBACK;
4776 ptr += 2;
4777 break;
4778
4779 case CHAR_EXCLAMATION_MARK: /* Negative lookbehind */
4780 bravalue = OP_ASSERTBACK_NOT;
4781 ptr += 2;
4782 break;
4783
4784 default: /* Could be name define, else bad */
4785 if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME;
4786 ptr++; /* Correct offset for error */
4787 *errorcodeptr = ERR24;
4788 goto FAILED;
4789 }
4790 break;
4791
4792
4793 /* ------------------------------------------------------------ */
4794 case CHAR_GREATER_THAN_SIGN: /* One-time brackets */
4795 bravalue = OP_ONCE;
4796 ptr++;
4797 break;
4798
4799
4800 /* ------------------------------------------------------------ */
4801 case CHAR_C: /* Callout - may be followed by digits; */
4802 previous_callout = code; /* Save for later completion */
4803 after_manual_callout = 1; /* Skip one item before completing */
4804 *code++ = OP_CALLOUT;
4805 {
4806 int n = 0;
4807 while ((digitab[*(++ptr)] & ctype_digit) != 0)
4808 n = n * 10 + *ptr - CHAR_0;
4809 if (*ptr != CHAR_RIGHT_PARENTHESIS)
4810 {
4811 *errorcodeptr = ERR39;
4812 goto FAILED;
4813 }
4814 if (n > 255)
4815 {
4816 *errorcodeptr = ERR38;
4817 goto FAILED;
4818 }
4819 *code++ = n;
4820 PUT(code, 0, ptr - cd->start_pattern + 1); /* Pattern offset */
4821 PUT(code, LINK_SIZE, 0); /* Default length */
4822 code += 2 * LINK_SIZE;
4823 }
4824 previous = NULL;
4825 continue;
4826
4827
4828 /* ------------------------------------------------------------ */
4829 case CHAR_P: /* Python-style named subpattern handling */
4830 if (*(++ptr) == CHAR_EQUALS_SIGN ||
4831 *ptr == CHAR_GREATER_THAN_SIGN) /* Reference or recursion */
4832 {
4833 is_recurse = *ptr == CHAR_GREATER_THAN_SIGN;
4834 terminator = CHAR_RIGHT_PARENTHESIS;
4835 goto NAMED_REF_OR_RECURSE;
4836 }
4837 else if (*ptr != CHAR_LESS_THAN_SIGN) /* Test for Python-style defn */
4838 {
4839 *errorcodeptr = ERR41;
4840 goto FAILED;
4841 }
4842 /* Fall through to handle (?P< as (?< is handled */
4843
4844
4845 /* ------------------------------------------------------------ */
4846 DEFINE_NAME: /* Come here from (?< handling */
4847 case CHAR_APOSTROPHE:
4848 {
4849 terminator = (*ptr == CHAR_LESS_THAN_SIGN)?
4850 CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
4851 name = ++ptr;
4852
4853 while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4854 namelen = ptr - name;
4855
4856 /* In the pre-compile phase, just do a syntax check. */
4857
4858 if (lengthptr != NULL)
4859 {
4860 if (*ptr != terminator)
4861 {
4862 *errorcodeptr = ERR42;
4863 goto FAILED;
4864 }
4865 if (cd->names_found >= MAX_NAME_COUNT)
4866 {
4867 *errorcodeptr = ERR49;
4868 goto FAILED;
4869 }
4870 if (namelen + 3 > cd->name_entry_size)
4871 {
4872 cd->name_entry_size = namelen + 3;
4873 if (namelen > MAX_NAME_SIZE)
4874 {
4875 *errorcodeptr = ERR48;
4876 goto FAILED;
4877 }
4878 }
4879 }
4880
4881 /* In the real compile, create the entry in the table, maintaining
4882 alphabetical order. Duplicate names for different numbers are
4883 permitted only if PCRE_DUPNAMES is set. Duplicate names for the same
4884 number are always OK. (An existing number can be re-used if (?|
4885 appears in the pattern.) In either event, a duplicate name results in
4886 a duplicate entry in the table, even if the number is the same. This
4887 is because the number of names, and hence the table size, is computed
4888 in the pre-compile, and it affects various numbers and pointers which
4889 would all have to be modified, and the compiled code moved down, if
4890 duplicates with the same number were omitted from the table. This
4891 doesn't seem worth the hassle. However, *different* names for the
4892 same number are not permitted. */
4893
4894 else
4895 {
4896 BOOL dupname = FALSE;
4897 slot = cd->name_table;
4898
4899 for (i = 0; i < cd->names_found; i++)
4900 {
4901 int crc = memcmp(name, slot+2, namelen);
4902 if (crc == 0)
4903 {
4904 if (slot[2+namelen] == 0)
4905 {
4906 if (GET2(slot, 0) != cd->bracount + 1 &&
4907 (options & PCRE_DUPNAMES) == 0)
4908 {
4909 *errorcodeptr = ERR43;
4910 goto FAILED;
4911 }
4912 else dupname = TRUE;
4913 }
4914 else crc = -1; /* Current name is a substring */
4915 }
4916
4917 /* Make space in the table and break the loop for an earlier
4918 name. For a duplicate or later name, carry on. We do this for
4919 duplicates so that in the simple case (when ?(| is not used) they
4920 are in order of their numbers. */
4921
4922 if (crc < 0)
4923 {
4924 memmove(slot + cd->name_entry_size, slot,
4925 (cd->names_found - i) * cd->name_entry_size);
4926 break;
4927 }
4928
4929 /* Continue the loop for a later or duplicate name */
4930
4931 slot += cd->name_entry_size;
4932 }
4933
4934 /* For non-duplicate names, check for a duplicate number before
4935 adding the new name. */
4936
4937 if (!dupname)
4938 {
4939 uschar *cslot = cd->name_table;
4940 for (i = 0; i < cd->names_found; i++)
4941 {
4942 if (cslot != slot)
4943 {
4944 if (GET2(cslot, 0) == cd->bracount + 1)
4945 {
4946 *errorcodeptr = ERR65;
4947 goto FAILED;
4948 }
4949 }
4950 else i--;
4951 cslot += cd->name_entry_size;
4952 }
4953 }
4954
4955 PUT2(slot, 0, cd->bracount + 1);
4956 memcpy(slot + 2, name, namelen);
4957 slot[2+namelen] = 0;
4958 }
4959 }
4960
4961 /* In both pre-compile and compile, count the number of names we've
4962 encountered. */
4963
4964 cd->names_found++;
4965 ptr++; /* Move past > or ' */
4966 goto NUMBERED_GROUP;
4967
4968
4969 /* ------------------------------------------------------------ */
4970 case CHAR_AMPERSAND: /* Perl recursion/subroutine syntax */
4971 terminator = CHAR_RIGHT_PARENTHESIS;
4972 is_recurse = TRUE;
4973 /* Fall through */
4974
4975 /* We come here from the Python syntax above that handles both
4976 references (?P=name) and recursion (?P>name), as well as falling
4977 through from the Perl recursion syntax (?&name). We also come here from
4978 the Perl \k<name> or \k'name' back reference syntax and the \k{name}
4979 .NET syntax, and the Oniguruma \g<...> and \g'...' subroutine syntax. */
4980
4981 NAMED_REF_OR_RECURSE:
4982 name = ++ptr;
4983 while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4984 namelen = ptr - name;
4985
4986 /* In the pre-compile phase, do a syntax check and set a dummy
4987 reference number. */
4988
4989 if (lengthptr != NULL)
4990 {
4991 if (namelen == 0)
4992 {
4993 *errorcodeptr = ERR62;
4994 goto FAILED;
4995 }
4996 if (*ptr != terminator)
4997 {
4998 *errorcodeptr = ERR42;
4999 goto FAILED;
5000 }
5001 if (namelen > MAX_NAME_SIZE)
5002 {
5003 *errorcodeptr = ERR48;
5004 goto FAILED;
5005 }
5006 recno = 0;
5007 }
5008
5009 /* In the real compile, seek the name in the table. We check the name
5010 first, and then check that we have reached the end of the name in the
5011 table. That way, if the name that is longer than any in the table,
5012 the comparison will fail without reading beyond the table entry. */
5013
5014 else
5015 {
5016 slot = cd->name_table;
5017 for (i = 0; i < cd->names_found; i++)
5018 {
5019 if (strncmp((char *)name, (char *)slot+2, namelen) == 0 &&
5020 slot[2+namelen] == 0)
5021 break;
5022 slot += cd->name_entry_size;
5023 }
5024
5025 if (i < cd->names_found) /* Back reference */
5026 {
5027 recno = GET2(slot, 0);
5028 }
5029 else if ((recno = /* Forward back reference */
5030 find_parens(cd, name, namelen,
5031 (options & PCRE_EXTENDED) != 0)) <= 0)
5032 {
5033 *errorcodeptr = ERR15;
5034 goto FAILED;
5035 }
5036 }
5037
5038 /* In both phases, we can now go to the code than handles numerical
5039 recursion or backreferences. */
5040
5041 if (is_recurse) goto HANDLE_RECURSION;
5042 else goto HANDLE_REFERENCE;
5043
5044
5045 /* ------------------------------------------------------------ */
5046 case CHAR_R: /* Recursion */
5047 ptr++; /* Same as (?0) */
5048 /* Fall through */
5049
5050
5051 /* ------------------------------------------------------------ */
5052 case CHAR_MINUS: case CHAR_PLUS: /* Recursion or subroutine */
5053 case CHAR_0: case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4:
5054 case CHAR_5: case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
5055 {
5056 const uschar *called;
5057 terminator = CHAR_RIGHT_PARENTHESIS;
5058
5059 /* Come here from the \g<...> and \g'...' code (Oniguruma
5060 compatibility). However, the syntax has been checked to ensure that
5061 the ... are a (signed) number, so that neither ERR63 nor ERR29 will
5062 be called on this path, nor with the jump to OTHER_CHAR_AFTER_QUERY
5063 ever be taken. */
5064
5065 HANDLE_NUMERICAL_RECURSION:
5066
5067 if ((refsign = *ptr) == CHAR_PLUS)
5068 {
5069 ptr++;
5070 if ((digitab[*ptr] & ctype_digit) == 0)
5071 {
5072 *errorcodeptr = ERR63;
5073 goto FAILED;
5074 }
5075 }
5076 else if (refsign == CHAR_MINUS)
5077 {
5078 if ((digitab[ptr[1]] & ctype_digit) == 0)
5079 goto OTHER_CHAR_AFTER_QUERY;
5080 ptr++;
5081 }
5082
5083 recno = 0;
5084 while((digitab[*ptr] & ctype_digit) != 0)
5085 recno = recno * 10 + *ptr++ - CHAR_0;
5086
5087 if (*ptr != terminator)
5088 {
5089 *errorcodeptr = ERR29;
5090 goto FAILED;
5091 }
5092
5093 if (refsign == CHAR_MINUS)
5094 {
5095 if (recno == 0)
5096 {
5097 *errorcodeptr = ERR58;
5098 goto FAILED;
5099 }
5100 recno = cd->bracount - recno + 1;
5101 if (recno <= 0)
5102 {
5103 *errorcodeptr = ERR15;
5104 goto FAILED;
5105 }
5106 }
5107 else if (refsign == CHAR_PLUS)
5108 {
5109 if (recno == 0)
5110 {
5111 *errorcodeptr = ERR58;
5112 goto FAILED;
5113 }
5114 recno += cd->bracount;
5115 }
5116
5117 /* Come here from code above that handles a named recursion */
5118
5119 HANDLE_RECURSION:
5120
5121 previous = code;
5122 called = cd->start_code;
5123
5124 /* When we are actually compiling, find the bracket that is being
5125 referenced. Temporarily end the regex in case it doesn't exist before
5126 this point. If we end up with a forward reference, first check that
5127 the bracket does occur later so we can give the error (and position)
5128 now. Then remember this forward reference in the workspace so it can
5129 be filled in at the end. */
5130
5131 if (lengthptr == NULL)
5132 {
5133 *code = OP_END;
5134 if (recno != 0)
5135 called = _pcre_find_bracket(cd->start_code, utf8, recno);
5136
5137 /* Forward reference */
5138
5139 if (called == NULL)
5140 {
5141 if (find_parens(cd, NULL, recno,
5142 (options & PCRE_EXTENDED) != 0) < 0)
5143 {
5144 *errorcodeptr = ERR15;
5145 goto FAILED;
5146 }
5147 called = cd->start_code + recno;
5148 PUTINC(cd->hwm, 0, code + 2 + LINK_SIZE - cd->start_code);
5149 }
5150
5151 /* If not a forward reference, and the subpattern is still open,
5152 this is a recursive call. We check to see if this is a left
5153 recursion that could loop for ever, and diagnose that case. */
5154
5155 else if (GET(called, 1) == 0 &&
5156 could_be_empty(called, code, bcptr, utf8))
5157 {
5158 *errorcodeptr = ERR40;
5159 goto FAILED;
5160 }
5161 }
5162
5163 /* Insert the recursion/subroutine item, automatically wrapped inside
5164 "once" brackets. Set up a "previous group" length so that a
5165 subsequent quantifier will work. */
5166
5167 *code = OP_ONCE;
5168 PUT(code, 1, 2 + 2*LINK_SIZE);
5169 code += 1 + LINK_SIZE;
5170
5171 *code = OP_RECURSE;
5172 PUT(code, 1, called - cd->start_code);
5173 code += 1 + LINK_SIZE;
5174
5175 *code = OP_KET;
5176 PUT(code, 1, 2 + 2*LINK_SIZE);
5177 code += 1 + LINK_SIZE;
5178
5179 length_prevgroup = 3 + 3*LINK_SIZE;
5180 }
5181
5182 /* Can't determine a first byte now */
5183
5184 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
5185 continue;
5186
5187
5188 /* ------------------------------------------------------------ */
5189 default: /* Other characters: check option setting */
5190 OTHER_CHAR_AFTER_QUERY:
5191 set = unset = 0;
5192 optset = &set;
5193
5194 while (*ptr != CHAR_RIGHT_PARENTHESIS && *ptr != CHAR_COLON)
5195 {
5196 switch (*ptr++)
5197 {
5198 case CHAR_MINUS: optset = &unset; break;
5199
5200 case CHAR_J: /* Record that it changed in the external options */
5201 *optset |= PCRE_DUPNAMES;
5202 cd->external_flags |= PCRE_JCHANGED;
5203 break;
5204
5205 case CHAR_i: *optset |= PCRE_CASELESS; break;
5206 case CHAR_m: *optset |= PCRE_MULTILINE; break;
5207 case CHAR_s: *optset |= PCRE_DOTALL; break;
5208 case CHAR_x: *optset |= PCRE_EXTENDED; break;
5209 case CHAR_U: *optset |= PCRE_UNGREEDY; break;
5210 case CHAR_X: *optset |= PCRE_EXTRA; break;
5211
5212 default: *errorcodeptr = ERR12;
5213 ptr--; /* Correct the offset */
5214 goto FAILED;
5215 }
5216 }
5217
5218 /* Set up the changed option bits, but don't change anything yet. */
5219
5220 newoptions = (options | set) & (~unset);
5221
5222 /* If the options ended with ')' this is not the start of a nested
5223 group with option changes, so the options change at this level. If this
5224 item is right at the start of the pattern, the options can be
5225 abstracted and made external in the pre-compile phase, and ignored in
5226 the compile phase. This can be helpful when matching -- for instance in
5227 caseless checking of required bytes.
5228
5229 If the code pointer is not (cd->start_code + 1 + LINK_SIZE), we are
5230 definitely *not* at the start of the pattern because something has been
5231 compiled. In the pre-compile phase, however, the code pointer can have
5232 that value after the start, because it gets reset as code is discarded
5233 during the pre-compile. However, this can happen only at top level - if
5234 we are within parentheses, the starting BRA will still be present. At
5235 any parenthesis level, the length value can be used to test if anything
5236 has been compiled at that level. Thus, a test for both these conditions
5237 is necessary to ensure we correctly detect the start of the pattern in
5238 both phases.
5239
5240 If we are not at the pattern start, compile code to change the ims
5241 options if this setting actually changes any of them, and reset the
5242 greedy defaults and the case value for firstbyte and reqbyte. */
5243
5244 if (*ptr == CHAR_RIGHT_PARENTHESIS)
5245 {
5246 if (code == cd->start_code + 1 + LINK_SIZE &&
5247 (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE))
5248 {
5249 cd->external_options = newoptions;
5250 }
5251 else
5252 {
5253 if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
5254 {
5255 *code++ = OP_OPT;
5256 *code++ = newoptions & PCRE_IMS;
5257 }
5258 greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
5259 greedy_non_default = greedy_default ^ 1;
5260 req_caseopt = ((newoptions & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
5261 }
5262
5263 /* Change options at this level, and pass them back for use
5264 in subsequent branches. When not at the start of the pattern, this
5265 information is also necessary so that a resetting item can be
5266 compiled at the end of a group (if we are in a group). */
5267
5268 *optionsptr = options = newoptions;
5269 previous = NULL; /* This item can't be repeated */
5270 continue; /* It is complete */
5271 }
5272
5273 /* If the options ended with ':' we are heading into a nested group
5274 with possible change of options. Such groups are non-capturing and are
5275 not assertions of any kind. All we need to do is skip over the ':';
5276 the newoptions value is handled below. */
5277
5278 bravalue = OP_BRA;
5279 ptr++;
5280 } /* End of switch for character following (? */
5281 } /* End of (? handling */
5282
5283 /* Opening parenthesis not followed by '?'. If PCRE_NO_AUTO_CAPTURE is set,
5284 all unadorned brackets become non-capturing and behave like (?:...)
5285 brackets. */
5286
5287 else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
5288 {
5289 bravalue = OP_BRA;
5290 }
5291
5292 /* Else we have a capturing group. */
5293
5294 else
5295 {
5296 NUMBERED_GROUP:
5297 cd->bracount += 1;
5298 PUT2(code, 1+LINK_SIZE, cd->bracount);
5299 skipbytes = 2;
5300 }
5301
5302 /* Process nested bracketed regex. Assertions may not be repeated, but
5303 other kinds can be. All their opcodes are >= OP_ONCE. We copy code into a
5304 non-register variable in order to be able to pass its address because some
5305 compilers complain otherwise. Pass in a new setting for the ims options if
5306 they have changed. */
5307
5308 previous = (bravalue >= OP_ONCE)? code : NULL;
5309 *code = bravalue;
5310 tempcode = code;
5311 tempreqvary = cd->req_varyopt; /* Save value before bracket */
5312 length_prevgroup = 0; /* Initialize for pre-compile phase */
5313
5314 if (!compile_regex(
5315 newoptions, /* The complete new option state */
5316 options & PCRE_IMS, /* The previous ims option state */
5317 &tempcode, /* Where to put code (updated) */
5318 &ptr, /* Input pointer (updated) */
5319 errorcodeptr, /* Where to put an error message */
5320 (bravalue == OP_ASSERTBACK ||
5321 bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
5322 reset_bracount, /* True if (?| group */
5323 skipbytes, /* Skip over bracket number */
5324 &subfirstbyte, /* For possible first char */
5325 &subreqbyte, /* For possible last char */
5326 bcptr, /* Current branch chain */
5327 cd, /* Tables block */
5328 (lengthptr == NULL)? NULL : /* Actual compile phase */
5329 &length_prevgroup /* Pre-compile phase */
5330 ))
5331 goto FAILED;
5332
5333 /* At the end of compiling, code is still pointing to the start of the
5334 group, while tempcode has been updated to point past the end of the group
5335 and any option resetting that may follow it. The pattern pointer (ptr)
5336 is on the bracket. */
5337
5338 /* If this is a conditional bracket, check that there are no more than
5339 two branches in the group, or just one if it's a DEFINE group. We do this
5340 in the real compile phase, not in the pre-pass, where the whole group may
5341 not be available. */
5342
5343 if (bravalue == OP_COND && lengthptr == NULL)
5344 {
5345 uschar *tc = code;
5346 int condcount = 0;
5347
5348 do {
5349 condcount++;
5350 tc += GET(tc,1);
5351 }
5352 while (*tc != OP_KET);
5353
5354 /* A DEFINE group is never obeyed inline (the "condition" is always
5355 false). It must have only one branch. */
5356
5357 if (code[LINK_SIZE+1] == OP_DEF)
5358 {
5359 if (condcount > 1)
5360 {
5361 *errorcodeptr = ERR54;
5362 goto FAILED;
5363 }
5364 bravalue = OP_DEF; /* Just a flag to suppress char handling below */
5365 }
5366
5367 /* A "normal" conditional group. If there is just one branch, we must not
5368 make use of its firstbyte or reqbyte, because this is equivalent to an
5369 empty second branch. */
5370
5371 else
5372 {
5373 if (condcount > 2)
5374 {
5375 *errorcodeptr = ERR27;
5376 goto FAILED;
5377 }
5378 if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
5379 }
5380 }
5381
5382 /* Error if hit end of pattern */
5383
5384 if (*ptr != CHAR_RIGHT_PARENTHESIS)
5385 {
5386 *errorcodeptr = ERR14;
5387 goto FAILED;
5388 }
5389
5390 /* In the pre-compile phase, update the length by the length of the group,
5391 less the brackets at either end. Then reduce the compiled code to just a
5392 set of non-capturing brackets so that it doesn't use much memory if it is
5393 duplicated by a quantifier.*/
5394
5395 if (lengthptr != NULL)
5396 {
5397 if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE)
5398 {
5399 *errorcodeptr = ERR20;
5400 goto FAILED;
5401 }
5402 *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
5403 *code++ = OP_BRA;
5404 PUTINC(code, 0, 1 + LINK_SIZE);
5405 *code++ = OP_KET;
5406 PUTINC(code, 0, 1 + LINK_SIZE);
5407 break; /* No need to waste time with special character handling */
5408 }
5409
5410 /* Otherwise update the main code pointer to the end of the group. */
5411
5412 code = tempcode;
5413
5414 /* For a DEFINE group, required and first character settings are not
5415 relevant. */
5416
5417 if (bravalue == OP_DEF) break;
5418
5419 /* Handle updating of the required and first characters for other types of
5420 group. Update for normal brackets of all kinds, and conditions with two
5421 branches (see code above). If the bracket is followed by a quantifier with
5422 zero repeat, we have to back off. Hence the definition of zeroreqbyte and
5423 zerofirstbyte outside the main loop so that they can be accessed for the
5424 back off. */
5425
5426 zeroreqbyte = reqbyte;
5427 zerofirstbyte = firstbyte;
5428 groupsetfirstbyte = FALSE;
5429
5430 if (bravalue >= OP_ONCE)
5431 {
5432 /* If we have not yet set a firstbyte in this branch, take it from the
5433 subpattern, remembering that it was set here so that a repeat of more
5434 than one can replicate it as reqbyte if necessary. If the subpattern has
5435 no firstbyte, set "none" for the whole branch. In both cases, a zero
5436 repeat forces firstbyte to "none". */
5437
5438 if (firstbyte == REQ_UNSET)
5439 {
5440 if (subfirstbyte >= 0)
5441 {
5442 firstbyte = subfirstbyte;
5443 groupsetfirstbyte = TRUE;
5444 }
5445 else firstbyte = REQ_NONE;
5446 zerofirstbyte = REQ_NONE;
5447 }
5448
5449 /* If firstbyte was previously set, convert the subpattern's firstbyte
5450 into reqbyte if there wasn't one, using the vary flag that was in
5451 existence beforehand. */
5452
5453 else if (subfirstbyte >= 0 && subreqbyte < 0)
5454 subreqbyte = subfirstbyte | tempreqvary;
5455
5456 /* If the subpattern set a required byte (or set a first byte that isn't
5457 really the first byte - see above), set it. */
5458
5459 if (subreqbyte >= 0) reqbyte = subreqbyte;
5460 }
5461
5462 /* For a forward assertion, we take the reqbyte, if set. This can be
5463 helpful if the pattern that follows the assertion doesn't set a different
5464 char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
5465 for an assertion, however because it leads to incorrect effect for patterns
5466 such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
5467 of a firstbyte. This is overcome by a scan at the end if there's no
5468 firstbyte, looking for an asserted first char. */
5469
5470 else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
5471 break; /* End of processing '(' */
5472
5473
5474 /* ===================================================================*/
5475 /* Handle metasequences introduced by \. For ones like \d, the ESC_ values
5476 are arranged to be the negation of the corresponding OP_values. For the
5477 back references, the values are ESC_REF plus the reference number. Only
5478 back references and those types that consume a character may be repeated.
5479 We can test for values between ESC_b and ESC_Z for the latter; this may
5480 have to change if any new ones are ever created. */
5481
5482 case CHAR_BACKSLASH:
5483 tempptr = ptr;
5484 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, FALSE);
5485 if (*errorcodeptr != 0) goto FAILED;
5486
5487 if (c < 0)
5488 {
5489 if (-c == ESC_Q) /* Handle start of quoted string */
5490 {
5491 if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
5492 ptr += 2; /* avoid empty string */
5493 else inescq = TRUE;
5494 continue;
5495 }
5496
5497 if (-c == ESC_E) continue; /* Perl ignores an orphan \E */
5498
5499 /* For metasequences that actually match a character, we disable the
5500 setting of a first character if it hasn't already been set. */
5501
5502 if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
5503 firstbyte = REQ_NONE;
5504
5505 /* Set values to reset to if this is followed by a zero repeat. */
5506
5507 zerofirstbyte = firstbyte;
5508 zeroreqbyte = reqbyte;
5509
5510 /* \g<name> or \g'name' is a subroutine call by name and \g<n> or \g'n'
5511 is a subroutine call by number (Oniguruma syntax). In fact, the value
5512 -ESC_g is returned only for these cases. So we don't need to check for <
5513 or ' if the value is -ESC_g. For the Perl syntax \g{n} the value is
5514 -ESC_REF+n, and for the Perl syntax \g{name} the result is -ESC_k (as
5515 that is a synonym for a named back reference). */
5516
5517 if (-c == ESC_g)
5518 {
5519 const uschar *p;
5520 save_hwm = cd->hwm; /* Normally this is set when '(' is read */
5521 terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
5522 CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
5523
5524 /* These two statements stop the compiler for warning about possibly
5525 unset variables caused by the jump to HANDLE_NUMERICAL_RECURSION. In
5526 fact, because we actually check for a number below, the paths that
5527 would actually be in error are never taken. */
5528
5529 skipbytes = 0;
5530 reset_bracount = FALSE;
5531
5532 /* Test for a name */
5533
5534 if (ptr[1] != CHAR_PLUS && ptr[1] != CHAR_MINUS)
5535 {
5536 BOOL isnumber = TRUE;
5537 for (p = ptr + 1; *p != 0 && *p != terminator; p++)
5538 {
5539 if ((cd->ctypes[*p] & ctype_digit) == 0) isnumber = FALSE;
5540 if ((cd->ctypes[*p] & ctype_word) == 0) break;
5541 }
5542 if (*p != terminator)
5543 {
5544 *errorcodeptr = ERR57;
5545 break;
5546 }
5547 if (isnumber)
5548 {
5549 ptr++;
5550 goto HANDLE_NUMERICAL_RECURSION;
5551 }
5552 is_recurse = TRUE;
5553 goto NAMED_REF_OR_RECURSE;
5554 }
5555
5556 /* Test a signed number in angle brackets or quotes. */
5557
5558 p = ptr + 2;
5559 while ((digitab[*p] & ctype_digit) != 0) p++;
5560 if (*p != terminator)
5561 {
5562 *errorcodeptr = ERR57;
5563 break;
5564 }
5565 ptr++;
5566 goto HANDLE_NUMERICAL_RECURSION;
5567 }
5568
5569 /* \k<name> or \k'name' is a back reference by name (Perl syntax).
5570 We also support \k{name} (.NET syntax) */
5571
5572 if (-c == ESC_k && (ptr[1] == CHAR_LESS_THAN_SIGN ||
5573 ptr[1] == CHAR_APOSTROPHE || ptr[1] == CHAR_LEFT_CURLY_BRACKET))
5574 {
5575 is_recurse = FALSE;
5576 terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
5577 CHAR_GREATER_THAN_SIGN : (*ptr == CHAR_APOSTROPHE)?
5578 CHAR_APOSTROPHE : CHAR_RIGHT_CURLY_BRACKET;
5579 goto NAMED_REF_OR_RECURSE;
5580 }
5581
5582 /* Back references are handled specially; must disable firstbyte if
5583 not set to cope with cases like (?=(\w+))\1: which would otherwise set
5584 ':' later. */
5585
5586 if (-c >= ESC_REF)
5587 {
5588 recno = -c - ESC_REF;
5589
5590 HANDLE_REFERENCE: /* Come here from named backref handling */
5591 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
5592 previous = code;
5593 *code++ = OP_REF;
5594 PUT2INC(code, 0, recno);
5595 cd->backref_map |= (recno < 32)? (1 << recno) : 1;
5596 if (recno > cd->top_backref) cd->top_backref = recno;
5597 }
5598
5599 /* So are Unicode property matches, if supported. */
5600
5601 #ifdef SUPPORT_UCP
5602 else if (-c == ESC_P || -c == ESC_p)
5603 {
5604 BOOL negated;
5605 int pdata;
5606 int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
5607 if (ptype < 0) goto FAILED;
5608 previous = code;
5609 *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
5610 *code++ = ptype;
5611 *code++ = pdata;
5612 }
5613 #else
5614
5615 /* If Unicode properties are not supported, \X, \P, and \p are not
5616 allowed. */
5617
5618 else if (-c == ESC_X || -c == ESC_P || -c == ESC_p)
5619 {
5620 *errorcodeptr = ERR45;
5621 goto FAILED;
5622 }
5623 #endif
5624
5625 /* For the rest (including \X when Unicode properties are supported), we
5626 can obtain the OP value by negating the escape value. */
5627
5628 else
5629 {
5630 previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
5631 *code++ = -c;
5632 }
5633 continue;
5634 }
5635
5636 /* We have a data character whose value is in c. In UTF-8 mode it may have
5637 a value > 127. We set its representation in the length/buffer, and then
5638 handle it as a data character. */
5639
5640 #ifdef SUPPORT_UTF8
5641 if (utf8 && c > 127)
5642 mclength = _pcre_ord2utf8(c, mcbuffer);
5643 else
5644 #endif
5645
5646 {
5647 mcbuffer[0] = c;
5648 mclength = 1;
5649 }
5650 goto ONE_CHAR;
5651
5652
5653 /* ===================================================================*/
5654 /* Handle a literal character. It is guaranteed not to be whitespace or #
5655 when the extended flag is set. If we are in UTF-8 mode, it may be a
5656 multi-byte literal character. */
5657
5658 default:
5659 NORMAL_CHAR:
5660 mclength = 1;
5661 mcbuffer[0] = c;
5662
5663 #ifdef SUPPORT_UTF8
5664 if (utf8 && c >= 0xc0)
5665 {
5666 while ((ptr[1] & 0xc0) == 0x80)
5667 mcbuffer[mclength++] = *(++ptr);
5668 }
5669 #endif
5670
5671 /* At this point we have the character's bytes in mcbuffer, and the length
5672 in mclength. When not in UTF-8 mode, the length is always 1. */
5673
5674 ONE_CHAR:
5675 previous = code;
5676 *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARNC : OP_CHAR;
5677 for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
5678
5679 /* Remember if \r or \n were seen */
5680
5681 if (mcbuffer[0] == CHAR_CR || mcbuffer[0] == CHAR_NL)
5682 cd->external_flags |= PCRE_HASCRORLF;
5683
5684 /* Set the first and required bytes appropriately. If no previous first
5685 byte, set it from this character, but revert to none on a zero repeat.
5686 Otherwise, leave the firstbyte value alone, and don't change it on a zero
5687 repeat. */
5688
5689 if (firstbyte == REQ_UNSET)
5690 {
5691 zerofirstbyte = REQ_NONE;
5692 zeroreqbyte = reqbyte;
5693
5694 /* If the character is more than one byte long, we can set firstbyte
5695 only if it is not to be matched caselessly. */
5696
5697 if (mclength == 1 || req_caseopt == 0)
5698 {
5699 firstbyte = mcbuffer[0] | req_caseopt;
5700 if (mclength != 1) reqbyte = code[-1] | cd->req_varyopt;
5701 }
5702 else firstbyte = reqbyte = REQ_NONE;
5703 }
5704
5705 /* firstbyte was previously set; we can set reqbyte only the length is
5706 1 or the matching is caseful. */
5707
5708 else
5709 {
5710 zerofirstbyte = firstbyte;
5711 zeroreqbyte = reqbyte;
5712 if (mclength == 1 || req_caseopt == 0)
5713 reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
5714 }
5715
5716 break; /* End of literal character handling */
5717 }
5718 } /* end of big loop */
5719
5720
5721 /* Control never reaches here by falling through, only by a goto for all the
5722 error states. Pass back the position in the pattern so that it can be displayed
5723 to the user for diagnosing the error. */
5724
5725 FAILED:
5726 *ptrptr = ptr;
5727 return FALSE;
5728 }
5729
5730
5731
5732
5733 /*************************************************
5734 * Compile sequence of alternatives *
5735 *************************************************/
5736
5737 /* On entry, ptr is pointing past the bracket character, but on return it
5738 points to the closing bracket, or vertical bar, or end of string. The code
5739 variable is pointing at the byte into which the BRA operator has been stored.
5740 If the ims options are changed at the start (for a (?ims: group) or during any
5741 branch, we need to insert an OP_OPT item at the start of every following branch
5742 to ensure they get set correctly at run time, and also pass the new options
5743 into every subsequent branch compile.
5744
5745 This function is used during the pre-compile phase when we are trying to find
5746 out the amount of memory needed, as well as during the real compile phase. The
5747 value of lengthptr distinguishes the two phases.
5748
5749 Arguments:
5750 options option bits, including any changes for this subpattern
5751 oldims previous settings of ims option bits
5752 codeptr -> the address of the current code pointer
5753 ptrptr -> the address of the current pattern pointer
5754 errorcodeptr -> pointer to error code variable
5755 lookbehind TRUE if this is a lookbehind assertion
5756 reset_bracount TRUE to reset the count for each branch
5757 skipbytes skip this many bytes at start (for brackets and OP_COND)
5758 firstbyteptr place to put the first required character, or a negative number
5759 reqbyteptr place to put the last required character, or a negative number
5760 bcptr pointer to the chain of currently open branches
5761 cd points to the data block with tables pointers etc.
5762 lengthptr NULL during the real compile phase
5763 points to length accumulator during pre-compile phase
5764
5765 Returns: TRUE on success
5766 */
5767
5768 static BOOL
5769 compile_regex(int options, int oldims, uschar **codeptr, const uschar **ptrptr,
5770 int *errorcodeptr, BOOL lookbehind, BOOL reset_bracount, int skipbytes,
5771 int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd,
5772 int *lengthptr)
5773 {
5774 const uschar *ptr = *ptrptr;
5775 uschar *code = *codeptr;
5776 uschar *last_branch = code;
5777 uschar *start_bracket = code;
5778 uschar *reverse_count = NULL;
5779 open_capitem capitem;
5780 int capnumber = 0;
5781 int firstbyte, reqbyte;
5782 int branchfirstbyte, branchreqbyte;
5783 int length;
5784 int orig_bracount;
5785 int max_bracount;
5786 branch_chain bc;
5787
5788 bc.outer = bcptr;
5789 bc.current = code;
5790
5791 firstbyte = reqbyte = REQ_UNSET;
5792
5793 /* Accumulate the length for use in the pre-compile phase. Start with the
5794 length of the BRA and KET and any extra bytes that are required at the
5795 beginning. We accumulate in a local variable to save frequent testing of
5796 lenthptr for NULL. We cannot do this by looking at the value of code at the
5797 start and end of each alternative, because compiled items are discarded during
5798 the pre-compile phase so that the work space is not exceeded. */
5799
5800 length = 2 + 2*LINK_SIZE + skipbytes;
5801
5802 /* WARNING: If the above line is changed for any reason, you must also change
5803 the code that abstracts option settings at the start of the pattern and makes
5804 them global. It tests the value of length for (2 + 2*LINK_SIZE) in the
5805 pre-compile phase to find out whether anything has yet been compiled or not. */
5806
5807 /* If this is a capturing subpattern, add to the chain of open capturing items
5808 so that we can detect them if (*ACCEPT) is encountered. */
5809
5810 if (*code == OP_CBRA)
5811 {
5812 capnumber = GET2(code, 1 + LINK_SIZE);
5813 capitem.number = capnumber;
5814 capitem.next = cd->open_caps;
5815 cd->open_caps = &capitem;
5816 }
5817
5818 /* Offset is set zero to mark that this bracket is still open */
5819
5820 PUT(code, 1, 0);
5821 code += 1 + LINK_SIZE + skipbytes;
5822
5823 /* Loop for each alternative branch */
5824
5825 orig_bracount = max_bracount = cd->bracount;
5826 for (;;)
5827 {
5828 /* For a (?| group, reset the capturing bracket count so that each branch
5829 uses the same numbers. */
5830
5831 if (reset_bracount) cd->bracount = orig_bracount;
5832
5833 /* Handle a change of ims options at the start of the branch */
5834
5835 if ((options & PCRE_IMS) != oldims)
5836 {
5837 *code++ = OP_OPT;
5838 *code++ = options & PCRE_IMS;
5839 length += 2;
5840 }
5841
5842 /* Set up dummy OP_REVERSE if lookbehind assertion */
5843
5844 if (lookbehind)
5845 {
5846 *code++ = OP_REVERSE;
5847 reverse_count = code;
5848 PUTINC(code, 0, 0);
5849 length += 1 + LINK_SIZE;
5850 }
5851
5852 /* Now compile the branch; in the pre-compile phase its length gets added
5853 into the length. */
5854
5855 if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstbyte,
5856 &branchreqbyte, &bc, cd, (lengthptr == NULL)? NULL : &length))
5857 {
5858 *ptrptr = ptr;
5859 return FALSE;
5860 }
5861
5862 /* Keep the highest bracket count in case (?| was used and some branch
5863 has fewer than the rest. */
5864
5865 if (cd->bracount > max_bracount) max_bracount = cd->bracount;
5866
5867 /* In the real compile phase, there is some post-processing to be done. */
5868
5869 if (lengthptr == NULL)
5870 {
5871 /* If this is the first branch, the firstbyte and reqbyte values for the
5872 branch become the values for the regex. */
5873
5874 if (*last_branch != OP_ALT)
5875 {
5876 firstbyte = branchfirstbyte;
5877 reqbyte = branchreqbyte;
5878 }
5879
5880 /* If this is not the first branch, the first char and reqbyte have to
5881 match the values from all the previous branches, except that if the
5882 previous value for reqbyte didn't have REQ_VARY set, it can still match,
5883 and we set REQ_VARY for the regex. */
5884
5885 else
5886 {
5887 /* If we previously had a firstbyte, but it doesn't match the new branch,
5888 we have to abandon the firstbyte for the regex, but if there was
5889 previously no reqbyte, it takes on the value of the old firstbyte. */
5890
5891 if (firstbyte >= 0 && firstbyte != branchfirstbyte)
5892 {
5893 if (reqbyte < 0) reqbyte = firstbyte;
5894 firstbyte = REQ_NONE;
5895 }
5896
5897 /* If we (now or from before) have no firstbyte, a firstbyte from the
5898 branch becomes a reqbyte if there isn't a branch reqbyte. */
5899
5900 if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
5901 branchreqbyte = branchfirstbyte;
5902
5903 /* Now ensure that the reqbytes match */
5904
5905 if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
5906 reqbyte = REQ_NONE;
5907 else reqbyte |= branchreqbyte; /* To "or" REQ_VARY */
5908 }
5909
5910 /* If lookbehind, check that this branch matches a fixed-length string, and
5911 put the length into the OP_REVERSE item. Temporarily mark the end of the
5912 branch with OP_END. If the branch contains OP_RECURSE, the result is -3
5913 because there may be forward references that we can't check here. Set a
5914 flag to cause another lookbehind check at the end. Why not do it all at the
5915 end? Because common, erroneous checks are picked up here and the offset of
5916 the problem can be shown. */
5917
5918 if (lookbehind)
5919 {
5920 int fixed_length;
5921 *code = OP_END;
5922 fixed_length = find_fixedlength(last_branch, options, FALSE, cd);
5923 DPRINTF(("fixed length = %d\n", fixed_length));
5924 if (fixed_length == -3)
5925 {
5926 cd->check_lookbehind = TRUE;
5927 }
5928 else if (fixed_length < 0)
5929 {
5930 *errorcodeptr = (fixed_length == -2)? ERR36 : ERR25;
5931 *ptrptr = ptr;
5932 return FALSE;
5933 }
5934 else { PUT(reverse_count, 0, fixed_length); }
5935 }
5936 }
5937
5938 /* Reached end of expression, either ')' or end of pattern. In the real
5939 compile phase, go back through the alternative branches and reverse the chain
5940 of offsets, with the field in the BRA item now becoming an offset to the
5941 first alternative. If there are no alternatives, it points to the end of the
5942 group. The length in the terminating ket is always the length of the whole
5943 bracketed item. If any of the ims options were changed inside the group,
5944 compile a resetting op-code following, except at the very end of the pattern.
5945 Return leaving the pointer at the terminating char. */
5946
5947 if (*ptr != CHAR_VERTICAL_LINE)
5948 {
5949 if (lengthptr == NULL)
5950 {
5951 int branch_length = code - last_branch;
5952 do
5953 {
5954 int prev_length = GET(last_branch, 1);
5955 PUT(last_branch, 1, branch_length);
5956 branch_length = prev_length;
5957 last_branch -= branch_length;
5958 }
5959 while (branch_length > 0);
5960 }
5961
5962 /* If it was a capturing subpattern, remove it from the chain. */
5963
5964 if (capnumber > 0) cd->open_caps = cd->open_caps->next;
5965
5966 /* Fill in the ket */
5967
5968 *code = OP_KET;
5969 PUT(code, 1, code - start_bracket);
5970 code += 1 + LINK_SIZE;
5971
5972 /* Resetting option if needed */
5973
5974 if ((options & PCRE_IMS) != oldims && *ptr == CHAR_RIGHT_PARENTHESIS)
5975 {
5976 *code++ = OP_OPT;
5977 *code++ = oldims;
5978 length += 2;
5979 }
5980
5981 /* Retain the highest bracket number, in case resetting was used. */
5982
5983 cd->bracount = max_bracount;
5984
5985 /* Set values to pass back */
5986
5987 *codeptr = code;
5988 *ptrptr = ptr;
5989 *firstbyteptr = firstbyte;
5990 *reqbyteptr = reqbyte;
5991 if (lengthptr != NULL)
5992 {
5993 if (OFLOW_MAX - *lengthptr < length)
5994 {
5995 *errorcodeptr = ERR20;
5996 return FALSE;
5997 }
5998 *lengthptr += length;
5999 }
6000 return TRUE;
6001 }
6002
6003 /* Another branch follows. In the pre-compile phase, we can move the code
6004 pointer back to where it was for the start of the first branch. (That is,
6005 pretend that each branch is the only one.)
6006
6007 In the real compile phase, insert an ALT node. Its length field points back
6008 to the previous branch while the bracket remains open. At the end the chain
6009 is reversed. It's done like this so that the start of the bracket has a
6010 zero offset until it is closed, making it possible to detect recursion. */
6011
6012 if (lengthptr != NULL)
6013 {
6014 code = *codeptr + 1 + LINK_SIZE + skipbytes;
6015 length += 1 + LINK_SIZE;
6016 }
6017 else
6018 {
6019 *code = OP_ALT;
6020 PUT(code, 1, code - last_branch);
6021 bc.current = last_branch = code;
6022 code += 1 + LINK_SIZE;
6023 }
6024
6025 ptr++;
6026 }
6027 /* Control never reaches here */
6028 }
6029
6030
6031
6032
6033 /*************************************************
6034 * Check for anchored expression *
6035 *************************************************/
6036
6037 /* Try to find out if this is an anchored regular expression. Consider each
6038 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
6039 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
6040 it's anchored. However, if this is a multiline pattern, then only OP_SOD
6041 counts, since OP_CIRC can match in the middle.
6042
6043 We can also consider a regex to be anchored if OP_SOM starts all its branches.
6044 This is the code for \G, which means "match at start of match position, taking
6045 into account the match offset".
6046
6047 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
6048 because that will try the rest of the pattern at all possible matching points,
6049 so there is no point trying again.... er ....
6050
6051 .... except when the .* appears inside capturing parentheses, and there is a
6052 subsequent back reference to those parentheses. We haven't enough information
6053 to catch that case precisely.
6054
6055 At first, the best we could do was to detect when .* was in capturing brackets
6056 and the highest back reference was greater than or equal to that level.
6057 However, by keeping a bitmap of the first 31 back references, we can catch some
6058 of the more common cases more precisely.
6059
6060 Arguments:
6061 code points to start of expression (the bracket)
6062 options points to the options setting
6063 bracket_map a bitmap of which brackets we are inside while testing; this
6064 handles up to substring 31; after that we just have to take
6065 the less precise approach
6066 backref_map the back reference bitmap
6067
6068 Returns: TRUE or FALSE
6069 */
6070
6071 static BOOL
6072 is_anchored(register const uschar *code, int *options, unsigned int bracket_map,
6073 unsigned int backref_map)
6074 {
6075 do {
6076 const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
6077 options, PCRE_MULTILINE, FALSE);
6078 register int op = *scode;
6079
6080 /* Non-capturing brackets */
6081
6082 if (op == OP_BRA)
6083 {
6084 if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
6085 }
6086
6087 /* Capturing brackets */
6088
6089 else if (op == OP_CBRA)
6090 {
6091 int n = GET2(scode, 1+LINK_SIZE);
6092 int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
6093 if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;
6094 }
6095
6096 /* Other brackets */
6097
6098 else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
6099 {
6100 if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
6101 }
6102
6103 /* .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and
6104 it isn't in brackets that are or may be referenced. */
6105
6106 else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
6107 op == OP_TYPEPOSSTAR))
6108 {
6109 if (scode[1] != OP_ALLANY || (bracket_map & backref_map) != 0)
6110 return FALSE;
6111 }
6112
6113 /* Check for explicit anchoring */
6114
6115 else if (op != OP_SOD && op != OP_SOM &&
6116 ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
6117 return FALSE;
6118 code += GET(code, 1);
6119 }
6120 while (*code == OP_ALT); /* Loop for each alternative */
6121 return TRUE;
6122 }
6123
6124
6125
6126 /*************************************************
6127 * Check for starting with ^ or .* *
6128 *************************************************/
6129
6130 /* This is called to find out if every branch starts with ^ or .* so that
6131 "first char" processing can be done to speed things up in multiline
6132 matching and for non-DOTALL patterns that start with .* (which must start at
6133 the beginning or after \n). As in the case of is_anchored() (see above), we
6134 have to take account of back references to capturing brackets that contain .*
6135 because in that case we can't make the assumption.
6136
6137 Arguments:
6138 code points to start of expression (the bracket)
6139 bracket_map a bitmap of which brackets we are inside while testing; this
6140 handles up to substring 31; after that we just have to take
6141 the less precise approach
6142 backref_map the back reference bitmap
6143
6144 Returns: TRUE or FALSE
6145 */
6146
6147 static BOOL
6148 is_startline(const uschar *code, unsigned int bracket_map,
6149 unsigned int backref_map)
6150 {
6151 do {
6152 const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
6153 NULL, 0, FALSE);
6154 register int op = *scode;
6155
6156 /* If we are at the start of a conditional assertion group, *both* the
6157 conditional assertion *and* what follows the condition must satisfy the test
6158 for start of line. Other kinds of condition fail. Note that there may be an
6159 auto-callout at the start of a condition. */
6160
6161 if (op == OP_COND)
6162 {
6163 scode += 1 + LINK_SIZE;
6164 if (*scode == OP_CALLOUT) scode += _pcre_OP_lengths[OP_CALLOUT];
6165 switch (*scode)
6166 {
6167 case OP_CREF:
6168 case OP_NCREF:
6169 case OP_RREF:
6170 case OP_NRREF:
6171 case OP_DEF:
6172 return FALSE;
6173
6174 default: /* Assertion */
6175 if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
6176 do scode += GET(scode, 1); while (*scode == OP_ALT);
6177 scode += 1 + LINK_SIZE;
6178 break;
6179 }
6180 scode = first_significant_code(scode, NULL, 0, FALSE);
6181 op = *scode;
6182 }
6183
6184 /* Non-capturing brackets */
6185
6186 if (op == OP_BRA)
6187 {
6188 if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
6189 }
6190
6191 /* Capturing brackets */
6192
6193 else if (op == OP_CBRA)
6194 {
6195 int n = GET2(scode, 1+LINK_SIZE);
6196 int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
6197 if (!is_startline(scode, new_map, backref_map)) return FALSE;
6198 }
6199
6200 /* Other brackets */
6201
6202 else if (op == OP_ASSERT || op == OP_ONCE)
6203 {
6204 if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
6205 }
6206
6207 /* .* means "start at start or after \n" if it isn't in brackets that
6208 may be referenced. */
6209
6210 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
6211 {
6212 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
6213 }
6214
6215 /* Check for explicit circumflex */
6216
6217 else if (op != OP_CIRC) return FALSE;
6218
6219 /* Move on to the next alternative */
6220
6221 code += GET(code, 1);
6222 }
6223 while (*code == OP_ALT); /* Loop for each alternative */
6224 return TRUE;
6225 }
6226
6227
6228
6229 /*************************************************
6230 * Check for asserted fixed first char *
6231 *************************************************/
6232
6233 /* During compilation, the "first char" settings from forward assertions are
6234 discarded, because they can cause conflicts with actual literals that follow.
6235 However, if we end up without a first char setting for an unanchored pattern,
6236 it is worth scanning the regex to see if there is an initial asserted first
6237 char. If all branches start with the same asserted char, or with a bracket all
6238 of whose alternatives start with the same asserted char (recurse ad lib), then
6239 we return that char, otherwise -1.
6240
6241 Arguments:
6242 code points to start of expression (the bracket)
6243 options pointer to the options (used to check casing changes)
6244 inassert TRUE if in an assertion
6245
6246 Returns: -1 or the fixed first char
6247 */
6248
6249 static int
6250 find_firstassertedchar(const uschar *code, int *options, BOOL inassert)
6251 {
6252 register int c = -1;
6253 do {
6254 int d;
6255 const uschar *scode =
6256 first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE);
6257 register int op = *scode;
6258
6259 switch(op)
6260 {
6261 default:
6262 return -1;
6263
6264 case OP_BRA:
6265 case OP_CBRA:
6266 case OP_ASSERT:
6267 case OP_ONCE:
6268 case OP_COND:
6269 if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0)
6270 return -1;
6271 if (c < 0) c = d; else if (c != d) return -1;
6272 break;
6273
6274 case OP_EXACT: /* Fall through */
6275 scode += 2;
6276
6277 case OP_CHAR:
6278 case OP_CHARNC:
6279 case OP_PLUS:
6280 case OP_MINPLUS:
6281 case OP_POSPLUS:
6282 if (!inassert) return -1;
6283 if (c < 0)
6284 {
6285 c = scode[1];
6286 if ((*options & PCRE_CASELESS) != 0) c |= REQ_CASELESS;
6287 }
6288 else if (c != scode[1]) return -1;
6289 break;
6290 }
6291
6292 code += GET(code, 1);
6293 }
6294 while (*code == OP_ALT);
6295 return c;
6296 }
6297
6298
6299
6300 /*************************************************
6301 * Compile a Regular Expression *
6302 *************************************************/
6303
6304 /* This function takes a string and returns a pointer to a block of store
6305 holding a compiled version of the expression. The original API for this
6306 function had no error code return variable; it is retained for backwards
6307 compatibility. The new function is given a new name.
6308
6309 Arguments:
6310 pattern the regular expression
6311 options various option bits
6312 errorcodeptr pointer to error code variable (pcre_compile2() only)
6313 can be NULL if you don't want a code value
6314 errorptr pointer to pointer to error text
6315 erroroffset ptr offset in pattern where error was detected
6316 tables pointer to character tables or NULL
6317
6318 Returns: pointer to compiled data block, or NULL on error,
6319 with errorptr and erroroffset set
6320 */
6321
6322 PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION
6323 pcre_compile(const char *pattern, int options, const char **errorptr,
6324 int *erroroffset, const unsigned char *tables)
6325 {
6326 return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
6327 }
6328
6329
6330 PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION
6331 pcre_compile2(const char *pattern, int options, int *errorcodeptr,
6332 const char **errorptr, int *erroroffset, const unsigned char *tables)
6333 {
6334 real_pcre *re;
6335 int length = 1; /* For final END opcode */
6336 int firstbyte, reqbyte, newline;
6337 int errorcode = 0;
6338 int skipatstart = 0;
6339 BOOL utf8 = (options & PCRE_UTF8) != 0;
6340 size_t size;
6341 uschar *code;
6342 const uschar *codestart;
6343 const uschar *ptr;
6344 compile_data compile_block;
6345 compile_data *cd = &compile_block;
6346
6347 /* This space is used for "compiling" into during the first phase, when we are
6348 computing the amount of memory that is needed. Compiled items are thrown away
6349 as soon as possible, so that a fairly large buffer should be sufficient for
6350 this purpose. The same space is used in the second phase for remembering where
6351 to fill in forward references to subpatterns. */
6352
6353 uschar cworkspace[COMPILE_WORK_SIZE];
6354
6355 /* Set this early so that early errors get offset 0. */
6356
6357 ptr = (const uschar *)pattern;
6358
6359 /* We can't pass back an error message if errorptr is NULL; I guess the best we
6360 can do is just return NULL, but we can set a code value if there is a code
6361 pointer. */
6362
6363 if (errorptr == NULL)
6364 {
6365 if (errorcodeptr != NULL) *errorcodeptr = 99;
6366 return NULL;
6367 }
6368
6369 *errorptr = NULL;
6370 if (errorcodeptr != NULL) *errorcodeptr = ERR0;
6371
6372 /* However, we can give a message for this error */
6373
6374 if (erroroffset == NULL)
6375 {
6376 errorcode = ERR16;
6377 goto PCRE_EARLY_ERROR_RETURN2;
6378 }
6379
6380 *erroroffset = 0;
6381
6382 /* Set up pointers to the individual character tables */
6383
6384 if (tables == NULL) tables = _pcre_default_tables;
6385 cd->lcc = tables + lcc_offset;
6386 cd->fcc = tables + fcc_offset;
6387 cd->cbits = tables + cbits_offset;
6388 cd->ctypes = tables + ctypes_offset;
6389
6390 /* Check that all undefined public option bits are zero */
6391
6392 if ((options & ~PUBLIC_COMPILE_OPTIONS) != 0)
6393 {
6394 errorcode = ERR17;
6395 goto PCRE_EARLY_ERROR_RETURN;
6396 }
6397
6398 /* Check for global one-time settings at the start of the pattern, and remember
6399 the offset for later. */
6400
6401 while (ptr[skipatstart] == CHAR_LEFT_PARENTHESIS &&
6402 ptr[skipatstart+1] == CHAR_ASTERISK)
6403 {
6404 int newnl = 0;
6405 int newbsr = 0;
6406
6407 if (strncmp((char *)(ptr+skipatstart+2), STRING_UTF8_RIGHTPAR, 5) == 0)
6408 { skipatstart += 7; options |= PCRE_UTF8; continue; }
6409
6410 if (strncmp((char *)(ptr+skipatstart+2), STRING_CR_RIGHTPAR, 3) == 0)
6411 { skipatstart += 5; newnl = PCRE_NEWLINE_CR; }
6412 else if (strncmp((char *)(ptr+skipatstart+2), STRING_LF_RIGHTPAR, 3) == 0)
6413 { skipatstart += 5; newnl = PCRE_NEWLINE_LF; }
6414 else if (strncmp((char *)(ptr+skipatstart+2), STRING_CRLF_RIGHTPAR, 5) == 0)
6415 { skipatstart += 7; newnl = PCRE_NEWLINE_CR + PCRE_NEWLINE_LF; }
6416 else if (strncmp((char *)(ptr+skipatstart+2), STRING_ANY_RIGHTPAR, 4) == 0)
6417 { skipatstart += 6; newnl = PCRE_NEWLINE_ANY; }
6418 else if (strncmp((char *)(ptr+skipatstart+2), STRING_ANYCRLF_RIGHTPAR, 8) == 0)
6419 { skipatstart += 10; newnl = PCRE_NEWLINE_ANYCRLF; }
6420
6421 else if (strncmp((char *)(ptr+skipatstart+2), STRING_BSR_ANYCRLF_RIGHTPAR, 12) == 0)
6422 { skipatstart += 14; newbsr = PCRE_BSR_ANYCRLF; }
6423 else if (strncmp((char *)(ptr+skipatstart+2), STRING_BSR_UNICODE_RIGHTPAR, 12) == 0)
6424 { skipatstart += 14; newbsr = PCRE_BSR_UNICODE; }
6425
6426 if (newnl != 0)
6427 options = (options & ~PCRE_NEWLINE_BITS) | newnl;
6428 else if (newbsr != 0)
6429 options = (options & ~(PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) | newbsr;
6430 else break;
6431 }
6432
6433 /* Can't support UTF8 unless PCRE has been compiled to include the code. */
6434
6435 #ifdef SUPPORT_UTF8
6436 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
6437 (*erroroffset = _pcre_valid_utf8((uschar *)pattern, -1)) >= 0)
6438 {
6439 errorcode = ERR44;
6440 goto PCRE_EARLY_ERROR_RETURN2;
6441 }
6442 #else
6443 if (utf8)
6444 {
6445 errorcode = ERR32;
6446 goto PCRE_EARLY_ERROR_RETURN;
6447 }
6448 #endif
6449
6450 /* Check validity of \R options. */
6451
6452 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
6453 {
6454 case 0:
6455 case PCRE_BSR_ANYCRLF:
6456 case PCRE_BSR_UNICODE:
6457 break;
6458 default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
6459 }
6460
6461 /* Handle different types of newline. The three bits give seven cases. The
6462 current code allows for fixed one- or two-byte sequences, plus "any" and
6463 "anycrlf". */
6464
6465 switch (options & PCRE_NEWLINE_BITS)
6466 {
6467 case 0: newline = NEWLINE; break; /* Build-time default */
6468 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
6469 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
6470 case PCRE_NEWLINE_CR+
6471 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
6472 case PCRE_NEWLINE_ANY: newline = -1; break;
6473 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
6474 default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
6475 }
6476
6477 if (newline == -2)
6478 {
6479 cd->nltype = NLTYPE_ANYCRLF;
6480 }
6481 else if (newline < 0)
6482 {
6483 cd->nltype = NLTYPE_ANY;
6484 }
6485 else
6486 {
6487 cd->nltype = NLTYPE_FIXED;
6488 if (newline > 255)
6489 {
6490 cd->nllen = 2;
6491 cd->nl[0] = (newline >> 8) & 255;
6492 cd->nl[1] = newline & 255;
6493 }
6494 else
6495 {
6496 cd->nllen = 1;
6497 cd->nl[0] = newline;
6498 }
6499 }
6500
6501 /* Maximum back reference and backref bitmap. The bitmap records up to 31 back
6502 references to help in deciding whether (.*) can be treated as anchored or not.
6503 */
6504
6505 cd->top_backref = 0;
6506 cd->backref_map = 0;
6507
6508 /* Reflect pattern for debugging output */
6509
6510 DPRINTF(("------------------------------------------------------------------\n"));
6511 DPRINTF(("%s\n", pattern));
6512
6513 /* Pretend to compile the pattern while actually just accumulating the length
6514 of memory required. This behaviour is triggered by passing a non-NULL final
6515 argument to compile_regex(). We pass a block of workspace (cworkspace) for it
6516 to compile parts of the pattern into; the compiled code is discarded when it is
6517 no longer needed, so hopefully this workspace will never overflow, though there
6518 is a test for its doing so. */
6519
6520 cd->bracount = cd->final_bracount = 0;
6521 cd->names_found = 0;
6522 cd->name_entry_size = 0;
6523 cd->name_table = NULL;
6524 cd->start_workspace = cworkspace;
6525 cd->start_code = cworkspace;
6526 cd->hwm = cworkspace;
6527 cd->start_pattern = (const uschar *)pattern;
6528 cd->end_pattern = (const uschar *)(pattern + strlen(pattern));
6529 cd->req_varyopt = 0;
6530 cd->external_options = options;
6531 cd->external_flags = 0;
6532 cd->open_caps = NULL;
6533
6534 /* Now do the pre-compile. On error, errorcode will be set non-zero, so we
6535 don't need to look at the result of the function here. The initial options have
6536 been put into the cd block so that they can be changed if an option setting is
6537 found within the regex right at the beginning. Bringing initial option settings
6538 outside can help speed up starting point checks. */
6539
6540 ptr += skipatstart;
6541 code = cworkspace;
6542 *code = OP_BRA;
6543 (void)compile_regex(cd->external_options, cd->external_options & PCRE_IMS,
6544 &code, &ptr, &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd,
6545 &length);
6546 if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN;
6547
6548 DPRINTF(("end pre-compile: length=%d workspace=%d\n", length,
6549 cd->hwm - cworkspace));
6550
6551 if (length > MAX_PATTERN_SIZE)
6552 {
6553 errorcode = ERR20;
6554 goto PCRE_EARLY_ERROR_RETURN;
6555 }
6556
6557 /* Compute the size of data block needed and get it, either from malloc or
6558 externally provided function. Integer overflow should no longer be possible
6559 because nowadays we limit the maximum value of cd->names_found and
6560 cd->name_entry_size. */
6561
6562 size = length + sizeof(real_pcre) + cd->names_found * (cd->name_entry_size + 3);
6563 re = (real_pcre *)(pcre_malloc)(size);
6564
6565 if (re == NULL)
6566 {
6567 errorcode = ERR21;
6568 goto PCRE_EARLY_ERROR_RETURN;
6569 }
6570
6571 /* Put in the magic number, and save the sizes, initial options, internal
6572 flags, and character table pointer. NULL is used for the default character
6573 tables. The nullpad field is at the end; it's there to help in the case when a
6574 regex compiled on a system with 4-byte pointers is run on another with 8-byte
6575 pointers. */
6576
6577 re->magic_number = MAGIC_NUMBER;
6578 re->size = size;
6579 re->options = cd->external_options;
6580 re->flags = cd->external_flags;
6581 re->dummy1 = 0;
6582 re->first_byte = 0;
6583 re->req_byte = 0;
6584 re->name_table_offset = sizeof(real_pcre);
6585 re->name_entry_size = cd->name_entry_size;
6586 re->name_count = cd->names_found;
6587 re->ref_count = 0;
6588 re->tables = (tables == _pcre_default_tables)? NULL : tables;
6589 re->nullpad = NULL;
6590
6591 /* The starting points of the name/number translation table and of the code are
6592 passed around in the compile data block. The start/end pattern and initial
6593 options are already set from the pre-compile phase, as is the name_entry_size
6594 field. Reset the bracket count and the names_found field. Also reset the hwm
6595 field; this time it's used for remembering forward references to subpatterns.
6596 */
6597
6598 cd->final_bracount = cd->bracount; /* Save for checking forward references */
6599 cd->bracount = 0;
6600 cd->names_found = 0;
6601 cd->name_table = (uschar *)re + re->name_table_offset;
6602 codestart = cd->name_table + re->name_entry_size * re->name_count;
6603 cd->start_code = codestart;
6604 cd->hwm = cworkspace;
6605 cd->req_varyopt = 0;
6606 cd->had_accept = FALSE;
6607 cd->check_lookbehind = FALSE;
6608 cd->open_caps = NULL;
6609
6610 /* Set up a starting, non-extracting bracket, then compile the expression. On
6611 error, errorcode will be set non-zero, so we don't need to look at the result
6612 of the function here. */
6613
6614 ptr = (const uschar *)pattern + skipatstart;
6615 code = (uschar *)codestart;
6616 *code = OP_BRA;
6617 (void)compile_regex(re->options, re->options & PCRE_IMS, &code, &ptr,
6618 &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd, NULL);
6619 re->top_bracket = cd->bracount;
6620 re->top_backref = cd->top_backref;
6621 re->flags = cd->external_flags;
6622
6623 if (cd->had_accept) reqbyte = -1; /* Must disable after (*ACCEPT) */
6624
6625 /* If not reached end of pattern on success, there's an excess bracket. */
6626
6627 if (errorcode == 0 && *ptr != 0) errorcode = ERR22;
6628
6629 /* Fill in the terminating state and check for disastrous overflow, but
6630 if debugging, leave the test till after things are printed out. */
6631
6632 *code++ = OP_END;
6633
6634 #ifndef DEBUG
6635 if (code - codestart > length) errorcode = ERR23;
6636 #endif
6637
6638 /* Fill in any forward references that are required. */
6639
6640 while (errorcode == 0 && cd->hwm > cworkspace)
6641 {
6642 int offset, recno;
6643 const uschar *groupptr;
6644 cd->hwm -= LINK_SIZE;
6645 offset = GET(cd->hwm, 0);
6646 recno = GET(codestar