/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Contents of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 505 - (show annotations) (download)
Tue Mar 9 16:50:47 2010 UTC (4 years, 4 months ago) by ph10
File MIME type: text/plain
File size: 223308 byte(s)
Improve compile-time overrun checking.

1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2010 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains the external function pcre_compile(), along with
42 supporting internal functions that are not used by other modules. */
43
44
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48
49 #define NLBLOCK cd /* Block containing newline information */
50 #define PSSTART start_pattern /* Field containing processed string start */
51 #define PSEND end_pattern /* Field containing processed string end */
52
53 #include "pcre_internal.h"
54
55
56 /* When PCRE_DEBUG is defined, we need the pcre_printint() function, which is
57 also used by pcretest. PCRE_DEBUG is not defined when building a production
58 library. */
59
60 #ifdef PCRE_DEBUG
61 #include "pcre_printint.src"
62 #endif
63
64
65 /* Macro for setting individual bits in class bitmaps. */
66
67 #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
68
69 /* Maximum length value to check against when making sure that the integer that
70 holds the compiled pattern length does not overflow. We make it a bit less than
71 INT_MAX to allow for adding in group terminating bytes, so that we don't have
72 to check them every time. */
73
74 #define OFLOW_MAX (INT_MAX - 20)
75
76
77 /*************************************************
78 * Code parameters and static tables *
79 *************************************************/
80
81 /* This value specifies the size of stack workspace that is used during the
82 first pre-compile phase that determines how much memory is required. The regex
83 is partly compiled into this space, but the compiled parts are discarded as
84 soon as they can be, so that hopefully there will never be an overrun. The code
85 does, however, check for an overrun. The largest amount I've seen used is 218,
86 so this number is very generous.
87
88 The same workspace is used during the second, actual compile phase for
89 remembering forward references to groups so that they can be filled in at the
90 end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
91 is 4 there is plenty of room. */
92
93 #define COMPILE_WORK_SIZE (4096)
94
95 /* The overrun tests check for a slightly smaller size so that they detect the
96 overrun before it actually does run off the end of the data block. */
97
98 #define WORK_SIZE_CHECK (COMPILE_WORK_SIZE - 100)
99
100
101 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
102 are simple data values; negative values are for special things like \d and so
103 on. Zero means further processing is needed (for things like \x), or the escape
104 is invalid. */
105
106 #ifndef EBCDIC
107
108 /* This is the "normal" table for ASCII systems or for EBCDIC systems running
109 in UTF-8 mode. */
110
111 static const short int escapes[] = {
112 0, 0,
113 0, 0,
114 0, 0,
115 0, 0,
116 0, 0,
117 CHAR_COLON, CHAR_SEMICOLON,
118 CHAR_LESS_THAN_SIGN, CHAR_EQUALS_SIGN,
119 CHAR_GREATER_THAN_SIGN, CHAR_QUESTION_MARK,
120 CHAR_COMMERCIAL_AT, -ESC_A,
121 -ESC_B, -ESC_C,
122 -ESC_D, -ESC_E,
123 0, -ESC_G,
124 -ESC_H, 0,
125 0, -ESC_K,
126 0, 0,
127 0, 0,
128 -ESC_P, -ESC_Q,
129 -ESC_R, -ESC_S,
130 0, 0,
131 -ESC_V, -ESC_W,
132 -ESC_X, 0,
133 -ESC_Z, CHAR_LEFT_SQUARE_BRACKET,
134 CHAR_BACKSLASH, CHAR_RIGHT_SQUARE_BRACKET,
135 CHAR_CIRCUMFLEX_ACCENT, CHAR_UNDERSCORE,
136 CHAR_GRAVE_ACCENT, 7,
137 -ESC_b, 0,
138 -ESC_d, ESC_e,
139 ESC_f, 0,
140 -ESC_h, 0,
141 0, -ESC_k,
142 0, 0,
143 ESC_n, 0,
144 -ESC_p, 0,
145 ESC_r, -ESC_s,
146 ESC_tee, 0,
147 -ESC_v, -ESC_w,
148 0, 0,
149 -ESC_z
150 };
151
152 #else
153
154 /* This is the "abnormal" table for EBCDIC systems without UTF-8 support. */
155
156 static const short int escapes[] = {
157 /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
158 /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
159 /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
160 /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
161 /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
162 /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
163 /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
164 /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
165 /* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0,
166 /* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
167 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
168 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0,
169 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
170 /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
171 /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
172 /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
173 /* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0,
174 /* D0 */ '}', 0, -ESC_K, 0, 0, 0, 0, -ESC_P,
175 /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
176 /* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X,
177 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
178 /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
179 /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
180 };
181 #endif
182
183
184 /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
185 searched linearly. Put all the names into a single string, in order to reduce
186 the number of relocations when a shared library is dynamically linked. The
187 string is built from string macros so that it works in UTF-8 mode on EBCDIC
188 platforms. */
189
190 typedef struct verbitem {
191 int len;
192 int op;
193 } verbitem;
194
195 static const char verbnames[] =
196 STRING_ACCEPT0
197 STRING_COMMIT0
198 STRING_F0
199 STRING_FAIL0
200 STRING_PRUNE0
201 STRING_SKIP0
202 STRING_THEN;
203
204 static const verbitem verbs[] = {
205 { 6, OP_ACCEPT },
206 { 6, OP_COMMIT },
207 { 1, OP_FAIL },
208 { 4, OP_FAIL },
209 { 5, OP_PRUNE },
210 { 4, OP_SKIP },
211 { 4, OP_THEN }
212 };
213
214 static const int verbcount = sizeof(verbs)/sizeof(verbitem);
215
216
217 /* Tables of names of POSIX character classes and their lengths. The names are
218 now all in a single string, to reduce the number of relocations when a shared
219 library is dynamically loaded. The list of lengths is terminated by a zero
220 length entry. The first three must be alpha, lower, upper, as this is assumed
221 for handling case independence. */
222
223 static const char posix_names[] =
224 STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
225 STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
226 STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
227 STRING_word0 STRING_xdigit;
228
229 static const uschar posix_name_lengths[] = {
230 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
231
232 /* Table of class bit maps for each POSIX class. Each class is formed from a
233 base map, with an optional addition or removal of another map. Then, for some
234 classes, there is some additional tweaking: for [:blank:] the vertical space
235 characters are removed, and for [:alpha:] and [:alnum:] the underscore
236 character is removed. The triples in the table consist of the base map offset,
237 second map offset or -1 if no second map, and a non-negative value for map
238 addition or a negative value for map subtraction (if there are two maps). The
239 absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
240 remove vertical space characters, 2 => remove underscore. */
241
242 static const int posix_class_maps[] = {
243 cbit_word, cbit_digit, -2, /* alpha */
244 cbit_lower, -1, 0, /* lower */
245 cbit_upper, -1, 0, /* upper */
246 cbit_word, -1, 2, /* alnum - word without underscore */
247 cbit_print, cbit_cntrl, 0, /* ascii */
248 cbit_space, -1, 1, /* blank - a GNU extension */
249 cbit_cntrl, -1, 0, /* cntrl */
250 cbit_digit, -1, 0, /* digit */
251 cbit_graph, -1, 0, /* graph */
252 cbit_print, -1, 0, /* print */
253 cbit_punct, -1, 0, /* punct */
254 cbit_space, -1, 0, /* space */
255 cbit_word, -1, 0, /* word - a Perl extension */
256 cbit_xdigit,-1, 0 /* xdigit */
257 };
258
259
260 #define STRING(a) # a
261 #define XSTRING(s) STRING(s)
262
263 /* The texts of compile-time error messages. These are "char *" because they
264 are passed to the outside world. Do not ever re-use any error number, because
265 they are documented. Always add a new error instead. Messages marked DEAD below
266 are no longer used. This used to be a table of strings, but in order to reduce
267 the number of relocations needed when a shared library is loaded dynamically,
268 it is now one long string. We cannot use a table of offsets, because the
269 lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
270 simply count through to the one we want - this isn't a performance issue
271 because these strings are used only when there is a compilation error.
272
273 Each substring ends with \0 to insert a null character. This includes the final
274 substring, so that the whole string ends with \0\0, which can be detected when
275 counting through. */
276
277 static const char error_texts[] =
278 "no error\0"
279 "\\ at end of pattern\0"
280 "\\c at end of pattern\0"
281 "unrecognized character follows \\\0"
282 "numbers out of order in {} quantifier\0"
283 /* 5 */
284 "number too big in {} quantifier\0"
285 "missing terminating ] for character class\0"
286 "invalid escape sequence in character class\0"
287 "range out of order in character class\0"
288 "nothing to repeat\0"
289 /* 10 */
290 "operand of unlimited repeat could match the empty string\0" /** DEAD **/
291 "internal error: unexpected repeat\0"
292 "unrecognized character after (? or (?-\0"
293 "POSIX named classes are supported only within a class\0"
294 "missing )\0"
295 /* 15 */
296 "reference to non-existent subpattern\0"
297 "erroffset passed as NULL\0"
298 "unknown option bit(s) set\0"
299 "missing ) after comment\0"
300 "parentheses nested too deeply\0" /** DEAD **/
301 /* 20 */
302 "regular expression is too large\0"
303 "failed to get memory\0"
304 "unmatched parentheses\0"
305 "internal error: code overflow\0"
306 "unrecognized character after (?<\0"
307 /* 25 */
308 "lookbehind assertion is not fixed length\0"
309 "malformed number or name after (?(\0"
310 "conditional group contains more than two branches\0"
311 "assertion expected after (?(\0"
312 "(?R or (?[+-]digits must be followed by )\0"
313 /* 30 */
314 "unknown POSIX class name\0"
315 "POSIX collating elements are not supported\0"
316 "this version of PCRE is not compiled with PCRE_UTF8 support\0"
317 "spare error\0" /** DEAD **/
318 "character value in \\x{...} sequence is too large\0"
319 /* 35 */
320 "invalid condition (?(0)\0"
321 "\\C not allowed in lookbehind assertion\0"
322 "PCRE does not support \\L, \\l, \\N, \\U, or \\u\0"
323 "number after (?C is > 255\0"
324 "closing ) for (?C expected\0"
325 /* 40 */
326 "recursive call could loop indefinitely\0"
327 "unrecognized character after (?P\0"
328 "syntax error in subpattern name (missing terminator)\0"
329 "two named subpatterns have the same name\0"
330 "invalid UTF-8 string\0"
331 /* 45 */
332 "support for \\P, \\p, and \\X has not been compiled\0"
333 "malformed \\P or \\p sequence\0"
334 "unknown property name after \\P or \\p\0"
335 "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
336 "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
337 /* 50 */
338 "repeated subpattern is too long\0" /** DEAD **/
339 "octal value is greater than \\377 (not in UTF-8 mode)\0"
340 "internal error: overran compiling workspace\0"
341 "internal error: previously-checked referenced subpattern not found\0"
342 "DEFINE group contains more than one branch\0"
343 /* 55 */
344 "repeating a DEFINE group is not allowed\0"
345 "inconsistent NEWLINE options\0"
346 "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
347 "a numbered reference must not be zero\0"
348 "(*VERB) with an argument is not supported\0"
349 /* 60 */
350 "(*VERB) not recognized\0"
351 "number is too big\0"
352 "subpattern name expected\0"
353 "digit expected after (?+\0"
354 "] is an invalid data character in JavaScript compatibility mode\0"
355 /* 65 */
356 "different names for subpatterns of the same number are not allowed\0";
357
358 /* Table to identify digits and hex digits. This is used when compiling
359 patterns. Note that the tables in chartables are dependent on the locale, and
360 may mark arbitrary characters as digits - but the PCRE compiling code expects
361 to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
362 a private table here. It costs 256 bytes, but it is a lot faster than doing
363 character value tests (at least in some simple cases I timed), and in some
364 applications one wants PCRE to compile efficiently as well as match
365 efficiently.
366
367 For convenience, we use the same bit definitions as in chartables:
368
369 0x04 decimal digit
370 0x08 hexadecimal digit
371
372 Then we can use ctype_digit and ctype_xdigit in the code. */
373
374 #ifndef EBCDIC
375
376 /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
377 UTF-8 mode. */
378
379 static const unsigned char digitab[] =
380 {
381 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
382 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
383 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
384 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
385 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
386 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
387 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
388 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
389 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
390 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
391 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
392 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
393 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
394 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
395 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
396 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
397 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
398 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
399 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
400 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
401 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
402 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
403 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
404 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
405 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
406 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
407 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
408 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
409 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
410 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
411 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
412 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
413
414 #else
415
416 /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
417
418 static const unsigned char digitab[] =
419 {
420 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
421 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
422 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
423 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
424 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
425 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
426 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
427 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
428 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
429 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
430 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
431 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */
432 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
433 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
434 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
435 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
436 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
437 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
438 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
439 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
440 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
441 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
442 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
443 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
444 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
445 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
446 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
447 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
448 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
449 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
450 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
451 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
452
453 static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
454 0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
455 0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
456 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
457 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
458 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
459 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
460 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
461 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
462 0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
463 0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
464 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
465 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */
466 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
467 0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
468 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
469 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
470 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
471 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
472 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
473 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
474 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
475 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
476 0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
477 0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
478 0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
479 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
480 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
481 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
482 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
483 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
484 0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
485 0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
486 #endif
487
488
489 /* Definition to allow mutual recursion */
490
491 static BOOL
492 compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
493 int *, int *, branch_chain *, compile_data *, int *);
494
495
496
497 /*************************************************
498 * Find an error text *
499 *************************************************/
500
501 /* The error texts are now all in one long string, to save on relocations. As
502 some of the text is of unknown length, we can't use a table of offsets.
503 Instead, just count through the strings. This is not a performance issue
504 because it happens only when there has been a compilation error.
505
506 Argument: the error number
507 Returns: pointer to the error string
508 */
509
510 static const char *
511 find_error_text(int n)
512 {
513 const char *s = error_texts;
514 for (; n > 0; n--)
515 {
516 while (*s++ != 0) {};
517 if (*s == 0) return "Error text not found (please report)";
518 }
519 return s;
520 }
521
522
523 /*************************************************
524 * Handle escapes *
525 *************************************************/
526
527 /* This function is called when a \ has been encountered. It either returns a
528 positive value for a simple escape such as \n, or a negative value which
529 encodes one of the more complicated things such as \d. A backreference to group
530 n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
531 UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
532 ptr is pointing at the \. On exit, it is on the final character of the escape
533 sequence.
534
535 Arguments:
536 ptrptr points to the pattern position pointer
537 errorcodeptr points to the errorcode variable
538 bracount number of previous extracting brackets
539 options the options bits
540 isclass TRUE if inside a character class
541
542 Returns: zero or positive => a data character
543 negative => a special escape sequence
544 on error, errorcodeptr is set
545 */
546
547 static int
548 check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
549 int options, BOOL isclass)
550 {
551 BOOL utf8 = (options & PCRE_UTF8) != 0;
552 const uschar *ptr = *ptrptr + 1;
553 int c, i;
554
555 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
556 ptr--; /* Set pointer back to the last byte */
557
558 /* If backslash is at the end of the pattern, it's an error. */
559
560 if (c == 0) *errorcodeptr = ERR1;
561
562 /* Non-alphanumerics are literals. For digits or letters, do an initial lookup
563 in a table. A non-zero result is something that can be returned immediately.
564 Otherwise further processing may be required. */
565
566 #ifndef EBCDIC /* ASCII/UTF-8 coding */
567 else if (c < CHAR_0 || c > CHAR_z) {} /* Not alphanumeric */
568 else if ((i = escapes[c - CHAR_0]) != 0) c = i;
569
570 #else /* EBCDIC coding */
571 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphanumeric */
572 else if ((i = escapes[c - 0x48]) != 0) c = i;
573 #endif
574
575 /* Escapes that need further processing, or are illegal. */
576
577 else
578 {
579 const uschar *oldptr;
580 BOOL braced, negated;
581
582 switch (c)
583 {
584 /* A number of Perl escapes are not handled by PCRE. We give an explicit
585 error. */
586
587 case CHAR_l:
588 case CHAR_L:
589 case CHAR_N:
590 case CHAR_u:
591 case CHAR_U:
592 *errorcodeptr = ERR37;
593 break;
594
595 /* \g must be followed by one of a number of specific things:
596
597 (1) A number, either plain or braced. If positive, it is an absolute
598 backreference. If negative, it is a relative backreference. This is a Perl
599 5.10 feature.
600
601 (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
602 is part of Perl's movement towards a unified syntax for back references. As
603 this is synonymous with \k{name}, we fudge it up by pretending it really
604 was \k.
605
606 (3) For Oniguruma compatibility we also support \g followed by a name or a
607 number either in angle brackets or in single quotes. However, these are
608 (possibly recursive) subroutine calls, _not_ backreferences. Just return
609 the -ESC_g code (cf \k). */
610
611 case CHAR_g:
612 if (ptr[1] == CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_APOSTROPHE)
613 {
614 c = -ESC_g;
615 break;
616 }
617
618 /* Handle the Perl-compatible cases */
619
620 if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
621 {
622 const uschar *p;
623 for (p = ptr+2; *p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET; p++)
624 if (*p != CHAR_MINUS && (digitab[*p] & ctype_digit) == 0) break;
625 if (*p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET)
626 {
627 c = -ESC_k;
628 break;
629 }
630 braced = TRUE;
631 ptr++;
632 }
633 else braced = FALSE;
634
635 if (ptr[1] == CHAR_MINUS)
636 {
637 negated = TRUE;
638 ptr++;
639 }
640 else negated = FALSE;
641
642 c = 0;
643 while ((digitab[ptr[1]] & ctype_digit) != 0)
644 c = c * 10 + *(++ptr) - CHAR_0;
645
646 if (c < 0) /* Integer overflow */
647 {
648 *errorcodeptr = ERR61;
649 break;
650 }
651
652 if (braced && *(++ptr) != CHAR_RIGHT_CURLY_BRACKET)
653 {
654 *errorcodeptr = ERR57;
655 break;
656 }
657
658 if (c == 0)
659 {
660 *errorcodeptr = ERR58;
661 break;
662 }
663
664 if (negated)
665 {
666 if (c > bracount)
667 {
668 *errorcodeptr = ERR15;
669 break;
670 }
671 c = bracount - (c - 1);
672 }
673
674 c = -(ESC_REF + c);
675 break;
676
677 /* The handling of escape sequences consisting of a string of digits
678 starting with one that is not zero is not straightforward. By experiment,
679 the way Perl works seems to be as follows:
680
681 Outside a character class, the digits are read as a decimal number. If the
682 number is less than 10, or if there are that many previous extracting
683 left brackets, then it is a back reference. Otherwise, up to three octal
684 digits are read to form an escaped byte. Thus \123 is likely to be octal
685 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
686 value is greater than 377, the least significant 8 bits are taken. Inside a
687 character class, \ followed by a digit is always an octal number. */
688
689 case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
690 case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
691
692 if (!isclass)
693 {
694 oldptr = ptr;
695 c -= CHAR_0;
696 while ((digitab[ptr[1]] & ctype_digit) != 0)
697 c = c * 10 + *(++ptr) - CHAR_0;
698 if (c < 0) /* Integer overflow */
699 {
700 *errorcodeptr = ERR61;
701 break;
702 }
703 if (c < 10 || c <= bracount)
704 {
705 c = -(ESC_REF + c);
706 break;
707 }
708 ptr = oldptr; /* Put the pointer back and fall through */
709 }
710
711 /* Handle an octal number following \. If the first digit is 8 or 9, Perl
712 generates a binary zero byte and treats the digit as a following literal.
713 Thus we have to pull back the pointer by one. */
714
715 if ((c = *ptr) >= CHAR_8)
716 {
717 ptr--;
718 c = 0;
719 break;
720 }
721
722 /* \0 always starts an octal number, but we may drop through to here with a
723 larger first octal digit. The original code used just to take the least
724 significant 8 bits of octal numbers (I think this is what early Perls used
725 to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
726 than 3 octal digits. */
727
728 case CHAR_0:
729 c -= CHAR_0;
730 while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
731 c = c * 8 + *(++ptr) - CHAR_0;
732 if (!utf8 && c > 255) *errorcodeptr = ERR51;
733 break;
734
735 /* \x is complicated. \x{ddd} is a character number which can be greater
736 than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
737 treated as a data character. */
738
739 case CHAR_x:
740 if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
741 {
742 const uschar *pt = ptr + 2;
743 int count = 0;
744
745 c = 0;
746 while ((digitab[*pt] & ctype_xdigit) != 0)
747 {
748 register int cc = *pt++;
749 if (c == 0 && cc == CHAR_0) continue; /* Leading zeroes */
750 count++;
751
752 #ifndef EBCDIC /* ASCII/UTF-8 coding */
753 if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
754 c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
755 #else /* EBCDIC coding */
756 if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */
757 c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
758 #endif
759 }
760
761 if (*pt == CHAR_RIGHT_CURLY_BRACKET)
762 {
763 if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
764 ptr = pt;
765 break;
766 }
767
768 /* If the sequence of hex digits does not end with '}', then we don't
769 recognize this construct; fall through to the normal \x handling. */
770 }
771
772 /* Read just a single-byte hex-defined char */
773
774 c = 0;
775 while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
776 {
777 int cc; /* Some compilers don't like */
778 cc = *(++ptr); /* ++ in initializers */
779 #ifndef EBCDIC /* ASCII/UTF-8 coding */
780 if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
781 c = c * 16 + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
782 #else /* EBCDIC coding */
783 if (cc <= CHAR_z) cc += 64; /* Convert to upper case */
784 c = c * 16 + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
785 #endif
786 }
787 break;
788
789 /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
790 This coding is ASCII-specific, but then the whole concept of \cx is
791 ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
792
793 case CHAR_c:
794 c = *(++ptr);
795 if (c == 0)
796 {
797 *errorcodeptr = ERR2;
798 break;
799 }
800
801 #ifndef EBCDIC /* ASCII/UTF-8 coding */
802 if (c >= CHAR_a && c <= CHAR_z) c -= 32;
803 c ^= 0x40;
804 #else /* EBCDIC coding */
805 if (c >= CHAR_a && c <= CHAR_z) c += 64;
806 c ^= 0xC0;
807 #endif
808 break;
809
810 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
811 other alphanumeric following \ is an error if PCRE_EXTRA was set;
812 otherwise, for Perl compatibility, it is a literal. This code looks a bit
813 odd, but there used to be some cases other than the default, and there may
814 be again in future, so I haven't "optimized" it. */
815
816 default:
817 if ((options & PCRE_EXTRA) != 0) switch(c)
818 {
819 default:
820 *errorcodeptr = ERR3;
821 break;
822 }
823 break;
824 }
825 }
826
827 *ptrptr = ptr;
828 return c;
829 }
830
831
832
833 #ifdef SUPPORT_UCP
834 /*************************************************
835 * Handle \P and \p *
836 *************************************************/
837
838 /* This function is called after \P or \p has been encountered, provided that
839 PCRE is compiled with support for Unicode properties. On entry, ptrptr is
840 pointing at the P or p. On exit, it is pointing at the final character of the
841 escape sequence.
842
843 Argument:
844 ptrptr points to the pattern position pointer
845 negptr points to a boolean that is set TRUE for negation else FALSE
846 dptr points to an int that is set to the detailed property value
847 errorcodeptr points to the error code variable
848
849 Returns: type value from ucp_type_table, or -1 for an invalid type
850 */
851
852 static int
853 get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
854 {
855 int c, i, bot, top;
856 const uschar *ptr = *ptrptr;
857 char name[32];
858
859 c = *(++ptr);
860 if (c == 0) goto ERROR_RETURN;
861
862 *negptr = FALSE;
863
864 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
865 negation. */
866
867 if (c == CHAR_LEFT_CURLY_BRACKET)
868 {
869 if (ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
870 {
871 *negptr = TRUE;
872 ptr++;
873 }
874 for (i = 0; i < (int)sizeof(name) - 1; i++)
875 {
876 c = *(++ptr);
877 if (c == 0) goto ERROR_RETURN;
878 if (c == CHAR_RIGHT_CURLY_BRACKET) break;
879 name[i] = c;
880 }
881 if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
882 name[i] = 0;
883 }
884
885 /* Otherwise there is just one following character */
886
887 else
888 {
889 name[0] = c;
890 name[1] = 0;
891 }
892
893 *ptrptr = ptr;
894
895 /* Search for a recognized property name using binary chop */
896
897 bot = 0;
898 top = _pcre_utt_size;
899
900 while (bot < top)
901 {
902 i = (bot + top) >> 1;
903 c = strcmp(name, _pcre_utt_names + _pcre_utt[i].name_offset);
904 if (c == 0)
905 {
906 *dptr = _pcre_utt[i].value;
907 return _pcre_utt[i].type;
908 }
909 if (c > 0) bot = i + 1; else top = i;
910 }
911
912 *errorcodeptr = ERR47;
913 *ptrptr = ptr;
914 return -1;
915
916 ERROR_RETURN:
917 *errorcodeptr = ERR46;
918 *ptrptr = ptr;
919 return -1;
920 }
921 #endif
922
923
924
925
926 /*************************************************
927 * Check for counted repeat *
928 *************************************************/
929
930 /* This function is called when a '{' is encountered in a place where it might
931 start a quantifier. It looks ahead to see if it really is a quantifier or not.
932 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
933 where the ddds are digits.
934
935 Arguments:
936 p pointer to the first char after '{'
937
938 Returns: TRUE or FALSE
939 */
940
941 static BOOL
942 is_counted_repeat(const uschar *p)
943 {
944 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
945 while ((digitab[*p] & ctype_digit) != 0) p++;
946 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
947
948 if (*p++ != CHAR_COMMA) return FALSE;
949 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
950
951 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
952 while ((digitab[*p] & ctype_digit) != 0) p++;
953
954 return (*p == CHAR_RIGHT_CURLY_BRACKET);
955 }
956
957
958
959 /*************************************************
960 * Read repeat counts *
961 *************************************************/
962
963 /* Read an item of the form {n,m} and return the values. This is called only
964 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
965 so the syntax is guaranteed to be correct, but we need to check the values.
966
967 Arguments:
968 p pointer to first char after '{'
969 minp pointer to int for min
970 maxp pointer to int for max
971 returned as -1 if no max
972 errorcodeptr points to error code variable
973
974 Returns: pointer to '}' on success;
975 current ptr on error, with errorcodeptr set non-zero
976 */
977
978 static const uschar *
979 read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
980 {
981 int min = 0;
982 int max = -1;
983
984 /* Read the minimum value and do a paranoid check: a negative value indicates
985 an integer overflow. */
986
987 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - CHAR_0;
988 if (min < 0 || min > 65535)
989 {
990 *errorcodeptr = ERR5;
991 return p;
992 }
993
994 /* Read the maximum value if there is one, and again do a paranoid on its size.
995 Also, max must not be less than min. */
996
997 if (*p == CHAR_RIGHT_CURLY_BRACKET) max = min; else
998 {
999 if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)
1000 {
1001 max = 0;
1002 while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - CHAR_0;
1003 if (max < 0 || max > 65535)
1004 {
1005 *errorcodeptr = ERR5;
1006 return p;
1007 }
1008 if (max < min)
1009 {
1010 *errorcodeptr = ERR4;
1011 return p;
1012 }
1013 }
1014 }
1015
1016 /* Fill in the required variables, and pass back the pointer to the terminating
1017 '}'. */
1018
1019 *minp = min;
1020 *maxp = max;
1021 return p;
1022 }
1023
1024
1025
1026 /*************************************************
1027 * Subroutine for finding forward reference *
1028 *************************************************/
1029
1030 /* This recursive function is called only from find_parens() below. The
1031 top-level call starts at the beginning of the pattern. All other calls must
1032 start at a parenthesis. It scans along a pattern's text looking for capturing
1033 subpatterns, and counting them. If it finds a named pattern that matches the
1034 name it is given, it returns its number. Alternatively, if the name is NULL, it
1035 returns when it reaches a given numbered subpattern. We know that if (?P< is
1036 encountered, the name will be terminated by '>' because that is checked in the
1037 first pass. Recursion is used to keep track of subpatterns that reset the
1038 capturing group numbers - the (?| feature.
1039
1040 Arguments:
1041 ptrptr address of the current character pointer (updated)
1042 cd compile background data
1043 name name to seek, or NULL if seeking a numbered subpattern
1044 lorn name length, or subpattern number if name is NULL
1045 xmode TRUE if we are in /x mode
1046 count pointer to the current capturing subpattern number (updated)
1047
1048 Returns: the number of the named subpattern, or -1 if not found
1049 */
1050
1051 static int
1052 find_parens_sub(uschar **ptrptr, compile_data *cd, const uschar *name, int lorn,
1053 BOOL xmode, int *count)
1054 {
1055 uschar *ptr = *ptrptr;
1056 int start_count = *count;
1057 int hwm_count = start_count;
1058 BOOL dup_parens = FALSE;
1059
1060 /* If the first character is a parenthesis, check on the type of group we are
1061 dealing with. The very first call may not start with a parenthesis. */
1062
1063 if (ptr[0] == CHAR_LEFT_PARENTHESIS)
1064 {
1065 if (ptr[1] == CHAR_QUESTION_MARK &&
1066 ptr[2] == CHAR_VERTICAL_LINE)
1067 {
1068 ptr += 3;
1069 dup_parens = TRUE;
1070 }
1071
1072 /* Handle a normal, unnamed capturing parenthesis */
1073
1074 else if (ptr[1] != CHAR_QUESTION_MARK && ptr[1] != CHAR_ASTERISK)
1075 {
1076 *count += 1;
1077 if (name == NULL && *count == lorn) return *count;
1078 ptr++;
1079 }
1080
1081 /* Handle a condition. If it is an assertion, just carry on so that it
1082 is processed as normal. If not, skip to the closing parenthesis of the
1083 condition (there can't be any nested parens. */
1084
1085 else if (ptr[2] == CHAR_LEFT_PARENTHESIS)
1086 {
1087 ptr += 2;
1088 if (ptr[1] != CHAR_QUESTION_MARK)
1089 {
1090 while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
1091 if (*ptr != 0) ptr++;
1092 }
1093 }
1094
1095 /* We have either (? or (* and not a condition */
1096
1097 else
1098 {
1099 ptr += 2;
1100 if (*ptr == CHAR_P) ptr++; /* Allow optional P */
1101
1102 /* We have to disambiguate (?<! and (?<= from (?<name> for named groups */
1103
1104 if ((*ptr == CHAR_LESS_THAN_SIGN && ptr[1] != CHAR_EXCLAMATION_MARK &&
1105 ptr[1] != CHAR_EQUALS_SIGN) || *ptr == CHAR_APOSTROPHE)
1106 {
1107 int term;
1108 const uschar *thisname;
1109 *count += 1;
1110 if (name == NULL && *count == lorn) return *count;
1111 term = *ptr++;
1112 if (term == CHAR_LESS_THAN_SIGN) term = CHAR_GREATER_THAN_SIGN;
1113 thisname = ptr;
1114 while (*ptr != term) ptr++;
1115 if (name != NULL && lorn == ptr - thisname &&
1116 strncmp((const char *)name, (const char *)thisname, lorn) == 0)
1117 return *count;
1118 term++;
1119 }
1120 }
1121 }
1122
1123 /* Past any initial parenthesis handling, scan for parentheses or vertical
1124 bars. */
1125
1126 for (; *ptr != 0; ptr++)
1127 {
1128 /* Skip over backslashed characters and also entire \Q...\E */
1129
1130 if (*ptr == CHAR_BACKSLASH)
1131 {
1132 if (*(++ptr) == 0) goto FAIL_EXIT;
1133 if (*ptr == CHAR_Q) for (;;)
1134 {
1135 while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
1136 if (*ptr == 0) goto FAIL_EXIT;
1137 if (*(++ptr) == CHAR_E) break;
1138 }
1139 continue;
1140 }
1141
1142 /* Skip over character classes; this logic must be similar to the way they
1143 are handled for real. If the first character is '^', skip it. Also, if the
1144 first few characters (either before or after ^) are \Q\E or \E we skip them
1145 too. This makes for compatibility with Perl. Note the use of STR macros to
1146 encode "Q\\E" so that it works in UTF-8 on EBCDIC platforms. */
1147
1148 if (*ptr == CHAR_LEFT_SQUARE_BRACKET)
1149 {
1150 BOOL negate_class = FALSE;
1151 for (;;)
1152 {
1153 if (ptr[1] == CHAR_BACKSLASH)
1154 {
1155 if (ptr[2] == CHAR_E)
1156 ptr+= 2;
1157 else if (strncmp((const char *)ptr+2,
1158 STR_Q STR_BACKSLASH STR_E, 3) == 0)
1159 ptr += 4;
1160 else
1161 break;
1162 }
1163 else if (!negate_class && ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
1164 {
1165 negate_class = TRUE;
1166 ptr++;
1167 }
1168 else break;
1169 }
1170
1171 /* If the next character is ']', it is a data character that must be
1172 skipped, except in JavaScript compatibility mode. */
1173
1174 if (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET &&
1175 (cd->external_options & PCRE_JAVASCRIPT_COMPAT) == 0)
1176 ptr++;
1177
1178 while (*(++ptr) != CHAR_RIGHT_SQUARE_BRACKET)
1179 {
1180 if (*ptr == 0) return -1;
1181 if (*ptr == CHAR_BACKSLASH)
1182 {
1183 if (*(++ptr) == 0) goto FAIL_EXIT;
1184 if (*ptr == CHAR_Q) for (;;)
1185 {
1186 while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
1187 if (*ptr == 0) goto FAIL_EXIT;
1188 if (*(++ptr) == CHAR_E) break;
1189 }
1190 continue;
1191 }
1192 }
1193 continue;
1194 }
1195
1196 /* Skip comments in /x mode */
1197
1198 if (xmode && *ptr == CHAR_NUMBER_SIGN)
1199 {
1200 while (*(++ptr) != 0 && *ptr != CHAR_NL) {};
1201 if (*ptr == 0) goto FAIL_EXIT;
1202 continue;
1203 }
1204
1205 /* Check for the special metacharacters */
1206
1207 if (*ptr == CHAR_LEFT_PARENTHESIS)
1208 {
1209 int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, count);
1210 if (rc > 0) return rc;
1211 if (*ptr == 0) goto FAIL_EXIT;
1212 }
1213
1214 else if (*ptr == CHAR_RIGHT_PARENTHESIS)
1215 {
1216 if (dup_parens && *count < hwm_count) *count = hwm_count;
1217 *ptrptr = ptr;
1218 return -1;
1219 }
1220
1221 else if (*ptr == CHAR_VERTICAL_LINE && dup_parens)
1222 {
1223 if (*count > hwm_count) hwm_count = *count;
1224 *count = start_count;
1225 }
1226 }
1227
1228 FAIL_EXIT:
1229 *ptrptr = ptr;
1230 return -1;
1231 }
1232
1233
1234
1235
1236 /*************************************************
1237 * Find forward referenced subpattern *
1238 *************************************************/
1239
1240 /* This function scans along a pattern's text looking for capturing
1241 subpatterns, and counting them. If it finds a named pattern that matches the
1242 name it is given, it returns its number. Alternatively, if the name is NULL, it
1243 returns when it reaches a given numbered subpattern. This is used for forward
1244 references to subpatterns. We used to be able to start this scan from the
1245 current compiling point, using the current count value from cd->bracount, and
1246 do it all in a single loop, but the addition of the possibility of duplicate
1247 subpattern numbers means that we have to scan from the very start, in order to
1248 take account of such duplicates, and to use a recursive function to keep track
1249 of the different types of group.
1250
1251 Arguments:
1252 cd compile background data
1253 name name to seek, or NULL if seeking a numbered subpattern
1254 lorn name length, or subpattern number if name is NULL
1255 xmode TRUE if we are in /x mode
1256
1257 Returns: the number of the found subpattern, or -1 if not found
1258 */
1259
1260 static int
1261 find_parens(compile_data *cd, const uschar *name, int lorn, BOOL xmode)
1262 {
1263 uschar *ptr = (uschar *)cd->start_pattern;
1264 int count = 0;
1265 int rc;
1266
1267 /* If the pattern does not start with an opening parenthesis, the first call
1268 to find_parens_sub() will scan right to the end (if necessary). However, if it
1269 does start with a parenthesis, find_parens_sub() will return when it hits the
1270 matching closing parens. That is why we have to have a loop. */
1271
1272 for (;;)
1273 {
1274 rc = find_parens_sub(&ptr, cd, name, lorn, xmode, &count);
1275 if (rc > 0 || *ptr++ == 0) break;
1276 }
1277
1278 return rc;
1279 }
1280
1281
1282
1283
1284 /*************************************************
1285 * Find first significant op code *
1286 *************************************************/
1287
1288 /* This is called by several functions that scan a compiled expression looking
1289 for a fixed first character, or an anchoring op code etc. It skips over things
1290 that do not influence this. For some calls, a change of option is important.
1291 For some calls, it makes sense to skip negative forward and all backward
1292 assertions, and also the \b assertion; for others it does not.
1293
1294 Arguments:
1295 code pointer to the start of the group
1296 options pointer to external options
1297 optbit the option bit whose changing is significant, or
1298 zero if none are
1299 skipassert TRUE if certain assertions are to be skipped
1300
1301 Returns: pointer to the first significant opcode
1302 */
1303
1304 static const uschar*
1305 first_significant_code(const uschar *code, int *options, int optbit,
1306 BOOL skipassert)
1307 {
1308 for (;;)
1309 {
1310 switch ((int)*code)
1311 {
1312 case OP_OPT:
1313 if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
1314 *options = (int)code[1];
1315 code += 2;
1316 break;
1317
1318 case OP_ASSERT_NOT:
1319 case OP_ASSERTBACK:
1320 case OP_ASSERTBACK_NOT:
1321 if (!skipassert) return code;
1322 do code += GET(code, 1); while (*code == OP_ALT);
1323 code += _pcre_OP_lengths[*code];
1324 break;
1325
1326 case OP_WORD_BOUNDARY:
1327 case OP_NOT_WORD_BOUNDARY:
1328 if (!skipassert) return code;
1329 /* Fall through */
1330
1331 case OP_CALLOUT:
1332 case OP_CREF:
1333 case OP_NCREF:
1334 case OP_RREF:
1335 case OP_NRREF:
1336 case OP_DEF:
1337 code += _pcre_OP_lengths[*code];
1338 break;
1339
1340 default:
1341 return code;
1342 }
1343 }
1344 /* Control never reaches here */
1345 }
1346
1347
1348
1349
1350 /*************************************************
1351 * Find the fixed length of a branch *
1352 *************************************************/
1353
1354 /* Scan a branch and compute the fixed length of subject that will match it,
1355 if the length is fixed. This is needed for dealing with backward assertions.
1356 In UTF8 mode, the result is in characters rather than bytes. The branch is
1357 temporarily terminated with OP_END when this function is called.
1358
1359 This function is called when a backward assertion is encountered, so that if it
1360 fails, the error message can point to the correct place in the pattern.
1361 However, we cannot do this when the assertion contains subroutine calls,
1362 because they can be forward references. We solve this by remembering this case
1363 and doing the check at the end; a flag specifies which mode we are running in.
1364
1365 Arguments:
1366 code points to the start of the pattern (the bracket)
1367 options the compiling options
1368 atend TRUE if called when the pattern is complete
1369 cd the "compile data" structure
1370
1371 Returns: the fixed length,
1372 or -1 if there is no fixed length,
1373 or -2 if \C was encountered
1374 or -3 if an OP_RECURSE item was encountered and atend is FALSE
1375 */
1376
1377 static int
1378 find_fixedlength(uschar *code, int options, BOOL atend, compile_data *cd)
1379 {
1380 int length = -1;
1381
1382 register int branchlength = 0;
1383 register uschar *cc = code + 1 + LINK_SIZE;
1384
1385 /* Scan along the opcodes for this branch. If we get to the end of the
1386 branch, check the length against that of the other branches. */
1387
1388 for (;;)
1389 {
1390 int d;
1391 uschar *ce, *cs;
1392 register int op = *cc;
1393 switch (op)
1394 {
1395 case OP_CBRA:
1396 case OP_BRA:
1397 case OP_ONCE:
1398 case OP_COND:
1399 d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options, atend, cd);
1400 if (d < 0) return d;
1401 branchlength += d;
1402 do cc += GET(cc, 1); while (*cc == OP_ALT);
1403 cc += 1 + LINK_SIZE;
1404 break;
1405
1406 /* Reached end of a branch; if it's a ket it is the end of a nested
1407 call. If it's ALT it is an alternation in a nested call. If it is
1408 END it's the end of the outer call. All can be handled by the same code. */
1409
1410 case OP_ALT:
1411 case OP_KET:
1412 case OP_KETRMAX:
1413 case OP_KETRMIN:
1414 case OP_END:
1415 if (length < 0) length = branchlength;
1416 else if (length != branchlength) return -1;
1417 if (*cc != OP_ALT) return length;
1418 cc += 1 + LINK_SIZE;
1419 branchlength = 0;
1420 break;
1421
1422 /* A true recursion implies not fixed length, but a subroutine call may
1423 be OK. If the subroutine is a forward reference, we can't deal with
1424 it until the end of the pattern, so return -3. */
1425
1426 case OP_RECURSE:
1427 if (!atend) return -3;
1428 cs = ce = (uschar *)cd->start_code + GET(cc, 1); /* Start subpattern */
1429 do ce += GET(ce, 1); while (*ce == OP_ALT); /* End subpattern */
1430 if (cc > cs && cc < ce) return -1; /* Recursion */
1431 d = find_fixedlength(cs + 2, options, atend, cd);
1432 if (d < 0) return d;
1433 branchlength += d;
1434 cc += 1 + LINK_SIZE;
1435 break;
1436
1437 /* Skip over assertive subpatterns */
1438
1439 case OP_ASSERT:
1440 case OP_ASSERT_NOT:
1441 case OP_ASSERTBACK:
1442 case OP_ASSERTBACK_NOT:
1443 do cc += GET(cc, 1); while (*cc == OP_ALT);
1444 /* Fall through */
1445
1446 /* Skip over things that don't match chars */
1447
1448 case OP_REVERSE:
1449 case OP_CREF:
1450 case OP_NCREF:
1451 case OP_RREF:
1452 case OP_NRREF:
1453 case OP_DEF:
1454 case OP_OPT:
1455 case OP_CALLOUT:
1456 case OP_SOD:
1457 case OP_SOM:
1458 case OP_SET_SOM:
1459 case OP_EOD:
1460 case OP_EODN:
1461 case OP_CIRC:
1462 case OP_DOLL:
1463 case OP_NOT_WORD_BOUNDARY:
1464 case OP_WORD_BOUNDARY:
1465 cc += _pcre_OP_lengths[*cc];
1466 break;
1467
1468 /* Handle literal characters */
1469
1470 case OP_CHAR:
1471 case OP_CHARNC:
1472 case OP_NOT:
1473 branchlength++;
1474 cc += 2;
1475 #ifdef SUPPORT_UTF8
1476 if ((options & PCRE_UTF8) != 0 && cc[-1] >= 0xc0)
1477 cc += _pcre_utf8_table4[cc[-1] & 0x3f];
1478 #endif
1479 break;
1480
1481 /* Handle exact repetitions. The count is already in characters, but we
1482 need to skip over a multibyte character in UTF8 mode. */
1483
1484 case OP_EXACT:
1485 branchlength += GET2(cc,1);
1486 cc += 4;
1487 #ifdef SUPPORT_UTF8
1488 if ((options & PCRE_UTF8) != 0 && cc[-1] >= 0xc0)
1489 cc += _pcre_utf8_table4[cc[-1] & 0x3f];
1490 #endif
1491 break;
1492
1493 case OP_TYPEEXACT:
1494 branchlength += GET2(cc,1);
1495 if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
1496 cc += 4;
1497 break;
1498
1499 /* Handle single-char matchers */
1500
1501 case OP_PROP:
1502 case OP_NOTPROP:
1503 cc += 2;
1504 /* Fall through */
1505
1506 case OP_NOT_DIGIT:
1507 case OP_DIGIT:
1508 case OP_NOT_WHITESPACE:
1509 case OP_WHITESPACE:
1510 case OP_NOT_WORDCHAR:
1511 case OP_WORDCHAR:
1512 case OP_ANY:
1513 case OP_ALLANY:
1514 branchlength++;
1515 cc++;
1516 break;
1517
1518 /* The single-byte matcher isn't allowed */
1519
1520 case OP_ANYBYTE:
1521 return -2;
1522
1523 /* Check a class for variable quantification */
1524
1525 #ifdef SUPPORT_UTF8
1526 case OP_XCLASS:
1527 cc += GET(cc, 1) - 33;
1528 /* Fall through */
1529 #endif
1530
1531 case OP_CLASS:
1532 case OP_NCLASS:
1533 cc += 33;
1534
1535 switch (*cc)
1536 {
1537 case OP_CRSTAR:
1538 case OP_CRMINSTAR:
1539 case OP_CRQUERY:
1540 case OP_CRMINQUERY:
1541 return -1;
1542
1543 case OP_CRRANGE:
1544 case OP_CRMINRANGE:
1545 if (GET2(cc,1) != GET2(cc,3)) return -1;
1546 branchlength += GET2(cc,1);
1547 cc += 5;
1548 break;
1549
1550 default:
1551 branchlength++;
1552 }
1553 break;
1554
1555 /* Anything else is variable length */
1556
1557 default:
1558 return -1;
1559 }
1560 }
1561 /* Control never gets here */
1562 }
1563
1564
1565
1566
1567 /*************************************************
1568 * Scan compiled regex for specific bracket *
1569 *************************************************/
1570
1571 /* This little function scans through a compiled pattern until it finds a
1572 capturing bracket with the given number, or, if the number is negative, an
1573 instance of OP_REVERSE for a lookbehind. The function is global in the C sense
1574 so that it can be called from pcre_study() when finding the minimum matching
1575 length.
1576
1577 Arguments:
1578 code points to start of expression
1579 utf8 TRUE in UTF-8 mode
1580 number the required bracket number or negative to find a lookbehind
1581
1582 Returns: pointer to the opcode for the bracket, or NULL if not found
1583 */
1584
1585 const uschar *
1586 _pcre_find_bracket(const uschar *code, BOOL utf8, int number)
1587 {
1588 for (;;)
1589 {
1590 register int c = *code;
1591 if (c == OP_END) return NULL;
1592
1593 /* XCLASS is used for classes that cannot be represented just by a bit
1594 map. This includes negated single high-valued characters. The length in
1595 the table is zero; the actual length is stored in the compiled code. */
1596
1597 if (c == OP_XCLASS) code += GET(code, 1);
1598
1599 /* Handle recursion */
1600
1601 else if (c == OP_REVERSE)
1602 {
1603 if (number < 0) return (uschar *)code;
1604 code += _pcre_OP_lengths[c];
1605 }
1606
1607 /* Handle capturing bracket */
1608
1609 else if (c == OP_CBRA)
1610 {
1611 int n = GET2(code, 1+LINK_SIZE);
1612 if (n == number) return (uschar *)code;
1613 code += _pcre_OP_lengths[c];
1614 }
1615
1616 /* Otherwise, we can get the item's length from the table, except that for
1617 repeated character types, we have to test for \p and \P, which have an extra
1618 two bytes of parameters. */
1619
1620 else
1621 {
1622 switch(c)
1623 {
1624 case OP_TYPESTAR:
1625 case OP_TYPEMINSTAR:
1626 case OP_TYPEPLUS:
1627 case OP_TYPEMINPLUS:
1628 case OP_TYPEQUERY:
1629 case OP_TYPEMINQUERY:
1630 case OP_TYPEPOSSTAR:
1631 case OP_TYPEPOSPLUS:
1632 case OP_TYPEPOSQUERY:
1633 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1634 break;
1635
1636 case OP_TYPEUPTO:
1637 case OP_TYPEMINUPTO:
1638 case OP_TYPEEXACT:
1639 case OP_TYPEPOSUPTO:
1640 if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1641 break;
1642 }
1643
1644 /* Add in the fixed length from the table */
1645
1646 code += _pcre_OP_lengths[c];
1647
1648 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1649 a multi-byte character. The length in the table is a minimum, so we have to
1650 arrange to skip the extra bytes. */
1651
1652 #ifdef SUPPORT_UTF8
1653 if (utf8) switch(c)
1654 {
1655 case OP_CHAR:
1656 case OP_CHARNC:
1657 case OP_EXACT:
1658 case OP_UPTO:
1659 case OP_MINUPTO:
1660 case OP_POSUPTO:
1661 case OP_STAR:
1662 case OP_MINSTAR:
1663 case OP_POSSTAR:
1664 case OP_PLUS:
1665 case OP_MINPLUS:
1666 case OP_POSPLUS:
1667 case OP_QUERY:
1668 case OP_MINQUERY:
1669 case OP_POSQUERY:
1670 if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1671 break;
1672 }
1673 #else
1674 (void)(utf8); /* Keep compiler happy by referencing function argument */
1675 #endif
1676 }
1677 }
1678 }
1679
1680
1681
1682 /*************************************************
1683 * Scan compiled regex for recursion reference *
1684 *************************************************/
1685
1686 /* This little function scans through a compiled pattern until it finds an
1687 instance of OP_RECURSE.
1688
1689 Arguments:
1690 code points to start of expression
1691 utf8 TRUE in UTF-8 mode
1692
1693 Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1694 */
1695
1696 static const uschar *
1697 find_recurse(const uschar *code, BOOL utf8)
1698 {
1699 for (;;)
1700 {
1701 register int c = *code;
1702 if (c == OP_END) return NULL;
1703 if (c == OP_RECURSE) return code;
1704
1705 /* XCLASS is used for classes that cannot be represented just by a bit
1706 map. This includes negated single high-valued characters. The length in
1707 the table is zero; the actual length is stored in the compiled code. */
1708
1709 if (c == OP_XCLASS) code += GET(code, 1);
1710
1711 /* Otherwise, we can get the item's length from the table, except that for
1712 repeated character types, we have to test for \p and \P, which have an extra
1713 two bytes of parameters. */
1714
1715 else
1716 {
1717 switch(c)
1718 {
1719 case OP_TYPESTAR:
1720 case OP_TYPEMINSTAR:
1721 case OP_TYPEPLUS:
1722 case OP_TYPEMINPLUS:
1723 case OP_TYPEQUERY:
1724 case OP_TYPEMINQUERY:
1725 case OP_TYPEPOSSTAR:
1726 case OP_TYPEPOSPLUS:
1727 case OP_TYPEPOSQUERY:
1728 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1729 break;
1730
1731 case OP_TYPEPOSUPTO:
1732 case OP_TYPEUPTO:
1733 case OP_TYPEMINUPTO:
1734 case OP_TYPEEXACT:
1735 if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1736 break;
1737 }
1738
1739 /* Add in the fixed length from the table */
1740
1741 code += _pcre_OP_lengths[c];
1742
1743 /* In UTF-8 mode, opcodes that are followed by a character may be followed
1744 by a multi-byte character. The length in the table is a minimum, so we have
1745 to arrange to skip the extra bytes. */
1746
1747 #ifdef SUPPORT_UTF8
1748 if (utf8) switch(c)
1749 {
1750 case OP_CHAR:
1751 case OP_CHARNC:
1752 case OP_EXACT:
1753 case OP_UPTO:
1754 case OP_MINUPTO:
1755 case OP_POSUPTO:
1756 case OP_STAR:
1757 case OP_MINSTAR:
1758 case OP_POSSTAR:
1759 case OP_PLUS:
1760 case OP_MINPLUS:
1761 case OP_POSPLUS:
1762 case OP_QUERY:
1763 case OP_MINQUERY:
1764 case OP_POSQUERY:
1765 if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1766 break;
1767 }
1768 #else
1769 (void)(utf8); /* Keep compiler happy by referencing function argument */
1770 #endif
1771 }
1772 }
1773 }
1774
1775
1776
1777 /*************************************************
1778 * Scan compiled branch for non-emptiness *
1779 *************************************************/
1780
1781 /* This function scans through a branch of a compiled pattern to see whether it
1782 can match the empty string or not. It is called from could_be_empty()
1783 below and from compile_branch() when checking for an unlimited repeat of a
1784 group that can match nothing. Note that first_significant_code() skips over
1785 backward and negative forward assertions when its final argument is TRUE. If we
1786 hit an unclosed bracket, we return "empty" - this means we've struck an inner
1787 bracket whose current branch will already have been scanned.
1788
1789 Arguments:
1790 code points to start of search
1791 endcode points to where to stop
1792 utf8 TRUE if in UTF8 mode
1793 cd contains pointers to tables etc.
1794
1795 Returns: TRUE if what is matched could be empty
1796 */
1797
1798 static BOOL
1799 could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8,
1800 compile_data *cd)
1801 {
1802 register int c;
1803 for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1804 code < endcode;
1805 code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1806 {
1807 const uschar *ccode;
1808
1809 c = *code;
1810
1811 /* Skip over forward assertions; the other assertions are skipped by
1812 first_significant_code() with a TRUE final argument. */
1813
1814 if (c == OP_ASSERT)
1815 {
1816 do code += GET(code, 1); while (*code == OP_ALT);
1817 c = *code;
1818 continue;
1819 }
1820
1821 /* Groups with zero repeats can of course be empty; skip them. */
1822
1823 if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO)
1824 {
1825 code += _pcre_OP_lengths[c];
1826 do code += GET(code, 1); while (*code == OP_ALT);
1827 c = *code;
1828 continue;
1829 }
1830
1831 /* For a recursion/subroutine call, if its end has been reached, which
1832 implies a subroutine call, we can scan it. */
1833
1834 if (c == OP_RECURSE)
1835 {
1836 BOOL empty_branch = FALSE;
1837 const uschar *scode = cd->start_code + GET(code, 1);
1838 if (GET(scode, 1) == 0) return TRUE; /* Unclosed */
1839 do
1840 {
1841 if (could_be_empty_branch(scode, endcode, utf8, cd))
1842 {
1843 empty_branch = TRUE;
1844 break;
1845 }
1846 scode += GET(scode, 1);
1847 }
1848 while (*scode == OP_ALT);
1849 if (!empty_branch) return FALSE; /* All branches are non-empty */
1850 continue;
1851 }
1852
1853 /* For other groups, scan the branches. */
1854
1855 if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND)
1856 {
1857 BOOL empty_branch;
1858 if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1859
1860 /* If a conditional group has only one branch, there is a second, implied,
1861 empty branch, so just skip over the conditional, because it could be empty.
1862 Otherwise, scan the individual branches of the group. */
1863
1864 if (c == OP_COND && code[GET(code, 1)] != OP_ALT)
1865 code += GET(code, 1);
1866 else
1867 {
1868 empty_branch = FALSE;
1869 do
1870 {
1871 if (!empty_branch && could_be_empty_branch(code, endcode, utf8, cd))
1872 empty_branch = TRUE;
1873 code += GET(code, 1);
1874 }
1875 while (*code == OP_ALT);
1876 if (!empty_branch) return FALSE; /* All branches are non-empty */
1877 }
1878
1879 c = *code;
1880 continue;
1881 }
1882
1883 /* Handle the other opcodes */
1884
1885 switch (c)
1886 {
1887 /* Check for quantifiers after a class. XCLASS is used for classes that
1888 cannot be represented just by a bit map. This includes negated single
1889 high-valued characters. The length in _pcre_OP_lengths[] is zero; the
1890 actual length is stored in the compiled code, so we must update "code"
1891 here. */
1892
1893 #ifdef SUPPORT_UTF8
1894 case OP_XCLASS:
1895 ccode = code += GET(code, 1);
1896 goto CHECK_CLASS_REPEAT;
1897 #endif
1898
1899 case OP_CLASS:
1900 case OP_NCLASS:
1901 ccode = code + 33;
1902
1903 #ifdef SUPPORT_UTF8
1904 CHECK_CLASS_REPEAT:
1905 #endif
1906
1907 switch (*ccode)
1908 {
1909 case OP_CRSTAR: /* These could be empty; continue */
1910 case OP_CRMINSTAR:
1911 case OP_CRQUERY:
1912 case OP_CRMINQUERY:
1913 break;
1914
1915 default: /* Non-repeat => class must match */
1916 case OP_CRPLUS: /* These repeats aren't empty */
1917 case OP_CRMINPLUS:
1918 return FALSE;
1919
1920 case OP_CRRANGE:
1921 case OP_CRMINRANGE:
1922 if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
1923 break;
1924 }
1925 break;
1926
1927 /* Opcodes that must match a character */
1928
1929 case OP_PROP:
1930 case OP_NOTPROP:
1931 case OP_EXTUNI:
1932 case OP_NOT_DIGIT:
1933 case OP_DIGIT:
1934 case OP_NOT_WHITESPACE:
1935 case OP_WHITESPACE:
1936 case OP_NOT_WORDCHAR:
1937 case OP_WORDCHAR:
1938 case OP_ANY:
1939 case OP_ALLANY:
1940 case OP_ANYBYTE:
1941 case OP_CHAR:
1942 case OP_CHARNC:
1943 case OP_NOT:
1944 case OP_PLUS:
1945 case OP_MINPLUS:
1946 case OP_POSPLUS:
1947 case OP_EXACT:
1948 case OP_NOTPLUS:
1949 case OP_NOTMINPLUS:
1950 case OP_NOTPOSPLUS:
1951 case OP_NOTEXACT:
1952 case OP_TYPEPLUS:
1953 case OP_TYPEMINPLUS:
1954 case OP_TYPEPOSPLUS:
1955 case OP_TYPEEXACT:
1956 return FALSE;
1957
1958 /* These are going to continue, as they may be empty, but we have to
1959 fudge the length for the \p and \P cases. */
1960
1961 case OP_TYPESTAR:
1962 case OP_TYPEMINSTAR:
1963 case OP_TYPEPOSSTAR:
1964 case OP_TYPEQUERY:
1965 case OP_TYPEMINQUERY:
1966 case OP_TYPEPOSQUERY:
1967 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1968 break;
1969
1970 /* Same for these */
1971
1972 case OP_TYPEUPTO:
1973 case OP_TYPEMINUPTO:
1974 case OP_TYPEPOSUPTO:
1975 if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1976 break;
1977
1978 /* End of branch */
1979
1980 case OP_KET:
1981 case OP_KETRMAX:
1982 case OP_KETRMIN:
1983 case OP_ALT:
1984 return TRUE;
1985
1986 /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1987 MINUPTO, and POSUPTO may be followed by a multibyte character */
1988
1989 #ifdef SUPPORT_UTF8
1990 case OP_STAR:
1991 case OP_MINSTAR:
1992 case OP_POSSTAR:
1993 case OP_QUERY:
1994 case OP_MINQUERY:
1995 case OP_POSQUERY:
1996 if (utf8 && code[1] >= 0xc0) code += _pcre_utf8_table4[code[1] & 0x3f];
1997 break;
1998
1999 case OP_UPTO:
2000 case OP_MINUPTO:
2001 case OP_POSUPTO:
2002 if (utf8 && code[3] >= 0xc0) code += _pcre_utf8_table4[code[3] & 0x3f];
2003 break;
2004 #endif
2005
2006 /* None of the remaining opcodes are required to match a character. */
2007
2008 default:
2009 break;
2010 }
2011 }
2012
2013 return TRUE;
2014 }
2015
2016
2017
2018 /*************************************************
2019 * Scan compiled regex for non-emptiness *
2020 *************************************************/
2021
2022 /* This function is called to check for left recursive calls. We want to check
2023 the current branch of the current pattern to see if it could match the empty
2024 string. If it could, we must look outwards for branches at other levels,
2025 stopping when we pass beyond the bracket which is the subject of the recursion.
2026
2027 Arguments:
2028 code points to start of the recursion
2029 endcode points to where to stop (current RECURSE item)
2030 bcptr points to the chain of current (unclosed) branch starts
2031 utf8 TRUE if in UTF-8 mode
2032 cd pointers to tables etc
2033
2034 Returns: TRUE if what is matched could be empty
2035 */
2036
2037 static BOOL
2038 could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
2039 BOOL utf8, compile_data *cd)
2040 {
2041 while (bcptr != NULL && bcptr->current_branch >= code)
2042 {
2043 if (!could_be_empty_branch(bcptr->current_branch, endcode, utf8, cd))
2044 return FALSE;
2045 bcptr = bcptr->outer;
2046 }
2047 return TRUE;
2048 }
2049
2050
2051
2052 /*************************************************
2053 * Check for POSIX class syntax *
2054 *************************************************/
2055
2056 /* This function is called when the sequence "[:" or "[." or "[=" is
2057 encountered in a character class. It checks whether this is followed by a
2058 sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
2059 reach an unescaped ']' without the special preceding character, return FALSE.
2060
2061 Originally, this function only recognized a sequence of letters between the
2062 terminators, but it seems that Perl recognizes any sequence of characters,
2063 though of course unknown POSIX names are subsequently rejected. Perl gives an
2064 "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
2065 didn't consider this to be a POSIX class. Likewise for [:1234:].
2066
2067 The problem in trying to be exactly like Perl is in the handling of escapes. We
2068 have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
2069 class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
2070 below handles the special case of \], but does not try to do any other escape
2071 processing. This makes it different from Perl for cases such as [:l\ower:]
2072 where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize
2073 "l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,
2074 I think.
2075
2076 Arguments:
2077 ptr pointer to the initial [
2078 endptr where to return the end pointer
2079
2080 Returns: TRUE or FALSE
2081 */
2082
2083 static BOOL
2084 check_posix_syntax(const uschar *ptr, const uschar **endptr)
2085 {
2086 int terminator; /* Don't combine these lines; the Solaris cc */
2087 terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
2088 for (++ptr; *ptr != 0; ptr++)
2089 {
2090 if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET) ptr++; else
2091 {
2092 if (*ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
2093 if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
2094 {
2095 *endptr = ptr;
2096 return TRUE;
2097 }
2098 }
2099 }
2100 return FALSE;
2101 }
2102
2103
2104
2105
2106 /*************************************************
2107 * Check POSIX class name *
2108 *************************************************/
2109
2110 /* This function is called to check the name given in a POSIX-style class entry
2111 such as [:alnum:].
2112
2113 Arguments:
2114 ptr points to the first letter
2115 len the length of the name
2116
2117 Returns: a value representing the name, or -1 if unknown
2118 */
2119
2120 static int
2121 check_posix_name(const uschar *ptr, int len)
2122 {
2123 const char *pn = posix_names;
2124 register int yield = 0;
2125 while (posix_name_lengths[yield] != 0)
2126 {
2127 if (len == posix_name_lengths[yield] &&
2128 strncmp((const char *)ptr, pn, len) == 0) return yield;
2129 pn += posix_name_lengths[yield] + 1;
2130 yield++;
2131 }
2132 return -1;
2133 }
2134
2135
2136 /*************************************************
2137 * Adjust OP_RECURSE items in repeated group *
2138 *************************************************/
2139
2140 /* OP_RECURSE items contain an offset from the start of the regex to the group
2141 that is referenced. This means that groups can be replicated for fixed
2142 repetition simply by copying (because the recursion is allowed to refer to
2143 earlier groups that are outside the current group). However, when a group is
2144 optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is
2145 inserted before it, after it has been compiled. This means that any OP_RECURSE
2146 items within it that refer to the group itself or any contained groups have to
2147 have their offsets adjusted. That one of the jobs of this function. Before it
2148 is called, the partially compiled regex must be temporarily terminated with
2149 OP_END.
2150
2151 This function has been extended with the possibility of forward references for
2152 recursions and subroutine calls. It must also check the list of such references
2153 for the group we are dealing with. If it finds that one of the recursions in
2154 the current group is on this list, it adjusts the offset in the list, not the
2155 value in the reference (which is a group number).
2156
2157 Arguments:
2158 group points to the start of the group
2159 adjust the amount by which the group is to be moved
2160 utf8 TRUE in UTF-8 mode
2161 cd contains pointers to tables etc.
2162 save_hwm the hwm forward reference pointer at the start of the group
2163
2164 Returns: nothing
2165 */
2166
2167 static void
2168 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
2169 uschar *save_hwm)
2170 {
2171 uschar *ptr = group;
2172
2173 while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
2174 {
2175 int offset;
2176 uschar *hc;
2177
2178 /* See if this recursion is on the forward reference list. If so, adjust the
2179 reference. */
2180
2181 for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
2182 {
2183 offset = GET(hc, 0);
2184 if (cd->start_code + offset == ptr + 1)
2185 {
2186 PUT(hc, 0, offset + adjust);
2187 break;
2188 }
2189 }
2190
2191 /* Otherwise, adjust the recursion offset if it's after the start of this
2192 group. */
2193
2194 if (hc >= cd->hwm)
2195 {
2196 offset = GET(ptr, 1);
2197 if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
2198 }
2199
2200 ptr += 1 + LINK_SIZE;
2201 }
2202 }
2203
2204
2205
2206 /*************************************************
2207 * Insert an automatic callout point *
2208 *************************************************/
2209
2210 /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
2211 callout points before each pattern item.
2212
2213 Arguments:
2214 code current code pointer
2215 ptr current pattern pointer
2216 cd pointers to tables etc
2217
2218 Returns: new code pointer
2219 */
2220
2221 static uschar *
2222 auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
2223 {
2224 *code++ = OP_CALLOUT;
2225 *code++ = 255;
2226 PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */
2227 PUT(code, LINK_SIZE, 0); /* Default length */
2228 return code + 2*LINK_SIZE;
2229 }
2230
2231
2232
2233 /*************************************************
2234 * Complete a callout item *
2235 *************************************************/
2236
2237 /* A callout item contains the length of the next item in the pattern, which
2238 we can't fill in till after we have reached the relevant point. This is used
2239 for both automatic and manual callouts.
2240
2241 Arguments:
2242 previous_callout points to previous callout item
2243 ptr current pattern pointer
2244 cd pointers to tables etc
2245
2246 Returns: nothing
2247 */
2248
2249 static void
2250 complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
2251 {
2252 int length = ptr - cd->start_pattern - GET(previous_callout, 2);
2253 PUT(previous_callout, 2 + LINK_SIZE, length);
2254 }
2255
2256
2257
2258 #ifdef SUPPORT_UCP
2259 /*************************************************
2260 * Get othercase range *
2261 *************************************************/
2262
2263 /* This function is passed the start and end of a class range, in UTF-8 mode
2264 with UCP support. It searches up the characters, looking for internal ranges of
2265 characters in the "other" case. Each call returns the next one, updating the
2266 start address.
2267
2268 Arguments:
2269 cptr points to starting character value; updated
2270 d end value
2271 ocptr where to put start of othercase range
2272 odptr where to put end of othercase range
2273
2274 Yield: TRUE when range returned; FALSE when no more
2275 */
2276
2277 static BOOL
2278 get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
2279 unsigned int *odptr)
2280 {
2281 unsigned int c, othercase, next;
2282
2283 for (c = *cptr; c <= d; c++)
2284 { if ((othercase = UCD_OTHERCASE(c)) != c) break; }
2285
2286 if (c > d) return FALSE;
2287
2288 *ocptr = othercase;
2289 next = othercase + 1;
2290
2291 for (++c; c <= d; c++)
2292 {
2293 if (UCD_OTHERCASE(c) != next) break;
2294 next++;
2295 }
2296
2297 *odptr = next - 1;
2298 *cptr = c;
2299
2300 return TRUE;
2301 }
2302 #endif /* SUPPORT_UCP */
2303
2304
2305
2306 /*************************************************
2307 * Check if auto-possessifying is possible *
2308 *************************************************/
2309
2310 /* This function is called for unlimited repeats of certain items, to see
2311 whether the next thing could possibly match the repeated item. If not, it makes
2312 sense to automatically possessify the repeated item.
2313
2314 Arguments:
2315 op_code the repeated op code
2316 this data for this item, depends on the opcode
2317 utf8 TRUE in UTF-8 mode
2318 utf8_char used for utf8 character bytes, NULL if not relevant
2319 ptr next character in pattern
2320 options options bits
2321 cd contains pointers to tables etc.
2322
2323 Returns: TRUE if possessifying is wanted
2324 */
2325
2326 static BOOL
2327 check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
2328 const uschar *ptr, int options, compile_data *cd)
2329 {
2330 int next;
2331
2332 /* Skip whitespace and comments in extended mode */
2333
2334 if ((options & PCRE_EXTENDED) != 0)
2335 {
2336 for (;;)
2337 {
2338 while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2339 if (*ptr == CHAR_NUMBER_SIGN)
2340 {
2341 while (*(++ptr) != 0)
2342 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2343 }
2344 else break;
2345 }
2346 }
2347
2348 /* If the next item is one that we can handle, get its value. A non-negative
2349 value is a character, a negative value is an escape value. */
2350
2351 if (*ptr == CHAR_BACKSLASH)
2352 {
2353 int temperrorcode = 0;
2354 next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
2355 if (temperrorcode != 0) return FALSE;
2356 ptr++; /* Point after the escape sequence */
2357 }
2358
2359 else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
2360 {
2361 #ifdef SUPPORT_UTF8
2362 if (utf8) { GETCHARINC(next, ptr); } else
2363 #endif
2364 next = *ptr++;
2365 }
2366
2367 else return FALSE;
2368
2369 /* Skip whitespace and comments in extended mode */
2370
2371 if ((options & PCRE_EXTENDED) != 0)
2372 {
2373 for (;;)
2374 {
2375 while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2376 if (*ptr == CHAR_NUMBER_SIGN)
2377 {
2378 while (*(++ptr) != 0)
2379 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2380 }
2381 else break;
2382 }
2383 }
2384
2385 /* If the next thing is itself optional, we have to give up. */
2386
2387 if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
2388 strncmp((char *)ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
2389 return FALSE;
2390
2391 /* Now compare the next item with the previous opcode. If the previous is a
2392 positive single character match, "item" either contains the character or, if
2393 "item" is greater than 127 in utf8 mode, the character's bytes are in
2394 utf8_char. */
2395
2396
2397 /* Handle cases when the next item is a character. */
2398
2399 if (next >= 0) switch(op_code)
2400 {
2401 case OP_CHAR:
2402 #ifdef SUPPORT_UTF8
2403 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2404 #else
2405 (void)(utf8_char); /* Keep compiler happy by referencing function argument */
2406 #endif
2407 return item != next;
2408
2409 /* For CHARNC (caseless character) we must check the other case. If we have
2410 Unicode property support, we can use it to test the other case of
2411 high-valued characters. */
2412
2413 case OP_CHARNC:
2414 #ifdef SUPPORT_UTF8
2415 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2416 #endif
2417 if (item == next) return FALSE;
2418 #ifdef SUPPORT_UTF8
2419 if (utf8)
2420 {
2421 unsigned int othercase;
2422 if (next < 128) othercase = cd->fcc[next]; else
2423 #ifdef SUPPORT_UCP
2424 othercase = UCD_OTHERCASE((unsigned int)next);
2425 #else
2426 othercase = NOTACHAR;
2427 #endif
2428 return (unsigned int)item != othercase;
2429 }
2430 else
2431 #endif /* SUPPORT_UTF8 */
2432 return (item != cd->fcc[next]); /* Non-UTF-8 mode */
2433
2434 /* For OP_NOT, "item" must be a single-byte character. */
2435
2436 case OP_NOT:
2437 if (item == next) return TRUE;
2438 if ((options & PCRE_CASELESS) == 0) return FALSE;
2439 #ifdef SUPPORT_UTF8
2440 if (utf8)
2441 {
2442 unsigned int othercase;
2443 if (next < 128) othercase = cd->fcc[next]; else
2444 #ifdef SUPPORT_UCP
2445 othercase = UCD_OTHERCASE(next);
2446 #else
2447 othercase = NOTACHAR;
2448 #endif
2449 return (unsigned int)item == othercase;
2450 }
2451 else
2452 #endif /* SUPPORT_UTF8 */
2453 return (item == cd->fcc[next]); /* Non-UTF-8 mode */
2454
2455 case OP_DIGIT:
2456 return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
2457
2458 case OP_NOT_DIGIT:
2459 return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
2460
2461 case OP_WHITESPACE:
2462 return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
2463
2464 case OP_NOT_WHITESPACE:
2465 return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
2466
2467 case OP_WORDCHAR:
2468 return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
2469
2470 case OP_NOT_WORDCHAR:
2471 return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
2472
2473 case OP_HSPACE:
2474 case OP_NOT_HSPACE:
2475 switch(next)
2476 {
2477 case 0x09:
2478 case 0x20:
2479 case 0xa0:
2480 case 0x1680:
2481 case 0x180e:
2482 case 0x2000:
2483 case 0x2001:
2484 case 0x2002:
2485 case 0x2003:
2486 case 0x2004:
2487 case 0x2005:
2488 case 0x2006:
2489 case 0x2007:
2490 case 0x2008:
2491 case 0x2009:
2492 case 0x200A:
2493 case 0x202f:
2494 case 0x205f:
2495 case 0x3000:
2496 return op_code != OP_HSPACE;
2497 default:
2498 return op_code == OP_HSPACE;
2499 }
2500
2501 case OP_VSPACE:
2502 case OP_NOT_VSPACE:
2503 switch(next)
2504 {
2505 case 0x0a:
2506 case 0x0b:
2507 case 0x0c:
2508 case 0x0d:
2509 case 0x85:
2510 case 0x2028:
2511 case 0x2029:
2512 return op_code != OP_VSPACE;
2513 default:
2514 return op_code == OP_VSPACE;
2515 }
2516
2517 default:
2518 return FALSE;
2519 }
2520
2521
2522 /* Handle the case when the next item is \d, \s, etc. */
2523
2524 switch(op_code)
2525 {
2526 case OP_CHAR:
2527 case OP_CHARNC:
2528 #ifdef SUPPORT_UTF8
2529 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2530 #endif
2531 switch(-next)
2532 {
2533 case ESC_d:
2534 return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
2535
2536 case ESC_D:
2537 return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
2538
2539 case ESC_s:
2540 return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
2541
2542 case ESC_S:
2543 return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
2544
2545 case ESC_w:
2546 return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
2547
2548 case ESC_W:
2549 return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
2550
2551 case ESC_h:
2552 case ESC_H:
2553 switch(item)
2554 {
2555 case 0x09:
2556 case 0x20:
2557 case 0xa0:
2558 case 0x1680:
2559 case 0x180e:
2560 case 0x2000:
2561 case 0x2001:
2562 case 0x2002:
2563 case 0x2003:
2564 case 0x2004:
2565 case 0x2005:
2566 case 0x2006:
2567 case 0x2007:
2568 case 0x2008:
2569 case 0x2009:
2570 case 0x200A:
2571 case 0x202f:
2572 case 0x205f:
2573 case 0x3000:
2574 return -next != ESC_h;
2575 default:
2576 return -next == ESC_h;
2577 }
2578
2579 case ESC_v:
2580 case ESC_V:
2581 switch(item)
2582 {
2583 case 0x0a:
2584 case 0x0b:
2585 case 0x0c:
2586 case 0x0d:
2587 case 0x85:
2588 case 0x2028:
2589 case 0x2029:
2590 return -next != ESC_v;
2591 default:
2592 return -next == ESC_v;
2593 }
2594
2595 default:
2596 return FALSE;
2597 }
2598
2599 case OP_DIGIT:
2600 return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
2601 next == -ESC_h || next == -ESC_v;
2602
2603 case OP_NOT_DIGIT:
2604 return next == -ESC_d;
2605
2606 case OP_WHITESPACE:
2607 return next == -ESC_S || next == -ESC_d || next == -ESC_w;
2608
2609 case OP_NOT_WHITESPACE:
2610 return next == -ESC_s || next == -ESC_h || next == -ESC_v;
2611
2612 case OP_HSPACE:
2613 return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w;
2614
2615 case OP_NOT_HSPACE:
2616 return next == -ESC_h;
2617
2618 /* Can't have \S in here because VT matches \S (Perl anomaly) */
2619 case OP_VSPACE:
2620 return next == -ESC_V || next == -ESC_d || next == -ESC_w;
2621
2622 case OP_NOT_VSPACE:
2623 return next == -ESC_v;
2624
2625 case OP_WORDCHAR:
2626 return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v;
2627
2628 case OP_NOT_WORDCHAR:
2629 return next == -ESC_w || next == -ESC_d;
2630
2631 default:
2632 return FALSE;
2633 }
2634
2635 /* Control does not reach here */
2636 }
2637
2638
2639
2640 /*************************************************
2641 * Compile one branch *
2642 *************************************************/
2643
2644 /* Scan the pattern, compiling it into the a vector. If the options are
2645 changed during the branch, the pointer is used to change the external options
2646 bits. This function is used during the pre-compile phase when we are trying
2647 to find out the amount of memory needed, as well as during the real compile
2648 phase. The value of lengthptr distinguishes the two phases.
2649
2650 Arguments:
2651 optionsptr pointer to the option bits
2652 codeptr points to the pointer to the current code point
2653 ptrptr points to the current pattern pointer
2654 errorcodeptr points to error code variable
2655 firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2656 reqbyteptr set to the last literal character required, else < 0
2657 bcptr points to current branch chain
2658 cd contains pointers to tables etc.
2659 lengthptr NULL during the real compile phase
2660 points to length accumulator during pre-compile phase
2661
2662 Returns: TRUE on success
2663 FALSE, with *errorcodeptr set non-zero on error
2664 */
2665
2666 static BOOL
2667 compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2668 int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2669 compile_data *cd, int *lengthptr)
2670 {
2671 int repeat_type, op_type;
2672 int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
2673 int bravalue = 0;
2674 int greedy_default, greedy_non_default;
2675 int firstbyte, reqbyte;
2676 int zeroreqbyte, zerofirstbyte;
2677 int req_caseopt, reqvary, tempreqvary;
2678 int options = *optionsptr;
2679 int after_manual_callout = 0;
2680 int length_prevgroup = 0;
2681 register int c;
2682 register uschar *code = *codeptr;
2683 uschar *last_code = code;
2684 uschar *orig_code = code;
2685 uschar *tempcode;
2686 BOOL inescq = FALSE;
2687 BOOL groupsetfirstbyte = FALSE;
2688 const uschar *ptr = *ptrptr;
2689 const uschar *tempptr;
2690 uschar *previous = NULL;
2691 uschar *previous_callout = NULL;
2692 uschar *save_hwm = NULL;
2693 uschar classbits[32];
2694
2695 #ifdef SUPPORT_UTF8
2696 BOOL class_utf8;
2697 BOOL utf8 = (options & PCRE_UTF8) != 0;
2698 uschar *class_utf8data;
2699 uschar *class_utf8data_base;
2700 uschar utf8_char[6];
2701 #else
2702 BOOL utf8 = FALSE;
2703 uschar *utf8_char = NULL;
2704 #endif
2705
2706 #ifdef PCRE_DEBUG
2707 if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2708 #endif
2709
2710 /* Set up the default and non-default settings for greediness */
2711
2712 greedy_default = ((options & PCRE_UNGREEDY) != 0);
2713 greedy_non_default = greedy_default ^ 1;
2714
2715 /* Initialize no first byte, no required byte. REQ_UNSET means "no char
2716 matching encountered yet". It gets changed to REQ_NONE if we hit something that
2717 matches a non-fixed char first char; reqbyte just remains unset if we never
2718 find one.
2719
2720 When we hit a repeat whose minimum is zero, we may have to adjust these values
2721 to take the zero repeat into account. This is implemented by setting them to
2722 zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2723 item types that can be repeated set these backoff variables appropriately. */
2724
2725 firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2726
2727 /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2728 according to the current setting of the caseless flag. REQ_CASELESS is a bit
2729 value > 255. It is added into the firstbyte or reqbyte variables to record the
2730 case status of the value. This is used only for ASCII characters. */
2731
2732 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2733
2734 /* Switch on next character until the end of the branch */
2735
2736 for (;; ptr++)
2737 {
2738 BOOL negate_class;
2739 BOOL should_flip_negation;
2740 BOOL possessive_quantifier;
2741 BOOL is_quantifier;
2742 BOOL is_recurse;
2743 BOOL reset_bracount;
2744 int class_charcount;
2745 int class_lastchar;
2746 int newoptions;
2747 int recno;
2748 int refsign;
2749 int skipbytes;
2750 int subreqbyte;
2751 int subfirstbyte;
2752 int terminator;
2753 int mclength;
2754 uschar mcbuffer[8];
2755
2756 /* Get next byte in the pattern */
2757
2758 c = *ptr;
2759
2760 /* If we are in the pre-compile phase, accumulate the length used for the
2761 previous cycle of this loop. */
2762
2763 if (lengthptr != NULL)
2764 {
2765 #ifdef PCRE_DEBUG
2766 if (code > cd->hwm) cd->hwm = code; /* High water info */
2767 #endif
2768 if (code > cd->start_workspace + WORK_SIZE_CHECK) /* Check for overrun */
2769 {
2770 *errorcodeptr = ERR52;
2771 goto FAILED;
2772 }
2773
2774 /* There is at least one situation where code goes backwards: this is the
2775 case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2776 the class is simply eliminated. However, it is created first, so we have to
2777 allow memory for it. Therefore, don't ever reduce the length at this point.
2778 */
2779
2780 if (code < last_code) code = last_code;
2781
2782 /* Paranoid check for integer overflow */
2783
2784 if (OFLOW_MAX - *lengthptr < code - last_code)
2785 {
2786 *errorcodeptr = ERR20;
2787 goto FAILED;
2788 }
2789
2790 *lengthptr += code - last_code;
2791 DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2792
2793 /* If "previous" is set and it is not at the start of the work space, move
2794 it back to there, in order to avoid filling up the work space. Otherwise,
2795 if "previous" is NULL, reset the current code pointer to the start. */
2796
2797 if (previous != NULL)
2798 {
2799 if (previous > orig_code)
2800 {
2801 memmove(orig_code, previous, code - previous);
2802 code -= previous - orig_code;
2803 previous = orig_code;
2804 }
2805 }
2806 else code = orig_code;
2807
2808 /* Remember where this code item starts so we can pick up the length
2809 next time round. */
2810
2811 last_code = code;
2812 }
2813
2814 /* In the real compile phase, just check the workspace used by the forward
2815 reference list. */
2816
2817 else if (cd->hwm > cd->start_workspace + WORK_SIZE_CHECK)
2818 {
2819 *errorcodeptr = ERR52;
2820 goto FAILED;
2821 }
2822
2823 /* If in \Q...\E, check for the end; if not, we have a literal */
2824
2825 if (inescq && c != 0)
2826 {
2827 if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)
2828 {
2829 inescq = FALSE;
2830 ptr++;
2831 continue;
2832 }
2833 else
2834 {
2835 if (previous_callout != NULL)
2836 {
2837 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2838 complete_callout(previous_callout, ptr, cd);
2839 previous_callout = NULL;
2840 }
2841 if ((options & PCRE_AUTO_CALLOUT) != 0)
2842 {
2843 previous_callout = code;
2844 code = auto_callout(code, ptr, cd);
2845 }
2846 goto NORMAL_CHAR;
2847 }
2848 }
2849
2850 /* Fill in length of a previous callout, except when the next thing is
2851 a quantifier. */
2852
2853 is_quantifier =
2854 c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK ||
2855 (c == CHAR_LEFT_CURLY_BRACKET && is_counted_repeat(ptr+1));
2856
2857 if (!is_quantifier && previous_callout != NULL &&
2858 after_manual_callout-- <= 0)
2859 {
2860 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2861 complete_callout(previous_callout, ptr, cd);
2862 previous_callout = NULL;
2863 }
2864
2865 /* In extended mode, skip white space and comments */
2866
2867 if ((options & PCRE_EXTENDED) != 0)
2868 {
2869 if ((cd->ctypes[c] & ctype_space) != 0) continue;
2870 if (c == CHAR_NUMBER_SIGN)
2871 {
2872 while (*(++ptr) != 0)
2873 {
2874 if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
2875 }
2876 if (*ptr != 0) continue;
2877
2878 /* Else fall through to handle end of string */
2879 c = 0;
2880 }
2881 }
2882
2883 /* No auto callout for quantifiers. */
2884
2885 if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
2886 {
2887 previous_callout = code;
2888 code = auto_callout(code, ptr, cd);
2889 }
2890
2891 switch(c)
2892 {
2893 /* ===================================================================*/
2894 case 0: /* The branch terminates at string end */
2895 case CHAR_VERTICAL_LINE: /* or | or ) */
2896 case CHAR_RIGHT_PARENTHESIS:
2897 *firstbyteptr = firstbyte;
2898 *reqbyteptr = reqbyte;
2899 *codeptr = code;
2900 *ptrptr = ptr;
2901 if (lengthptr != NULL)
2902 {
2903 if (OFLOW_MAX - *lengthptr < code - last_code)
2904 {
2905 *errorcodeptr = ERR20;
2906 goto FAILED;
2907 }
2908 *lengthptr += code - last_code; /* To include callout length */
2909 DPRINTF((">> end branch\n"));
2910 }
2911 return TRUE;
2912
2913
2914 /* ===================================================================*/
2915 /* Handle single-character metacharacters. In multiline mode, ^ disables
2916 the setting of any following char as a first character. */
2917
2918 case CHAR_CIRCUMFLEX_ACCENT:
2919 if ((options & PCRE_MULTILINE) != 0)
2920 {
2921 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2922 }
2923 previous = NULL;
2924 *code++ = OP_CIRC;
2925 break;
2926
2927 case CHAR_DOLLAR_SIGN:
2928 previous = NULL;
2929 *code++ = OP_DOLL;
2930 break;
2931
2932 /* There can never be a first char if '.' is first, whatever happens about
2933 repeats. The value of reqbyte doesn't change either. */
2934
2935 case CHAR_DOT:
2936 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2937 zerofirstbyte = firstbyte;
2938 zeroreqbyte = reqbyte;
2939 previous = code;
2940 *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;
2941 break;
2942
2943
2944 /* ===================================================================*/
2945 /* Character classes. If the included characters are all < 256, we build a
2946 32-byte bitmap of the permitted characters, except in the special case
2947 where there is only one such character. For negated classes, we build the
2948 map as usual, then invert it at the end. However, we use a different opcode
2949 so that data characters > 255 can be handled correctly.
2950
2951 If the class contains characters outside the 0-255 range, a different
2952 opcode is compiled. It may optionally have a bit map for characters < 256,
2953 but those above are are explicitly listed afterwards. A flag byte tells
2954 whether the bitmap is present, and whether this is a negated class or not.
2955
2956 In JavaScript compatibility mode, an isolated ']' causes an error. In
2957 default (Perl) mode, it is treated as a data character. */
2958
2959 case CHAR_RIGHT_SQUARE_BRACKET:
2960 if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
2961 {
2962 *errorcodeptr = ERR64;
2963 goto FAILED;
2964 }
2965 goto NORMAL_CHAR;
2966
2967 case CHAR_LEFT_SQUARE_BRACKET:
2968 previous = code;
2969
2970 /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
2971 they are encountered at the top level, so we'll do that too. */
2972
2973 if ((ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
2974 ptr[1] == CHAR_EQUALS_SIGN) &&
2975 check_posix_syntax(ptr, &tempptr))
2976 {
2977 *errorcodeptr = (ptr[1] == CHAR_COLON)? ERR13 : ERR31;
2978 goto FAILED;
2979 }
2980
2981 /* If the first character is '^', set the negation flag and skip it. Also,
2982 if the first few characters (either before or after ^) are \Q\E or \E we
2983 skip them too. This makes for compatibility with Perl. */
2984
2985 negate_class = FALSE;
2986 for (;;)
2987 {
2988 c = *(++ptr);
2989 if (c == CHAR_BACKSLASH)
2990 {
2991 if (ptr[1] == CHAR_E)
2992 ptr++;
2993 else if (strncmp((const char *)ptr+1,
2994 STR_Q STR_BACKSLASH STR_E, 3) == 0)
2995 ptr += 3;
2996 else
2997 break;
2998 }
2999 else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
3000 negate_class = TRUE;
3001 else break;
3002 }
3003
3004 /* Empty classes are allowed in JavaScript compatibility mode. Otherwise,
3005 an initial ']' is taken as a data character -- the code below handles
3006 that. In JS mode, [] must always fail, so generate OP_FAIL, whereas
3007 [^] must match any character, so generate OP_ALLANY. */
3008
3009 if (c == CHAR_RIGHT_SQUARE_BRACKET &&
3010 (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
3011 {
3012 *code++ = negate_class? OP_ALLANY : OP_FAIL;
3013 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3014 zerofirstbyte = firstbyte;
3015 break;
3016 }
3017
3018 /* If a class contains a negative special such as \S, we need to flip the
3019 negation flag at the end, so that support for characters > 255 works
3020 correctly (they are all included in the class). */
3021
3022 should_flip_negation = FALSE;
3023
3024 /* Keep a count of chars with values < 256 so that we can optimize the case
3025 of just a single character (as long as it's < 256). However, For higher
3026 valued UTF-8 characters, we don't yet do any optimization. */
3027
3028 class_charcount = 0;
3029 class_lastchar = -1;
3030
3031 /* Initialize the 32-char bit map to all zeros. We build the map in a
3032 temporary bit of memory, in case the class contains only 1 character (less
3033 than 256), because in that case the compiled code doesn't use the bit map.
3034 */
3035
3036 memset(classbits, 0, 32 * sizeof(uschar));
3037
3038 #ifdef SUPPORT_UTF8
3039 class_utf8 = FALSE; /* No chars >= 256 */
3040 class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */
3041 class_utf8data_base = class_utf8data; /* For resetting in pass 1 */
3042 #endif
3043
3044 /* Process characters until ] is reached. By writing this as a "do" it
3045 means that an initial ] is taken as a data character. At the start of the
3046 loop, c contains the first byte of the character. */
3047
3048 if (c != 0) do
3049 {
3050 const uschar *oldptr;
3051
3052 #ifdef SUPPORT_UTF8
3053 if (utf8 && c > 127)
3054 { /* Braces are required because the */
3055 GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
3056 }
3057
3058 /* In the pre-compile phase, accumulate the length of any UTF-8 extra
3059 data and reset the pointer. This is so that very large classes that
3060 contain a zillion UTF-8 characters no longer overwrite the work space
3061 (which is on the stack). */
3062
3063 if (lengthptr != NULL)
3064 {
3065 *lengthptr += class_utf8data - class_utf8data_base;
3066 class_utf8data = class_utf8data_base;
3067 }
3068
3069 #endif
3070
3071 /* Inside \Q...\E everything is literal except \E */
3072
3073 if (inescq)
3074 {
3075 if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E) /* If we are at \E */
3076 {
3077 inescq = FALSE; /* Reset literal state */
3078 ptr++; /* Skip the 'E' */
3079 continue; /* Carry on with next */
3080 }
3081 goto CHECK_RANGE; /* Could be range if \E follows */
3082 }
3083
3084 /* Handle POSIX class names. Perl allows a negation extension of the
3085 form [:^name:]. A square bracket that doesn't match the syntax is
3086 treated as a literal. We also recognize the POSIX constructions
3087 [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
3088 5.6 and 5.8 do. */
3089
3090 if (c == CHAR_LEFT_SQUARE_BRACKET &&
3091 (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
3092 ptr[1] == CHAR_EQUALS_SIGN) && check_posix_syntax(ptr, &tempptr))
3093 {
3094 BOOL local_negate = FALSE;
3095 int posix_class, taboffset, tabopt;
3096 register const uschar *cbits = cd->cbits;
3097 uschar pbits[32];
3098
3099 if (ptr[1] != CHAR_COLON)
3100 {
3101 *errorcodeptr = ERR31;
3102 goto FAILED;
3103 }
3104
3105 ptr += 2;
3106 if (*ptr == CHAR_CIRCUMFLEX_ACCENT)
3107 {
3108 local_negate = TRUE;
3109 should_flip_negation = TRUE; /* Note negative special */
3110 ptr++;
3111 }
3112
3113 posix_class = check_posix_name(ptr, tempptr - ptr);
3114 if (posix_class < 0)
3115 {
3116 *errorcodeptr = ERR30;
3117 goto FAILED;
3118 }
3119
3120 /* If matching is caseless, upper and lower are converted to
3121 alpha. This relies on the fact that the class table starts with
3122 alpha, lower, upper as the first 3 entries. */
3123
3124 if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
3125 posix_class = 0;
3126
3127 /* We build the bit map for the POSIX class in a chunk of local store
3128 because we may be adding and subtracting from it, and we don't want to
3129 subtract bits that may be in the main map already. At the end we or the
3130 result into the bit map that is being built. */
3131
3132 posix_class *= 3;
3133
3134 /* Copy in the first table (always present) */
3135
3136 memcpy(pbits, cbits + posix_class_maps[posix_class],
3137 32 * sizeof(uschar));
3138
3139 /* If there is a second table, add or remove it as required. */
3140
3141 taboffset = posix_class_maps[posix_class + 1];
3142 tabopt = posix_class_maps[posix_class + 2];
3143
3144 if (taboffset >= 0)
3145 {
3146 if (tabopt >= 0)
3147 for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
3148 else
3149 for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
3150 }
3151
3152 /* Not see if we need to remove any special characters. An option
3153 value of 1 removes vertical space and 2 removes underscore. */
3154
3155 if (tabopt < 0) tabopt = -tabopt;
3156 if (tabopt == 1) pbits[1] &= ~0x3c;
3157 else if (tabopt == 2) pbits[11] &= 0x7f;
3158
3159 /* Add the POSIX table or its complement into the main table that is
3160 being built and we are done. */
3161
3162 if (local_negate)
3163 for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
3164 else
3165 for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
3166
3167 ptr = tempptr + 1;
3168 class_charcount = 10; /* Set > 1; assumes more than 1 per class */
3169 continue; /* End of POSIX syntax handling */
3170 }
3171
3172 /* Backslash may introduce a single character, or it may introduce one
3173 of the specials, which just set a flag. The sequence \b is a special
3174 case. Inside a class (and only there) it is treated as backspace.
3175 Elsewhere it marks a word boundary. Other escapes have preset maps ready
3176 to 'or' into the one we are building. We assume they have more than one
3177 character in them, so set class_charcount bigger than one. */
3178
3179 if (c == CHAR_BACKSLASH)
3180 {
3181 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3182 if (*errorcodeptr != 0) goto FAILED;
3183
3184 if (-c == ESC_b) c = CHAR_BS; /* \b is backspace in a class */
3185 else if (-c == ESC_X) c = CHAR_X; /* \X is literal X in a class */
3186 else if (-c == ESC_R) c = CHAR_R; /* \R is literal R in a class */
3187 else if (-c == ESC_Q) /* Handle start of quoted string */
3188 {
3189 if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
3190 {
3191 ptr += 2; /* avoid empty string */
3192 }
3193 else inescq = TRUE;
3194 continue;
3195 }
3196 else if (-c == ESC_E) continue; /* Ignore orphan \E */
3197
3198 if (c < 0)
3199 {
3200 register const uschar *cbits = cd->cbits;
3201 class_charcount += 2; /* Greater than 1 is what matters */
3202
3203 /* Save time by not doing this in the pre-compile phase. */
3204
3205 if (lengthptr == NULL) switch (-c)
3206 {
3207 case ESC_d:
3208 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
3209 continue;
3210
3211 case ESC_D:
3212 should_flip_negation = TRUE;
3213 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
3214 continue;
3215
3216 case ESC_w:
3217 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
3218 continue;
3219
3220 case ESC_W:
3221 should_flip_negation = TRUE;
3222 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
3223 continue;
3224
3225 case ESC_s:
3226 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
3227 classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
3228 continue;
3229
3230 case ESC_S:
3231 should_flip_negation = TRUE;
3232 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
3233 classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
3234 continue;
3235
3236 default: /* Not recognized; fall through */
3237 break; /* Need "default" setting to stop compiler warning. */
3238 }
3239
3240 /* In the pre-compile phase, just do the recognition. */
3241
3242 else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
3243 c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
3244
3245 /* We need to deal with \H, \h, \V, and \v in both phases because
3246 they use extra memory. */
3247
3248 if (-c == ESC_h)
3249 {
3250 SETBIT(classbits, 0x09); /* VT */
3251 SETBIT(classbits, 0x20); /* SPACE */
3252 SETBIT(classbits, 0xa0); /* NSBP */
3253 #ifdef SUPPORT_UTF8
3254 if (utf8)
3255 {
3256 class_utf8 = TRUE;
3257 *class_utf8data++ = XCL_SINGLE;
3258 class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
3259 *class_utf8data++ = XCL_SINGLE;
3260 class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
3261 *class_utf8data++ = XCL_RANGE;
3262 class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
3263 class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
3264 *class_utf8data++ = XCL_SINGLE;
3265 class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
3266 *class_utf8data++ = XCL_SINGLE;
3267 class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
3268 *class_utf8data++ = XCL_SINGLE;
3269 class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
3270 }
3271 #endif
3272 continue;
3273 }
3274
3275 if (-c == ESC_H)
3276 {
3277 for (c = 0; c < 32; c++)
3278 {
3279 int x = 0xff;
3280 switch (c)
3281 {
3282 case 0x09/8: x ^= 1 << (0x09%8); break;
3283 case 0x20/8: x ^= 1 << (0x20%8); break;
3284 case 0xa0/8: x ^= 1 << (0xa0%8); break;
3285 default: break;
3286 }
3287 classbits[c] |= x;
3288 }
3289
3290 #ifdef SUPPORT_UTF8
3291 if (utf8)
3292 {
3293 class_utf8 = TRUE;
3294 *class_utf8data++ = XCL_RANGE;
3295 class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
3296 class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
3297 *class_utf8data++ = XCL_RANGE;
3298 class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
3299 class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
3300 *class_utf8data++ = XCL_RANGE;
3301 class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
3302 class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
3303 *class_utf8data++ = XCL_RANGE;
3304 class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
3305 class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
3306 *class_utf8data++ = XCL_RANGE;
3307 class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
3308 class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
3309 *class_utf8data++ = XCL_RANGE;
3310 class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
3311 class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
3312 *class_utf8data++ = XCL_RANGE;
3313 class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
3314 class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
3315 }
3316 #endif
3317 continue;
3318 }
3319
3320 if (-c == ESC_v)
3321 {
3322 SETBIT(classbits, 0x0a); /* LF */
3323 SETBIT(classbits, 0x0b); /* VT */
3324 SETBIT(classbits, 0x0c); /* FF */
3325 SETBIT(classbits, 0x0d); /* CR */
3326 SETBIT(classbits, 0x85); /* NEL */
3327 #ifdef SUPPORT_UTF8
3328 if (utf8)
3329 {
3330 class_utf8 = TRUE;
3331 *class_utf8data++ = XCL_RANGE;
3332 class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
3333 class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
3334 }
3335 #endif
3336 continue;
3337 }
3338
3339 if (-c == ESC_V)
3340 {
3341 for (c = 0; c < 32; c++)
3342 {
3343 int x = 0xff;
3344 switch (c)
3345 {
3346 case 0x0a/8: x ^= 1 << (0x0a%8);
3347 x ^= 1 << (0x0b%8);
3348 x ^= 1 << (0x0c%8);
3349 x ^= 1 << (0x0d%8);
3350 break;
3351 case 0x85/8: x ^= 1 << (0x85%8); break;
3352 default: break;
3353 }
3354 classbits[c] |= x;
3355 }
3356
3357 #ifdef SUPPORT_UTF8
3358 if (utf8)
3359 {
3360 class_utf8 = TRUE;
3361 *class_utf8data++ = XCL_RANGE;
3362 class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
3363 class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
3364 *class_utf8data++ = XCL_RANGE;
3365 class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
3366 class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
3367 }
3368 #endif
3369 continue;
3370 }
3371
3372 /* We need to deal with \P and \p in both phases. */
3373
3374 #ifdef SUPPORT_UCP
3375 if (-c == ESC_p || -c == ESC_P)
3376 {
3377 BOOL negated;
3378 int pdata;
3379 int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
3380 if (ptype < 0) goto FAILED;
3381 class_utf8 = TRUE;
3382 *class_utf8data++ = ((-c == ESC_p) != negated)?
3383 XCL_PROP : XCL_NOTPROP;
3384 *class_utf8data++ = ptype;
3385 *class_utf8data++ = pdata;
3386 class_charcount -= 2; /* Not a < 256 character */
3387 continue;
3388 }
3389 #endif
3390 /* Unrecognized escapes are faulted if PCRE is running in its
3391 strict mode. By default, for compatibility with Perl, they are
3392 treated as literals. */
3393
3394 if ((options & PCRE_EXTRA) != 0)
3395 {
3396 *errorcodeptr = ERR7;
3397 goto FAILED;
3398 }
3399
3400 class_charcount -= 2; /* Undo the default count from above */
3401 c = *ptr; /* Get the final character and fall through */
3402 }
3403
3404 /* Fall through if we have a single character (c >= 0). This may be
3405 greater than 256 in UTF-8 mode. */
3406
3407 } /* End of backslash handling */
3408
3409 /* A single character may be followed by '-' to form a range. However,
3410 Perl does not permit ']' to be the end of the range. A '-' character
3411 at the end is treated as a literal. Perl ignores orphaned \E sequences
3412 entirely. The code for handling \Q and \E is messy. */
3413
3414 CHECK_RANGE:
3415 while (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
3416 {
3417 inescq = FALSE;
3418 ptr += 2;
3419 }
3420
3421 oldptr = ptr;
3422
3423 /* Remember \r or \n */
3424
3425 if (c == CHAR_CR || c == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
3426
3427 /* Check for range */
3428
3429 if (!inescq && ptr[1] == CHAR_MINUS)
3430 {
3431 int d;
3432 ptr += 2;
3433 while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E) ptr += 2;
3434
3435 /* If we hit \Q (not followed by \E) at this point, go into escaped
3436 mode. */
3437
3438 while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_Q)
3439 {
3440 ptr += 2;
3441 if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E)
3442 { ptr += 2; continue; }
3443 inescq = TRUE;
3444 break;
3445 }
3446
3447 if (*ptr == 0 || (!inescq && *ptr == CHAR_RIGHT_SQUARE_BRACKET))
3448 {
3449 ptr = oldptr;
3450 goto LONE_SINGLE_CHARACTER;
3451 }
3452
3453 #ifdef SUPPORT_UTF8
3454 if (utf8)
3455 { /* Braces are required because the */
3456 GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
3457 }
3458 else
3459 #endif
3460 d = *ptr; /* Not UTF-8 mode */
3461
3462 /* The second part of a range can be a single-character escape, but
3463 not any of the other escapes. Perl 5.6 treats a hyphen as a literal
3464 in such circumstances. */
3465
3466 if (!inescq && d == CHAR_BACKSLASH)
3467 {
3468 d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3469 if (*errorcodeptr != 0) goto FAILED;
3470
3471 /* \b is backspace; \X is literal X; \R is literal R; any other
3472 special means the '-' was literal */
3473
3474 if (d < 0)
3475 {
3476 if (d == -ESC_b) d = CHAR_BS;
3477 else if (d == -ESC_X) d = CHAR_X;
3478 else if (d == -ESC_R) d = CHAR_R; else
3479 {
3480 ptr = oldptr;
3481 goto LONE_SINGLE_CHARACTER; /* A few lines below */
3482 }
3483 }
3484 }
3485
3486 /* Check that the two values are in the correct order. Optimize
3487 one-character ranges */
3488
3489 if (d < c)
3490 {
3491 *errorcodeptr = ERR8;
3492 goto FAILED;
3493 }
3494
3495 if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
3496
3497 /* Remember \r or \n */
3498
3499 if (d == CHAR_CR || d == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
3500
3501 /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
3502 matching, we have to use an XCLASS with extra data items. Caseless
3503 matching for characters > 127 is available only if UCP support is
3504 available. */
3505
3506 #ifdef SUPPORT_UTF8
3507 if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
3508 {
3509 class_utf8 = TRUE;
3510
3511 /* With UCP support, we can find the other case equivalents of
3512 the relevant characters. There may be several ranges. Optimize how
3513 they fit with the basic range. */
3514
3515 #ifdef SUPPORT_UCP
3516 if ((options & PCRE_CASELESS) != 0)
3517 {
3518 unsigned int occ, ocd;
3519 unsigned int cc = c;
3520 unsigned int origd = d;
3521 while (get_othercase_range(&cc, origd, &occ, &ocd))
3522 {
3523 if (occ >= (unsigned int)c &&
3524 ocd <= (unsigned int)d)
3525 continue; /* Skip embedded ranges */
3526
3527 if (occ < (unsigned int)c &&
3528 ocd >= (unsigned int)c - 1) /* Extend the basic range */
3529 { /* if there is overlap, */
3530 c = occ; /* noting that if occ < c */
3531 continue; /* we can't have ocd > d */
3532 } /* because a subrange is */
3533 if (ocd > (unsigned int)d &&
3534 occ <= (unsigned int)d + 1) /* always shorter than */
3535 { /* the basic range. */
3536 d = ocd;
3537 continue;
3538 }
3539
3540 if (occ == ocd)
3541 {
3542 *class_utf8data++ = XCL_SINGLE;
3543 }
3544 else
3545 {
3546 *class_utf8data++ = XCL_RANGE;
3547 class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
3548 }
3549 class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
3550 }
3551 }
3552 #endif /* SUPPORT_UCP */
3553
3554 /* Now record the original range, possibly modified for UCP caseless
3555 overlapping ranges. */
3556
3557 *class_utf8data++ = XCL_RANGE;
3558 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3559 class_utf8data += _pcre_ord2utf8(d, class_utf8data);
3560
3561 /* With UCP support, we are done. Without UCP support, there is no
3562 caseless matching for UTF-8 characters > 127; we can use the bit map
3563 for the smaller ones. */
3564
3565 #ifdef SUPPORT_UCP
3566 continue; /* With next character in the class */
3567 #else
3568 if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
3569
3570 /* Adjust upper limit and fall through to set up the map */
3571
3572 d = 127;
3573
3574 #endif /* SUPPORT_UCP */
3575 }
3576 #endif /* SUPPORT_UTF8 */
3577
3578 /* We use the bit map for all cases when not in UTF-8 mode; else
3579 ranges that lie entirely within 0-127 when there is UCP support; else
3580 for partial ranges without UCP support. */
3581
3582 class_charcount += d - c + 1;
3583 class_lastchar = d;
3584
3585 /* We can save a bit of time by skipping this in the pre-compile. */
3586
3587 if (lengthptr == NULL) for (; c <= d; c++)
3588 {
3589 classbits[c/8] |= (1 << (c&7));
3590 if ((options & PCRE_CASELESS) != 0)
3591 {
3592 int uc = cd->fcc[c]; /* flip case */
3593 classbits[uc/8] |= (1 << (uc&7));
3594 }
3595 }
3596
3597 continue; /* Go get the next char in the class */
3598 }
3599
3600 /* Handle a lone single character - we can get here for a normal
3601 non-escape char, or after \ that introduces a single character or for an
3602 apparent range that isn't. */
3603
3604 LONE_SINGLE_CHARACTER:
3605
3606 /* Handle a character that cannot go in the bit map */
3607
3608 #ifdef SUPPORT_UTF8
3609 if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
3610 {
3611 class_utf8 = TRUE;
3612 *class_utf8data++ = XCL_SINGLE;
3613 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3614
3615 #ifdef SUPPORT_UCP
3616 if ((options & PCRE_CASELESS) != 0)
3617 {
3618 unsigned int othercase;
3619 if ((othercase = UCD_OTHERCASE(c)) != c)
3620 {
3621 *class_utf8data++ = XCL_SINGLE;
3622 class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
3623 }
3624 }
3625 #endif /* SUPPORT_UCP */
3626
3627 }
3628 else
3629 #endif /* SUPPORT_UTF8 */
3630
3631 /* Handle a single-byte character */
3632 {
3633 classbits[c/8] |= (1 << (c&7));
3634 if ((options & PCRE_CASELESS) != 0)
3635 {
3636 c = cd->fcc[c]; /* flip case */
3637 classbits[c/8] |= (1 << (c&7));
3638 }
3639 class_charcount++;
3640 class_lastchar = c;
3641 }
3642 }
3643
3644 /* Loop until ']' reached. This "while" is the end of the "do" above. */
3645
3646 while ((c = *(++ptr)) != 0 && (c != CHAR_RIGHT_SQUARE_BRACKET || inescq));
3647
3648 if (c == 0) /* Missing terminating ']' */
3649 {
3650 *errorcodeptr = ERR6;
3651 goto FAILED;
3652 }
3653
3654
3655 /* This code has been disabled because it would mean that \s counts as
3656 an explicit \r or \n reference, and that's not really what is wanted. Now
3657 we set the flag only if there is a literal "\r" or "\n" in the class. */
3658
3659 #if 0
3660 /* Remember whether \r or \n are in this class */
3661
3662 if (negate_class)
3663 {
3664 if ((classbits[1] & 0x24) != 0x24) cd->external_flags |= PCRE_HASCRORLF;
3665 }
3666 else
3667 {
3668 if ((classbits[1] & 0x24) != 0) cd->external_flags |= PCRE_HASCRORLF;
3669 }
3670 #endif
3671
3672
3673 /* If class_charcount is 1, we saw precisely one character whose value is
3674 less than 256. As long as there were no characters >= 128 and there was no
3675 use of \p or \P, in other words, no use of any XCLASS features, we can
3676 optimize.
3677
3678 In UTF-8 mode, we can optimize the negative case only if there were no
3679 characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR
3680 operate on single-bytes only. This is an historical hangover. Maybe one day
3681 we can tidy these opcodes to handle multi-byte characters.
3682
3683 The optimization throws away the bit map. We turn the item into a
3684 1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
3685 that OP_NOT does not support multibyte characters. In the positive case, it
3686 can cause firstbyte to be set. Otherwise, there can be no first char if
3687 this item is first, whatever repeat count may follow. In the case of
3688 reqbyte, save the previous value for reinstating. */
3689
3690 #ifdef SUPPORT_UTF8
3691 if (class_charcount == 1 && !class_utf8 &&
3692 (!utf8 || !negate_class || class_lastchar < 128))
3693 #else
3694 if (class_charcount == 1)
3695 #endif
3696 {
3697 zeroreqbyte = reqbyte;
3698
3699 /* The OP_NOT opcode works on one-byte characters only. */
3700
3701 if (negate_class)
3702 {
3703 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3704 zerofirstbyte = firstbyte;
3705 *code++ = OP_NOT;
3706 *code++ = class_lastchar;
3707 break;
3708 }
3709
3710 /* For a single, positive character, get the value into mcbuffer, and
3711 then we can handle this with the normal one-character code. */
3712
3713 #ifdef SUPPORT_UTF8
3714 if (utf8 && class_lastchar > 127)
3715 mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
3716 else
3717 #endif
3718 {
3719 mcbuffer[0] = class_lastchar;
3720 mclength = 1;
3721 }
3722 goto ONE_CHAR;
3723 } /* End of 1-char optimization */
3724
3725 /* The general case - not the one-char optimization. If this is the first
3726 thing in the branch, there can be no first char setting, whatever the
3727 repeat count. Any reqbyte setting must remain unchanged after any kind of
3728 repeat. */
3729
3730 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3731 zerofirstbyte = firstbyte;
3732 zeroreqbyte = reqbyte;
3733
3734 /* If there are characters with values > 255, we have to compile an
3735 extended class, with its own opcode, unless there was a negated special
3736 such as \S in the class, because in that case all characters > 255 are in
3737 the class, so any that were explicitly given as well can be ignored. If
3738 (when there are explicit characters > 255 that must be listed) there are no
3739 characters < 256, we can omit the bitmap in the actual compiled code. */
3740
3741 #ifdef SUPPORT_UTF8
3742 if (class_utf8 && !should_flip_negation)
3743 {
3744 *class_utf8data++ = XCL_END; /* Marks the end of extra data */
3745 *code++ = OP_XCLASS;
3746 code += LINK_SIZE;
3747 *code = negate_class? XCL_NOT : 0;
3748
3749 /* If the map is required, move up the extra data to make room for it;
3750 otherwise just move the code pointer to the end of the extra data. */
3751
3752 if (class_charcount > 0)
3753 {
3754 *code++ |= XCL_MAP;
3755 memmove(code + 32, code, class_utf8data - code);
3756 memcpy(code, classbits, 32);
3757 code = class_utf8data + 32;
3758 }
3759 else code = class_utf8data;
3760
3761 /* Now fill in the complete length of the item */
3762
3763 PUT(previous, 1, code - previous);
3764 break; /* End of class handling */
3765 }
3766 #endif
3767
3768 /* If there are no characters > 255, set the opcode to OP_CLASS or
3769 OP_NCLASS, depending on whether the whole class was negated and whether
3770 there were negative specials such as \S in the class. Then copy the 32-byte
3771 map into the code vector, negating it if necessary. */
3772
3773 *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
3774 if (negate_class)
3775 {
3776 if (lengthptr == NULL) /* Save time in the pre-compile phase */
3777 for (c = 0; c < 32; c++) code[c] = ~classbits[c];
3778 }
3779 else
3780 {
3781 memcpy(code, classbits, 32);
3782 }
3783 code += 32;
3784 break;
3785
3786
3787 /* ===================================================================*/
3788 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
3789 has been tested above. */
3790
3791 case CHAR_LEFT_CURLY_BRACKET:
3792 if (!is_quantifier) goto NORMAL_CHAR;
3793 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
3794 if (*errorcodeptr != 0) goto FAILED;
3795 goto REPEAT;
3796
3797 case CHAR_ASTERISK:
3798 repeat_min = 0;
3799 repeat_max = -1;
3800 goto REPEAT;
3801
3802 case CHAR_PLUS:
3803 repeat_min = 1;
3804 repeat_max = -1;
3805 goto REPEAT;
3806
3807 case CHAR_QUESTION_MARK:
3808 repeat_min = 0;
3809 repeat_max = 1;
3810
3811 REPEAT:
3812 if (previous == NULL)
3813 {
3814 *errorcodeptr = ERR9;
3815 goto FAILED;
3816 }
3817
3818 if (repeat_min == 0)
3819 {
3820 firstbyte = zerofirstbyte; /* Adjust for zero repeat */
3821 reqbyte = zeroreqbyte; /* Ditto */
3822 }
3823
3824 /* Remember whether this is a variable length repeat */
3825
3826 reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
3827
3828 op_type = 0; /* Default single-char op codes */
3829 possessive_quantifier = FALSE; /* Default not possessive quantifier */
3830
3831 /* Save start of previous item, in case we have to move it up to make space
3832 for an inserted OP_ONCE for the additional '+' extension. */
3833
3834 tempcode = previous;
3835
3836 /* If the next character is '+', we have a possessive quantifier. This
3837 implies greediness, whatever the setting of the PCRE_UNGREEDY option.
3838 If the next character is '?' this is a minimizing repeat, by default,
3839 but if PCRE_UNGREEDY is set, it works the other way round. We change the
3840 repeat type to the non-default. */
3841
3842 if (ptr[1] == CHAR_PLUS)
3843 {
3844 repeat_type = 0; /* Force greedy */
3845 possessive_quantifier = TRUE;
3846 ptr++;
3847 }
3848 else if (ptr[1] == CHAR_QUESTION_MARK)
3849 {
3850 repeat_type = greedy_non_default;
3851 ptr++;
3852 }
3853 else repeat_type = greedy_default;
3854
3855 /* If previous was a character match, abolish the item and generate a
3856 repeat item instead. If a char item has a minumum of more than one, ensure
3857 that it is set in reqbyte - it might not be if a sequence such as x{3} is
3858 the first thing in a branch because the x will have gone into firstbyte
3859 instead. */
3860
3861 if (*previous == OP_CHAR || *previous == OP_CHARNC)
3862 {
3863 /* Deal with UTF-8 characters that take up more than one byte. It's
3864 easier to write this out separately than try to macrify it. Use c to
3865 hold the length of the character in bytes, plus 0x80 to flag that it's a
3866 length rather than a small character. */
3867
3868 #ifdef SUPPORT_UTF8
3869 if (utf8 && (code[-1] & 0x80) != 0)
3870 {
3871 uschar *lastchar = code - 1;
3872 while((*lastchar & 0xc0) == 0x80) lastchar--;
3873 c = code - lastchar; /* Length of UTF-8 character */
3874 memcpy(utf8_char, lastchar, c); /* Save the char */
3875 c |= 0x80; /* Flag c as a length */
3876 }
3877 else
3878 #endif
3879
3880 /* Handle the case of a single byte - either with no UTF8 support, or
3881 with UTF-8 disabled, or for a UTF-8 character < 128. */
3882
3883 {
3884 c = code[-1];
3885 if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
3886 }
3887
3888 /* If the repetition is unlimited, it pays to see if the next thing on
3889 the line is something that cannot possibly match this character. If so,
3890 automatically possessifying this item gains some performance in the case
3891 where the match fails. */
3892
3893 if (!possessive_quantifier &&
3894 repeat_max < 0 &&
3895 check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3896 options, cd))
3897 {
3898 repeat_type = 0; /* Force greedy */
3899 possessive_quantifier = TRUE;
3900 }
3901
3902 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
3903 }
3904
3905 /* If previous was a single negated character ([^a] or similar), we use
3906 one of the special opcodes, replacing it. The code is shared with single-
3907 character repeats by setting opt_type to add a suitable offset into
3908 repeat_type. We can also test for auto-possessification. OP_NOT is
3909 currently used only for single-byte chars. */
3910
3911 else if (*previous == OP_NOT)
3912 {
3913 op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
3914 c = previous[1];
3915 if (!possessive_quantifier &&
3916 repeat_max < 0 &&
3917 check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3918 {
3919 repeat_type = 0; /* Force greedy */
3920 possessive_quantifier = TRUE;
3921 }
3922 goto OUTPUT_SINGLE_REPEAT;
3923 }
3924
3925 /* If previous was a character type match (\d or similar), abolish it and
3926 create a suitable repeat item. The code is shared with single-character
3927 repeats by setting op_type to add a suitable offset into repeat_type. Note
3928 the the Unicode property types will be present only when SUPPORT_UCP is
3929 defined, but we don't wrap the little bits of code here because it just
3930 makes it horribly messy. */
3931
3932 else if (*previous < OP_EODN)
3933 {
3934 uschar *oldcode;
3935 int prop_type, prop_value;
3936 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
3937 c = *previous;
3938
3939 if (!possessive_quantifier &&
3940 repeat_max < 0 &&
3941 check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3942 {
3943 repeat_type = 0; /* Force greedy */
3944 possessive_quantifier = TRUE;
3945 }
3946
3947 OUTPUT_SINGLE_REPEAT:
3948 if (*previous == OP_PROP || *previous == OP_NOTPROP)
3949 {
3950 prop_type = previous[1];
3951 prop_value = previous[2];
3952 }
3953 else prop_type = prop_value = -1;
3954
3955 oldcode = code;
3956 code = previous; /* Usually overwrite previous item */
3957
3958 /* If the maximum is zero then the minimum must also be zero; Perl allows
3959 this case, so we do too - by simply omitting the item altogether. */
3960
3961 if (repeat_max == 0) goto END_REPEAT;
3962
3963 /*--------------------------------------------------------------------*/
3964 /* This code is obsolete from release 8.00; the restriction was finally
3965 removed: */
3966
3967 /* All real repeats make it impossible to handle partial matching (maybe
3968 one day we will be able to remove this restriction). */
3969
3970 /* if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL; */
3971 /*--------------------------------------------------------------------*/
3972
3973 /* Combine the op_type with the repeat_type */
3974
3975 repeat_type += op_type;
3976
3977 /* A minimum of zero is handled either as the special case * or ?, or as
3978 an UPTO, with the maximum given. */
3979
3980 if (repeat_min == 0)
3981 {
3982 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
3983 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
3984 else
3985 {
3986 *code++ = OP_UPTO + repeat_type;
3987 PUT2INC(code, 0, repeat_max);
3988 }
3989 }
3990
3991 /* A repeat minimum of 1 is optimized into some special cases. If the
3992 maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
3993 left in place and, if the maximum is greater than 1, we use OP_UPTO with
3994 one less than the maximum. */
3995
3996 else if (repeat_min == 1)
3997 {
3998 if (repeat_max == -1)
3999 *code++ = OP_PLUS + repeat_type;
4000 else
4001 {
4002 code = oldcode; /* leave previous item in place */
4003 if (repeat_max == 1) goto END_REPEAT;
4004 *code++ = OP_UPTO + repeat_type;
4005 PUT2INC(code, 0, repeat_max - 1);
4006 }
4007 }
4008
4009 /* The case {n,n} is just an EXACT, while the general case {n,m} is
4010 handled as an EXACT followed by an UPTO. */
4011
4012 else
4013 {
4014 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
4015 PUT2INC(code, 0, repeat_min);
4016
4017 /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
4018 we have to insert the character for the previous code. For a repeated
4019 Unicode property match, there are two extra bytes that define the
4020 required property. In UTF-8 mode, long characters have their length in
4021 c, with the 0x80 bit as a flag. */
4022
4023 if (repeat_max < 0)
4024 {
4025 #ifdef SUPPORT_UTF8
4026 if (utf8 && c >= 128)
4027 {
4028 memcpy(code, utf8_char, c & 7);
4029 code += c & 7;
4030 }
4031 else
4032 #endif
4033 {
4034 *code++ = c;
4035 if (prop_type >= 0)
4036 {
4037 *code++ = prop_type;
4038 *code++ = prop_value;
4039 }
4040 }
4041 *code++ = OP_STAR + repeat_type;
4042 }
4043
4044 /* Else insert an UPTO if the max is greater than the min, again
4045 preceded by the character, for the previously inserted code. If the
4046 UPTO is just for 1 instance, we can use QUERY instead. */
4047
4048 else if (repeat_max != repeat_min)
4049 {
4050 #ifdef SUPPORT_UTF8
4051 if (utf8 && c >= 128)
4052 {
4053 memcpy(code, utf8_char, c & 7);
4054 code += c & 7;
4055 }
4056 else
4057 #endif
4058 *code++ = c;
4059 if (prop_type >= 0)
4060 {
4061 *code++ = prop_type;
4062 *code++ = prop_value;
4063 }
4064 repeat_max -= repeat_min;
4065
4066 if (repeat_max == 1)
4067 {
4068 *code++ = OP_QUERY + repeat_type;
4069 }
4070 else
4071 {
4072 *code++ = OP_UPTO + repeat_type;
4073 PUT2INC(code, 0, repeat_max);
4074 }
4075 }
4076 }
4077
4078 /* The character or character type itself comes last in all cases. */
4079
4080 #ifdef SUPPORT_UTF8
4081 if (utf8 && c >= 128)
4082 {
4083 memcpy(code, utf8_char, c & 7);
4084 code += c & 7;
4085 }
4086 else
4087 #endif
4088 *code++ = c;
4089
4090 /* For a repeated Unicode property match, there are two extra bytes that
4091 define the required property. */
4092
4093 #ifdef SUPPORT_UCP
4094 if (prop_type >= 0)
4095 {
4096 *code++ = prop_type;
4097 *code++ = prop_value;
4098 }
4099 #endif
4100 }
4101
4102 /* If previous was a character class or a back reference, we put the repeat
4103 stuff after it, but just skip the item if the repeat was {0,0}. */
4104
4105 else if (*previous == OP_CLASS ||
4106 *previous == OP_NCLASS ||
4107 #ifdef SUPPORT_UTF8
4108 *previous == OP_XCLASS ||
4109 #endif
4110 *previous == OP_REF)
4111 {
4112 if (repeat_max == 0)
4113 {
4114 code = previous;
4115 goto END_REPEAT;
4116 }
4117
4118 /*--------------------------------------------------------------------*/
4119 /* This code is obsolete from release 8.00; the restriction was finally
4120 removed: */
4121
4122 /* All real repeats make it impossible to handle partial matching (maybe
4123 one day we will be able to remove this restriction). */
4124
4125 /* if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL; */
4126 /*--------------------------------------------------------------------*/
4127
4128 if (repeat_min == 0 && repeat_max == -1)
4129 *code++ = OP_CRSTAR + repeat_type;
4130 else if (repeat_min == 1 && repeat_max == -1)
4131 *code++ = OP_CRPLUS + repeat_type;
4132 else if (repeat_min == 0 && repeat_max == 1)
4133 *code++ = OP_CRQUERY + repeat_type;
4134 else
4135 {
4136 *code++ = OP_CRRANGE + repeat_type;
4137 PUT2INC(code, 0, repeat_min);
4138 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
4139 PUT2INC(code, 0, repeat_max);
4140 }
4141 }
4142
4143 /* If previous was a bracket group, we may have to replicate it in certain
4144 cases. */
4145
4146 else if (*previous == OP_BRA || *previous == OP_CBRA ||
4147 *previous == OP_ONCE || *previous == OP_COND)
4148 {
4149 register int i;
4150 int ketoffset = 0;
4151 int len = code - previous;
4152 uschar *bralink = NULL;
4153
4154 /* Repeating a DEFINE group is pointless */
4155
4156 if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
4157 {
4158 *errorcodeptr = ERR55;
4159 goto FAILED;
4160 }
4161
4162 /* If the maximum repeat count is unlimited, find the end of the bracket
4163 by scanning through from the start, and compute the offset back to it
4164 from the current code pointer. There may be an OP_OPT setting following
4165 the final KET, so we can't find the end just by going back from the code
4166 pointer. */
4167
4168 if (repeat_max == -1)
4169 {
4170 register uschar *ket = previous;
4171 do ket += GET(ket, 1); while (*ket != OP_KET);
4172 ketoffset = code - ket;
4173 }
4174
4175 /* The case of a zero minimum is special because of the need to stick
4176 OP_BRAZERO in front of it, and because the group appears once in the
4177 data, whereas in other cases it appears the minimum number of times. For
4178 this reason, it is simplest to treat this case separately, as otherwise
4179 the code gets far too messy. There are several special subcases when the
4180 minimum is zero. */
4181
4182 if (repeat_min == 0)
4183 {
4184 /* If the maximum is also zero, we used to just omit the group from the
4185 output altogether, like this:
4186
4187 ** if (repeat_max == 0)
4188 ** {
4189 ** code = previous;
4190 ** goto END_REPEAT;
4191 ** }
4192
4193 However, that fails when a group is referenced as a subroutine from
4194 elsewhere in the pattern, so now we stick in OP_SKIPZERO in front of it
4195 so that it is skipped on execution. As we don't have a list of which
4196 groups are referenced, we cannot do this selectively.
4197
4198 If the maximum is 1 or unlimited, we just have to stick in the BRAZERO
4199 and do no more at this point. However, we do need to adjust any
4200 OP_RECURSE calls inside the group that refer to the group itself or any
4201 internal or forward referenced group, because the offset is from the
4202 start of the whole regex. Temporarily terminate the pattern while doing
4203 this. */
4204
4205 if (repeat_max <= 1) /* Covers 0, 1, and unlimited */
4206 {
4207 *code = OP_END;
4208 adjust_recurse(previous, 1, utf8, cd, save_hwm);
4209 memmove(previous+1, previous, len);
4210 code++;
4211 if (repeat_max == 0)
4212 {
4213 *previous++ = OP_SKIPZERO;
4214 goto END_REPEAT;
4215 }
4216 *previous++ = OP_BRAZERO + repeat_type;
4217 }
4218
4219 /* If the maximum is greater than 1 and limited, we have to replicate
4220 in a nested fashion, sticking OP_BRAZERO before each set of brackets.
4221 The first one has to be handled carefully because it's the original
4222 copy, which has to be moved up. The remainder can be handled by code
4223 that is common with the non-zero minimum case below. We have to
4224 adjust the value or repeat_max, since one less copy is required. Once
4225 again, we may have to adjust any OP_RECURSE calls inside the group. */
4226
4227 else
4228 {
4229 int offset;
4230 *code = OP_END;
4231 adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
4232 memmove(previous + 2 + LINK_SIZE, previous, len);
4233 code += 2 + LINK_SIZE;
4234 *previous++ = OP_BRAZERO + repeat_type;
4235 *previous++ = OP_BRA;
4236
4237 /* We chain together the bracket offset fields that have to be
4238 filled in later when the ends of the brackets are reached. */
4239
4240 offset = (bralink == NULL)? 0 : previous - bralink;
4241 bralink = previous;
4242 PUTINC(previous, 0, offset);
4243 }
4244
4245 repeat_max--;
4246 }
4247
4248 /* If the minimum is greater than zero, replicate the group as many
4249 times as necessary, and adjust the maximum to the number of subsequent
4250 copies that we need. If we set a first char from the group, and didn't
4251 set a required char, copy the latter from the former. If there are any
4252 forward reference subroutine calls in the group, there will be entries on
4253 the workspace list; replicate these with an appropriate increment. */
4254
4255 else
4256 {
4257 if (repeat_min > 1)
4258 {
4259 /* In the pre-compile phase, we don't actually do the replication. We
4260 just adjust the length as if we had. Do some paranoid checks for
4261 potential integer overflow. The INT64_OR_DOUBLE type is a 64-bit
4262 integer type when available, otherwise double. */
4263
4264 if (lengthptr != NULL)
4265 {
4266 int delta = (repeat_min - 1)*length_prevgroup;
4267 if ((INT64_OR_DOUBLE)(repeat_min - 1)*
4268 (INT64_OR_DOUBLE)length_prevgroup >
4269 (INT64_OR_DOUBLE)INT_MAX ||
4270 OFLOW_MAX - *lengthptr < delta)
4271 {
4272 *errorcodeptr = ERR20;
4273 goto FAILED;
4274 }
4275 *lengthptr += delta;
4276 }
4277
4278 /* This is compiling for real */
4279
4280 else
4281 {
4282 if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
4283 for (i = 1; i < repeat_min; i++)
4284 {
4285 uschar *hc;
4286 uschar *this_hwm = cd->hwm;
4287 memcpy(code, previous, len);
4288 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
4289 {
4290 PUT(cd->hwm, 0, GET(hc, 0) + len);
4291 cd->hwm += LINK_SIZE;
4292 }
4293 save_hwm = this_hwm;
4294 code += len;
4295 }
4296 }
4297 }
4298
4299 if (repeat_max > 0) repeat_max -= repeat_min;
4300 }
4301
4302 /* This code is common to both the zero and non-zero minimum cases. If
4303 the maximum is limited, it replicates the group in a nested fashion,
4304 remembering the bracket starts on a stack. In the case of a zero minimum,
4305 the first one was set up above. In all cases the repeat_max now specifies
4306 the number of additional copies needed. Again, we must remember to
4307 replicate entries on the forward reference list. */
4308
4309 if (repeat_max >= 0)
4310 {
4311 /* In the pre-compile phase, we don't actually do the replication. We
4312 just adjust the length as if we had. For each repetition we must add 1
4313 to the length for BRAZERO and for all but the last repetition we must
4314 add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
4315 paranoid checks to avoid integer overflow. The INT64_OR_DOUBLE type is
4316 a 64-bit integer type when available, otherwise double. */
4317
4318 if (lengthptr != NULL && repeat_max > 0)
4319 {
4320 int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
4321 2 - 2*LINK_SIZE; /* Last one doesn't nest */
4322 if ((INT64_OR_DOUBLE)repeat_max *
4323 (INT64_OR_DOUBLE)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
4324 > (INT64_OR_DOUBLE)INT_MAX ||
4325 OFLOW_MAX - *lengthptr < delta)
4326 {
4327 *errorcodeptr = ERR20;
4328 goto FAILED;
4329 }
4330 *lengthptr += delta;
4331 }
4332
4333 /* This is compiling for real */
4334
4335 else for (i = repeat_max - 1; i >= 0; i--)
4336 {
4337 uschar *hc;
4338 uschar *this_hwm = cd->hwm;
4339
4340 *code++ = OP_BRAZERO + repeat_type;
4341
4342 /* All but the final copy start a new nesting, maintaining the
4343 chain of brackets outstanding. */
4344
4345 if (i != 0)
4346 {
4347 int offset;
4348 *code++ = OP_BRA;
4349 offset = (bralink == NULL)? 0 : code - bralink;
4350 bralink = code;
4351 PUTINC(code, 0, offset);
4352 }
4353
4354 memcpy(code, previous, len);
4355 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
4356 {
4357 PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
4358 cd->hwm += LINK_SIZE;
4359 }
4360 save_hwm = this_hwm;
4361 code += len;
4362 }
4363
4364 /* Now chain through the pending brackets, and fill in their length
4365 fields (which are holding the chain links pro tem). */
4366
4367 while (bralink != NULL)
4368 {
4369 int oldlinkoffset;
4370 int offset = code - bralink + 1;
4371 uschar *bra = code - offset;
4372 oldlinkoffset = GET(bra, 1);
4373 bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
4374 *code++ = OP_KET;
4375 PUTINC(code, 0, offset);
4376 PUT(bra, 1, offset);
4377 }
4378 }
4379
4380 /* If the maximum is unlimited, set a repeater in the final copy. We
4381 can't just offset backwards from the current code point, because we
4382 don't know if there's been an options resetting after the ket. The
4383 correct offset was computed above.
4384
4385 Then, when we are doing the actual compile phase, check to see whether
4386 this group is a non-atomic one that could match an empty string. If so,
4387 convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
4388 that runtime checking can be done. [This check is also applied to
4389 atomic groups at runtime, but in a different way.] */
4390
4391 else
4392 {
4393 uschar *ketcode = code - ketoffset;
4394 uschar *bracode = ketcode - GET(ketcode, 1);
4395 *ketcode = OP_KETRMAX + repeat_type;
4396 if (lengthptr == NULL && *bracode != OP_ONCE)
4397 {
4398 uschar *scode = bracode;
4399 do
4400 {
4401 if (could_be_empty_branch(scode, ketcode, utf8, cd))
4402 {
4403 *bracode += OP_SBRA - OP_BRA;
4404 break;
4405 }
4406 scode += GET(scode, 1);
4407 }
4408 while (*scode == OP_ALT);
4409 }
4410 }
4411 }
4412
4413 /* If previous is OP_FAIL, it was generated by an empty class [] in
4414 JavaScript mode. The other ways in which OP_FAIL can be generated, that is
4415 by (*FAIL) or (?!) set previous to NULL, which gives a "nothing to repeat"
4416 error above. We can just ignore the repeat in JS case. */
4417
4418 else if (*previous == OP_FAIL) goto END_REPEAT;
4419
4420 /* Else there's some kind of shambles */
4421
4422 else
4423 {
4424 *errorcodeptr = ERR11;
4425 goto FAILED;
4426 }
4427
4428 /* If the character following a repeat is '+', or if certain optimization
4429 tests above succeeded, possessive_quantifier is TRUE. For some of the
4430 simpler opcodes, there is an special alternative opcode for this. For
4431 anything else, we wrap the entire repeated item inside OP_ONCE brackets.
4432 The '+' notation is just syntactic sugar, taken from Sun's Java package,
4433 but the special opcodes can optimize it a bit. The repeated item starts at
4434 tempcode, not at previous, which might be the first part of a string whose
4435 (former) last char we repeated.
4436
4437 Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
4438 an 'upto' may follow. We skip over an 'exact' item, and then test the
4439 length of what remains before proceeding. */
4440
4441 if (possessive_quantifier)
4442 {
4443 int len;
4444
4445 if (*tempcode == OP_TYPEEXACT)
4446 tempcode += _pcre_OP_lengths[*tempcode] +
4447 ((tempcode[3] == OP_PROP || tempcode[3] == OP_NOTPROP)? 2 : 0);
4448
4449 else if (*tempcode == OP_EXACT || *tempcode == OP_NOTEXACT)
4450 {
4451 tempcode += _pcre_OP_lengths[*tempcode];
4452 #ifdef SUPPORT_UTF8
4453 if (utf8 && tempcode[-1] >= 0xc0)
4454 tempcode += _pcre_utf8_table4[tempcode[-1] & 0x3f];
4455 #endif
4456 }
4457
4458 len = code - tempcode;
4459 if (len > 0) switch (*tempcode)
4460 {
4461 case OP_STAR: *tempcode = OP_POSSTAR; break;
4462 case OP_PLUS: *tempcode = OP_POSPLUS; break;
4463 case OP_QUERY: *tempcode = OP_POSQUERY; break;
4464 case OP_UPTO: *tempcode = OP_POSUPTO; break;
4465
4466 case OP_TYPESTAR: *tempcode = OP_TYPEPOSSTAR; break;
4467 case OP_TYPEPLUS: *tempcode = OP_TYPEPOSPLUS; break;
4468 case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
4469 case OP_TYPEUPTO: *tempcode = OP_TYPEPOSUPTO; break;
4470
4471 case OP_NOTSTAR: *tempcode = OP_NOTPOSSTAR; break;
4472 case OP_NOTPLUS: *tempcode = OP_NOTPOSPLUS; break;
4473 case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
4474 case OP_NOTUPTO: *tempcode = OP_NOTPOSUPTO; break;
4475
4476 /* Because we are moving code along, we must ensure that any
4477 pending recursive references are updated. */
4478
4479 default:
4480 *code = OP_END;
4481 adjust_recurse(tempcode, 1 + LINK_SIZE, utf8, cd, save_hwm);
4482 memmove(tempcode + 1+LINK_SIZE, tempcode, len);
4483 code += 1 + LINK_SIZE;
4484 len += 1 + LINK_SIZE;
4485 tempcode[0] = OP_ONCE;
4486 *code++ = OP_KET;
4487 PUTINC(code, 0, len);
4488 PUT(tempcode, 1, len);
4489 break;
4490 }
4491 }
4492
4493 /* In all case we no longer have a previous item. We also set the
4494 "follows varying string" flag for subsequently encountered reqbytes if
4495 it isn't already set and we have just passed a varying length item. */
4496
4497 END_REPEAT:
4498 previous = NULL;
4499 cd->req_varyopt |= reqvary;
4500 break;
4501
4502
4503 /* ===================================================================*/
4504 /* Start of nested parenthesized sub-expression, or comment or lookahead or
4505 lookbehind or option setting or condition or all the other extended
4506 parenthesis forms. */
4507
4508 case CHAR_LEFT_PARENTHESIS:
4509 newoptions = options;
4510 skipbytes = 0;
4511 bravalue = OP_CBRA;
4512 save_hwm = cd->hwm;
4513 reset_bracount = FALSE;
4514
4515 /* First deal with various "verbs" that can be introduced by '*'. */
4516
4517 if (*(++ptr) == CHAR_ASTERISK && (cd->ctypes[ptr[1]] & ctype_letter) != 0)
4518 {
4519 int i, namelen;
4520 const char *vn = verbnames;
4521 const uschar *name = ++ptr;
4522 previous = NULL;
4523 while ((cd->ctypes[*++ptr] & ctype_letter) != 0) {};
4524 if (*ptr == CHAR_COLON)
4525 {
4526 *errorcodeptr = ERR59; /* Not supported */
4527 goto FAILED;
4528 }
4529 if (*ptr != CHAR_RIGHT_PARENTHESIS)
4530 {
4531 *errorcodeptr = ERR60;
4532 goto FAILED;
4533 }
4534 namelen = ptr - name;
4535 for (i = 0; i < verbcount; i++)
4536 {
4537 if (namelen == verbs[i].len &&
4538 strncmp((char *)name, vn, namelen) == 0)
4539 {
4540 /* Check for open captures before ACCEPT */
4541
4542 if (verbs[i].op == OP_ACCEPT)
4543 {
4544 open_capitem *oc;
4545 cd->had_accept = TRUE;
4546 for (oc = cd->open_caps; oc != NULL; oc = oc->next)
4547 {
4548 *code++ = OP_CLOSE;
4549 PUT2INC(code, 0, oc->number);
4550 }
4551 }
4552 *code++ = verbs[i].op;
4553 break;
4554 }
4555 vn += verbs[i].len + 1;
4556 }
4557 if (i < verbcount) continue;
4558 *errorcodeptr = ERR60;
4559 goto FAILED;
4560 }
4561
4562 /* Deal with the extended parentheses; all are introduced by '?', and the
4563 appearance of any of them means that this is not a capturing group. */
4564
4565 else if (*ptr == CHAR_QUESTION_MARK)
4566 {
4567 int i, set, unset, namelen;
4568 int *optset;
4569 const uschar *name;
4570 uschar *slot;
4571
4572 switch (*(++ptr))
4573 {
4574 case CHAR_NUMBER_SIGN: /* Comment; skip to ket */
4575 ptr++;
4576 while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
4577 if (*ptr == 0)
4578 {
4579 *errorcodeptr = ERR18;
4580 goto FAILED;
4581 }
4582 continue;
4583
4584
4585 /* ------------------------------------------------------------ */
4586 case CHAR_VERTICAL_LINE: /* Reset capture count for each branch */
4587 reset_bracount = TRUE;
4588 /* Fall through */
4589
4590 /* ------------------------------------------------------------ */
4591 case CHAR_COLON: /* Non-capturing bracket */
4592 bravalue = OP_BRA;
4593 ptr++;
4594 break;
4595
4596
4597 /* ------------------------------------------------------------ */
4598 case CHAR_LEFT_PARENTHESIS:
4599 bravalue = OP_COND; /* Conditional group */
4600
4601 /* A condition can be an assertion, a number (referring to a numbered
4602 group), a name (referring to a named group), or 'R', referring to
4603 recursion. R<digits> and R&name are also permitted for recursion tests.
4604
4605 There are several syntaxes for testing a named group: (?(name)) is used
4606 by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
4607
4608 There are two unfortunate ambiguities, caused by history. (a) 'R' can
4609 be the recursive thing or the name 'R' (and similarly for 'R' followed
4610 by digits), and (b) a number could be a name that consists of digits.
4611 In both cases, we look for a name first; if not found, we try the other
4612 cases. */
4613
4614 /* For conditions that are assertions, check the syntax, and then exit
4615 the switch. This will take control down to where bracketed groups,
4616 including assertions, are processed. */
4617
4618 if (ptr[1] == CHAR_QUESTION_MARK && (ptr[2] == CHAR_EQUALS_SIGN ||
4619 ptr[2] == CHAR_EXCLAMATION_MARK || ptr[2] == CHAR_LESS_THAN_SIGN))
4620 break;
4621
4622 /* Most other conditions use OP_CREF (a couple change to OP_RREF
4623 below), and all need to skip 3 bytes at the start of the group. */
4624
4625 code[1+LINK_SIZE] = OP_CREF;
4626 skipbytes = 3;
4627 refsign = -1;
4628
4629 /* Check for a test for recursion in a named group. */
4630
4631 if (ptr[1] == CHAR_R && ptr[2] == CHAR_AMPERSAND)
4632 {
4633 terminator = -1;
4634 ptr += 2;
4635 code[1+LINK_SIZE] = OP_RREF; /* Change the type of test */
4636 }
4637
4638 /* Check for a test for a named group's having been set, using the Perl
4639 syntax (?(<name>) or (?('name') */
4640
4641 else if (ptr[1] == CHAR_LESS_THAN_SIGN)
4642 {
4643 terminator = CHAR_GREATER_THAN_SIGN;
4644 ptr++;
4645 }
4646 else if (ptr[1] == CHAR_APOSTROPHE)
4647 {
4648 terminator = CHAR_APOSTROPHE;
4649 ptr++;
4650 }
4651 else
4652 {
4653 terminator = 0;
4654 if (ptr[1] == CHAR_MINUS || ptr[1] == CHAR_PLUS) refsign = *(++ptr);
4655 }
4656
4657 /* We now expect to read a name; any thing else is an error */
4658
4659 if ((cd->ctypes[ptr[1]] & ctype_word) == 0)
4660 {
4661 ptr += 1; /* To get the right offset */
4662 *errorcodeptr = ERR28;
4663 goto FAILED;
4664 }
4665
4666 /* Read the name, but also get it as a number if it's all digits */
4667
4668 recno = 0;
4669 name = ++ptr;
4670 while ((cd->ctypes[*ptr] & ctype_word) != 0)
4671 {
4672 if (recno >= 0)
4673 recno = ((digitab[*ptr] & ctype_digit) != 0)?
4674 recno * 10 + *ptr - CHAR_0 : -1;
4675 ptr++;
4676 }
4677 namelen = ptr - name;
4678
4679 if ((terminator > 0 && *ptr++ != terminator) ||
4680 *ptr++ != CHAR_RIGHT_PARENTHESIS)
4681 {
4682 ptr--; /* Error offset */
4683 *errorcodeptr = ERR26;
4684 goto FAILED;
4685 }
4686
4687 /* Do no further checking in the pre-compile phase. */
4688
4689 if (lengthptr != NULL) break;
4690
4691 /* In the real compile we do the work of looking for the actual
4692 reference. If the string started with "+" or "-" we require the rest to
4693 be digits, in which case recno will be set. */
4694
4695 if (refsign > 0)
4696 {
4697 if (recno <= 0)
4698 {
4699 *errorcodeptr = ERR58;
4700 goto FAILED;
4701 }
4702 recno = (refsign == CHAR_MINUS)?
4703 cd->bracount - recno + 1 : recno +cd->bracount;
4704 if (recno <= 0 || recno > cd->final_bracount)
4705 {
4706 *errorcodeptr = ERR15;
4707 goto FAILED;
4708 }
4709 PUT2(code, 2+LINK_SIZE, recno);
4710 break;
4711 }
4712
4713 /* Otherwise (did not start with "+" or "-"), start by looking for the
4714 name. If we find a name, add one to the opcode to change OP_CREF or
4715 OP_RREF into OP_NCREF or OP_NRREF. These behave exactly the same,
4716 except they record that the reference was originally to a name. The
4717 information is used to check duplicate names. */
4718
4719 slot = cd->name_table;
4720 for (i = 0; i < cd->names_found; i++)
4721 {
4722 if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
4723 slot += cd->name_entry_size;
4724 }
4725
4726 /* Found a previous named subpattern */
4727
4728 if (i < cd->names_found)
4729 {
4730 recno = GET2(slot, 0);
4731 PUT2(code, 2+LINK_SIZE, recno);
4732 code[1+LINK_SIZE]++;
4733 }
4734
4735 /* Search the pattern for a forward reference */
4736
4737 else if ((i = find_parens(cd, name, namelen,
4738 (options & PCRE_EXTENDED) != 0)) > 0)
4739 {
4740 PUT2(code, 2+LINK_SIZE, i);
4741 code[1+LINK_SIZE]++;
4742 }
4743
4744 /* If terminator == 0 it means that the name followed directly after
4745 the opening parenthesis [e.g. (?(abc)...] and in this case there are
4746 some further alternatives to try. For the cases where terminator != 0
4747 [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
4748 now checked all the possibilities, so give an error. */
4749
4750 else if (terminator != 0)
4751 {
4752 *errorcodeptr = ERR15;
4753 goto FAILED;
4754 }
4755
4756 /* Check for (?(R) for recursion. Allow digits after R to specify a
4757 specific group number. */
4758
4759 else if (*name == CHAR_R)
4760 {
4761 recno = 0;
4762 for (i = 1; i < namelen; i++)
4763 {
4764 if ((digitab[name[i]] & ctype_digit) == 0)
4765 {
4766 *errorcodeptr = ERR15;
4767 goto FAILED;
4768 }
4769 recno = recno * 10 + name[i] - CHAR_0;
4770 }
4771 if (recno == 0) recno = RREF_ANY;
4772 code[1+LINK_SIZE] = OP_RREF; /* Change test type */
4773 PUT2(code, 2+LINK_SIZE, recno);
4774 }
4775
4776 /* Similarly, check for the (?(DEFINE) "condition", which is always
4777 false. */
4778
4779 else if (namelen == 6 && strncmp((char *)name, STRING_DEFINE, 6) == 0)
4780 {
4781 code[1+LINK_SIZE] = OP_DEF;
4782 skipbytes = 1;
4783 }
4784
4785 /* Check for the "name" actually being a subpattern number. We are
4786 in the second pass here, so final_bracount is set. */
4787
4788 else if (recno > 0 && recno <= cd->final_bracount)
4789 {
4790 PUT2(code, 2+LINK_SIZE, recno);
4791 }
4792
4793 /* Either an unidentified subpattern, or a reference to (?(0) */
4794
4795 else
4796 {
4797 *errorcodeptr = (recno == 0)? ERR35: ERR15;
4798 goto FAILED;
4799 }
4800 break;
4801
4802
4803 /* ------------------------------------------------------------ */
4804 case CHAR_EQUALS_SIGN: /* Positive lookahead */
4805 bravalue = OP_ASSERT;
4806 ptr++;
4807 break;
4808
4809
4810 /* ------------------------------------------------------------ */
4811 case CHAR_EXCLAMATION_MARK: /* Negative lookahead */
4812 ptr++;
4813 if (*ptr == CHAR_RIGHT_PARENTHESIS) /* Optimize (?!) */
4814 {
4815 *code++ = OP_FAIL;
4816 previous = NULL;
4817 continue;
4818 }
4819 bravalue = OP_ASSERT_NOT;
4820 break;
4821
4822
4823 /* ------------------------------------------------------------ */
4824 case CHAR_LESS_THAN_SIGN: /* Lookbehind or named define */
4825 switch (ptr[1])
4826 {
4827 case CHAR_EQUALS_SIGN: /* Positive lookbehind */
4828 bravalue = OP_ASSERTBACK;
4829 ptr += 2;
4830 break;
4831
4832 case CHAR_EXCLAMATION_MARK: /* Negative lookbehind */
4833 bravalue = OP_ASSERTBACK_NOT;
4834 ptr += 2;
4835 break;
4836
4837 default: /* Could be name define, else bad */
4838 if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME;
4839 ptr++; /* Correct offset for error */
4840 *errorcodeptr = ERR24;
4841 goto FAILED;
4842 }
4843 break;
4844
4845
4846 /* ------------------------------------------------------------ */
4847 case CHAR_GREATER_THAN_SIGN: /* One-time brackets */
4848 bravalue = OP_ONCE;
4849 ptr++;
4850 break;
4851
4852
4853 /* ------------------------------------------------------------ */
4854 case CHAR_C: /* Callout - may be followed by digits; */
4855 previous_callout = code; /* Save for later completion */
4856 after_manual_callout = 1; /* Skip one item before completing */
4857 *code++ = OP_CALLOUT;
4858 {
4859 int n = 0;
4860 while ((digitab[*(++ptr)] & ctype_digit) != 0)
4861 n = n * 10 + *ptr - CHAR_0;
4862 if (*ptr != CHAR_RIGHT_PARENTHESIS)
4863 {
4864 *errorcodeptr = ERR39;
4865 goto FAILED;
4866 }
4867 if (n > 255)
4868 {
4869 *errorcodeptr = ERR38;
4870 goto FAILED;
4871 }
4872 *code++ = n;
4873 PUT(code, 0, ptr - cd->start_pattern + 1); /* Pattern offset */
4874 PUT(code, LINK_SIZE, 0); /* Default length */
4875 code += 2 * LINK_SIZE;
4876 }
4877 previous = NULL;
4878 continue;
4879
4880
4881 /* ------------------------------------------------------------ */
4882 case CHAR_P: /* Python-style named subpattern handling */
4883 if (*(++ptr) == CHAR_EQUALS_SIGN ||
4884 *ptr == CHAR_GREATER_THAN_SIGN) /* Reference or recursion */
4885 {
4886 is_recurse = *ptr == CHAR_GREATER_THAN_SIGN;
4887 terminator = CHAR_RIGHT_PARENTHESIS;
4888 goto NAMED_REF_OR_RECURSE;
4889 }
4890 else if (*ptr != CHAR_LESS_THAN_SIGN) /* Test for Python-style defn */
4891 {
4892 *errorcodeptr = ERR41;
4893 goto FAILED;
4894 }
4895 /* Fall through to handle (?P< as (?< is handled */
4896
4897
4898 /* ------------------------------------------------------------ */
4899 DEFINE_NAME: /* Come here from (?< handling */
4900 case CHAR_APOSTROPHE:
4901 {
4902 terminator = (*ptr == CHAR_LESS_THAN_SIGN)?
4903 CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
4904 name = ++ptr;
4905
4906 while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4907 namelen = ptr - name;
4908
4909 /* In the pre-compile phase, just do a syntax check. */
4910
4911 if (lengthptr != NULL)
4912 {
4913 if (*ptr != terminator)
4914 {
4915 *errorcodeptr = ERR42;
4916 goto FAILED;
4917 }
4918 if (cd->names_found >= MAX_NAME_COUNT)
4919 {
4920 *errorcodeptr = ERR49;
4921 goto FAILED;
4922 }
4923 if (namelen + 3 > cd->name_entry_size)
4924 {
4925 cd->name_entry_size = namelen + 3;
4926 if (namelen > MAX_NAME_SIZE)
4927 {
4928 *errorcodeptr = ERR48;
4929 goto FAILED;
4930 }
4931 }
4932 }
4933
4934 /* In the real compile, create the entry in the table, maintaining
4935 alphabetical order. Duplicate names for different numbers are
4936 permitted only if PCRE_DUPNAMES is set. Duplicate names for the same
4937 number are always OK. (An existing number can be re-used if (?|
4938 appears in the pattern.) In either event, a duplicate name results in
4939 a duplicate entry in the table, even if the number is the same. This
4940 is because the number of names, and hence the table size, is computed
4941 in the pre-compile, and it affects various numbers and pointers which
4942 would all have to be modified, and the compiled code moved down, if
4943 duplicates with the same number were omitted from the table. This
4944 doesn't seem worth the hassle. However, *different* names for the
4945 same number are not permitted. */
4946
4947 else
4948 {
4949 BOOL dupname = FALSE;
4950 slot = cd->name_table;
4951
4952 for (i = 0; i < cd->names_found; i++)
4953 {
4954 int crc = memcmp(name, slot+2, namelen);
4955 if (crc == 0)
4956 {
4957 if (slot[2+namelen] == 0)
4958 {
4959 if (GET2(slot, 0) != cd->bracount + 1 &&
4960 (options & PCRE_DUPNAMES) == 0)
4961 {
4962 *errorcodeptr = ERR43;
4963 goto FAILED;
4964 }
4965 else dupname = TRUE;
4966 }
4967 else crc = -1; /* Current name is a substring */
4968 }
4969
4970 /* Make space in the table and break the loop for an earlier
4971 name. For a duplicate or later name, carry on. We do this for
4972 duplicates so that in the simple case (when ?(| is not used) they
4973 are in order of their numbers. */
4974
4975 if (crc < 0)
4976 {
4977 memmove(slot + cd->name_entry_size, slot,
4978 (cd->names_found - i) * cd->name_entry_size);
4979 break;
4980 }
4981
4982 /* Continue the loop for a later or duplicate name */
4983
4984 slot += cd->name_entry_size;
4985 }
4986
4987 /* For non-duplicate names, check for a duplicate number before
4988 adding the new name. */
4989
4990 if (!dupname)
4991 {
4992 uschar *cslot = cd->name_table;
4993 for (i = 0; i < cd->names_found; i++)
4994 {
4995 if (cslot != slot)
4996 {
4997 if (GET2(cslot, 0) == cd->bracount + 1)
4998 {
4999 *errorcodeptr = ERR65;
5000 goto FAILED;
5001 }
5002 }
5003 else i--;
5004 cslot += cd->name_entry_size;
5005 }
5006 }
5007
5008 PUT2(slot, 0, cd->bracount + 1);
5009 memcpy(slot + 2, name, namelen);
5010 slot[2+namelen] = 0;
5011 }
5012 }
5013
5014 /* In both pre-compile and compile, count the number of names we've
5015 encountered. */
5016
5017 cd->names_found++;
5018 ptr++; /* Move past > or ' */
5019 goto NUMBERED_GROUP;
5020
5021
5022 /* ------------------------------------------------------------ */
5023 case CHAR_AMPERSAND: /* Perl recursion/subroutine syntax */
5024 terminator = CHAR_RIGHT_PARENTHESIS;
5025 is_recurse = TRUE;
5026 /* Fall through */
5027
5028 /* We come here from the Python syntax above that handles both
5029 references (?P=name) and recursion (?P>name), as well as falling
5030 through from the Perl recursion syntax (?&name). We also come here from
5031 the Perl \k<name> or \k'name' back reference syntax and the \k{name}
5032 .NET syntax, and the Oniguruma \g<...> and \g'...' subroutine syntax. */
5033
5034 NAMED_REF_OR_RECURSE:
5035 name = ++ptr;
5036 while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
5037 namelen = ptr - name;
5038
5039 /* In the pre-compile phase, do a syntax check and set a dummy
5040 reference number. */
5041
5042 if (lengthptr != NULL)
5043 {
5044 if (namelen == 0)
5045 {
5046 *errorcodeptr = ERR62;
5047 goto FAILED;
5048 }
5049 if (*ptr != terminator)
5050 {
5051 *errorcodeptr = ERR42;
5052 goto FAILED;
5053 }
5054 if (namelen > MAX_NAME_SIZE)
5055 {
5056 *errorcodeptr = ERR48;
5057 goto FAILED;
5058 }
5059 recno = 0;
5060 }
5061
5062 /* In the real compile, seek the name in the table. We check the name
5063 first, and then check that we have reached the end of the name in the
5064 table. That way, if the name that is longer than any in the table,
5065 the comparison will fail without reading beyond the table entry. */
5066
5067 else
5068 {
5069 slot = cd->name_table;
5070 for (i = 0; i < cd->names_found; i++)
5071 {
5072 if (strncmp((char *)name, (char *)slot+2, namelen) == 0 &&
5073 slot[2+namelen] == 0)
5074 break;
5075 slot += cd->name_entry_size;
5076 }
5077
5078 if (i < cd->names_found) /* Back reference */
5079 {
5080 recno = GET2(slot, 0);
5081 }
5082 else if ((recno = /* Forward back reference */
5083 find_parens(cd, name, namelen,
5084 (options & PCRE_EXTENDED) != 0)) <= 0)
5085 {
5086 *errorcodeptr = ERR15;
5087 goto FAILED;
5088 }
5089 }
5090
5091 /* In both phases, we can now go to the code than handles numerical
5092 recursion or backreferences. */
5093
5094 if (is_recurse) goto HANDLE_RECURSION;
5095 else goto HANDLE_REFERENCE;
5096
5097
5098 /* ------------------------------------------------------------ */
5099 case CHAR_R: /* Recursion */
5100 ptr++; /* Same as (?0) */
5101 /* Fall through */
5102
5103
5104 /* ------------------------------------------------------------ */
5105 case CHAR_MINUS: case CHAR_PLUS: /* Recursion or subroutine */
5106 case CHAR_0: case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4:
5107 case CHAR_5: case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
5108 {
5109 const uschar *called;
5110 terminator = CHAR_RIGHT_PARENTHESIS;
5111
5112 /* Come here from the \g<...> and \g'...' code (Oniguruma
5113 compatibility). However, the syntax has been checked to ensure that
5114 the ... are a (signed) number, so that neither ERR63 nor ERR29 will
5115 be called on this path, nor with the jump to OTHER_CHAR_AFTER_QUERY
5116 ever be taken. */
5117
5118 HANDLE_NUMERICAL_RECURSION:
5119
5120 if ((refsign = *ptr) == CHAR_PLUS)
5121 {
5122 ptr++;
5123 if ((digitab[*ptr] & ctype_digit) == 0)
5124 {
5125 *errorcodeptr = ERR63;
5126 goto FAILED;
5127 }
5128 }
5129 else if (refsign == CHAR_MINUS)
5130 {
5131 if ((digitab[ptr[1]] & ctype_digit) == 0)
5132 goto OTHER_CHAR_AFTER_QUERY;
5133 ptr++;
5134 }
5135
5136 recno = 0;
5137 while((digitab[*ptr] & ctype_digit) != 0)
5138 recno = recno * 10 + *ptr++ - CHAR_0;
5139
5140 if (*ptr != terminator)
5141 {
5142 *errorcodeptr = ERR29;
5143 goto FAILED;
5144 }
5145
5146 if (refsign == CHAR_MINUS)
5147 {
5148 if (recno == 0)
5149 {
5150 *errorcodeptr = ERR58;
5151 goto FAILED;
5152 }
5153 recno = cd->bracount - recno + 1;
5154 if (recno <= 0)
5155 {
5156 *errorcodeptr = ERR15;
5157 goto FAILED;
5158 }
5159 }
5160 else if (refsign == CHAR_PLUS)
5161 {
5162 if (recno == 0)
5163 {
5164 *errorcodeptr = ERR58;
5165 goto FAILED;
5166 }
5167 recno += cd->bracount;
5168 }
5169
5170 /* Come here from code above that handles a named recursion */
5171
5172 HANDLE_RECURSION:
5173
5174 previous = code;
5175 called = cd->start_code;
5176
5177 /* When we are actually compiling, find the bracket that is being
5178 referenced. Temporarily end the regex in case it doesn't exist before
5179 this point. If we end up with a forward reference, first check that
5180 the bracket does occur later so we can give the error (and position)
5181 now. Then remember this forward reference in the workspace so it can
5182 be filled in at the end. */
5183
5184 if (lengthptr == NULL)
5185 {
5186 *code = OP_END;
5187 if (recno != 0)
5188 called = _pcre_find_bracket(cd->start_code, utf8, recno);
5189
5190 /* Forward reference */
5191
5192 if (called == NULL)
5193 {
5194 if (find_parens(cd, NULL, recno,
5195 (options & PCRE_EXTENDED) != 0) < 0)
5196 {
5197 *errorcodeptr = ERR15;
5198 goto FAILED;
5199 }
5200
5201 /* Fudge the value of "called" so that when it is inserted as an
5202 offset below, what it actually inserted is the reference number
5203 of the group. */
5204
5205 called = cd->start_code + recno;
5206 PUTINC(cd->hwm, 0, code + 2 + LINK_SIZE - cd->start_code);
5207 }
5208
5209 /* If not a forward reference, and the subpattern is still open,
5210 this is a recursive call. We check to see if this is a left
5211 recursion that could loop for ever, and diagnose that case. */
5212
5213 else if (GET(called, 1) == 0 &&
5214 could_be_empty(called, code, bcptr, utf8, cd))
5215 {
5216 *errorcodeptr = ERR40;
5217 goto FAILED;
5218 }
5219 }
5220
5221 /* Insert the recursion/subroutine item, automatically wrapped inside
5222 "once" brackets. Set up a "previous group" length so that a
5223 subsequent quantifier will work. */
5224
5225 *code = OP_ONCE;
5226 PUT(code, 1, 2 + 2*LINK_SIZE);
5227 code += 1 + LINK_SIZE;
5228
5229 *code = OP_RECURSE;
5230 PUT(code, 1, called - cd->start_code);
5231 code += 1 + LINK_SIZE;
5232
5233 *code = OP_KET;
5234 PUT(code, 1, 2 + 2*LINK_SIZE);
5235 code += 1 + LINK_SIZE;
5236
5237 length_prevgroup = 3 + 3*LINK_SIZE;
5238 }
5239
5240 /* Can't determine a first byte now */
5241
5242 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
5243 continue;
5244
5245
5246 /* ------------------------------------------------------------ */
5247 default: /* Other characters: check option setting */
5248 OTHER_CHAR_AFTER_QUERY:
5249 set = unset = 0;
5250 optset = &set;
5251
5252 while (*ptr != CHAR_RIGHT_PARENTHESIS && *ptr != CHAR_COLON)
5253 {
5254 switch (*ptr++)
5255 {
5256 case CHAR_MINUS: optset = &unset; break;
5257
5258 case CHAR_J: /* Record that it changed in the external options */
5259 *optset |= PCRE_DUPNAMES;
5260 cd->external_flags |= PCRE_JCHANGED;
5261 break;
5262
5263 case CHAR_i: *optset |= PCRE_CASELESS; break;
5264 case CHAR_m: *optset |= PCRE_MULTILINE; break;
5265 case CHAR_s: *optset |= PCRE_DOTALL; break;
5266 case CHAR_x: *optset |= PCRE_EXTENDED; break;
5267 case CHAR_U: *optset |= PCRE_UNGREEDY; break;
5268 case CHAR_X: *optset |= PCRE_EXTRA; break;
5269
5270 default: *errorcodeptr = ERR12;
5271 ptr--; /* Correct the offset */
5272 goto FAILED;
5273 }
5274 }
5275
5276 /* Set up the changed option bits, but don't change anything yet. */
5277
5278 newoptions = (options | set) & (~unset);
5279
5280 /* If the options ended with ')' this is not the start of a nested
5281 group with option changes, so the options change at this level. If this
5282 item is right at the start of the pattern, the options can be
5283 abstracted and made external in the pre-compile phase, and ignored in
5284 the compile phase. This can be helpful when matching -- for instance in
5285 caseless checking of required bytes.
5286
5287 If the code pointer is not (cd->start_code + 1 + LINK_SIZE), we are
5288 definitely *not* at the start of the pattern because something has been
5289 compiled. In the pre-compile phase, however, the code pointer can have
5290 that value after the start, because it gets reset as code is discarded
5291 during the pre-compile. However, this can happen only at top level - if
5292 we are within parentheses, the starting BRA will still be present. At
5293 any parenthesis level, the length value can be used to test if anything
5294 has been compiled at that level. Thus, a test for both these conditions
5295 is necessary to ensure we correctly detect the start of the pattern in
5296 both phases.
5297
5298 If we are not at the pattern start, compile code to change the ims
5299 options if this setting actually changes any of them, and reset the
5300 greedy defaults and the case value for firstbyte and reqbyte. */
5301
5302 if (*ptr == CHAR_RIGHT_PARENTHESIS)
5303 {
5304 if (code == cd->start_code + 1 + LINK_SIZE &&
5305 (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE))
5306 {
5307 cd->external_options = newoptions;
5308 }
5309 else
5310 {
5311 if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
5312 {
5313 *code++ = OP_OPT;
5314 *code++ = newoptions & PCRE_IMS;
5315 }
5316 greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
5317 greedy_non_default = greedy_default ^ 1;
5318 req_caseopt = ((newoptions & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
5319 }
5320
5321 /* Change options at this level, and pass them back for use
5322 in subsequent branches. When not at the start of the pattern, this
5323 information is also necessary so that a resetting item can be
5324 compiled at the end of a group (if we are in a group). */
5325
5326 *optionsptr = options = newoptions;
5327 previous = NULL; /* This item can't be repeated */
5328 continue; /* It is complete */
5329 }
5330
5331 /* If the options ended with ':' we are heading into a nested group
5332 with possible change of options. Such groups are non-capturing and are
5333 not assertions of any kind. All we need to do is skip over the ':';
5334 the newoptions value is handled below. */
5335
5336 bravalue = OP_BRA;
5337 ptr++;
5338 } /* End of switch for character following (? */
5339 } /* End of (? handling */
5340
5341 /* Opening parenthesis not followed by '?'. If PCRE_NO_AUTO_CAPTURE is set,
5342 all unadorned brackets become non-capturing and behave like (?:...)
5343 brackets. */
5344
5345 else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
5346 {
5347 bravalue = OP_BRA;
5348 }
5349
5350 /* Else we have a capturing group. */
5351
5352 else
5353 {
5354 NUMBERED_GROUP:
5355 cd->bracount += 1;
5356 PUT2(code, 1+LINK_SIZE, cd->bracount);
5357 skipbytes = 2;
5358 }
5359
5360 /* Process nested bracketed regex. Assertions may not be repeated, but
5361 other kinds can be. All their opcodes are >= OP_ONCE. We copy code into a
5362 non-register variable in order to be able to pass its address because some
5363 compilers complain otherwise. Pass in a new setting for the ims options if
5364 they have changed. */
5365
5366 previous = (bravalue >= OP_ONCE)? code : NULL;
5367 *code = bravalue;
5368 tempcode = code;
5369 tempreqvary = cd->req_varyopt; /* Save value before bracket */
5370 length_prevgroup = 0; /* Initialize for pre-compile phase */
5371
5372 if (!compile_regex(
5373 newoptions, /* The complete new option state */
5374 options & PCRE_IMS, /* The previous ims option state */
5375 &tempcode, /* Where to put code (updated) */
5376 &ptr, /* Input pointer (updated) */
5377 errorcodeptr, /* Where to put an error message */
5378 (bravalue == OP_ASSERTBACK ||
5379 bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
5380 reset_bracount, /* True if (?| group */
5381 skipbytes, /* Skip over bracket number */
5382 &subfirstbyte, /* For possible first char */
5383 &subreqbyte, /* For possible last char */
5384 bcptr, /* Current branch chain */
5385 cd, /* Tables block */
5386 (lengthptr == NULL)? NULL : /* Actual compile phase */
5387 &length_prevgroup /* Pre-compile phase */
5388 ))
5389 goto FAILED;
5390
5391 /* At the end of compiling, code is still pointing to the start of the
5392 group, while tempcode has been updated to point past the end of the group
5393 and any option resetting that may follow it. The pattern pointer (ptr)
5394 is on the bracket. */
5395
5396 /* If this is a conditional bracket, check that there are no more than
5397 two branches in the group, or just one if it's a DEFINE group. We do this
5398 in the real compile phase, not in the pre-pass, where the whole group may
5399 not be available. */
5400
5401 if (bravalue == OP_COND && lengthptr == NULL)
5402 {
5403 uschar *tc = code;
5404 int condcount = 0;
5405
5406 do {
5407 condcount++;
5408 tc += GET(tc,1);
5409 }
5410 while (*tc != OP_KET);
5411
5412 /* A DEFINE group is never obeyed inline (the "condition" is always
5413 false). It must have only one branch. */
5414
5415 if (code[LINK_SIZE+1] == OP_DEF)
5416 {
5417 if (condcount > 1)
5418 {
5419 *errorcodeptr = ERR54;
5420 goto FAILED;
5421 }
5422 bravalue = OP_DEF; /* Just a flag to suppress char handling below */
5423 }
5424
5425 /* A "normal" conditional group. If there is just one branch, we must not
5426 make use of its firstbyte or reqbyte, because this is equivalent to an
5427 empty second branch. */
5428
5429 else
5430 {
5431 if (condcount > 2)
5432 {
5433 *errorcodeptr = ERR27;
5434 goto FAILED;
5435 }
5436 if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
5437 }
5438 }
5439
5440 /* Error if hit end of pattern */
5441
5442 if (*ptr != CHAR_RIGHT_PARENTHESIS)
5443 {
5444 *errorcodeptr = ERR14;
5445 goto FAILED;
5446 }
5447
5448 /* In the pre-compile phase, update the length by the length of the group,
5449 less the brackets at either end. Then reduce the compiled code to just a
5450 set of non-capturing brackets so that it doesn't use much memory if it is
5451 duplicated by a quantifier.*/
5452
5453 if (lengthptr != NULL)
5454 {
5455 if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE)
5456 {
5457 *errorcodeptr = ERR20;
5458 goto FAILED;
5459 }
5460 *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
5461 *code++ = OP_BRA;
5462 PUTINC(code, 0, 1 + LINK_SIZE);
5463 *code++ = OP_KET;
5464 PUTINC(code, 0, 1 + LINK_SIZE);
5465 break; /* No need to waste time with special character handling */
5466 }
5467
5468 /* Otherwise update the main code pointer to the end of the group. */
5469
5470 code = tempcode;
5471
5472 /* For a DEFINE group, required and first character settings are not
5473 relevant. */
5474
5475 if (bravalue == OP_DEF) break;
5476
5477 /* Handle updating of the required and first characters for other types of
5478 group. Update for normal brackets of all kinds, and conditions with two
5479 branches (see code above). If the bracket is followed by a quantifier with
5480 zero repeat, we have to back off. Hence the definition of zeroreqbyte and
5481 zerofirstbyte outside the main loop so that they can be accessed for the
5482 back off. */
5483
5484 zeroreqbyte = reqbyte;
5485 zerofirstbyte = firstbyte;
5486 groupsetfirstbyte = FALSE;
5487
5488 if (bravalue >= OP_ONCE)
5489 {
5490 /* If we have not yet set a firstbyte in this branch, take it from the
5491 subpattern, remembering that it was set here so that a repeat of more
5492 than one can replicate it as reqbyte if necessary. If the subpattern has
5493 no firstbyte, set "none" for the whole branch. In both cases, a zero
5494 repeat forces firstbyte to "none". */
5495
5496 if (firstbyte == REQ_UNSET)
5497 {
5498 if (subfirstbyte >= 0)
5499 {
5500 firstbyte = subfirstbyte;
5501 groupsetfirstbyte = TRUE;
5502 }
5503 else firstbyte = REQ_NONE;
5504 zerofirstbyte = REQ_NONE;
5505 }
5506
5507 /* If firstbyte was previously set, convert the subpattern's firstbyte
5508 into reqbyte if there wasn't one, using the vary flag that was in
5509 existence beforehand. */
5510
5511 else if (subfirstbyte >= 0 && subreqbyte < 0)
5512 subreqbyte = subfirstbyte | tempreqvary;
5513
5514 /* If the subpattern set a required byte (or set a first byte that isn't
5515 really the first byte - see above), set it. */
5516
5517 if (subreqbyte >= 0) reqbyte = subreqbyte;
5518 }
5519
5520 /* For a forward assertion, we take the reqbyte, if set. This can be
5521 helpful if the pattern that follows the assertion doesn't set a different
5522 char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
5523 for an assertion, however because it leads to incorrect effect for patterns
5524 such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
5525 of a firstbyte. This is overcome by a scan at the end if there's no
5526 firstbyte, looking for an asserted first char. */
5527
5528 else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
5529 break; /* End of processing '(' */
5530
5531
5532 /* ===================================================================*/
5533 /* Handle metasequences introduced by \. For ones like \d, the ESC_ values
5534 are arranged to be the negation of the corresponding OP_values. For the
5535 back references, the values are ESC_REF plus the reference number. Only
5536 back references and those types that consume a character may be repeated.
5537 We can test for values between ESC_b and ESC_Z for the latter; this may
5538 have to change if any new ones are ever created. */
5539
5540 case CHAR_BACKSLASH:
5541 tempptr = ptr;
5542 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, FALSE);
5543 if (*errorcodeptr != 0) goto FAILED;
5544
5545 if (c < 0)
5546 {
5547 if (-c == ESC_Q) /* Handle start of quoted string */
5548 {
5549 if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
5550 ptr += 2; /* avoid empty string */
5551 else inescq = TRUE;
5552 continue;
5553 }
5554
5555 if (-c == ESC_E) continue; /* Perl ignores an orphan \E */
5556
5557 /* For metasequences that actually match a character, we disable the
5558 setting of a first character if it hasn't already been set. */
5559
5560 if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
5561 firstbyte = REQ_NONE;
5562
5563 /* Set values to reset to if this is followed by a zero repeat. */
5564
5565 zerofirstbyte = firstbyte;
5566 zeroreqbyte = reqbyte;
5567
5568 /* \g<name> or \g'name' is a subroutine call by name and \g<n> or \g'n'
5569 is a subroutine call by number (Oniguruma syntax). In fact, the value
5570 -ESC_g is returned only for these cases. So we don't need to check for <
5571 or ' if the value is -ESC_g. For the Perl syntax \g{n} the value is
5572 -ESC_REF+n, and for the Perl syntax \g{name} the result is -ESC_k (as
5573 that is a synonym for a named back reference). */
5574
5575 if (-c == ESC_g)
5576 {
5577 const uschar *p;
5578 save_hwm = cd->hwm; /* Normally this is set when '(' is read */
5579 terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
5580 CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
5581
5582 /* These two statements stop the compiler for warning about possibly
5583 unset variables caused by the jump to HANDLE_NUMERICAL_RECURSION. In
5584 fact, because we actually check for a number below, the paths that
5585 would actually be in error are never taken. */
5586
5587 skipbytes = 0;
5588 reset_bracount = FALSE;
5589
5590 /* Test for a name */
5591
5592 if (ptr[1] != CHAR_PLUS && ptr[1] != CHAR_MINUS)
5593 {
5594 BOOL isnumber = TRUE;
5595 for (p = ptr + 1; *p != 0 && *p != terminator; p++)
5596 {
5597 if ((cd->ctypes[*p] & ctype_digit) == 0) isnumber = FALSE;
5598 if ((cd->ctypes[*p] & ctype_word) == 0) break;
5599 }
5600 if (*p != terminator)
5601 {
5602 *errorcodeptr = ERR57;
5603 break;
5604 }
5605 if (isnumber)
5606 {
5607 ptr++;
5608 goto HANDLE_NUMERICAL_RECURSION;
5609 }
5610 is_recurse = TRUE;
5611 goto NAMED_REF_OR_RECURSE;
5612 }
5613
5614 /* Test a signed number in angle brackets or quotes. */
5615
5616 p = ptr + 2;
5617 while ((digitab[*p] & ctype_digit) != 0) p++;
5618 if (*p != terminator)
5619 {
5620 *errorcodeptr = ERR57;
5621 break;
5622 }
5623 ptr++;
5624 goto HANDLE_NUMERICAL_RECURSION;
5625 }
5626
5627 /* \k<name> or \k'name' is a back reference by name (Perl syntax).
5628 We also support \k{name} (.NET syntax) */
5629
5630 if (-c == ESC_k && (ptr[1] == CHAR_LESS_THAN_SIGN ||
5631 ptr[1] == CHAR_APOSTROPHE || ptr[1] == CHAR_LEFT_CURLY_BRACKET))
5632 {
5633 is_recurse = FALSE;
5634 terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
5635 CHAR_GREATER_THAN_SIGN : (*ptr == CHAR_APOSTROPHE)?
5636 CHAR_APOSTROPHE : CHAR_RIGHT_CURLY_BRACKET;
5637 goto NAMED_REF_OR_RECURSE;
5638 }
5639
5640 /* Back references are handled specially; must disable firstbyte if
5641 not set to cope with cases like (?=(\w+))\1: which would otherwise set
5642 ':' later. */
5643
5644 if (-c >= ESC_REF)
5645 {
5646 open_capitem *oc;
5647 recno = -c - ESC_REF;
5648
5649 HANDLE_REFERENCE: /* Come here from named backref handling */
5650 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
5651 previous = code;
5652 *code++ = OP_REF;
5653 PUT2INC(code, 0, recno);
5654 cd->backref_map |= (recno < 32)? (1 << recno) : 1;
5655 if (recno > cd->top_backref) cd->top_backref = recno;
5656
5657 /* Check to see if this back reference is recursive, that it, it
5658 is inside the group that it references. A flag is set so that the
5659 group can be made atomic. */
5660
5661 for (oc = cd->open_caps; oc != NULL; oc = oc->next)
5662 {
5663 if (oc->number == recno)
5664 {
5665 oc->flag = TRUE;
5666 break;
5667 }
5668 }
5669 }
5670
5671 /* So are Unicode property matches, if supported. */
5672
5673 #ifdef SUPPORT_UCP
5674 else if (-c == ESC_P || -c == ESC_p)
5675 {
5676 BOOL negated;
5677 int pdata;
5678 int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
5679 if (ptype < 0) goto FAILED;
5680 previous = code;
5681 *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
5682 *code++ = ptype;
5683 *code++ = pdata;
5684 }
5685 #else
5686
5687 /* If Unicode properties are not supported, \X, \P, and \p are not
5688 allowed. */
5689
5690 else if (-c == ESC_X || -c == ESC_P || -c == ESC_p)
5691 {
5692 *errorcodeptr = ERR45;
5693 goto FAILED;
5694 }
5695 #endif
5696
5697 /* For the rest (including \X when Unicode properties are supported), we
5698 can obtain the OP value by negating the escape value. */
5699
5700 else
5701 {
5702 previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
5703 *code++ = -c;
5704 }
5705 continue;
5706 }
5707
5708 /* We have a data character whose value is in c. In UTF-8 mode it may have
5709 a value > 127. We set its representation in the length/buffer, and then
5710 handle it as a data character. */
5711
5712 #ifdef SUPPORT_UTF8
5713 if (utf8 && c > 127)
5714 mclength = _pcre_ord2utf8(c, mcbuffer);
5715 else
5716 #endif
5717
5718 {
5719 mcbuffer[0] = c;
5720 mclength = 1;
5721 }
5722 goto ONE_CHAR;
5723
5724
5725 /* ===================================================================*/
5726 /* Handle a literal character. It is guaranteed not to be whitespace or #
5727 when the extended flag is set. If we are in UTF-8 mode, it may be a
5728 multi-byte literal character. */
5729
5730 default:
5731 NORMAL_CHAR:
5732 mclength = 1;
5733 mcbuffer[0] = c;
5734
5735 #ifdef SUPPORT_UTF8
5736 if (utf8 && c >= 0xc0)
5737 {
5738 while ((ptr[1] & 0xc0) == 0x80)
5739 mcbuffer[mclength++] = *(++ptr);
5740 }
5741 #endif
5742
5743 /* At this point we have the character's bytes in mcbuffer, and the length
5744 in mclength. When not in UTF-8 mode, the length is always 1. */
5745
5746 ONE_CHAR:
5747 previous = code;
5748 *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARNC : OP_CHAR;
5749 for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
5750
5751 /* Remember if \r or \n were seen */
5752
5753 if (mcbuffer[0] == CHAR_CR || mcbuffer[0] == CHAR_NL)
5754 cd->external_flags |= PCRE_HASCRORLF;
5755
5756 /* Set the first and required bytes appropriately. If no previous first
5757 byte, set it from this character, but revert to none on a zero repeat.
5758 Otherwise, leave the firstbyte value alone, and don't change it on a zero
5759 repeat. */
5760
5761 if (firstbyte == REQ_UNSET)
5762 {
5763 zerofirstbyte = REQ_NONE;
5764 zeroreqbyte = reqbyte;
5765
5766 /* If the character is more than one byte long, we can set firstbyte
5767 only if it is not to be matched caselessly. */
5768
5769 if (mclength == 1 || req_caseopt == 0)
5770 {
5771 firstbyte = mcbuffer[0] | req_caseopt;
5772 if (mclength != 1) reqbyte = code[-1] | cd->req_varyopt;
5773 }
5774 else firstbyte = reqbyte = REQ_NONE;
5775 }
5776
5777 /* firstbyte was previously set; we can set reqbyte only the length is
5778 1 or the matching is caseful. */
5779
5780 else
5781 {
5782 zerofirstbyte = firstbyte;
5783 zeroreqbyte = reqbyte;
5784 if (mclength == 1 || req_caseopt == 0)
5785 reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
5786 }
5787
5788 break; /* End of literal character handling */
5789 }
5790 } /* end of big loop */
5791
5792
5793 /* Control never reaches here by falling through, only by a goto for all the
5794 error states. Pass back the position in the pattern so that it can be displayed
5795 to the user for diagnosing the error. */
5796
5797 FAILED:
5798 *ptrptr = ptr;
5799 return FALSE;
5800 }
5801
5802
5803
5804
5805 /*************************************************
5806 * Compile sequence of alternatives *
5807 *************************************************/
5808
5809 /* On entry, ptr is pointing past the bracket character, but on return it
5810 points to the closing bracket, or vertical bar, or end of string. The code
5811 variable is pointing at the byte into which the BRA operator has been stored.
5812 If the ims options are changed at the start (for a (?ims: group) or during any
5813 branch, we need to insert an OP_OPT item at the start of every following branch
5814 to ensure they get set correctly at run time, and also pass the new options
5815 into every subsequent branch compile.
5816
5817 This function is used during the pre-compile phase when we are trying to find
5818 out the amount of memory needed, as well as during the real compile phase. The
5819 value of lengthptr distinguishes the two phases.
5820
5821 Arguments:
5822 options option bits, including any changes for this subpattern
5823 oldims previous settings of ims option bits
5824 codeptr -> the address of the current code pointer
5825 ptrptr -> the address of the current pattern pointer
5826 errorcodeptr -> pointer to error code variable
5827 lookbehind TRUE if this is a lookbehind assertion
5828 reset_bracount TRUE to reset the count for each branch
5829 skipbytes skip this many bytes at start (for brackets and OP_COND)
5830 firstbyteptr place to put the first required character, or a negative number
5831 reqbyteptr place to put the last required character, or a negative number
5832 bcptr pointer to the chain of currently open branches
5833 cd points to the data block with tables pointers etc.
5834 lengthptr NULL during the real compile phase
5835 points to length accumulator during pre-compile phase
5836
5837 Returns: TRUE on success
5838 */
5839
5840 static BOOL
5841 compile_regex(int options, int oldims, uschar **codeptr, const uschar **ptrptr,
5842 int *errorcodeptr, BOOL lookbehind, BOOL reset_bracount, int skipbytes,
5843 int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd,
5844 int *lengthptr)
5845 {
5846 const uschar *ptr = *ptrptr;
5847 uschar *code = *codeptr;
5848 uschar *last_branch = code;
5849 uschar *start_bracket = code;
5850 uschar *reverse_count = NULL;
5851 open_capitem capitem;
5852 int capnumber = 0;
5853 int firstbyte, reqbyte;
5854 int branchfirstbyte, branchreqbyte;
5855 int length;
5856 int orig_bracount;
5857 int max_bracount;
5858 int old_external_options = cd->external_options;
5859 branch_chain bc;
5860
5861 bc.outer = bcptr;
5862 bc.current_branch = code;
5863
5864 firstbyte = reqbyte = REQ_UNSET;
5865
5866 /* Accumulate the length for use in the pre-compile phase. Start with the
5867 length of the BRA and KET and any extra bytes that are required at the
5868 beginning. We accumulate in a local variable to save frequent testing of
5869 lenthptr for NULL. We cannot do this by looking at the value of code at the
5870 start and end of each alternative, because compiled items are discarded during
5871 the pre-compile phase so that the work space is not exceeded. */
5872
5873 length = 2 + 2*LINK_SIZE + skipbytes;
5874
5875 /* WARNING: If the above line is changed for any reason, you must also change
5876 the code that abstracts option settings at the start of the pattern and makes
5877 them global. It tests the value of length for (2 + 2*LINK_SIZE) in the
5878 pre-compile phase to find out whether anything has yet been compiled or not. */
5879
5880 /* If this is a capturing subpattern, add to the chain of open capturing items
5881 so that we can detect them if (*ACCEPT) is encountered. This is also used to
5882 detect groups that contain recursive back references to themselves. */
5883
5884 if (*code == OP_CBRA)
5885 {
5886 capnumber = GET2(code, 1 + LINK_SIZE);
5887 capitem.number = capnumber;
5888 capitem.next = cd->open_caps;
5889 capitem.flag = FALSE;
5890 cd->open_caps = &capitem;
5891 }
5892
5893 /* Offset is set zero to mark that this bracket is still open */
5894
5895 PUT(code, 1, 0);
5896 code += 1 + LINK_SIZE + skipbytes;
5897
5898 /* Loop for each alternative branch */
5899
5900 orig_bracount = max_bracount = cd->bracount;
5901 for (;;)
5902 {
5903 /* For a (?| group, reset the capturing bracket count so that each branch
5904 uses the same numbers. */
5905
5906 if (reset_bracount) cd->bracount = orig_bracount;
5907
5908 /* Handle a change of ims options at the start of the branch */
5909
5910 if ((options & PCRE_IMS) != oldims)
5911 {
5912 *code++ = OP_OPT;
5913 *code++ = options & PCRE_IMS;
5914 length += 2;
5915 }
5916
5917 /* Set up dummy OP_REVERSE if lookbehind assertion */
5918
5919 if (lookbehind)
5920 {
5921 *code++ = OP_REVERSE;
5922 reverse_count = code;
5923 PUTINC(code, 0, 0);
5924 length += 1 + LINK_SIZE;
5925 }
5926
5927 /* Now compile the branch; in the pre-compile phase its length gets added
5928 into the length. */
5929
5930 if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstbyte,
5931 &branchreqbyte, &bc, cd, (lengthptr == NULL)? NULL : &length))
5932 {
5933 *ptrptr = ptr;
5934 return FALSE;
5935 }
5936
5937 /* If the external options have changed during this branch, it means that we
5938 are at the top level, and a leading option setting has been encountered. We
5939 need to re-set the original option values to take account of this so that,
5940 during the pre-compile phase, we know to allow for a re-set at the start of
5941 subsequent branches. */
5942
5943 if (old_external_options != cd->external_options)
5944 oldims = cd->external_options & PCRE_IMS;
5945
5946 /* Keep the highest bracket count in case (?| was used and some branch
5947 has fewer than the rest. */
5948
5949 if (cd->bracount > max_bracount) max_bracount = cd->bracount;
5950
5951 /* In the real compile phase, there is some post-processing to be done. */
5952
5953 if (lengthptr == NULL)
5954 {
5955 /* If this is the first branch, the firstbyte and reqbyte values for the
5956 branch become the values for the regex. */
5957
5958 if (*last_branch != OP_ALT)
5959 {
5960 firstbyte = branchfirstbyte;
5961 reqbyte = branchreqbyte;
5962 }
5963
5964 /* If this is not the first branch, the first char and reqbyte have to
5965 match the values from all the previous branches, except that if the
5966 previous value for reqbyte didn't have REQ_VARY set, it can still match,
5967 and we set REQ_VARY for the regex. */
5968
5969 else
5970 {
5971 /* If we previously had a firstbyte, but it doesn't match the new branch,
5972 we have to abandon the firstbyte for the regex, but if there was
5973 previously no reqbyte, it takes on the value of the old firstbyte. */
5974
5975 if (firstbyte >= 0 && firstbyte != branchfirstbyte)
5976 {
5977 if (reqbyte < 0) reqbyte = firstbyte;
5978 firstbyte = REQ_NONE;
5979 }
5980
5981 /* If we (now or from before) have no firstbyte, a firstbyte from the
5982 branch becomes a reqbyte if there isn't a branch reqbyte. */
5983
5984 if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
5985 branchreqbyte = branchfirstbyte;
5986
5987 /* Now ensure that the reqbytes match */
5988
5989 if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
5990 reqbyte = REQ_NONE;
5991 else reqbyte |= branchreqbyte; /* To "or" REQ_VARY */
5992 }
5993
5994 /* If lookbehind, check that this branch matches a fixed-length string, and
5995 put the length into the OP_REVERSE item. Temporarily mark the end of the
5996 branch with OP_END. If the branch contains OP_RECURSE, the result is -3
5997 because there may be forward references that we can't check here. Set a
5998 flag to cause another lookbehind check at the end. Why not do it all at the
5999 end? Because common, erroneous checks are picked up here and the offset of
6000 the problem can be shown. */
6001
6002 if (lookbehind)
6003 {
6004 int fixed_length;
6005 *code = OP_END;
6006 fixed_length = find_fixedlength(last_branch, options, FALSE, cd);
6007 DPRINTF(("fixed length = %d\n", fixed_length));
6008 if (fixed_length == -3)
6009 {
6010 cd->check_lookbehind = TRUE;
6011 }
6012 else if (fixed_length < 0)
6013 {
6014 *errorcodeptr = (fixed_length == -2)? ERR36 : ERR25;
6015 *ptrptr = ptr;
6016 return FALSE;
6017 }
6018 else { PUT(reverse_count, 0, fixed_length); }
6019 }
6020 }
6021
6022 /* Reached end of expression, either ')' or end of pattern. In the real
6023 compile phase, go back through the alternative branches and reverse the chain
6024 of offsets, with the field in the BRA item now becoming an offset to the
6025 first alternative. If there are no alternatives, it points to the end of the
6026 group. The length in the terminating ket is always the length of the whole
6027 bracketed item. If any of the ims options were changed inside the group,
6028 compile a resetting op-code following, except at the very end of the pattern.
6029 Return leaving the pointer at the terminating char. */
6030
6031 if (*ptr != CHAR_VERTICAL_LINE)
6032 {
6033 if (lengthptr == NULL)
6034 {
6035 int branch_length = code - last_branch;
6036 do
6037 {
6038 int prev_length = GET(last_branch, 1);
6039 PUT(last_branch, 1, branch_length);
6040 branch_length = prev_length;
6041 last_branch -= branch_length;
6042 }
6043 while (branch_length > 0);
6044 }
6045
6046 /* Fill in the ket */
6047
6048 *code = OP_KET;
6049 PUT(code, 1, code - start_bracket);
6050 code += 1 + LINK_SIZE;
6051
6052 /* If it was a capturing subpattern, check to see if it contained any
6053 recursive back references. If so, we must wrap it in atomic brackets.
6054 In any event, remove the block from the chain. */
6055
6056 if (capnumber > 0)
6057 {
6058 if (cd->open_caps->flag)
6059 {
6060 memmove(start_bracket + 1 + LINK_SIZE, start_bracket,
6061 code - start_bracket);
6062 *start_bracket = OP_ONCE;
6063 code += 1 + LINK_SIZE;
6064 PUT(start_bracket, 1, code - start_bracket);
6065 *code = OP_KET;
6066 PUT(code, 1, code - start_bracket);
6067 code += 1 + LINK_SIZE;
6068 length += 2 + 2*LINK_SIZE;
6069 }
6070 cd->open_caps = cd->open_caps->next;
6071 }
6072
6073 /* Reset options if needed. */
6074
6075 if ((options & PCRE_IMS) != oldims && *ptr == CHAR_RIGHT_PARENTHESIS)
6076 {
6077 *code++ = OP_OPT;
6078 *code++ = oldims;
6079 length += 2;
6080 }
6081
6082 /* Retain the highest bracket number, in case resetting was used. */
6083
6084 cd->bracount = max_bracount;
6085
6086 /* Set values to pass back */
6087
6088 *codeptr = code;
6089 *ptrptr = ptr;
6090 *firstbyteptr = firstbyte;
6091 *reqbyteptr = reqbyte;
6092 if (lengthptr != NULL)
6093 {
6094 if (OFLOW_MAX - *lengthptr < length)
6095 {
6096 *errorcodeptr = ERR20;
6097 return FALSE;
6098 }
6099 *lengthptr += length;
6100 }
6101 return TRUE;
6102 }
6103
6104 /* Another branch follows. In the pre-compile phase, we can move the code
6105 pointer back to where it was for the start of the first branch. (That is,
6106 pretend that each branch is the only one.)
6107
6108 In the real compile phase, insert an ALT node. Its length field points back
6109 to the previous branch while the bracket remains open. At the end the chain
6110 is reversed. It's done like this so that the start of the bracket has a
6111 zero offset until it is closed, making it possible to detect recursion. */
6112
6113 if (lengthptr != NULL)
6114 {
6115 code = *codeptr + 1 + LINK_SIZE + skipbytes;
6116 length += 1 + LINK_SIZE;
6117 }
6118 else
6119 {
6120 *code = OP_ALT;
6121 PUT(code, 1, code - last_branch);
6122 bc.current_branch = last_branch = code;
6123 code += 1 + LINK_SIZE;
6124 }
6125
6126 ptr++;
6127 }
6128 /* Control never reaches here */
6129 }
6130
6131
6132
6133
6134 /*************************************************
6135 * Check for anchored expression *
6136 *************************************************/
6137
6138 /* Try to find out if this is an anchored regular expression. Consider each
6139 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
6140 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
6141 it's anchored. However, if this is a multiline pattern, then only OP_SOD
6142 counts, since OP_CIRC can match in the middle.
6143
6144 We can also consider a regex to be anchored if OP_SOM starts all its branches.
6145 This is the code for \G, which means "match at start of match position, taking
6146 into account the match offset".
6147
6148 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
6149 because that will try the rest of the pattern at all possible matching points,
6150 so there is no point trying again.... er ....
6151
6152 .... except when the .* appears inside capturing parentheses, and there is a
6153 subsequent back reference to those parentheses. We haven't enough information
6154 to catch that case precisely.
6155
6156 At first, the best we could do was to detect when .* was in capturing brackets
6157 and the highest back reference was greater than or equal to that level.
6158 However, by keeping a bitmap of the first 31 back references, we can catch some
6159 of the more common cases more precisely.
6160
6161 Arguments:
6162 code points to start of expression (the bracket)
6163 options points to the options setting
6164 bracket_map a bitmap of which brackets we are inside while testing; this
6165 handles up to substring 31; after that we just have to take
6166 the less precise approach
6167 backref_map the back reference bitmap
6168
6169 Returns: TRUE or FALSE
6170 */
6171
6172 static BOOL
6173 is_anchored(register const uschar *code, int *options, unsigned int bracket_map,
6174 unsigned int backref_map)
6175 {
6176 do {
6177 const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
6178 options, PCRE_MULTILINE, FALSE);
6179 register int op = *scode;
6180
6181 /* Non-capturing brackets */
6182
6183 if (op == OP_BRA)
6184 {
6185 if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
6186 }
6187
6188 /* Capturing brackets */
6189
6190 else if (op == OP_CBRA)
6191 {
6192 int n = GET2(scode, 1+LINK_SIZE);
6193 int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
6194 if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;
6195 }
6196
6197 /* Other brackets */
6198
6199 else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
6200 {
6201 if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
6202 }
6203
6204 /* .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and
6205 it isn't in brackets that are or may be referenced. */
6206
6207 else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
6208 op == OP_TYPEPOSSTAR))
6209 {
6210 if (scode[1] != OP_ALLANY || (bracket_map & backref_map) != 0)
6211 return FALSE;
6212 }
6213
6214 /* Check for explicit anchoring */
6215
6216 else if (op != OP_SOD && op != OP_SOM &&
6217 ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
6218 return FALSE;
6219 code += GET(code, 1);
6220 }
6221 while (*code == OP_ALT); /* Loop for each alternative */
6222 return TRUE;
6223 }
6224
6225
6226
6227 /*************************************************
6228 * Check for starting with ^ or .* *
6229 *************************************************/
6230
6231 /* This is called to find out if every branch starts with ^ or .* so that
6232 "first char" processing can be done to speed things up in multiline
6233 matching and for non-DOTALL patterns that start with .* (which must start at
6234 the beginning or after \n). As in the case of is_anchored() (see above), we
6235 have to take account of back references to capturing brackets that contain .*
6236 because in that case we can't make the assumption.
6237
6238 Arguments:
6239 code points to start of expression (the bracket)
6240 bracket_map a bitmap of which brackets we are inside while testing; this
6241 handles up to substring 31; after that we just have to take
6242 the less precise approach
6243 backref_map the back reference bitmap
6244
6245 Returns: TRUE or FALSE
6246 */
6247
6248 static BOOL
6249 is_startline(const uschar *code, unsigned int bracket_map,
6250 unsigned int backref_map)
6251 {
6252 do {
6253 const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
6254 NULL, 0, FALSE);
6255 register int op = *scode;
6256
6257 /* If we are at the start of a conditional assertion group, *both* the
6258 conditional assertion *and* what follows the condition must satisfy the test
6259 for start of line. Other kinds of condition fail. Note that there may be an
6260 auto-callout at the start of a condition. */
6261
6262 if (op == OP_COND)
6263 {
6264 scode += 1 + LINK_SIZE;
6265 if (*scode == OP_CALLOUT) scode += _pcre_OP_lengths[OP_CALLOUT];
6266 switch (*scode)
6267 {
6268 case OP_CREF:
6269 case OP_NCREF:
6270 case OP_RREF:
6271 case OP_NRREF:
6272 case OP_DEF:
6273 return FALSE;
6274
6275 default: /* Assertion */
6276 if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
6277 do scode += GET(scode, 1); while (*scode == OP_ALT);
6278 scode += 1 + LINK_SIZE;
6279 break;
6280 }
6281 scode = first_significant_code(scode, NULL, 0, FALSE);
6282 op = *scode;
6283 }
6284
6285 /* Non-capturing brackets */
6286
6287 if (op == OP_BRA)
6288 {
6289 if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
6290 }
6291
6292 /* Capturing brackets */
6293
6294 else if (op == OP_CBRA)
6295 {
6296 int n = GET2(scode, 1+LINK_SIZE);
6297 int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
6298 if (!is_startline(scode, new_map, backref_map)) return FALSE;
6299 }
6300
6301 /* Other brackets */
6302
6303 else if (op == OP_ASSERT || op == OP_ONCE)
6304 {
6305 if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
6306 }
6307
6308 /* .* means "start at start or after \n" if it isn't in brackets that
6309 may be referenced. */
6310
6311 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
6312 {
6313 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
6314 }
6315
6316 /* Check for explicit circumflex */
6317
6318 else if (op != OP_CIRC) return FALSE;
6319
6320 /* Move on to the next alternative */
6321
6322 code += GET(code, 1);
6323 }
6324 while (*code == OP_ALT); /* Loop for each alternative */
6325 return TRUE;
6326 }
6327
6328
6329
6330 /*************************************************
6331 * Check for asserted fixed first char *
6332 *************************************************/
6333
6334 /* During compilation, the "first char" settings from forward assertions are
6335 discarded, because they can cause conflicts with actual literals that follow.
6336 However, if we end up without a first char setting for an unanchored pattern,
6337 it is worth scanning the regex to see if there is an initial asserted first
6338 char. If all branches start with the same asserted char, or with a bracket all
6339 of whose alternatives start with the same asserted char (recurse ad lib), then
6340 we return that char, otherwise -1.
6341
6342 Arguments:
6343 code points to start of expression (the bracket)
6344 options pointer to the options (used to check casing changes)
6345 inassert TRUE if in an assertion
6346
6347 Returns: -1 or the fixed first char
6348 */
6349
6350 static int
6351 find_firstassertedchar(const uschar *code, int *options, BOOL inassert)
6352 {
6353 register int c = -1;
6354 do {
6355 int d;
6356 const uschar *scode =
6357 first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE);
6358 register int op = *scode;
6359
6360 switch(op)
6361 {
6362 default:
6363 return -1;
6364
6365 case OP_BRA:
6366 case OP_CBRA:
6367 case OP_ASSERT:
6368 case OP_ONCE:
6369 case OP_COND:
6370 if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0)
6371 return -1;
6372 if (c < 0) c = d; else if (c != d) return -1;
6373 break;
6374
6375 case OP_EXACT: /* Fall through */
6376 scode += 2;
6377
6378 case OP_CHAR:
6379 case OP_CHARNC:
6380 case OP_PLUS:
6381 case OP_MINPLUS:
6382 case OP_POSPLUS:
6383 if (!inassert) return -1;
6384 if (c < 0)
6385 {
6386 c = scode[1];
6387 if ((*options & PCRE_CASELESS) != 0) c |= REQ_CASELESS;
6388 }
6389 else if (c != scode[1]) return -1;
6390 break;
6391 }
6392
6393 code += GET(code, 1);
6394 }
6395 while (*code == OP_ALT);
6396 return c;
6397 }
6398
6399
6400
6401 /*************************************************
6402 * Compile a Regular Expression *
6403 *************************************************/
6404
6405 /* This function takes a string and returns a pointer to a block of store
6406 holding a compiled version of the expression. The original API for this
6407 function had no error code return variable; it is retained for backwards
6408 compatibility. The new function is given a new name.
6409
6410 Arguments:
6411 pattern the regular expression
6412 options various option bits
6413 errorcodeptr pointer to error code variable (pcre_compile2() only)
6414 can be NULL if you don't want a code value
6415 errorptr pointer to pointer to error text
6416 erroroffset ptr offset in pattern where error was detected
6417 tables pointer to character tables or NULL
6418
6419 Returns: pointer to compiled data block, or NULL on error,
6420 with errorptr and erroroffset set
6421 */
6422
6423 PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION
6424 pcre_compile(const char *pattern, int options, const char **errorptr,
6425 int *erroroffset, const unsigned char *tables)
6426 {
6427 return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
6428 }
6429
6430
6431 PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION
6432 pcre_compile2(const char *pattern, int options, int *errorcodeptr,
6433 const char **errorptr, int *erroroffset, const unsigned char *tables)
6434 {
6435 real_pcre *re;
6436 int length = 1; /* For final END opcode */
6437 int firstbyte, reqbyte, newline;
6438 int errorcode = 0;
6439 int skipatstart = 0;
6440 BOOL utf8 = (options & PCRE_UTF8) != 0;
6441 size_t size;
6442 uschar *code;
6443 const uschar *codestart;
6444 const uschar *ptr;
6445 compile_data compile_block;
6446 compile_data *cd = &compile_block;
6447
6448 /* This space is used for "compiling" into during the first phase, when we are
6449 computing the amount of memory that is needed. Compiled items are thrown away
6450 as soon as possible, so that a fairly large buffer should be sufficient for
6451 this purpose. The same space is used in the second phase for remembering where
6452 to fill in forward references to subpatterns. */
6453
6454 uschar cworkspace[COMPILE_WORK_SIZE];
6455
6456 /* Set this early so that early errors get offset 0. */
6457
6458 ptr = (const uschar *)pattern;
6459
6460 /* We can't pass back an error message if errorptr is NULL; I guess the best we
6461 can do is just return NULL, but we can set a code value if there is a code
6462 pointer. */
6463
6464 if (errorptr == NULL)
6465 {
6466 if (errorcodeptr != NULL) *errorcodeptr = 99;
6467 return NULL;
6468 }
6469
6470 *errorptr = NULL;
6471 if (errorcodeptr != NULL) *errorcodeptr = ERR0;
6472
6473 /* However, we can give a message for this error */
6474
6475 if (erroroffset == NULL)
6476 {
6477 errorcode = ERR16;
6478 goto PCRE_EARLY_ERROR_RETURN2;
6479 }
6480
6481 *erroroffset = 0;
6482
6483 /* Set up pointers to the individual character tables */
6484
6485 if (tables == NULL) tables = _pcre_default_tables;
6486 cd->lcc = tables + lcc_offset;
6487 cd->fcc = tables + fcc_offset;
6488 cd->cbits = tables + cbits_offset;
6489 cd->ctypes = tables + ctypes_offset;
6490
6491 /* Check that all undefined public option bits are zero */
6492
6493 if ((options & ~PUBLIC_COMPILE_OPTIONS) != 0)
6494 {
6495 errorcode = ERR17;
6496 goto PCRE_EARLY_ERROR_RETURN;
6497 }
6498
6499 /* Check for global one-time settings at the start of the pattern, and remember
6500 the offset for later. */
6501
6502 while (ptr[skipatstart] == CHAR_LEFT_PARENTHESIS &&
6503 ptr[skipatstart+1] == CHAR_ASTERISK)
6504 {
6505 int newnl = 0;
6506 int newbsr = 0;
6507
6508 if (strncmp((char *)(ptr+skipatstart+2), STRING_UTF8_RIGHTPAR, 5) == 0)
6509 { skipatstart += 7; options |= PCRE_UTF8; continue; }
6510
6511 if (strncmp((char *)(ptr+skipatstart+2), STRING_CR_RIGHTPAR, 3) == 0)
6512 { skipatstart += 5; newnl = PCRE_NEWLINE_CR; }
6513 else if (strncmp((char *)(ptr+skipatstart+2), STRING_LF_RIGHTPAR, 3) == 0)
6514 { skipatstart += 5; newnl = PCRE_NEWLINE_LF; }
6515 else if (strncmp((char *)(ptr+skipatstart+2), STRING_CRLF_RIGHTPAR, 5) == 0)
6516 { skipatstart += 7; newnl = PCRE_NEWLINE_CR + PCRE_NEWLINE_LF; }
6517 else if (strncmp((char *)(ptr+skipatstart+2), STRING_ANY_RIGHTPAR, 4) == 0)
6518 { skipatstart += 6; newnl = PCRE_NEWLINE_ANY; }
6519 else if (strncmp((char *)(ptr+skipatstart+2), STRING_ANYCRLF_RIGHTPAR, 8) == 0)
6520 { skipatstart += 10; newnl = PCRE_NEWLINE_ANYCRLF; }
6521
6522 else if (strncmp((char *)(ptr+skipatstart+2), STRING_BSR_ANYCRLF_RIGHTPAR, 12) == 0)
6523 { skipatstart += 14; newbsr = PCRE_BSR_ANYCRLF; }
6524 else if (strncmp((char *)(ptr+skipatstart+2), STRING_BSR_UNICODE_RIGHTPAR, 12) == 0)
6525 { skipatstart += 14; newbsr = PCRE_BSR_UNICODE; }
6526
6527 if (newnl != 0)
6528 options = (options & ~PCRE_NEWLINE_BITS) | newnl;
6529 else if (newbsr != 0)
6530 options = (options & ~(PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) | newbsr;
6531 else break;
6532 }
6533
6534 /* Can't support UTF8 unless PCRE has been compiled to include the code. */
6535
6536 #ifdef SUPPORT_UTF8
6537 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
6538 (*erroroffset = _pcre_valid_utf8((USPTR)pattern, -1)) >= 0)
6539 {
6540 errorcode = ERR44;
6541 goto PCRE_EARLY_ERROR_RETURN2;
6542 }
6543 #else
6544 if (utf8)
6545 {
6546 errorcode = ERR32;
6547 goto PCRE_EARLY_ERROR_RETURN;
6548 }
6549 #endif
6550
6551 /* Check validity of \R options. */
6552
6553 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
6554 {
6555 case 0:
6556 case PCRE_BSR_ANYCRLF:
6557 case PCRE_BSR_UNICODE:
6558 break;
6559 default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
6560 }
6561
6562 /* Handle different types of newline. The three bits give seven cases. The
6563 current code allows for fixed one- or two-byte sequences, plus "any" and
6564 "anycrlf". */
6565
6566 switch (options & PCRE_NEWLINE_BITS)
6567 {
6568 case 0: newline = NEWLINE; break; /* Build-time default */
6569 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
6570 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
6571 case PCRE_NEWLINE_CR+
6572 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
6573 case PCRE_NEWLINE_ANY: newline = -1; break;
6574 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
6575 default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
6576 }
6577
6578 if (newline == -2)
6579 {
6580 cd->nltype = NLTYPE_ANYCRLF;
6581 }
6582 else if (newline < 0)
6583 {
6584 cd->nltype = NLTYPE_ANY;
6585 }
6586 else
6587 {
6588 cd->nltype = NLTYPE_FIXED;
6589 if (newline > 255)
6590 {
6591 cd->nllen = 2;
6592 cd->nl[0] = (newline >> 8) & 255;
6593 cd->nl[1] = newline & 255;
6594 }
6595 else
6596 {
6597 cd->nllen = 1;
6598 cd->nl[0] = newline;
6599 }
6600 }
6601
6602 /* Maximum back reference and backref bitmap. The bitmap records up to 31 back
6603 references to help in deciding whether (.*) can be treated as anchored or not.
6604 */
6605
6606 cd->top_backref = 0;
6607 cd->backref_map = 0;
6608
6609 /* Reflect pattern for debugging output */
6610
6611 DPRINTF(("------------------------------------------------------------------\n"));
6612 DPRINTF(("%s\n", pattern));
6613
6614 /* Pretend to compile the pattern while actually just accumulating the length
6615 of memory required. This behaviour is triggered by passing a non-NULL final
6616 argument to compile_regex(). We pass a block of workspace (cworkspace) for it
6617 to compile parts of the pattern into; the compiled code is discarded when it is
6618 no longer needed, so hopefully this workspace will never overflow, though there
6619 is a test for its doing so. */
6620
6621 cd->bracount = cd->final_bracount = 0;
6622 cd->names_found = 0;
6623 cd->name_entry_size = 0;
6624 cd->name_table = NULL;
6625 cd->start_workspace = cworkspace;
6626 cd->start_code = cworkspace;
6627 cd->hwm = cworkspace;
6628 cd->start_pattern = (const uschar *)pattern;
6629 cd->end_pattern = (const uschar *)(pattern + strlen(pattern));
6630 cd->req_varyopt = 0;
6631 cd->external_options = options;
6632 cd->external_flags = 0;
6633 cd->open_caps = NULL;
6634
6635 /* Now do the pre-compile. On error, errorcode will be set non-zero, so we
6636 don't need to look at the result of the function here. The initial options have
6637 been put into the cd block so that they can be changed if an option setting is
6638 found within the regex right at the beginning. Bringing initial option settings
6639 outside can help speed up starting point checks. */
6640
6641 ptr += skipatstart;
6642 code = cworkspace;
6643 *code = OP_BRA;
6644 (void)compile_regex(cd->external_options, cd->external_options & PCRE_IMS,
6645 &code, &ptr, &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd,
6646 &length);
6647 if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN;
6648
6649 DPRINTF(("end pre-compile: length=%d workspace=%d\n", length,
6650 cd->hwm - cworkspace));
6651
6652 if (length > MAX_PATTERN_SIZE)
6653 {
6654 errorcode = ERR20;
6655 goto PCRE_EARLY_ERROR_RETURN;
6656 }
6657
6658 /* Compute the size of data block needed and get it, either from malloc or