/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Contents of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 500 - (show annotations) (download)
Sat Mar 6 19:00:29 2010 UTC (4 years, 8 months ago) by ph10
File MIME type: text/plain
File size: 222249 byte(s)
Fix bugs with \K in atomic groups, subroutines, and assertions.

1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2010 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains the external function pcre_compile(), along with
42 supporting internal functions that are not used by other modules. */
43
44
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48
49 #define NLBLOCK cd /* Block containing newline information */
50 #define PSSTART start_pattern /* Field containing processed string start */
51 #define PSEND end_pattern /* Field containing processed string end */
52
53 #include "pcre_internal.h"
54
55
56 /* When PCRE_DEBUG is defined, we need the pcre_printint() function, which is
57 also used by pcretest. PCRE_DEBUG is not defined when building a production
58 library. */
59
60 #ifdef PCRE_DEBUG
61 #include "pcre_printint.src"
62 #endif
63
64
65 /* Macro for setting individual bits in class bitmaps. */
66
67 #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
68
69 /* Maximum length value to check against when making sure that the integer that
70 holds the compiled pattern length does not overflow. We make it a bit less than
71 INT_MAX to allow for adding in group terminating bytes, so that we don't have
72 to check them every time. */
73
74 #define OFLOW_MAX (INT_MAX - 20)
75
76
77 /*************************************************
78 * Code parameters and static tables *
79 *************************************************/
80
81 /* This value specifies the size of stack workspace that is used during the
82 first pre-compile phase that determines how much memory is required. The regex
83 is partly compiled into this space, but the compiled parts are discarded as
84 soon as they can be, so that hopefully there will never be an overrun. The code
85 does, however, check for an overrun. The largest amount I've seen used is 218,
86 so this number is very generous.
87
88 The same workspace is used during the second, actual compile phase for
89 remembering forward references to groups so that they can be filled in at the
90 end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
91 is 4 there is plenty of room. */
92
93 #define COMPILE_WORK_SIZE (4096)
94
95
96 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
97 are simple data values; negative values are for special things like \d and so
98 on. Zero means further processing is needed (for things like \x), or the escape
99 is invalid. */
100
101 #ifndef EBCDIC
102
103 /* This is the "normal" table for ASCII systems or for EBCDIC systems running
104 in UTF-8 mode. */
105
106 static const short int escapes[] = {
107 0, 0,
108 0, 0,
109 0, 0,
110 0, 0,
111 0, 0,
112 CHAR_COLON, CHAR_SEMICOLON,
113 CHAR_LESS_THAN_SIGN, CHAR_EQUALS_SIGN,
114 CHAR_GREATER_THAN_SIGN, CHAR_QUESTION_MARK,
115 CHAR_COMMERCIAL_AT, -ESC_A,
116 -ESC_B, -ESC_C,
117 -ESC_D, -ESC_E,
118 0, -ESC_G,
119 -ESC_H, 0,
120 0, -ESC_K,
121 0, 0,
122 0, 0,
123 -ESC_P, -ESC_Q,
124 -ESC_R, -ESC_S,
125 0, 0,
126 -ESC_V, -ESC_W,
127 -ESC_X, 0,
128 -ESC_Z, CHAR_LEFT_SQUARE_BRACKET,
129 CHAR_BACKSLASH, CHAR_RIGHT_SQUARE_BRACKET,
130 CHAR_CIRCUMFLEX_ACCENT, CHAR_UNDERSCORE,
131 CHAR_GRAVE_ACCENT, 7,
132 -ESC_b, 0,
133 -ESC_d, ESC_e,
134 ESC_f, 0,
135 -ESC_h, 0,
136 0, -ESC_k,
137 0, 0,
138 ESC_n, 0,
139 -ESC_p, 0,
140 ESC_r, -ESC_s,
141 ESC_tee, 0,
142 -ESC_v, -ESC_w,
143 0, 0,
144 -ESC_z
145 };
146
147 #else
148
149 /* This is the "abnormal" table for EBCDIC systems without UTF-8 support. */
150
151 static const short int escapes[] = {
152 /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
153 /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
154 /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
155 /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
156 /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
157 /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
158 /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
159 /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
160 /* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0,
161 /* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
162 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
163 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0,
164 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
165 /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
166 /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
167 /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
168 /* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0,
169 /* D0 */ '}', 0, -ESC_K, 0, 0, 0, 0, -ESC_P,
170 /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
171 /* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X,
172 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
173 /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
174 /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
175 };
176 #endif
177
178
179 /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
180 searched linearly. Put all the names into a single string, in order to reduce
181 the number of relocations when a shared library is dynamically linked. The
182 string is built from string macros so that it works in UTF-8 mode on EBCDIC
183 platforms. */
184
185 typedef struct verbitem {
186 int len;
187 int op;
188 } verbitem;
189
190 static const char verbnames[] =
191 STRING_ACCEPT0
192 STRING_COMMIT0
193 STRING_F0
194 STRING_FAIL0
195 STRING_PRUNE0
196 STRING_SKIP0
197 STRING_THEN;
198
199 static const verbitem verbs[] = {
200 { 6, OP_ACCEPT },
201 { 6, OP_COMMIT },
202 { 1, OP_FAIL },
203 { 4, OP_FAIL },
204 { 5, OP_PRUNE },
205 { 4, OP_SKIP },
206 { 4, OP_THEN }
207 };
208
209 static const int verbcount = sizeof(verbs)/sizeof(verbitem);
210
211
212 /* Tables of names of POSIX character classes and their lengths. The names are
213 now all in a single string, to reduce the number of relocations when a shared
214 library is dynamically loaded. The list of lengths is terminated by a zero
215 length entry. The first three must be alpha, lower, upper, as this is assumed
216 for handling case independence. */
217
218 static const char posix_names[] =
219 STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
220 STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
221 STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
222 STRING_word0 STRING_xdigit;
223
224 static const uschar posix_name_lengths[] = {
225 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
226
227 /* Table of class bit maps for each POSIX class. Each class is formed from a
228 base map, with an optional addition or removal of another map. Then, for some
229 classes, there is some additional tweaking: for [:blank:] the vertical space
230 characters are removed, and for [:alpha:] and [:alnum:] the underscore
231 character is removed. The triples in the table consist of the base map offset,
232 second map offset or -1 if no second map, and a non-negative value for map
233 addition or a negative value for map subtraction (if there are two maps). The
234 absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
235 remove vertical space characters, 2 => remove underscore. */
236
237 static const int posix_class_maps[] = {
238 cbit_word, cbit_digit, -2, /* alpha */
239 cbit_lower, -1, 0, /* lower */
240 cbit_upper, -1, 0, /* upper */
241 cbit_word, -1, 2, /* alnum - word without underscore */
242 cbit_print, cbit_cntrl, 0, /* ascii */
243 cbit_space, -1, 1, /* blank - a GNU extension */
244 cbit_cntrl, -1, 0, /* cntrl */
245 cbit_digit, -1, 0, /* digit */
246 cbit_graph, -1, 0, /* graph */
247 cbit_print, -1, 0, /* print */
248 cbit_punct, -1, 0, /* punct */
249 cbit_space, -1, 0, /* space */
250 cbit_word, -1, 0, /* word - a Perl extension */
251 cbit_xdigit,-1, 0 /* xdigit */
252 };
253
254
255 #define STRING(a) # a
256 #define XSTRING(s) STRING(s)
257
258 /* The texts of compile-time error messages. These are "char *" because they
259 are passed to the outside world. Do not ever re-use any error number, because
260 they are documented. Always add a new error instead. Messages marked DEAD below
261 are no longer used. This used to be a table of strings, but in order to reduce
262 the number of relocations needed when a shared library is loaded dynamically,
263 it is now one long string. We cannot use a table of offsets, because the
264 lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
265 simply count through to the one we want - this isn't a performance issue
266 because these strings are used only when there is a compilation error.
267
268 Each substring ends with \0 to insert a null character. This includes the final
269 substring, so that the whole string ends with \0\0, which can be detected when
270 counting through. */
271
272 static const char error_texts[] =
273 "no error\0"
274 "\\ at end of pattern\0"
275 "\\c at end of pattern\0"
276 "unrecognized character follows \\\0"
277 "numbers out of order in {} quantifier\0"
278 /* 5 */
279 "number too big in {} quantifier\0"
280 "missing terminating ] for character class\0"
281 "invalid escape sequence in character class\0"
282 "range out of order in character class\0"
283 "nothing to repeat\0"
284 /* 10 */
285 "operand of unlimited repeat could match the empty string\0" /** DEAD **/
286 "internal error: unexpected repeat\0"
287 "unrecognized character after (? or (?-\0"
288 "POSIX named classes are supported only within a class\0"
289 "missing )\0"
290 /* 15 */
291 "reference to non-existent subpattern\0"
292 "erroffset passed as NULL\0"
293 "unknown option bit(s) set\0"
294 "missing ) after comment\0"
295 "parentheses nested too deeply\0" /** DEAD **/
296 /* 20 */
297 "regular expression is too large\0"
298 "failed to get memory\0"
299 "unmatched parentheses\0"
300 "internal error: code overflow\0"
301 "unrecognized character after (?<\0"
302 /* 25 */
303 "lookbehind assertion is not fixed length\0"
304 "malformed number or name after (?(\0"
305 "conditional group contains more than two branches\0"
306 "assertion expected after (?(\0"
307 "(?R or (?[+-]digits must be followed by )\0"
308 /* 30 */
309 "unknown POSIX class name\0"
310 "POSIX collating elements are not supported\0"
311 "this version of PCRE is not compiled with PCRE_UTF8 support\0"
312 "spare error\0" /** DEAD **/
313 "character value in \\x{...} sequence is too large\0"
314 /* 35 */
315 "invalid condition (?(0)\0"
316 "\\C not allowed in lookbehind assertion\0"
317 "PCRE does not support \\L, \\l, \\N, \\U, or \\u\0"
318 "number after (?C is > 255\0"
319 "closing ) for (?C expected\0"
320 /* 40 */
321 "recursive call could loop indefinitely\0"
322 "unrecognized character after (?P\0"
323 "syntax error in subpattern name (missing terminator)\0"
324 "two named subpatterns have the same name\0"
325 "invalid UTF-8 string\0"
326 /* 45 */
327 "support for \\P, \\p, and \\X has not been compiled\0"
328 "malformed \\P or \\p sequence\0"
329 "unknown property name after \\P or \\p\0"
330 "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
331 "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
332 /* 50 */
333 "repeated subpattern is too long\0" /** DEAD **/
334 "octal value is greater than \\377 (not in UTF-8 mode)\0"
335 "internal error: overran compiling workspace\0"
336 "internal error: previously-checked referenced subpattern not found\0"
337 "DEFINE group contains more than one branch\0"
338 /* 55 */
339 "repeating a DEFINE group is not allowed\0"
340 "inconsistent NEWLINE options\0"
341 "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
342 "a numbered reference must not be zero\0"
343 "(*VERB) with an argument is not supported\0"
344 /* 60 */
345 "(*VERB) not recognized\0"
346 "number is too big\0"
347 "subpattern name expected\0"
348 "digit expected after (?+\0"
349 "] is an invalid data character in JavaScript compatibility mode\0"
350 /* 65 */
351 "different names for subpatterns of the same number are not allowed\0";
352
353 /* Table to identify digits and hex digits. This is used when compiling
354 patterns. Note that the tables in chartables are dependent on the locale, and
355 may mark arbitrary characters as digits - but the PCRE compiling code expects
356 to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
357 a private table here. It costs 256 bytes, but it is a lot faster than doing
358 character value tests (at least in some simple cases I timed), and in some
359 applications one wants PCRE to compile efficiently as well as match
360 efficiently.
361
362 For convenience, we use the same bit definitions as in chartables:
363
364 0x04 decimal digit
365 0x08 hexadecimal digit
366
367 Then we can use ctype_digit and ctype_xdigit in the code. */
368
369 #ifndef EBCDIC
370
371 /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
372 UTF-8 mode. */
373
374 static const unsigned char digitab[] =
375 {
376 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
377 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
378 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
379 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
380 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
381 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
382 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
383 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
384 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
385 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
386 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
387 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
388 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
389 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
390 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
391 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
392 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
393 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
394 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
395 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
396 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
397 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
398 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
399 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
400 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
401 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
402 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
403 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
404 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
405 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
406 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
407 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
408
409 #else
410
411 /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
412
413 static const unsigned char digitab[] =
414 {
415 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
416 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
417 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
418 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
419 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
420 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
421 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
422 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
423 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
424 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
425 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
426 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */
427 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
428 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
429 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
430 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
431 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
432 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
433 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
434 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
435 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
436 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
437 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
438 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
439 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
440 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
441 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
442 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
443 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
444 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
445 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
446 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
447
448 static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
449 0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
450 0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
451 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
452 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
453 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
454 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
455 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
456 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
457 0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
458 0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
459 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
460 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */
461 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
462 0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
463 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
464 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
465 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
466 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
467 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
468 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
469 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
470 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
471 0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
472 0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
473 0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
474 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
475 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
476 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
477 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
478 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
479 0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
480 0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
481 #endif
482
483
484 /* Definition to allow mutual recursion */
485
486 static BOOL
487 compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
488 int *, int *, branch_chain *, compile_data *, int *);
489
490
491
492 /*************************************************
493 * Find an error text *
494 *************************************************/
495
496 /* The error texts are now all in one long string, to save on relocations. As
497 some of the text is of unknown length, we can't use a table of offsets.
498 Instead, just count through the strings. This is not a performance issue
499 because it happens only when there has been a compilation error.
500
501 Argument: the error number
502 Returns: pointer to the error string
503 */
504
505 static const char *
506 find_error_text(int n)
507 {
508 const char *s = error_texts;
509 for (; n > 0; n--)
510 {
511 while (*s++ != 0) {};
512 if (*s == 0) return "Error text not found (please report)";
513 }
514 return s;
515 }
516
517
518 /*************************************************
519 * Handle escapes *
520 *************************************************/
521
522 /* This function is called when a \ has been encountered. It either returns a
523 positive value for a simple escape such as \n, or a negative value which
524 encodes one of the more complicated things such as \d. A backreference to group
525 n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
526 UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
527 ptr is pointing at the \. On exit, it is on the final character of the escape
528 sequence.
529
530 Arguments:
531 ptrptr points to the pattern position pointer
532 errorcodeptr points to the errorcode variable
533 bracount number of previous extracting brackets
534 options the options bits
535 isclass TRUE if inside a character class
536
537 Returns: zero or positive => a data character
538 negative => a special escape sequence
539 on error, errorcodeptr is set
540 */
541
542 static int
543 check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
544 int options, BOOL isclass)
545 {
546 BOOL utf8 = (options & PCRE_UTF8) != 0;
547 const uschar *ptr = *ptrptr + 1;
548 int c, i;
549
550 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
551 ptr--; /* Set pointer back to the last byte */
552
553 /* If backslash is at the end of the pattern, it's an error. */
554
555 if (c == 0) *errorcodeptr = ERR1;
556
557 /* Non-alphanumerics are literals. For digits or letters, do an initial lookup
558 in a table. A non-zero result is something that can be returned immediately.
559 Otherwise further processing may be required. */
560
561 #ifndef EBCDIC /* ASCII/UTF-8 coding */
562 else if (c < CHAR_0 || c > CHAR_z) {} /* Not alphanumeric */
563 else if ((i = escapes[c - CHAR_0]) != 0) c = i;
564
565 #else /* EBCDIC coding */
566 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphanumeric */
567 else if ((i = escapes[c - 0x48]) != 0) c = i;
568 #endif
569
570 /* Escapes that need further processing, or are illegal. */
571
572 else
573 {
574 const uschar *oldptr;
575 BOOL braced, negated;
576
577 switch (c)
578 {
579 /* A number of Perl escapes are not handled by PCRE. We give an explicit
580 error. */
581
582 case CHAR_l:
583 case CHAR_L:
584 case CHAR_N:
585 case CHAR_u:
586 case CHAR_U:
587 *errorcodeptr = ERR37;
588 break;
589
590 /* \g must be followed by one of a number of specific things:
591
592 (1) A number, either plain or braced. If positive, it is an absolute
593 backreference. If negative, it is a relative backreference. This is a Perl
594 5.10 feature.
595
596 (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
597 is part of Perl's movement towards a unified syntax for back references. As
598 this is synonymous with \k{name}, we fudge it up by pretending it really
599 was \k.
600
601 (3) For Oniguruma compatibility we also support \g followed by a name or a
602 number either in angle brackets or in single quotes. However, these are
603 (possibly recursive) subroutine calls, _not_ backreferences. Just return
604 the -ESC_g code (cf \k). */
605
606 case CHAR_g:
607 if (ptr[1] == CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_APOSTROPHE)
608 {
609 c = -ESC_g;
610 break;
611 }
612
613 /* Handle the Perl-compatible cases */
614
615 if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
616 {
617 const uschar *p;
618 for (p = ptr+2; *p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET; p++)
619 if (*p != CHAR_MINUS && (digitab[*p] & ctype_digit) == 0) break;
620 if (*p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET)
621 {
622 c = -ESC_k;
623 break;
624 }
625 braced = TRUE;
626 ptr++;
627 }
628 else braced = FALSE;
629
630 if (ptr[1] == CHAR_MINUS)
631 {
632 negated = TRUE;
633 ptr++;
634 }
635 else negated = FALSE;
636
637 c = 0;
638 while ((digitab[ptr[1]] & ctype_digit) != 0)
639 c = c * 10 + *(++ptr) - CHAR_0;
640
641 if (c < 0) /* Integer overflow */
642 {
643 *errorcodeptr = ERR61;
644 break;
645 }
646
647 if (braced && *(++ptr) != CHAR_RIGHT_CURLY_BRACKET)
648 {
649 *errorcodeptr = ERR57;
650 break;
651 }
652
653 if (c == 0)
654 {
655 *errorcodeptr = ERR58;
656 break;
657 }
658
659 if (negated)
660 {
661 if (c > bracount)
662 {
663 *errorcodeptr = ERR15;
664 break;
665 }
666 c = bracount - (c - 1);
667 }
668
669 c = -(ESC_REF + c);
670 break;
671
672 /* The handling of escape sequences consisting of a string of digits
673 starting with one that is not zero is not straightforward. By experiment,
674 the way Perl works seems to be as follows:
675
676 Outside a character class, the digits are read as a decimal number. If the
677 number is less than 10, or if there are that many previous extracting
678 left brackets, then it is a back reference. Otherwise, up to three octal
679 digits are read to form an escaped byte. Thus \123 is likely to be octal
680 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
681 value is greater than 377, the least significant 8 bits are taken. Inside a
682 character class, \ followed by a digit is always an octal number. */
683
684 case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
685 case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
686
687 if (!isclass)
688 {
689 oldptr = ptr;
690 c -= CHAR_0;
691 while ((digitab[ptr[1]] & ctype_digit) != 0)
692 c = c * 10 + *(++ptr) - CHAR_0;
693 if (c < 0) /* Integer overflow */
694 {
695 *errorcodeptr = ERR61;
696 break;
697 }
698 if (c < 10 || c <= bracount)
699 {
700 c = -(ESC_REF + c);
701 break;
702 }
703 ptr = oldptr; /* Put the pointer back and fall through */
704 }
705
706 /* Handle an octal number following \. If the first digit is 8 or 9, Perl
707 generates a binary zero byte and treats the digit as a following literal.
708 Thus we have to pull back the pointer by one. */
709
710 if ((c = *ptr) >= CHAR_8)
711 {
712 ptr--;
713 c = 0;
714 break;
715 }
716
717 /* \0 always starts an octal number, but we may drop through to here with a
718 larger first octal digit. The original code used just to take the least
719 significant 8 bits of octal numbers (I think this is what early Perls used
720 to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
721 than 3 octal digits. */
722
723 case CHAR_0:
724 c -= CHAR_0;
725 while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
726 c = c * 8 + *(++ptr) - CHAR_0;
727 if (!utf8 && c > 255) *errorcodeptr = ERR51;
728 break;
729
730 /* \x is complicated. \x{ddd} is a character number which can be greater
731 than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
732 treated as a data character. */
733
734 case CHAR_x:
735 if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
736 {
737 const uschar *pt = ptr + 2;
738 int count = 0;
739
740 c = 0;
741 while ((digitab[*pt] & ctype_xdigit) != 0)
742 {
743 register int cc = *pt++;
744 if (c == 0 && cc == CHAR_0) continue; /* Leading zeroes */
745 count++;
746
747 #ifndef EBCDIC /* ASCII/UTF-8 coding */
748 if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
749 c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
750 #else /* EBCDIC coding */
751 if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */
752 c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
753 #endif
754 }
755
756 if (*pt == CHAR_RIGHT_CURLY_BRACKET)
757 {
758 if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
759 ptr = pt;
760 break;
761 }
762
763 /* If the sequence of hex digits does not end with '}', then we don't
764 recognize this construct; fall through to the normal \x handling. */
765 }
766
767 /* Read just a single-byte hex-defined char */
768
769 c = 0;
770 while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
771 {
772 int cc; /* Some compilers don't like */
773 cc = *(++ptr); /* ++ in initializers */
774 #ifndef EBCDIC /* ASCII/UTF-8 coding */
775 if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
776 c = c * 16 + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
777 #else /* EBCDIC coding */
778 if (cc <= CHAR_z) cc += 64; /* Convert to upper case */
779 c = c * 16 + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
780 #endif
781 }
782 break;
783
784 /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
785 This coding is ASCII-specific, but then the whole concept of \cx is
786 ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
787
788 case CHAR_c:
789 c = *(++ptr);
790 if (c == 0)
791 {
792 *errorcodeptr = ERR2;
793 break;
794 }
795
796 #ifndef EBCDIC /* ASCII/UTF-8 coding */
797 if (c >= CHAR_a && c <= CHAR_z) c -= 32;
798 c ^= 0x40;
799 #else /* EBCDIC coding */
800 if (c >= CHAR_a && c <= CHAR_z) c += 64;
801 c ^= 0xC0;
802 #endif
803 break;
804
805 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
806 other alphanumeric following \ is an error if PCRE_EXTRA was set;
807 otherwise, for Perl compatibility, it is a literal. This code looks a bit
808 odd, but there used to be some cases other than the default, and there may
809 be again in future, so I haven't "optimized" it. */
810
811 default:
812 if ((options & PCRE_EXTRA) != 0) switch(c)
813 {
814 default:
815 *errorcodeptr = ERR3;
816 break;
817 }
818 break;
819 }
820 }
821
822 *ptrptr = ptr;
823 return c;
824 }
825
826
827
828 #ifdef SUPPORT_UCP
829 /*************************************************
830 * Handle \P and \p *
831 *************************************************/
832
833 /* This function is called after \P or \p has been encountered, provided that
834 PCRE is compiled with support for Unicode properties. On entry, ptrptr is
835 pointing at the P or p. On exit, it is pointing at the final character of the
836 escape sequence.
837
838 Argument:
839 ptrptr points to the pattern position pointer
840 negptr points to a boolean that is set TRUE for negation else FALSE
841 dptr points to an int that is set to the detailed property value
842 errorcodeptr points to the error code variable
843
844 Returns: type value from ucp_type_table, or -1 for an invalid type
845 */
846
847 static int
848 get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
849 {
850 int c, i, bot, top;
851 const uschar *ptr = *ptrptr;
852 char name[32];
853
854 c = *(++ptr);
855 if (c == 0) goto ERROR_RETURN;
856
857 *negptr = FALSE;
858
859 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
860 negation. */
861
862 if (c == CHAR_LEFT_CURLY_BRACKET)
863 {
864 if (ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
865 {
866 *negptr = TRUE;
867 ptr++;
868 }
869 for (i = 0; i < (int)sizeof(name) - 1; i++)
870 {
871 c = *(++ptr);
872 if (c == 0) goto ERROR_RETURN;
873 if (c == CHAR_RIGHT_CURLY_BRACKET) break;
874 name[i] = c;
875 }
876 if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
877 name[i] = 0;
878 }
879
880 /* Otherwise there is just one following character */
881
882 else
883 {
884 name[0] = c;
885 name[1] = 0;
886 }
887
888 *ptrptr = ptr;
889
890 /* Search for a recognized property name using binary chop */
891
892 bot = 0;
893 top = _pcre_utt_size;
894
895 while (bot < top)
896 {
897 i = (bot + top) >> 1;
898 c = strcmp(name, _pcre_utt_names + _pcre_utt[i].name_offset);
899 if (c == 0)
900 {
901 *dptr = _pcre_utt[i].value;
902 return _pcre_utt[i].type;
903 }
904 if (c > 0) bot = i + 1; else top = i;
905 }
906
907 *errorcodeptr = ERR47;
908 *ptrptr = ptr;
909 return -1;
910
911 ERROR_RETURN:
912 *errorcodeptr = ERR46;
913 *ptrptr = ptr;
914 return -1;
915 }
916 #endif
917
918
919
920
921 /*************************************************
922 * Check for counted repeat *
923 *************************************************/
924
925 /* This function is called when a '{' is encountered in a place where it might
926 start a quantifier. It looks ahead to see if it really is a quantifier or not.
927 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
928 where the ddds are digits.
929
930 Arguments:
931 p pointer to the first char after '{'
932
933 Returns: TRUE or FALSE
934 */
935
936 static BOOL
937 is_counted_repeat(const uschar *p)
938 {
939 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
940 while ((digitab[*p] & ctype_digit) != 0) p++;
941 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
942
943 if (*p++ != CHAR_COMMA) return FALSE;
944 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
945
946 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
947 while ((digitab[*p] & ctype_digit) != 0) p++;
948
949 return (*p == CHAR_RIGHT_CURLY_BRACKET);
950 }
951
952
953
954 /*************************************************
955 * Read repeat counts *
956 *************************************************/
957
958 /* Read an item of the form {n,m} and return the values. This is called only
959 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
960 so the syntax is guaranteed to be correct, but we need to check the values.
961
962 Arguments:
963 p pointer to first char after '{'
964 minp pointer to int for min
965 maxp pointer to int for max
966 returned as -1 if no max
967 errorcodeptr points to error code variable
968
969 Returns: pointer to '}' on success;
970 current ptr on error, with errorcodeptr set non-zero
971 */
972
973 static const uschar *
974 read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
975 {
976 int min = 0;
977 int max = -1;
978
979 /* Read the minimum value and do a paranoid check: a negative value indicates
980 an integer overflow. */
981
982 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - CHAR_0;
983 if (min < 0 || min > 65535)
984 {
985 *errorcodeptr = ERR5;
986 return p;
987 }
988
989 /* Read the maximum value if there is one, and again do a paranoid on its size.
990 Also, max must not be less than min. */
991
992 if (*p == CHAR_RIGHT_CURLY_BRACKET) max = min; else
993 {
994 if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)
995 {
996 max = 0;
997 while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - CHAR_0;
998 if (max < 0 || max > 65535)
999 {
1000 *errorcodeptr = ERR5;
1001 return p;
1002 }
1003 if (max < min)
1004 {
1005 *errorcodeptr = ERR4;
1006 return p;
1007 }
1008 }
1009 }
1010
1011 /* Fill in the required variables, and pass back the pointer to the terminating
1012 '}'. */
1013
1014 *minp = min;
1015 *maxp = max;
1016 return p;
1017 }
1018
1019
1020
1021 /*************************************************
1022 * Subroutine for finding forward reference *
1023 *************************************************/
1024
1025 /* This recursive function is called only from find_parens() below. The
1026 top-level call starts at the beginning of the pattern. All other calls must
1027 start at a parenthesis. It scans along a pattern's text looking for capturing
1028 subpatterns, and counting them. If it finds a named pattern that matches the
1029 name it is given, it returns its number. Alternatively, if the name is NULL, it
1030 returns when it reaches a given numbered subpattern. We know that if (?P< is
1031 encountered, the name will be terminated by '>' because that is checked in the
1032 first pass. Recursion is used to keep track of subpatterns that reset the
1033 capturing group numbers - the (?| feature.
1034
1035 Arguments:
1036 ptrptr address of the current character pointer (updated)
1037 cd compile background data
1038 name name to seek, or NULL if seeking a numbered subpattern
1039 lorn name length, or subpattern number if name is NULL
1040 xmode TRUE if we are in /x mode
1041 count pointer to the current capturing subpattern number (updated)
1042
1043 Returns: the number of the named subpattern, or -1 if not found
1044 */
1045
1046 static int
1047 find_parens_sub(uschar **ptrptr, compile_data *cd, const uschar *name, int lorn,
1048 BOOL xmode, int *count)
1049 {
1050 uschar *ptr = *ptrptr;
1051 int start_count = *count;
1052 int hwm_count = start_count;
1053 BOOL dup_parens = FALSE;
1054
1055 /* If the first character is a parenthesis, check on the type of group we are
1056 dealing with. The very first call may not start with a parenthesis. */
1057
1058 if (ptr[0] == CHAR_LEFT_PARENTHESIS)
1059 {
1060 if (ptr[1] == CHAR_QUESTION_MARK &&
1061 ptr[2] == CHAR_VERTICAL_LINE)
1062 {
1063 ptr += 3;
1064 dup_parens = TRUE;
1065 }
1066
1067 /* Handle a normal, unnamed capturing parenthesis */
1068
1069 else if (ptr[1] != CHAR_QUESTION_MARK && ptr[1] != CHAR_ASTERISK)
1070 {
1071 *count += 1;
1072 if (name == NULL && *count == lorn) return *count;
1073 ptr++;
1074 }
1075
1076 /* Handle a condition. If it is an assertion, just carry on so that it
1077 is processed as normal. If not, skip to the closing parenthesis of the
1078 condition (there can't be any nested parens. */
1079
1080 else if (ptr[2] == CHAR_LEFT_PARENTHESIS)
1081 {
1082 ptr += 2;
1083 if (ptr[1] != CHAR_QUESTION_MARK)
1084 {
1085 while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
1086 if (*ptr != 0) ptr++;
1087 }
1088 }
1089
1090 /* We have either (? or (* and not a condition */
1091
1092 else
1093 {
1094 ptr += 2;
1095 if (*ptr == CHAR_P) ptr++; /* Allow optional P */
1096
1097 /* We have to disambiguate (?<! and (?<= from (?<name> for named groups */
1098
1099 if ((*ptr == CHAR_LESS_THAN_SIGN && ptr[1] != CHAR_EXCLAMATION_MARK &&
1100 ptr[1] != CHAR_EQUALS_SIGN) || *ptr == CHAR_APOSTROPHE)
1101 {
1102 int term;
1103 const uschar *thisname;
1104 *count += 1;
1105 if (name == NULL && *count == lorn) return *count;
1106 term = *ptr++;
1107 if (term == CHAR_LESS_THAN_SIGN) term = CHAR_GREATER_THAN_SIGN;
1108 thisname = ptr;
1109 while (*ptr != term) ptr++;
1110 if (name != NULL && lorn == ptr - thisname &&
1111 strncmp((const char *)name, (const char *)thisname, lorn) == 0)
1112 return *count;
1113 term++;
1114 }
1115 }
1116 }
1117
1118 /* Past any initial parenthesis handling, scan for parentheses or vertical
1119 bars. */
1120
1121 for (; *ptr != 0; ptr++)
1122 {
1123 /* Skip over backslashed characters and also entire \Q...\E */
1124
1125 if (*ptr == CHAR_BACKSLASH)
1126 {
1127 if (*(++ptr) == 0) goto FAIL_EXIT;
1128 if (*ptr == CHAR_Q) for (;;)
1129 {
1130 while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
1131 if (*ptr == 0) goto FAIL_EXIT;
1132 if (*(++ptr) == CHAR_E) break;
1133 }
1134 continue;
1135 }
1136
1137 /* Skip over character classes; this logic must be similar to the way they
1138 are handled for real. If the first character is '^', skip it. Also, if the
1139 first few characters (either before or after ^) are \Q\E or \E we skip them
1140 too. This makes for compatibility with Perl. Note the use of STR macros to
1141 encode "Q\\E" so that it works in UTF-8 on EBCDIC platforms. */
1142
1143 if (*ptr == CHAR_LEFT_SQUARE_BRACKET)
1144 {
1145 BOOL negate_class = FALSE;
1146 for (;;)
1147 {
1148 if (ptr[1] == CHAR_BACKSLASH)
1149 {
1150 if (ptr[2] == CHAR_E)
1151 ptr+= 2;
1152 else if (strncmp((const char *)ptr+2,
1153 STR_Q STR_BACKSLASH STR_E, 3) == 0)
1154 ptr += 4;
1155 else
1156 break;
1157 }
1158 else if (!negate_class && ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
1159 {
1160 negate_class = TRUE;
1161 ptr++;
1162 }
1163 else break;
1164 }
1165
1166 /* If the next character is ']', it is a data character that must be
1167 skipped, except in JavaScript compatibility mode. */
1168
1169 if (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET &&
1170 (cd->external_options & PCRE_JAVASCRIPT_COMPAT) == 0)
1171 ptr++;
1172
1173 while (*(++ptr) != CHAR_RIGHT_SQUARE_BRACKET)
1174 {
1175 if (*ptr == 0) return -1;
1176 if (*ptr == CHAR_BACKSLASH)
1177 {
1178 if (*(++ptr) == 0) goto FAIL_EXIT;
1179 if (*ptr == CHAR_Q) for (;;)
1180 {
1181 while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
1182 if (*ptr == 0) goto FAIL_EXIT;
1183 if (*(++ptr) == CHAR_E) break;
1184 }
1185 continue;
1186 }
1187 }
1188 continue;
1189 }
1190
1191 /* Skip comments in /x mode */
1192
1193 if (xmode && *ptr == CHAR_NUMBER_SIGN)
1194 {
1195 while (*(++ptr) != 0 && *ptr != CHAR_NL) {};
1196 if (*ptr == 0) goto FAIL_EXIT;
1197 continue;
1198 }
1199
1200 /* Check for the special metacharacters */
1201
1202 if (*ptr == CHAR_LEFT_PARENTHESIS)
1203 {
1204 int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, count);
1205 if (rc > 0) return rc;
1206 if (*ptr == 0) goto FAIL_EXIT;
1207 }
1208
1209 else if (*ptr == CHAR_RIGHT_PARENTHESIS)
1210 {
1211 if (dup_parens && *count < hwm_count) *count = hwm_count;
1212 *ptrptr = ptr;
1213 return -1;
1214 }
1215
1216 else if (*ptr == CHAR_VERTICAL_LINE && dup_parens)
1217 {
1218 if (*count > hwm_count) hwm_count = *count;
1219 *count = start_count;
1220 }
1221 }
1222
1223 FAIL_EXIT:
1224 *ptrptr = ptr;
1225 return -1;
1226 }
1227
1228
1229
1230
1231 /*************************************************
1232 * Find forward referenced subpattern *
1233 *************************************************/
1234
1235 /* This function scans along a pattern's text looking for capturing
1236 subpatterns, and counting them. If it finds a named pattern that matches the
1237 name it is given, it returns its number. Alternatively, if the name is NULL, it
1238 returns when it reaches a given numbered subpattern. This is used for forward
1239 references to subpatterns. We used to be able to start this scan from the
1240 current compiling point, using the current count value from cd->bracount, and
1241 do it all in a single loop, but the addition of the possibility of duplicate
1242 subpattern numbers means that we have to scan from the very start, in order to
1243 take account of such duplicates, and to use a recursive function to keep track
1244 of the different types of group.
1245
1246 Arguments:
1247 cd compile background data
1248 name name to seek, or NULL if seeking a numbered subpattern
1249 lorn name length, or subpattern number if name is NULL
1250 xmode TRUE if we are in /x mode
1251
1252 Returns: the number of the found subpattern, or -1 if not found
1253 */
1254
1255 static int
1256 find_parens(compile_data *cd, const uschar *name, int lorn, BOOL xmode)
1257 {
1258 uschar *ptr = (uschar *)cd->start_pattern;
1259 int count = 0;
1260 int rc;
1261
1262 /* If the pattern does not start with an opening parenthesis, the first call
1263 to find_parens_sub() will scan right to the end (if necessary). However, if it
1264 does start with a parenthesis, find_parens_sub() will return when it hits the
1265 matching closing parens. That is why we have to have a loop. */
1266
1267 for (;;)
1268 {
1269 rc = find_parens_sub(&ptr, cd, name, lorn, xmode, &count);
1270 if (rc > 0 || *ptr++ == 0) break;
1271 }
1272
1273 return rc;
1274 }
1275
1276
1277
1278
1279 /*************************************************
1280 * Find first significant op code *
1281 *************************************************/
1282
1283 /* This is called by several functions that scan a compiled expression looking
1284 for a fixed first character, or an anchoring op code etc. It skips over things
1285 that do not influence this. For some calls, a change of option is important.
1286 For some calls, it makes sense to skip negative forward and all backward
1287 assertions, and also the \b assertion; for others it does not.
1288
1289 Arguments:
1290 code pointer to the start of the group
1291 options pointer to external options
1292 optbit the option bit whose changing is significant, or
1293 zero if none are
1294 skipassert TRUE if certain assertions are to be skipped
1295
1296 Returns: pointer to the first significant opcode
1297 */
1298
1299 static const uschar*
1300 first_significant_code(const uschar *code, int *options, int optbit,
1301 BOOL skipassert)
1302 {
1303 for (;;)
1304 {
1305 switch ((int)*code)
1306 {
1307 case OP_OPT:
1308 if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
1309 *options = (int)code[1];
1310 code += 2;
1311 break;
1312
1313 case OP_ASSERT_NOT:
1314 case OP_ASSERTBACK:
1315 case OP_ASSERTBACK_NOT:
1316 if (!skipassert) return code;
1317 do code += GET(code, 1); while (*code == OP_ALT);
1318 code += _pcre_OP_lengths[*code];
1319 break;
1320
1321 case OP_WORD_BOUNDARY:
1322 case OP_NOT_WORD_BOUNDARY:
1323 if (!skipassert) return code;
1324 /* Fall through */
1325
1326 case OP_CALLOUT:
1327 case OP_CREF:
1328 case OP_NCREF:
1329 case OP_RREF:
1330 case OP_NRREF:
1331 case OP_DEF:
1332 code += _pcre_OP_lengths[*code];
1333 break;
1334
1335 default:
1336 return code;
1337 }
1338 }
1339 /* Control never reaches here */
1340 }
1341
1342
1343
1344
1345 /*************************************************
1346 * Find the fixed length of a branch *
1347 *************************************************/
1348
1349 /* Scan a branch and compute the fixed length of subject that will match it,
1350 if the length is fixed. This is needed for dealing with backward assertions.
1351 In UTF8 mode, the result is in characters rather than bytes. The branch is
1352 temporarily terminated with OP_END when this function is called.
1353
1354 This function is called when a backward assertion is encountered, so that if it
1355 fails, the error message can point to the correct place in the pattern.
1356 However, we cannot do this when the assertion contains subroutine calls,
1357 because they can be forward references. We solve this by remembering this case
1358 and doing the check at the end; a flag specifies which mode we are running in.
1359
1360 Arguments:
1361 code points to the start of the pattern (the bracket)
1362 options the compiling options
1363 atend TRUE if called when the pattern is complete
1364 cd the "compile data" structure
1365
1366 Returns: the fixed length,
1367 or -1 if there is no fixed length,
1368 or -2 if \C was encountered
1369 or -3 if an OP_RECURSE item was encountered and atend is FALSE
1370 */
1371
1372 static int
1373 find_fixedlength(uschar *code, int options, BOOL atend, compile_data *cd)
1374 {
1375 int length = -1;
1376
1377 register int branchlength = 0;
1378 register uschar *cc = code + 1 + LINK_SIZE;
1379
1380 /* Scan along the opcodes for this branch. If we get to the end of the
1381 branch, check the length against that of the other branches. */
1382
1383 for (;;)
1384 {
1385 int d;
1386 uschar *ce, *cs;
1387 register int op = *cc;
1388 switch (op)
1389 {
1390 case OP_CBRA:
1391 case OP_BRA:
1392 case OP_ONCE:
1393 case OP_COND:
1394 d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options, atend, cd);
1395 if (d < 0) return d;
1396 branchlength += d;
1397 do cc += GET(cc, 1); while (*cc == OP_ALT);
1398 cc += 1 + LINK_SIZE;
1399 break;
1400
1401 /* Reached end of a branch; if it's a ket it is the end of a nested
1402 call. If it's ALT it is an alternation in a nested call. If it is
1403 END it's the end of the outer call. All can be handled by the same code. */
1404
1405 case OP_ALT:
1406 case OP_KET:
1407 case OP_KETRMAX:
1408 case OP_KETRMIN:
1409 case OP_END:
1410 if (length < 0) length = branchlength;
1411 else if (length != branchlength) return -1;
1412 if (*cc != OP_ALT) return length;
1413 cc += 1 + LINK_SIZE;
1414 branchlength = 0;
1415 break;
1416
1417 /* A true recursion implies not fixed length, but a subroutine call may
1418 be OK. If the subroutine is a forward reference, we can't deal with
1419 it until the end of the pattern, so return -3. */
1420
1421 case OP_RECURSE:
1422 if (!atend) return -3;
1423 cs = ce = (uschar *)cd->start_code + GET(cc, 1); /* Start subpattern */
1424 do ce += GET(ce, 1); while (*ce == OP_ALT); /* End subpattern */
1425 if (cc > cs && cc < ce) return -1; /* Recursion */
1426 d = find_fixedlength(cs + 2, options, atend, cd);
1427 if (d < 0) return d;
1428 branchlength += d;
1429 cc += 1 + LINK_SIZE;
1430 break;
1431
1432 /* Skip over assertive subpatterns */
1433
1434 case OP_ASSERT:
1435 case OP_ASSERT_NOT:
1436 case OP_ASSERTBACK:
1437 case OP_ASSERTBACK_NOT:
1438 do cc += GET(cc, 1); while (*cc == OP_ALT);
1439 /* Fall through */
1440
1441 /* Skip over things that don't match chars */
1442
1443 case OP_REVERSE:
1444 case OP_CREF:
1445 case OP_NCREF:
1446 case OP_RREF:
1447 case OP_NRREF:
1448 case OP_DEF:
1449 case OP_OPT:
1450 case OP_CALLOUT:
1451 case OP_SOD:
1452 case OP_SOM:
1453 case OP_SET_SOM:
1454 case OP_EOD:
1455 case OP_EODN:
1456 case OP_CIRC:
1457 case OP_DOLL:
1458 case OP_NOT_WORD_BOUNDARY:
1459 case OP_WORD_BOUNDARY:
1460 cc += _pcre_OP_lengths[*cc];
1461 break;
1462
1463 /* Handle literal characters */
1464
1465 case OP_CHAR:
1466 case OP_CHARNC:
1467 case OP_NOT:
1468 branchlength++;
1469 cc += 2;
1470 #ifdef SUPPORT_UTF8
1471 if ((options & PCRE_UTF8) != 0 && cc[-1] >= 0xc0)
1472 cc += _pcre_utf8_table4[cc[-1] & 0x3f];
1473 #endif
1474 break;
1475
1476 /* Handle exact repetitions. The count is already in characters, but we
1477 need to skip over a multibyte character in UTF8 mode. */
1478
1479 case OP_EXACT:
1480 branchlength += GET2(cc,1);
1481 cc += 4;
1482 #ifdef SUPPORT_UTF8
1483 if ((options & PCRE_UTF8) != 0 && cc[-1] >= 0xc0)
1484 cc += _pcre_utf8_table4[cc[-1] & 0x3f];
1485 #endif
1486 break;
1487
1488 case OP_TYPEEXACT:
1489 branchlength += GET2(cc,1);
1490 if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
1491 cc += 4;
1492 break;
1493
1494 /* Handle single-char matchers */
1495
1496 case OP_PROP:
1497 case OP_NOTPROP:
1498 cc += 2;
1499 /* Fall through */
1500
1501 case OP_NOT_DIGIT:
1502 case OP_DIGIT:
1503 case OP_NOT_WHITESPACE:
1504 case OP_WHITESPACE:
1505 case OP_NOT_WORDCHAR:
1506 case OP_WORDCHAR:
1507 case OP_ANY:
1508 case OP_ALLANY:
1509 branchlength++;
1510 cc++;
1511 break;
1512
1513 /* The single-byte matcher isn't allowed */
1514
1515 case OP_ANYBYTE:
1516 return -2;
1517
1518 /* Check a class for variable quantification */
1519
1520 #ifdef SUPPORT_UTF8
1521 case OP_XCLASS:
1522 cc += GET(cc, 1) - 33;
1523 /* Fall through */
1524 #endif
1525
1526 case OP_CLASS:
1527 case OP_NCLASS:
1528 cc += 33;
1529
1530 switch (*cc)
1531 {
1532 case OP_CRSTAR:
1533 case OP_CRMINSTAR:
1534 case OP_CRQUERY:
1535 case OP_CRMINQUERY:
1536 return -1;
1537
1538 case OP_CRRANGE:
1539 case OP_CRMINRANGE:
1540 if (GET2(cc,1) != GET2(cc,3)) return -1;
1541 branchlength += GET2(cc,1);
1542 cc += 5;
1543 break;
1544
1545 default:
1546 branchlength++;
1547 }
1548 break;
1549
1550 /* Anything else is variable length */
1551
1552 default:
1553 return -1;
1554 }
1555 }
1556 /* Control never gets here */
1557 }
1558
1559
1560
1561
1562 /*************************************************
1563 * Scan compiled regex for specific bracket *
1564 *************************************************/
1565
1566 /* This little function scans through a compiled pattern until it finds a
1567 capturing bracket with the given number, or, if the number is negative, an
1568 instance of OP_REVERSE for a lookbehind. The function is global in the C sense
1569 so that it can be called from pcre_study() when finding the minimum matching
1570 length.
1571
1572 Arguments:
1573 code points to start of expression
1574 utf8 TRUE in UTF-8 mode
1575 number the required bracket number or negative to find a lookbehind
1576
1577 Returns: pointer to the opcode for the bracket, or NULL if not found
1578 */
1579
1580 const uschar *
1581 _pcre_find_bracket(const uschar *code, BOOL utf8, int number)
1582 {
1583 for (;;)
1584 {
1585 register int c = *code;
1586 if (c == OP_END) return NULL;
1587
1588 /* XCLASS is used for classes that cannot be represented just by a bit
1589 map. This includes negated single high-valued characters. The length in
1590 the table is zero; the actual length is stored in the compiled code. */
1591
1592 if (c == OP_XCLASS) code += GET(code, 1);
1593
1594 /* Handle recursion */
1595
1596 else if (c == OP_REVERSE)
1597 {
1598 if (number < 0) return (uschar *)code;
1599 code += _pcre_OP_lengths[c];
1600 }
1601
1602 /* Handle capturing bracket */
1603
1604 else if (c == OP_CBRA)
1605 {
1606 int n = GET2(code, 1+LINK_SIZE);
1607 if (n == number) return (uschar *)code;
1608 code += _pcre_OP_lengths[c];
1609 }
1610
1611 /* Otherwise, we can get the item's length from the table, except that for
1612 repeated character types, we have to test for \p and \P, which have an extra
1613 two bytes of parameters. */
1614
1615 else
1616 {
1617 switch(c)
1618 {
1619 case OP_TYPESTAR:
1620 case OP_TYPEMINSTAR:
1621 case OP_TYPEPLUS:
1622 case OP_TYPEMINPLUS:
1623 case OP_TYPEQUERY:
1624 case OP_TYPEMINQUERY:
1625 case OP_TYPEPOSSTAR:
1626 case OP_TYPEPOSPLUS:
1627 case OP_TYPEPOSQUERY:
1628 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1629 break;
1630
1631 case OP_TYPEUPTO:
1632 case OP_TYPEMINUPTO:
1633 case OP_TYPEEXACT:
1634 case OP_TYPEPOSUPTO:
1635 if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1636 break;
1637 }
1638
1639 /* Add in the fixed length from the table */
1640
1641 code += _pcre_OP_lengths[c];
1642
1643 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1644 a multi-byte character. The length in the table is a minimum, so we have to
1645 arrange to skip the extra bytes. */
1646
1647 #ifdef SUPPORT_UTF8
1648 if (utf8) switch(c)
1649 {
1650 case OP_CHAR:
1651 case OP_CHARNC:
1652 case OP_EXACT:
1653 case OP_UPTO:
1654 case OP_MINUPTO:
1655 case OP_POSUPTO:
1656 case OP_STAR:
1657 case OP_MINSTAR:
1658 case OP_POSSTAR:
1659 case OP_PLUS:
1660 case OP_MINPLUS:
1661 case OP_POSPLUS:
1662 case OP_QUERY:
1663 case OP_MINQUERY:
1664 case OP_POSQUERY:
1665 if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1666 break;
1667 }
1668 #else
1669 (void)(utf8); /* Keep compiler happy by referencing function argument */
1670 #endif
1671 }
1672 }
1673 }
1674
1675
1676
1677 /*************************************************
1678 * Scan compiled regex for recursion reference *
1679 *************************************************/
1680
1681 /* This little function scans through a compiled pattern until it finds an
1682 instance of OP_RECURSE.
1683
1684 Arguments:
1685 code points to start of expression
1686 utf8 TRUE in UTF-8 mode
1687
1688 Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1689 */
1690
1691 static const uschar *
1692 find_recurse(const uschar *code, BOOL utf8)
1693 {
1694 for (;;)
1695 {
1696 register int c = *code;
1697 if (c == OP_END) return NULL;
1698 if (c == OP_RECURSE) return code;
1699
1700 /* XCLASS is used for classes that cannot be represented just by a bit
1701 map. This includes negated single high-valued characters. The length in
1702 the table is zero; the actual length is stored in the compiled code. */
1703
1704 if (c == OP_XCLASS) code += GET(code, 1);
1705
1706 /* Otherwise, we can get the item's length from the table, except that for
1707 repeated character types, we have to test for \p and \P, which have an extra
1708 two bytes of parameters. */
1709
1710 else
1711 {
1712 switch(c)
1713 {
1714 case OP_TYPESTAR:
1715 case OP_TYPEMINSTAR:
1716 case OP_TYPEPLUS:
1717 case OP_TYPEMINPLUS:
1718 case OP_TYPEQUERY:
1719 case OP_TYPEMINQUERY:
1720 case OP_TYPEPOSSTAR:
1721 case OP_TYPEPOSPLUS:
1722 case OP_TYPEPOSQUERY:
1723 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1724 break;
1725
1726 case OP_TYPEPOSUPTO:
1727 case OP_TYPEUPTO:
1728 case OP_TYPEMINUPTO:
1729 case OP_TYPEEXACT:
1730 if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1731 break;
1732 }
1733
1734 /* Add in the fixed length from the table */
1735
1736 code += _pcre_OP_lengths[c];
1737
1738 /* In UTF-8 mode, opcodes that are followed by a character may be followed
1739 by a multi-byte character. The length in the table is a minimum, so we have
1740 to arrange to skip the extra bytes. */
1741
1742 #ifdef SUPPORT_UTF8
1743 if (utf8) switch(c)
1744 {
1745 case OP_CHAR:
1746 case OP_CHARNC:
1747 case OP_EXACT:
1748 case OP_UPTO:
1749 case OP_MINUPTO:
1750 case OP_POSUPTO:
1751 case OP_STAR:
1752 case OP_MINSTAR:
1753 case OP_POSSTAR:
1754 case OP_PLUS:
1755 case OP_MINPLUS:
1756 case OP_POSPLUS:
1757 case OP_QUERY:
1758 case OP_MINQUERY:
1759 case OP_POSQUERY:
1760 if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1761 break;
1762 }
1763 #else
1764 (void)(utf8); /* Keep compiler happy by referencing function argument */
1765 #endif
1766 }
1767 }
1768 }
1769
1770
1771
1772 /*************************************************
1773 * Scan compiled branch for non-emptiness *
1774 *************************************************/
1775
1776 /* This function scans through a branch of a compiled pattern to see whether it
1777 can match the empty string or not. It is called from could_be_empty()
1778 below and from compile_branch() when checking for an unlimited repeat of a
1779 group that can match nothing. Note that first_significant_code() skips over
1780 backward and negative forward assertions when its final argument is TRUE. If we
1781 hit an unclosed bracket, we return "empty" - this means we've struck an inner
1782 bracket whose current branch will already have been scanned.
1783
1784 Arguments:
1785 code points to start of search
1786 endcode points to where to stop
1787 utf8 TRUE if in UTF8 mode
1788
1789 Returns: TRUE if what is matched could be empty
1790 */
1791
1792 static BOOL
1793 could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1794 {
1795 register int c;
1796 for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1797 code < endcode;
1798 code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1799 {
1800 const uschar *ccode;
1801
1802 c = *code;
1803
1804 /* Skip over forward assertions; the other assertions are skipped by
1805 first_significant_code() with a TRUE final argument. */
1806
1807 if (c == OP_ASSERT)
1808 {
1809 do code += GET(code, 1); while (*code == OP_ALT);
1810 c = *code;
1811 continue;
1812 }
1813
1814 /* Groups with zero repeats can of course be empty; skip them. */
1815
1816 if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO)
1817 {
1818 code += _pcre_OP_lengths[c];
1819 do code += GET(code, 1); while (*code == OP_ALT);
1820 c = *code;
1821 continue;
1822 }
1823
1824 /* For other groups, scan the branches. */
1825
1826 if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND)
1827 {
1828 BOOL empty_branch;
1829 if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1830
1831 /* If a conditional group has only one branch, there is a second, implied,
1832 empty branch, so just skip over the conditional, because it could be empty.
1833 Otherwise, scan the individual branches of the group. */
1834
1835 if (c == OP_COND && code[GET(code, 1)] != OP_ALT)
1836 code += GET(code, 1);
1837 else
1838 {
1839 empty_branch = FALSE;
1840 do
1841 {
1842 if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1843 empty_branch = TRUE;
1844 code += GET(code, 1);
1845 }
1846 while (*code == OP_ALT);
1847 if (!empty_branch) return FALSE; /* All branches are non-empty */
1848 }
1849
1850 c = *code;
1851 continue;
1852 }
1853
1854 /* Handle the other opcodes */
1855
1856 switch (c)
1857 {
1858 /* Check for quantifiers after a class. XCLASS is used for classes that
1859 cannot be represented just by a bit map. This includes negated single
1860 high-valued characters. The length in _pcre_OP_lengths[] is zero; the
1861 actual length is stored in the compiled code, so we must update "code"
1862 here. */
1863
1864 #ifdef SUPPORT_UTF8
1865 case OP_XCLASS:
1866 ccode = code += GET(code, 1);
1867 goto CHECK_CLASS_REPEAT;
1868 #endif
1869
1870 case OP_CLASS:
1871 case OP_NCLASS:
1872 ccode = code + 33;
1873
1874 #ifdef SUPPORT_UTF8
1875 CHECK_CLASS_REPEAT:
1876 #endif
1877
1878 switch (*ccode)
1879 {
1880 case OP_CRSTAR: /* These could be empty; continue */
1881 case OP_CRMINSTAR:
1882 case OP_CRQUERY:
1883 case OP_CRMINQUERY:
1884 break;
1885
1886 default: /* Non-repeat => class must match */
1887 case OP_CRPLUS: /* These repeats aren't empty */
1888 case OP_CRMINPLUS:
1889 return FALSE;
1890
1891 case OP_CRRANGE:
1892 case OP_CRMINRANGE:
1893 if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
1894 break;
1895 }
1896 break;
1897
1898 /* Opcodes that must match a character */
1899
1900 case OP_PROP:
1901 case OP_NOTPROP:
1902 case OP_EXTUNI:
1903 case OP_NOT_DIGIT:
1904 case OP_DIGIT:
1905 case OP_NOT_WHITESPACE:
1906 case OP_WHITESPACE:
1907 case OP_NOT_WORDCHAR:
1908 case OP_WORDCHAR:
1909 case OP_ANY:
1910 case OP_ALLANY:
1911 case OP_ANYBYTE:
1912 case OP_CHAR:
1913 case OP_CHARNC:
1914 case OP_NOT:
1915 case OP_PLUS:
1916 case OP_MINPLUS:
1917 case OP_POSPLUS:
1918 case OP_EXACT:
1919 case OP_NOTPLUS:
1920 case OP_NOTMINPLUS:
1921 case OP_NOTPOSPLUS:
1922 case OP_NOTEXACT:
1923 case OP_TYPEPLUS:
1924 case OP_TYPEMINPLUS:
1925 case OP_TYPEPOSPLUS:
1926 case OP_TYPEEXACT:
1927 return FALSE;
1928
1929 /* These are going to continue, as they may be empty, but we have to
1930 fudge the length for the \p and \P cases. */
1931
1932 case OP_TYPESTAR:
1933 case OP_TYPEMINSTAR:
1934 case OP_TYPEPOSSTAR:
1935 case OP_TYPEQUERY:
1936 case OP_TYPEMINQUERY:
1937 case OP_TYPEPOSQUERY:
1938 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1939 break;
1940
1941 /* Same for these */
1942
1943 case OP_TYPEUPTO:
1944 case OP_TYPEMINUPTO:
1945 case OP_TYPEPOSUPTO:
1946 if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1947 break;
1948
1949 /* End of branch */
1950
1951 case OP_KET:
1952 case OP_KETRMAX:
1953 case OP_KETRMIN:
1954 case OP_ALT:
1955 return TRUE;
1956
1957 /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1958 MINUPTO, and POSUPTO may be followed by a multibyte character */
1959
1960 #ifdef SUPPORT_UTF8
1961 case OP_STAR:
1962 case OP_MINSTAR:
1963 case OP_POSSTAR:
1964 case OP_QUERY:
1965 case OP_MINQUERY:
1966 case OP_POSQUERY:
1967 if (utf8 && code[1] >= 0xc0) code += _pcre_utf8_table4[code[1] & 0x3f];
1968 break;
1969
1970 case OP_UPTO:
1971 case OP_MINUPTO:
1972 case OP_POSUPTO:
1973 if (utf8 && code[3] >= 0xc0) code += _pcre_utf8_table4[code[3] & 0x3f];
1974 break;
1975 #endif
1976 }
1977 }
1978
1979 return TRUE;
1980 }
1981
1982
1983
1984 /*************************************************
1985 * Scan compiled regex for non-emptiness *
1986 *************************************************/
1987
1988 /* This function is called to check for left recursive calls. We want to check
1989 the current branch of the current pattern to see if it could match the empty
1990 string. If it could, we must look outwards for branches at other levels,
1991 stopping when we pass beyond the bracket which is the subject of the recursion.
1992
1993 Arguments:
1994 code points to start of the recursion
1995 endcode points to where to stop (current RECURSE item)
1996 bcptr points to the chain of current (unclosed) branch starts
1997 utf8 TRUE if in UTF-8 mode
1998
1999 Returns: TRUE if what is matched could be empty
2000 */
2001
2002 static BOOL
2003 could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
2004 BOOL utf8)
2005 {
2006 while (bcptr != NULL && bcptr->current_branch >= code)
2007 {
2008 if (!could_be_empty_branch(bcptr->current_branch, endcode, utf8))
2009 return FALSE;
2010 bcptr = bcptr->outer;
2011 }
2012 return TRUE;
2013 }
2014
2015
2016
2017 /*************************************************
2018 * Check for POSIX class syntax *
2019 *************************************************/
2020
2021 /* This function is called when the sequence "[:" or "[." or "[=" is
2022 encountered in a character class. It checks whether this is followed by a
2023 sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
2024 reach an unescaped ']' without the special preceding character, return FALSE.
2025
2026 Originally, this function only recognized a sequence of letters between the
2027 terminators, but it seems that Perl recognizes any sequence of characters,
2028 though of course unknown POSIX names are subsequently rejected. Perl gives an
2029 "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
2030 didn't consider this to be a POSIX class. Likewise for [:1234:].
2031
2032 The problem in trying to be exactly like Perl is in the handling of escapes. We
2033 have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
2034 class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
2035 below handles the special case of \], but does not try to do any other escape
2036 processing. This makes it different from Perl for cases such as [:l\ower:]
2037 where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize
2038 "l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,
2039 I think.
2040
2041 Arguments:
2042 ptr pointer to the initial [
2043 endptr where to return the end pointer
2044
2045 Returns: TRUE or FALSE
2046 */
2047
2048 static BOOL
2049 check_posix_syntax(const uschar *ptr, const uschar **endptr)
2050 {
2051 int terminator; /* Don't combine these lines; the Solaris cc */
2052 terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
2053 for (++ptr; *ptr != 0; ptr++)
2054 {
2055 if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET) ptr++; else
2056 {
2057 if (*ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
2058 if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
2059 {
2060 *endptr = ptr;
2061 return TRUE;
2062 }
2063 }
2064 }
2065 return FALSE;
2066 }
2067
2068
2069
2070
2071 /*************************************************
2072 * Check POSIX class name *
2073 *************************************************/
2074
2075 /* This function is called to check the name given in a POSIX-style class entry
2076 such as [:alnum:].
2077
2078 Arguments:
2079 ptr points to the first letter
2080 len the length of the name
2081
2082 Returns: a value representing the name, or -1 if unknown
2083 */
2084
2085 static int
2086 check_posix_name(const uschar *ptr, int len)
2087 {
2088 const char *pn = posix_names;
2089 register int yield = 0;
2090 while (posix_name_lengths[yield] != 0)
2091 {
2092 if (len == posix_name_lengths[yield] &&
2093 strncmp((const char *)ptr, pn, len) == 0) return yield;
2094 pn += posix_name_lengths[yield] + 1;
2095 yield++;
2096 }
2097 return -1;
2098 }
2099
2100
2101 /*************************************************
2102 * Adjust OP_RECURSE items in repeated group *
2103 *************************************************/
2104
2105 /* OP_RECURSE items contain an offset from the start of the regex to the group
2106 that is referenced. This means that groups can be replicated for fixed
2107 repetition simply by copying (because the recursion is allowed to refer to
2108 earlier groups that are outside the current group). However, when a group is
2109 optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is
2110 inserted before it, after it has been compiled. This means that any OP_RECURSE
2111 items within it that refer to the group itself or any contained groups have to
2112 have their offsets adjusted. That one of the jobs of this function. Before it
2113 is called, the partially compiled regex must be temporarily terminated with
2114 OP_END.
2115
2116 This function has been extended with the possibility of forward references for
2117 recursions and subroutine calls. It must also check the list of such references
2118 for the group we are dealing with. If it finds that one of the recursions in
2119 the current group is on this list, it adjusts the offset in the list, not the
2120 value in the reference (which is a group number).
2121
2122 Arguments:
2123 group points to the start of the group
2124 adjust the amount by which the group is to be moved
2125 utf8 TRUE in UTF-8 mode
2126 cd contains pointers to tables etc.
2127 save_hwm the hwm forward reference pointer at the start of the group
2128
2129 Returns: nothing
2130 */
2131
2132 static void
2133 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
2134 uschar *save_hwm)
2135 {
2136 uschar *ptr = group;
2137
2138 while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
2139 {
2140 int offset;
2141 uschar *hc;
2142
2143 /* See if this recursion is on the forward reference list. If so, adjust the
2144 reference. */
2145
2146 for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
2147 {
2148 offset = GET(hc, 0);
2149 if (cd->start_code + offset == ptr + 1)
2150 {
2151 PUT(hc, 0, offset + adjust);
2152 break;
2153 }
2154 }
2155
2156 /* Otherwise, adjust the recursion offset if it's after the start of this
2157 group. */
2158
2159 if (hc >= cd->hwm)
2160 {
2161 offset = GET(ptr, 1);
2162 if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
2163 }
2164
2165 ptr += 1 + LINK_SIZE;
2166 }
2167 }
2168
2169
2170
2171 /*************************************************
2172 * Insert an automatic callout point *
2173 *************************************************/
2174
2175 /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
2176 callout points before each pattern item.
2177
2178 Arguments:
2179 code current code pointer
2180 ptr current pattern pointer
2181 cd pointers to tables etc
2182
2183 Returns: new code pointer
2184 */
2185
2186 static uschar *
2187 auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
2188 {
2189 *code++ = OP_CALLOUT;
2190 *code++ = 255;
2191 PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */
2192 PUT(code, LINK_SIZE, 0); /* Default length */
2193 return code + 2*LINK_SIZE;
2194 }
2195
2196
2197
2198 /*************************************************
2199 * Complete a callout item *
2200 *************************************************/
2201
2202 /* A callout item contains the length of the next item in the pattern, which
2203 we can't fill in till after we have reached the relevant point. This is used
2204 for both automatic and manual callouts.
2205
2206 Arguments:
2207 previous_callout points to previous callout item
2208 ptr current pattern pointer
2209 cd pointers to tables etc
2210
2211 Returns: nothing
2212 */
2213
2214 static void
2215 complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
2216 {
2217 int length = ptr - cd->start_pattern - GET(previous_callout, 2);
2218 PUT(previous_callout, 2 + LINK_SIZE, length);
2219 }
2220
2221
2222
2223 #ifdef SUPPORT_UCP
2224 /*************************************************
2225 * Get othercase range *
2226 *************************************************/
2227
2228 /* This function is passed the start and end of a class range, in UTF-8 mode
2229 with UCP support. It searches up the characters, looking for internal ranges of
2230 characters in the "other" case. Each call returns the next one, updating the
2231 start address.
2232
2233 Arguments:
2234 cptr points to starting character value; updated
2235 d end value
2236 ocptr where to put start of othercase range
2237 odptr where to put end of othercase range
2238
2239 Yield: TRUE when range returned; FALSE when no more
2240 */
2241
2242 static BOOL
2243 get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
2244 unsigned int *odptr)
2245 {
2246 unsigned int c, othercase, next;
2247
2248 for (c = *cptr; c <= d; c++)
2249 { if ((othercase = UCD_OTHERCASE(c)) != c) break; }
2250
2251 if (c > d) return FALSE;
2252
2253 *ocptr = othercase;
2254 next = othercase + 1;
2255
2256 for (++c; c <= d; c++)
2257 {
2258 if (UCD_OTHERCASE(c) != next) break;
2259 next++;
2260 }
2261
2262 *odptr = next - 1;
2263 *cptr = c;
2264
2265 return TRUE;
2266 }
2267 #endif /* SUPPORT_UCP */
2268
2269
2270
2271 /*************************************************
2272 * Check if auto-possessifying is possible *
2273 *************************************************/
2274
2275 /* This function is called for unlimited repeats of certain items, to see
2276 whether the next thing could possibly match the repeated item. If not, it makes
2277 sense to automatically possessify the repeated item.
2278
2279 Arguments:
2280 op_code the repeated op code
2281 this data for this item, depends on the opcode
2282 utf8 TRUE in UTF-8 mode
2283 utf8_char used for utf8 character bytes, NULL if not relevant
2284 ptr next character in pattern
2285 options options bits
2286 cd contains pointers to tables etc.
2287
2288 Returns: TRUE if possessifying is wanted
2289 */
2290
2291 static BOOL
2292 check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
2293 const uschar *ptr, int options, compile_data *cd)
2294 {
2295 int next;
2296
2297 /* Skip whitespace and comments in extended mode */
2298
2299 if ((options & PCRE_EXTENDED) != 0)
2300 {
2301 for (;;)
2302 {
2303 while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2304 if (*ptr == CHAR_NUMBER_SIGN)
2305 {
2306 while (*(++ptr) != 0)
2307 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2308 }
2309 else break;
2310 }
2311 }
2312
2313 /* If the next item is one that we can handle, get its value. A non-negative
2314 value is a character, a negative value is an escape value. */
2315
2316 if (*ptr == CHAR_BACKSLASH)
2317 {
2318 int temperrorcode = 0;
2319 next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
2320 if (temperrorcode != 0) return FALSE;
2321 ptr++; /* Point after the escape sequence */
2322 }
2323
2324 else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
2325 {
2326 #ifdef SUPPORT_UTF8
2327 if (utf8) { GETCHARINC(next, ptr); } else
2328 #endif
2329 next = *ptr++;
2330 }
2331
2332 else return FALSE;
2333
2334 /* Skip whitespace and comments in extended mode */
2335
2336 if ((options & PCRE_EXTENDED) != 0)
2337 {
2338 for (;;)
2339 {
2340 while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2341 if (*ptr == CHAR_NUMBER_SIGN)
2342 {
2343 while (*(++ptr) != 0)
2344 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2345 }
2346 else break;
2347 }
2348 }
2349
2350 /* If the next thing is itself optional, we have to give up. */
2351
2352 if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
2353 strncmp((char *)ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
2354 return FALSE;
2355
2356 /* Now compare the next item with the previous opcode. If the previous is a
2357 positive single character match, "item" either contains the character or, if
2358 "item" is greater than 127 in utf8 mode, the character's bytes are in
2359 utf8_char. */
2360
2361
2362 /* Handle cases when the next item is a character. */
2363
2364 if (next >= 0) switch(op_code)
2365 {
2366 case OP_CHAR:
2367 #ifdef SUPPORT_UTF8
2368 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2369 #else
2370 (void)(utf8_char); /* Keep compiler happy by referencing function argument */
2371 #endif
2372 return item != next;
2373
2374 /* For CHARNC (caseless character) we must check the other case. If we have
2375 Unicode property support, we can use it to test the other case of
2376 high-valued characters. */
2377
2378 case OP_CHARNC:
2379 #ifdef SUPPORT_UTF8
2380 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2381 #endif
2382 if (item == next) return FALSE;
2383 #ifdef SUPPORT_UTF8
2384 if (utf8)
2385 {
2386 unsigned int othercase;
2387 if (next < 128) othercase = cd->fcc[next]; else
2388 #ifdef SUPPORT_UCP
2389 othercase = UCD_OTHERCASE((unsigned int)next);
2390 #else
2391 othercase = NOTACHAR;
2392 #endif
2393 return (unsigned int)item != othercase;
2394 }
2395 else
2396 #endif /* SUPPORT_UTF8 */
2397 return (item != cd->fcc[next]); /* Non-UTF-8 mode */
2398
2399 /* For OP_NOT, "item" must be a single-byte character. */
2400
2401 case OP_NOT:
2402 if (item == next) return TRUE;
2403 if ((options & PCRE_CASELESS) == 0) return FALSE;
2404 #ifdef SUPPORT_UTF8
2405 if (utf8)
2406 {
2407 unsigned int othercase;
2408 if (next < 128) othercase = cd->fcc[next]; else
2409 #ifdef SUPPORT_UCP
2410 othercase = UCD_OTHERCASE(next);
2411 #else
2412 othercase = NOTACHAR;
2413 #endif
2414 return (unsigned int)item == othercase;
2415 }
2416 else
2417 #endif /* SUPPORT_UTF8 */
2418 return (item == cd->fcc[next]); /* Non-UTF-8 mode */
2419
2420 case OP_DIGIT:
2421 return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
2422
2423 case OP_NOT_DIGIT:
2424 return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
2425
2426 case OP_WHITESPACE:
2427 return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
2428
2429 case OP_NOT_WHITESPACE:
2430 return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
2431
2432 case OP_WORDCHAR:
2433 return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
2434
2435 case OP_NOT_WORDCHAR:
2436 return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
2437
2438 case OP_HSPACE:
2439 case OP_NOT_HSPACE:
2440 switch(next)
2441 {
2442 case 0x09:
2443 case 0x20:
2444 case 0xa0:
2445 case 0x1680:
2446 case 0x180e:
2447 case 0x2000:
2448 case 0x2001:
2449 case 0x2002:
2450 case 0x2003:
2451 case 0x2004:
2452 case 0x2005:
2453 case 0x2006:
2454 case 0x2007:
2455 case 0x2008:
2456 case 0x2009:
2457 case 0x200A:
2458 case 0x202f:
2459 case 0x205f:
2460 case 0x3000:
2461 return op_code != OP_HSPACE;
2462 default:
2463 return op_code == OP_HSPACE;
2464 }
2465
2466 case OP_VSPACE:
2467 case OP_NOT_VSPACE:
2468 switch(next)
2469 {
2470 case 0x0a:
2471 case 0x0b:
2472 case 0x0c:
2473 case 0x0d:
2474 case 0x85:
2475 case 0x2028:
2476 case 0x2029:
2477 return op_code != OP_VSPACE;
2478 default:
2479 return op_code == OP_VSPACE;
2480 }
2481
2482 default:
2483 return FALSE;
2484 }
2485
2486
2487 /* Handle the case when the next item is \d, \s, etc. */
2488
2489 switch(op_code)
2490 {
2491 case OP_CHAR:
2492 case OP_CHARNC:
2493 #ifdef SUPPORT_UTF8
2494 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2495 #endif
2496 switch(-next)
2497 {
2498 case ESC_d:
2499 return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
2500
2501 case ESC_D:
2502 return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
2503
2504 case ESC_s:
2505 return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
2506
2507 case ESC_S:
2508 return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
2509
2510 case ESC_w:
2511 return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
2512
2513 case ESC_W:
2514 return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
2515
2516 case ESC_h:
2517 case ESC_H:
2518 switch(item)
2519 {
2520 case 0x09:
2521 case 0x20:
2522 case 0xa0:
2523 case 0x1680:
2524 case 0x180e:
2525 case 0x2000:
2526 case 0x2001:
2527 case 0x2002:
2528 case 0x2003:
2529 case 0x2004:
2530 case 0x2005:
2531 case 0x2006:
2532 case 0x2007:
2533 case 0x2008:
2534 case 0x2009:
2535 case 0x200A:
2536 case 0x202f:
2537 case 0x205f:
2538 case 0x3000:
2539 return -next != ESC_h;
2540 default:
2541 return -next == ESC_h;
2542 }
2543
2544 case ESC_v:
2545 case ESC_V:
2546 switch(item)
2547 {
2548 case 0x0a:
2549 case 0x0b:
2550 case 0x0c:
2551 case 0x0d:
2552 case 0x85:
2553 case 0x2028:
2554 case 0x2029:
2555 return -next != ESC_v;
2556 default:
2557 return -next == ESC_v;
2558 }
2559
2560 default:
2561 return FALSE;
2562 }
2563
2564 case OP_DIGIT:
2565 return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
2566 next == -ESC_h || next == -ESC_v;
2567
2568 case OP_NOT_DIGIT:
2569 return next == -ESC_d;
2570
2571 case OP_WHITESPACE:
2572 return next == -ESC_S || next == -ESC_d || next == -ESC_w;
2573
2574 case OP_NOT_WHITESPACE:
2575 return next == -ESC_s || next == -ESC_h || next == -ESC_v;
2576
2577 case OP_HSPACE:
2578 return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w;
2579
2580 case OP_NOT_HSPACE:
2581 return next == -ESC_h;
2582
2583 /* Can't have \S in here because VT matches \S (Perl anomaly) */
2584 case OP_VSPACE:
2585 return next == -ESC_V || next == -ESC_d || next == -ESC_w;
2586
2587 case OP_NOT_VSPACE:
2588 return next == -ESC_v;
2589
2590 case OP_WORDCHAR:
2591 return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v;
2592
2593 case OP_NOT_WORDCHAR:
2594 return next == -ESC_w || next == -ESC_d;
2595
2596 default:
2597 return FALSE;
2598 }
2599
2600 /* Control does not reach here */
2601 }
2602
2603
2604
2605 /*************************************************
2606 * Compile one branch *
2607 *************************************************/
2608
2609 /* Scan the pattern, compiling it into the a vector. If the options are
2610 changed during the branch, the pointer is used to change the external options
2611 bits. This function is used during the pre-compile phase when we are trying
2612 to find out the amount of memory needed, as well as during the real compile
2613 phase. The value of lengthptr distinguishes the two phases.
2614
2615 Arguments:
2616 optionsptr pointer to the option bits
2617 codeptr points to the pointer to the current code point
2618 ptrptr points to the current pattern pointer
2619 errorcodeptr points to error code variable
2620 firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2621 reqbyteptr set to the last literal character required, else < 0
2622 bcptr points to current branch chain
2623 cd contains pointers to tables etc.
2624 lengthptr NULL during the real compile phase
2625 points to length accumulator during pre-compile phase
2626
2627 Returns: TRUE on success
2628 FALSE, with *errorcodeptr set non-zero on error
2629 */
2630
2631 static BOOL
2632 compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2633 int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2634 compile_data *cd, int *lengthptr)
2635 {
2636 int repeat_type, op_type;
2637 int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
2638 int bravalue = 0;
2639 int greedy_default, greedy_non_default;
2640 int firstbyte, reqbyte;
2641 int zeroreqbyte, zerofirstbyte;
2642 int req_caseopt, reqvary, tempreqvary;
2643 int options = *optionsptr;
2644 int after_manual_callout = 0;
2645 int length_prevgroup = 0;
2646 register int c;
2647 register uschar *code = *codeptr;
2648 uschar *last_code = code;
2649 uschar *orig_code = code;
2650 uschar *tempcode;
2651 BOOL inescq = FALSE;
2652 BOOL groupsetfirstbyte = FALSE;
2653 const uschar *ptr = *ptrptr;
2654 const uschar *tempptr;
2655 uschar *previous = NULL;
2656 uschar *previous_callout = NULL;
2657 uschar *save_hwm = NULL;
2658 uschar classbits[32];
2659
2660 #ifdef SUPPORT_UTF8
2661 BOOL class_utf8;
2662 BOOL utf8 = (options & PCRE_UTF8) != 0;
2663 uschar *class_utf8data;
2664 uschar *class_utf8data_base;
2665 uschar utf8_char[6];
2666 #else
2667 BOOL utf8 = FALSE;
2668 uschar *utf8_char = NULL;
2669 #endif
2670
2671 #ifdef PCRE_DEBUG
2672 if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2673 #endif
2674
2675 /* Set up the default and non-default settings for greediness */
2676
2677 greedy_default = ((options & PCRE_UNGREEDY) != 0);
2678 greedy_non_default = greedy_default ^ 1;
2679
2680 /* Initialize no first byte, no required byte. REQ_UNSET means "no char
2681 matching encountered yet". It gets changed to REQ_NONE if we hit something that
2682 matches a non-fixed char first char; reqbyte just remains unset if we never
2683 find one.
2684
2685 When we hit a repeat whose minimum is zero, we may have to adjust these values
2686 to take the zero repeat into account. This is implemented by setting them to
2687 zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2688 item types that can be repeated set these backoff variables appropriately. */
2689
2690 firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2691
2692 /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2693 according to the current setting of the caseless flag. REQ_CASELESS is a bit
2694 value > 255. It is added into the firstbyte or reqbyte variables to record the
2695 case status of the value. This is used only for ASCII characters. */
2696
2697 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2698
2699 /* Switch on next character until the end of the branch */
2700
2701 for (;; ptr++)
2702 {
2703 BOOL negate_class;
2704 BOOL should_flip_negation;
2705 BOOL possessive_quantifier;
2706 BOOL is_quantifier;
2707 BOOL is_recurse;
2708 BOOL reset_bracount;
2709 int class_charcount;
2710 int class_lastchar;
2711 int newoptions;
2712 int recno;
2713 int refsign;
2714 int skipbytes;
2715 int subreqbyte;
2716 int subfirstbyte;
2717 int terminator;
2718 int mclength;
2719 uschar mcbuffer[8];
2720
2721 /* Get next byte in the pattern */
2722
2723 c = *ptr;
2724
2725 /* If we are in the pre-compile phase, accumulate the length used for the
2726 previous cycle of this loop. */
2727
2728 if (lengthptr != NULL)
2729 {
2730 #ifdef PCRE_DEBUG
2731 if (code > cd->hwm) cd->hwm = code; /* High water info */
2732 #endif
2733 if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
2734 {
2735 *errorcodeptr = ERR52;
2736 goto FAILED;
2737 }
2738
2739 /* There is at least one situation where code goes backwards: this is the
2740 case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2741 the class is simply eliminated. However, it is created first, so we have to
2742 allow memory for it. Therefore, don't ever reduce the length at this point.
2743 */
2744
2745 if (code < last_code) code = last_code;
2746
2747 /* Paranoid check for integer overflow */
2748
2749 if (OFLOW_MAX - *lengthptr < code - last_code)
2750 {
2751 *errorcodeptr = ERR20;
2752 goto FAILED;
2753 }
2754
2755 *lengthptr += code - last_code;
2756 DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2757
2758 /* If "previous" is set and it is not at the start of the work space, move
2759 it back to there, in order to avoid filling up the work space. Otherwise,
2760 if "previous" is NULL, reset the current code pointer to the start. */
2761
2762 if (previous != NULL)
2763 {
2764 if (previous > orig_code)
2765 {
2766 memmove(orig_code, previous, code - previous);
2767 code -= previous - orig_code;
2768 previous = orig_code;
2769 }
2770 }
2771 else code = orig_code;
2772
2773 /* Remember where this code item starts so we can pick up the length
2774 next time round. */
2775
2776 last_code = code;
2777 }
2778
2779 /* In the real compile phase, just check the workspace used by the forward
2780 reference list. */
2781
2782 else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
2783 {
2784 *errorcodeptr = ERR52;
2785 goto FAILED;
2786 }
2787
2788 /* If in \Q...\E, check for the end; if not, we have a literal */
2789
2790 if (inescq && c != 0)
2791 {
2792 if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)
2793 {
2794 inescq = FALSE;
2795 ptr++;
2796 continue;
2797 }
2798 else
2799 {
2800 if (previous_callout != NULL)
2801 {
2802 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2803 complete_callout(previous_callout, ptr, cd);
2804 previous_callout = NULL;
2805 }
2806 if ((options & PCRE_AUTO_CALLOUT) != 0)
2807 {
2808 previous_callout = code;
2809 code = auto_callout(code, ptr, cd);
2810 }
2811 goto NORMAL_CHAR;
2812 }
2813 }
2814
2815 /* Fill in length of a previous callout, except when the next thing is
2816 a quantifier. */
2817
2818 is_quantifier =
2819 c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK ||
2820 (c == CHAR_LEFT_CURLY_BRACKET && is_counted_repeat(ptr+1));
2821
2822 if (!is_quantifier && previous_callout != NULL &&
2823 after_manual_callout-- <= 0)
2824 {
2825 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2826 complete_callout(previous_callout, ptr, cd);
2827 previous_callout = NULL;
2828 }
2829
2830 /* In extended mode, skip white space and comments */
2831
2832 if ((options & PCRE_EXTENDED) != 0)
2833 {
2834 if ((cd->ctypes[c] & ctype_space) != 0) continue;
2835 if (c == CHAR_NUMBER_SIGN)
2836 {
2837 while (*(++ptr) != 0)
2838 {
2839 if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
2840 }
2841 if (*ptr != 0) continue;
2842
2843 /* Else fall through to handle end of string */
2844 c = 0;
2845 }
2846 }
2847
2848 /* No auto callout for quantifiers. */
2849
2850 if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
2851 {
2852 previous_callout = code;
2853 code = auto_callout(code, ptr, cd);
2854 }
2855
2856 switch(c)
2857 {
2858 /* ===================================================================*/
2859 case 0: /* The branch terminates at string end */
2860 case CHAR_VERTICAL_LINE: /* or | or ) */
2861 case CHAR_RIGHT_PARENTHESIS:
2862 *firstbyteptr = firstbyte;
2863 *reqbyteptr = reqbyte;
2864 *codeptr = code;
2865 *ptrptr = ptr;
2866 if (lengthptr != NULL)
2867 {
2868 if (OFLOW_MAX - *lengthptr < code - last_code)
2869 {
2870 *errorcodeptr = ERR20;
2871 goto FAILED;
2872 }
2873 *lengthptr += code - last_code; /* To include callout length */
2874 DPRINTF((">> end branch\n"));
2875 }
2876 return TRUE;
2877
2878
2879 /* ===================================================================*/
2880 /* Handle single-character metacharacters. In multiline mode, ^ disables
2881 the setting of any following char as a first character. */
2882
2883 case CHAR_CIRCUMFLEX_ACCENT:
2884 if ((options & PCRE_MULTILINE) != 0)
2885 {
2886 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2887 }
2888 previous = NULL;
2889 *code++ = OP_CIRC;
2890 break;
2891
2892 case CHAR_DOLLAR_SIGN:
2893 previous = NULL;
2894 *code++ = OP_DOLL;
2895 break;
2896
2897 /* There can never be a first char if '.' is first, whatever happens about
2898 repeats. The value of reqbyte doesn't change either. */
2899
2900 case CHAR_DOT:
2901 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2902 zerofirstbyte = firstbyte;
2903 zeroreqbyte = reqbyte;
2904 previous = code;
2905 *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;
2906 break;
2907
2908
2909 /* ===================================================================*/
2910 /* Character classes. If the included characters are all < 256, we build a
2911 32-byte bitmap of the permitted characters, except in the special case
2912 where there is only one such character. For negated classes, we build the
2913 map as usual, then invert it at the end. However, we use a different opcode
2914 so that data characters > 255 can be handled correctly.
2915
2916 If the class contains characters outside the 0-255 range, a different
2917 opcode is compiled. It may optionally have a bit map for characters < 256,
2918 but those above are are explicitly listed afterwards. A flag byte tells
2919 whether the bitmap is present, and whether this is a negated class or not.
2920
2921 In JavaScript compatibility mode, an isolated ']' causes an error. In
2922 default (Perl) mode, it is treated as a data character. */
2923
2924 case CHAR_RIGHT_SQUARE_BRACKET:
2925 if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
2926 {
2927 *errorcodeptr = ERR64;
2928 goto FAILED;
2929 }
2930 goto NORMAL_CHAR;
2931
2932 case CHAR_LEFT_SQUARE_BRACKET:
2933 previous = code;
2934
2935 /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
2936 they are encountered at the top level, so we'll do that too. */
2937
2938 if ((ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
2939 ptr[1] == CHAR_EQUALS_SIGN) &&
2940 check_posix_syntax(ptr, &tempptr))
2941 {
2942 *errorcodeptr = (ptr[1] == CHAR_COLON)? ERR13 : ERR31;
2943 goto FAILED;
2944 }
2945
2946 /* If the first character is '^', set the negation flag and skip it. Also,
2947 if the first few characters (either before or after ^) are \Q\E or \E we
2948 skip them too. This makes for compatibility with Perl. */
2949
2950 negate_class = FALSE;
2951 for (;;)
2952 {
2953 c = *(++ptr);
2954 if (c == CHAR_BACKSLASH)
2955 {
2956 if (ptr[1] == CHAR_E)
2957 ptr++;
2958 else if (strncmp((const char *)ptr+1,
2959 STR_Q STR_BACKSLASH STR_E, 3) == 0)
2960 ptr += 3;
2961 else
2962 break;
2963 }
2964 else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
2965 negate_class = TRUE;
2966 else break;
2967 }
2968
2969 /* Empty classes are allowed in JavaScript compatibility mode. Otherwise,
2970 an initial ']' is taken as a data character -- the code below handles
2971 that. In JS mode, [] must always fail, so generate OP_FAIL, whereas
2972 [^] must match any character, so generate OP_ALLANY. */
2973
2974 if (c == CHAR_RIGHT_SQUARE_BRACKET &&
2975 (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
2976 {
2977 *code++ = negate_class? OP_ALLANY : OP_FAIL;
2978 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2979 zerofirstbyte = firstbyte;
2980 break;
2981 }
2982
2983 /* If a class contains a negative special such as \S, we need to flip the
2984 negation flag at the end, so that support for characters > 255 works
2985 correctly (they are all included in the class). */
2986
2987 should_flip_negation = FALSE;
2988
2989 /* Keep a count of chars with values < 256 so that we can optimize the case
2990 of just a single character (as long as it's < 256). However, For higher
2991 valued UTF-8 characters, we don't yet do any optimization. */
2992
2993 class_charcount = 0;
2994 class_lastchar = -1;
2995
2996 /* Initialize the 32-char bit map to all zeros. We build the map in a
2997 temporary bit of memory, in case the class contains only 1 character (less
2998 than 256), because in that case the compiled code doesn't use the bit map.
2999 */
3000
3001 memset(classbits, 0, 32 * sizeof(uschar));
3002
3003 #ifdef SUPPORT_UTF8
3004 class_utf8 = FALSE; /* No chars >= 256 */
3005 class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */
3006 class_utf8data_base = class_utf8data; /* For resetting in pass 1 */
3007 #endif
3008
3009 /* Process characters until ] is reached. By writing this as a "do" it
3010 means that an initial ] is taken as a data character. At the start of the
3011 loop, c contains the first byte of the character. */
3012
3013 if (c != 0) do
3014 {
3015 const uschar *oldptr;
3016
3017 #ifdef SUPPORT_UTF8
3018 if (utf8 && c > 127)
3019 { /* Braces are required because the */
3020 GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
3021 }
3022
3023 /* In the pre-compile phase, accumulate the length of any UTF-8 extra
3024 data and reset the pointer. This is so that very large classes that
3025 contain a zillion UTF-8 characters no longer overwrite the work space
3026 (which is on the stack). */
3027
3028 if (lengthptr != NULL)
3029 {
3030 *lengthptr += class_utf8data - class_utf8data_base;
3031 class_utf8data = class_utf8data_base;
3032 }
3033
3034 #endif
3035
3036 /* Inside \Q...\E everything is literal except \E */
3037
3038 if (inescq)
3039 {
3040 if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E) /* If we are at \E */
3041 {
3042 inescq = FALSE; /* Reset literal state */
3043 ptr++; /* Skip the 'E' */
3044 continue; /* Carry on with next */
3045 }
3046 goto CHECK_RANGE; /* Could be range if \E follows */
3047 }
3048
3049 /* Handle POSIX class names. Perl allows a negation extension of the
3050 form [:^name:]. A square bracket that doesn't match the syntax is
3051 treated as a literal. We also recognize the POSIX constructions
3052 [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
3053 5.6 and 5.8 do. */
3054
3055 if (c == CHAR_LEFT_SQUARE_BRACKET &&
3056 (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
3057 ptr[1] == CHAR_EQUALS_SIGN) && check_posix_syntax(ptr, &tempptr))
3058 {
3059 BOOL local_negate = FALSE;
3060 int posix_class, taboffset, tabopt;
3061 register const uschar *cbits = cd->cbits;
3062 uschar pbits[32];
3063
3064 if (ptr[1] != CHAR_COLON)
3065 {
3066 *errorcodeptr = ERR31;
3067 goto FAILED;
3068 }
3069
3070 ptr += 2;
3071 if (*ptr == CHAR_CIRCUMFLEX_ACCENT)
3072 {
3073 local_negate = TRUE;
3074 should_flip_negation = TRUE; /* Note negative special */
3075 ptr++;
3076 }
3077
3078 posix_class = check_posix_name(ptr, tempptr - ptr);
3079 if (posix_class < 0)
3080 {
3081 *errorcodeptr = ERR30;
3082 goto FAILED;
3083 }
3084
3085 /* If matching is caseless, upper and lower are converted to
3086 alpha. This relies on the fact that the class table starts with
3087 alpha, lower, upper as the first 3 entries. */
3088
3089 if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
3090 posix_class = 0;
3091
3092 /* We build the bit map for the POSIX class in a chunk of local store
3093 because we may be adding and subtracting from it, and we don't want to
3094 subtract bits that may be in the main map already. At the end we or the
3095 result into the bit map that is being built. */
3096
3097 posix_class *= 3;
3098
3099 /* Copy in the first table (always present) */
3100
3101 memcpy(pbits, cbits + posix_class_maps[posix_class],
3102 32 * sizeof(uschar));
3103
3104 /* If there is a second table, add or remove it as required. */
3105
3106 taboffset = posix_class_maps[posix_class + 1];
3107 tabopt = posix_class_maps[posix_class + 2];
3108
3109 if (taboffset >= 0)
3110 {
3111 if (tabopt >= 0)
3112 for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
3113 else
3114 for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
3115 }
3116
3117 /* Not see if we need to remove any special characters. An option
3118 value of 1 removes vertical space and 2 removes underscore. */
3119
3120 if (tabopt < 0) tabopt = -tabopt;
3121 if (tabopt == 1) pbits[1] &= ~0x3c;
3122 else if (tabopt == 2) pbits[11] &= 0x7f;
3123
3124 /* Add the POSIX table or its complement into the main table that is
3125 being built and we are done. */
3126
3127 if (local_negate)
3128 for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
3129 else
3130 for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
3131
3132 ptr = tempptr + 1;
3133 class_charcount = 10; /* Set > 1; assumes more than 1 per class */
3134 continue; /* End of POSIX syntax handling */
3135 }
3136
3137 /* Backslash may introduce a single character, or it may introduce one
3138 of the specials, which just set a flag. The sequence \b is a special
3139 case. Inside a class (and only there) it is treated as backspace.
3140 Elsewhere it marks a word boundary. Other escapes have preset maps ready
3141 to 'or' into the one we are building. We assume they have more than one
3142 character in them, so set class_charcount bigger than one. */
3143
3144 if (c == CHAR_BACKSLASH)
3145 {
3146 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3147 if (*errorcodeptr != 0) goto FAILED;
3148
3149 if (-c == ESC_b) c = CHAR_BS; /* \b is backspace in a class */
3150 else if (-c == ESC_X) c = CHAR_X; /* \X is literal X in a class */
3151 else if (-c == ESC_R) c = CHAR_R; /* \R is literal R in a class */
3152 else if (-c == ESC_Q) /* Handle start of quoted string */
3153 {
3154 if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
3155 {
3156 ptr += 2; /* avoid empty string */
3157 }
3158 else inescq = TRUE;
3159 continue;
3160 }
3161 else if (-c == ESC_E) continue; /* Ignore orphan \E */
3162
3163 if (c < 0)
3164 {
3165 register const uschar *cbits = cd->cbits;
3166 class_charcount += 2; /* Greater than 1 is what matters */
3167
3168 /* Save time by not doing this in the pre-compile phase. */
3169
3170 if (lengthptr == NULL) switch (-c)
3171 {
3172 case ESC_d:
3173 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
3174 continue;
3175
3176 case ESC_D:
3177 should_flip_negation = TRUE;
3178 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
3179 continue;
3180
3181 case ESC_w:
3182 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
3183 continue;
3184
3185 case ESC_W:
3186 should_flip_negation = TRUE;
3187 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
3188 continue;
3189
3190 case ESC_s:
3191 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
3192 classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
3193 continue;
3194
3195 case ESC_S:
3196 should_flip_negation = TRUE;
3197 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
3198 classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
3199 continue;
3200
3201 default: /* Not recognized; fall through */
3202 break; /* Need "default" setting to stop compiler warning. */
3203 }
3204
3205 /* In the pre-compile phase, just do the recognition. */
3206
3207 else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
3208 c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
3209
3210 /* We need to deal with \H, \h, \V, and \v in both phases because
3211 they use extra memory. */
3212
3213 if (-c == ESC_h)
3214 {
3215 SETBIT(classbits, 0x09); /* VT */
3216 SETBIT(classbits, 0x20); /* SPACE */
3217 SETBIT(classbits, 0xa0); /* NSBP */
3218 #ifdef SUPPORT_UTF8
3219 if (utf8)
3220 {
3221 class_utf8 = TRUE;
3222 *class_utf8data++ = XCL_SINGLE;
3223 class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
3224 *class_utf8data++ = XCL_SINGLE;
3225 class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
3226 *class_utf8data++ = XCL_RANGE;
3227 class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
3228 class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
3229 *class_utf8data++ = XCL_SINGLE;
3230 class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
3231 *class_utf8data++ = XCL_SINGLE;
3232 class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
3233 *class_utf8data++ = XCL_SINGLE;
3234 class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
3235 }
3236 #endif
3237 continue;
3238 }
3239
3240 if (-c == ESC_H)
3241 {
3242 for (c = 0; c < 32; c++)
3243 {
3244 int x = 0xff;
3245 switch (c)
3246 {
3247 case 0x09/8: x ^= 1 << (0x09%8); break;
3248 case 0x20/8: x ^= 1 << (0x20%8); break;
3249 case 0xa0/8: x ^= 1 << (0xa0%8); break;
3250 default: break;
3251 }
3252 classbits[c] |= x;
3253 }
3254
3255 #ifdef SUPPORT_UTF8
3256 if (utf8)
3257 {
3258 class_utf8 = TRUE;
3259 *class_utf8data++ = XCL_RANGE;
3260 class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
3261 class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
3262 *class_utf8data++ = XCL_RANGE;
3263 class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
3264 class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
3265 *class_utf8data++ = XCL_RANGE;
3266 class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
3267 class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
3268 *class_utf8data++ = XCL_RANGE;
3269 class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
3270 class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
3271 *class_utf8data++ = XCL_RANGE;
3272 class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
3273 class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
3274 *class_utf8data++ = XCL_RANGE;
3275 class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
3276 class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
3277 *class_utf8data++ = XCL_RANGE;
3278 class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
3279 class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
3280 }
3281 #endif
3282 continue;
3283 }
3284
3285 if (-c == ESC_v)
3286 {
3287 SETBIT(classbits, 0x0a); /* LF */
3288 SETBIT(classbits, 0x0b); /* VT */
3289 SETBIT(classbits, 0x0c); /* FF */
3290 SETBIT(classbits, 0x0d); /* CR */
3291 SETBIT(classbits, 0x85); /* NEL */
3292 #ifdef SUPPORT_UTF8
3293 if (utf8)
3294 {
3295 class_utf8 = TRUE;
3296 *class_utf8data++ = XCL_RANGE;
3297 class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
3298 class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
3299 }
3300 #endif
3301 continue;
3302 }
3303
3304 if (-c == ESC_V)
3305 {
3306 for (c = 0; c < 32; c++)
3307 {
3308 int x = 0xff;
3309 switch (c)
3310 {
3311 case 0x0a/8: x ^= 1 << (0x0a%8);
3312 x ^= 1 << (0x0b%8);
3313 x ^= 1 << (0x0c%8);
3314 x ^= 1 << (0x0d%8);
3315 break;
3316 case 0x85/8: x ^= 1 << (0x85%8); break;
3317 default: break;
3318 }
3319 classbits[c] |= x;
3320 }
3321
3322 #ifdef SUPPORT_UTF8
3323 if (utf8)
3324 {
3325 class_utf8 = TRUE;
3326 *class_utf8data++ = XCL_RANGE;
3327 class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
3328 class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
3329 *class_utf8data++ = XCL_RANGE;
3330 class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
3331 class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
3332 }
3333 #endif
3334 continue;
3335 }
3336
3337 /* We need to deal with \P and \p in both phases. */
3338
3339 #ifdef SUPPORT_UCP
3340 if (-c == ESC_p || -c == ESC_P)
3341 {
3342 BOOL negated;
3343 int pdata;
3344 int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
3345 if (ptype < 0) goto FAILED;
3346 class_utf8 = TRUE;
3347 *class_utf8data++ = ((-c == ESC_p) != negated)?
3348 XCL_PROP : XCL_NOTPROP;
3349 *class_utf8data++ = ptype;
3350 *class_utf8data++ = pdata;
3351 class_charcount -= 2; /* Not a < 256 character */
3352 continue;
3353 }
3354 #endif
3355 /* Unrecognized escapes are faulted if PCRE is running in its
3356 strict mode. By default, for compatibility with Perl, they are
3357 treated as literals. */
3358
3359 if ((options & PCRE_EXTRA) != 0)
3360 {
3361 *errorcodeptr = ERR7;
3362 goto FAILED;
3363 }
3364
3365 class_charcount -= 2; /* Undo the default count from above */
3366 c = *ptr; /* Get the final character and fall through */
3367 }
3368
3369 /* Fall through if we have a single character (c >= 0). This may be
3370 greater than 256 in UTF-8 mode. */
3371
3372 } /* End of backslash handling */
3373
3374 /* A single character may be followed by '-' to form a range. However,
3375 Perl does not permit ']' to be the end of the range. A '-' character
3376 at the end is treated as a literal. Perl ignores orphaned \E sequences
3377 entirely. The code for handling \Q and \E is messy. */
3378
3379 CHECK_RANGE:
3380 while (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
3381 {
3382 inescq = FALSE;
3383 ptr += 2;
3384 }
3385
3386 oldptr = ptr;
3387
3388 /* Remember \r or \n */
3389
3390 if (c == CHAR_CR || c == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
3391
3392 /* Check for range */
3393
3394 if (!inescq && ptr[1] == CHAR_MINUS)
3395 {
3396 int d;
3397 ptr += 2;
3398 while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E) ptr += 2;
3399
3400 /* If we hit \Q (not followed by \E) at this point, go into escaped
3401 mode. */
3402
3403 while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_Q)
3404 {
3405 ptr += 2;
3406 if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E)
3407 { ptr += 2; continue; }
3408 inescq = TRUE;
3409 break;
3410 }
3411
3412 if (*ptr == 0 || (!inescq && *ptr == CHAR_RIGHT_SQUARE_BRACKET))
3413 {
3414 ptr = oldptr;
3415 goto LONE_SINGLE_CHARACTER;
3416 }
3417
3418 #ifdef SUPPORT_UTF8
3419 if (utf8)
3420 { /* Braces are required because the */
3421 GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
3422 }
3423 else
3424 #endif
3425 d = *ptr; /* Not UTF-8 mode */
3426
3427 /* The second part of a range can be a single-character escape, but
3428 not any of the other escapes. Perl 5.6 treats a hyphen as a literal
3429 in such circumstances. */
3430
3431 if (!inescq && d == CHAR_BACKSLASH)
3432 {
3433 d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3434 if (*errorcodeptr != 0) goto FAILED;
3435
3436 /* \b is backspace; \X is literal X; \R is literal R; any other
3437 special means the '-' was literal */
3438
3439 if (d < 0)
3440 {
3441 if (d == -ESC_b) d = CHAR_BS;
3442 else if (d == -ESC_X) d = CHAR_X;
3443 else if (d == -ESC_R) d = CHAR_R; else
3444 {
3445 ptr = oldptr;
3446 goto LONE_SINGLE_CHARACTER; /* A few lines below */
3447 }
3448 }
3449 }
3450
3451 /* Check that the two values are in the correct order. Optimize
3452 one-character ranges */
3453
3454 if (d < c)
3455 {
3456 *errorcodeptr = ERR8;
3457 goto FAILED;
3458 }
3459
3460 if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
3461
3462 /* Remember \r or \n */
3463
3464 if (d == CHAR_CR || d == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
3465
3466 /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
3467 matching, we have to use an XCLASS with extra data items. Caseless
3468 matching for characters > 127 is available only if UCP support is
3469 available. */
3470
3471 #ifdef SUPPORT_UTF8
3472 if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
3473 {
3474 class_utf8 = TRUE;
3475
3476 /* With UCP support, we can find the other case equivalents of
3477 the relevant characters. There may be several ranges. Optimize how
3478 they fit with the basic range. */
3479
3480 #ifdef SUPPORT_UCP
3481 if ((options & PCRE_CASELESS) != 0)
3482 {
3483 unsigned int occ, ocd;
3484 unsigned int cc = c;
3485 unsigned int origd = d;
3486 while (get_othercase_range(&cc, origd, &occ, &ocd))
3487 {
3488 if (occ >= (unsigned int)c &&
3489 ocd <= (unsigned int)d)
3490 continue; /* Skip embedded ranges */
3491
3492 if (occ < (unsigned int)c &&
3493 ocd >= (unsigned int)c - 1) /* Extend the basic range */
3494 { /* if there is overlap, */
3495 c = occ; /* noting that if occ < c */
3496 continue; /* we can't have ocd > d */
3497 } /* because a subrange is */
3498 if (ocd > (unsigned int)d &&
3499 occ <= (unsigned int)d + 1) /* always shorter than */
3500 { /* the basic range. */
3501 d = ocd;
3502 continue;
3503 }
3504
3505 if (occ == ocd)
3506 {
3507 *class_utf8data++ = XCL_SINGLE;
3508 }
3509 else
3510 {
3511 *class_utf8data++ = XCL_RANGE;
3512 class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
3513 }
3514 class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
3515 }
3516 }
3517 #endif /* SUPPORT_UCP */
3518
3519 /* Now record the original range, possibly modified for UCP caseless
3520 overlapping ranges. */
3521
3522 *class_utf8data++ = XCL_RANGE;
3523 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3524 class_utf8data += _pcre_ord2utf8(d, class_utf8data);
3525
3526 /* With UCP support, we are done. Without UCP support, there is no
3527 caseless matching for UTF-8 characters > 127; we can use the bit map
3528 for the smaller ones. */
3529
3530 #ifdef SUPPORT_UCP
3531 continue; /* With next character in the class */
3532 #else
3533 if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
3534
3535 /* Adjust upper limit and fall through to set up the map */
3536
3537 d = 127;
3538
3539 #endif /* SUPPORT_UCP */
3540 }
3541 #endif /* SUPPORT_UTF8 */
3542
3543 /* We use the bit map for all cases when not in UTF-8 mode; else
3544 ranges that lie entirely within 0-127 when there is UCP support; else
3545 for partial ranges without UCP support. */
3546
3547 class_charcount += d - c + 1;
3548 class_lastchar = d;
3549
3550 /* We can save a bit of time by skipping this in the pre-compile. */
3551
3552 if (lengthptr == NULL) for (; c <= d; c++)
3553 {
3554 classbits[c/8] |= (1 << (c&7));
3555 if ((options & PCRE_CASELESS) != 0)
3556 {
3557 int uc = cd->fcc[c]; /* flip case */
3558 classbits[uc/8] |= (1 << (uc&7));
3559 }
3560 }
3561
3562 continue; /* Go get the next char in the class */
3563 }
3564
3565 /* Handle a lone single character - we can get here for a normal
3566 non-escape char, or after \ that introduces a single character or for an
3567 apparent range that isn't. */
3568
3569 LONE_SINGLE_CHARACTER:
3570
3571 /* Handle a character that cannot go in the bit map */
3572
3573 #ifdef SUPPORT_UTF8
3574 if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
3575 {
3576 class_utf8 = TRUE;
3577 *class_utf8data++ = XCL_SINGLE;
3578 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3579
3580 #ifdef SUPPORT_UCP
3581 if ((options & PCRE_CASELESS) != 0)
3582 {
3583 unsigned int othercase;
3584 if ((othercase = UCD_OTHERCASE(c)) != c)
3585 {
3586 *class_utf8data++ = XCL_SINGLE;
3587 class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
3588 }
3589 }
3590 #endif /* SUPPORT_UCP */
3591
3592 }
3593 else
3594 #endif /* SUPPORT_UTF8 */
3595
3596 /* Handle a single-byte character */
3597 {
3598 classbits[c/8] |= (1 << (c&7));
3599 if ((options & PCRE_CASELESS) != 0)
3600 {
3601 c = cd->fcc[c]; /* flip case */
3602 classbits[c/8] |= (1 << (c&7));
3603 }
3604 class_charcount++;
3605 class_lastchar = c;
3606 }
3607 }
3608
3609 /* Loop until ']' reached. This "while" is the end of the "do" above. */
3610
3611 while ((c = *(++ptr)) != 0 && (c != CHAR_RIGHT_SQUARE_BRACKET || inescq));
3612
3613 if (c == 0) /* Missing terminating ']' */
3614 {
3615 *errorcodeptr = ERR6;
3616 goto FAILED;
3617 }
3618
3619
3620 /* This code has been disabled because it would mean that \s counts as
3621 an explicit \r or \n reference, and that's not really what is wanted. Now
3622 we set the flag only if there is a literal "\r" or "\n" in the class. */
3623
3624 #if 0
3625 /* Remember whether \r or \n are in this class */
3626
3627 if (negate_class)
3628 {
3629 if ((classbits[1] & 0x24) != 0x24) cd->external_flags |= PCRE_HASCRORLF;
3630 }
3631 else
3632 {
3633 if ((classbits[1] & 0x24) != 0) cd->external_flags |= PCRE_HASCRORLF;
3634 }
3635 #endif
3636
3637
3638 /* If class_charcount is 1, we saw precisely one character whose value is
3639 less than 256. As long as there were no characters >= 128 and there was no
3640 use of \p or \P, in other words, no use of any XCLASS features, we can
3641 optimize.
3642
3643 In UTF-8 mode, we can optimize the negative case only if there were no
3644 characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR
3645 operate on single-bytes only. This is an historical hangover. Maybe one day
3646 we can tidy these opcodes to handle multi-byte characters.
3647
3648 The optimization throws away the bit map. We turn the item into a
3649 1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
3650 that OP_NOT does not support multibyte characters. In the positive case, it
3651 can cause firstbyte to be set. Otherwise, there can be no first char if
3652 this item is first, whatever repeat count may follow. In the case of
3653 reqbyte, save the previous value for reinstating. */
3654
3655 #ifdef SUPPORT_UTF8
3656 if (class_charcount == 1 && !class_utf8 &&
3657 (!utf8 || !negate_class || class_lastchar < 128))
3658 #else
3659 if (class_charcount == 1)
3660 #endif
3661 {
3662 zeroreqbyte = reqbyte;
3663
3664 /* The OP_NOT opcode works on one-byte characters only. */
3665
3666 if (negate_class)
3667 {
3668 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3669 zerofirstbyte = firstbyte;
3670 *code++ = OP_NOT;
3671 *code++ = class_lastchar;
3672 break;
3673 }
3674
3675 /* For a single, positive character, get the value into mcbuffer, and
3676 then we can handle this with the normal one-character code. */
3677
3678 #ifdef SUPPORT_UTF8
3679 if (utf8 && class_lastchar > 127)
3680 mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
3681 else
3682 #endif
3683 {
3684 mcbuffer[0] = class_lastchar;
3685 mclength = 1;
3686 }
3687 goto ONE_CHAR;
3688 } /* End of 1-char optimization */
3689
3690 /* The general case - not the one-char optimization. If this is the first
3691 thing in the branch, there can be no first char setting, whatever the
3692 repeat count. Any reqbyte setting must remain unchanged after any kind of
3693 repeat. */
3694
3695 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3696 zerofirstbyte = firstbyte;
3697 zeroreqbyte = reqbyte;
3698
3699 /* If there are characters with values > 255, we have to compile an
3700 extended class, with its own opcode, unless there was a negated special
3701 such as \S in the class, because in that case all characters > 255 are in
3702 the class, so any that were explicitly given as well can be ignored. If
3703 (when there are explicit characters > 255 that must be listed) there are no
3704 characters < 256, we can omit the bitmap in the actual compiled code. */
3705
3706 #ifdef SUPPORT_UTF8
3707 if (class_utf8 && !should_flip_negation)
3708 {
3709 *class_utf8data++ = XCL_END; /* Marks the end of extra data */
3710 *code++ = OP_XCLASS;
3711 code += LINK_SIZE;
3712 *code = negate_class? XCL_NOT : 0;
3713
3714 /* If the map is required, move up the extra data to make room for it;
3715 otherwise just move the code pointer to the end of the extra data. */
3716
3717 if (class_charcount > 0)
3718 {
3719 *code++ |= XCL_MAP;
3720 memmove(code + 32, code, class_utf8data - code);
3721 memcpy(code, classbits, 32);
3722 code = class_utf8data + 32;
3723 }
3724 else code = class_utf8data;
3725
3726 /* Now fill in the complete length of the item */
3727
3728 PUT(previous, 1, code - previous);
3729 break; /* End of class handling */
3730 }
3731 #endif
3732
3733 /* If there are no characters > 255, set the opcode to OP_CLASS or
3734 OP_NCLASS, depending on whether the whole class was negated and whether
3735 there were negative specials such as \S in the class. Then copy the 32-byte
3736 map into the code vector, negating it if necessary. */
3737
3738 *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
3739 if (negate_class)
3740 {
3741 if (lengthptr == NULL) /* Save time in the pre-compile phase */
3742 for (c = 0; c < 32; c++) code[c] = ~classbits[c];
3743 }
3744 else
3745 {
3746 memcpy(code, classbits, 32);
3747 }
3748 code += 32;
3749 break;
3750
3751
3752 /* ===================================================================*/
3753 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
3754 has been tested above. */
3755
3756 case CHAR_LEFT_CURLY_BRACKET:
3757 if (!is_quantifier) goto NORMAL_CHAR;
3758 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
3759 if (*errorcodeptr != 0) goto FAILED;
3760 goto REPEAT;
3761
3762 case CHAR_ASTERISK:
3763 repeat_min = 0;
3764 repeat_max = -1;
3765 goto REPEAT;
3766
3767 case CHAR_PLUS:
3768 repeat_min = 1;
3769 repeat_max = -1;
3770 goto REPEAT;
3771
3772 case CHAR_QUESTION_MARK:
3773 repeat_min = 0;
3774 repeat_max = 1;
3775
3776 REPEAT:
3777 if (previous == NULL)
3778 {
3779 *errorcodeptr = ERR9;
3780 goto FAILED;
3781 }
3782
3783 if (repeat_min == 0)
3784 {
3785 firstbyte = zerofirstbyte; /* Adjust for zero repeat */
3786 reqbyte = zeroreqbyte; /* Ditto */
3787 }
3788
3789 /* Remember whether this is a variable length repeat */
3790
3791 reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
3792
3793 op_type = 0; /* Default single-char op codes */
3794 possessive_quantifier = FALSE; /* Default not possessive quantifier */
3795
3796 /* Save start of previous item, in case we have to move it up to make space
3797 for an inserted OP_ONCE for the additional '+' extension. */
3798
3799 tempcode = previous;
3800
3801 /* If the next character is '+', we have a possessive quantifier. This
3802 implies greediness, whatever the setting of the PCRE_UNGREEDY option.
3803 If the next character is '?' this is a minimizing repeat, by default,
3804 but if PCRE_UNGREEDY is set, it works the other way round. We change the
3805 repeat type to the non-default. */
3806
3807 if (ptr[1] == CHAR_PLUS)
3808 {
3809 repeat_type = 0; /* Force greedy */
3810 possessive_quantifier = TRUE;
3811 ptr++;
3812 }
3813 else if (ptr[1] == CHAR_QUESTION_MARK)
3814 {
3815 repeat_type = greedy_non_default;
3816 ptr++;
3817 }
3818 else repeat_type = greedy_default;
3819
3820 /* If previous was a character match, abolish the item and generate a
3821 repeat item instead. If a char item has a minumum of more than one, ensure
3822 that it is set in reqbyte - it might not be if a sequence such as x{3} is
3823 the first thing in a branch because the x will have gone into firstbyte
3824 instead. */
3825
3826 if (*previous == OP_CHAR || *previous == OP_CHARNC)
3827 {
3828 /* Deal with UTF-8 characters that take up more than one byte. It's
3829 easier to write this out separately than try to macrify it. Use c to
3830 hold the length of the character in bytes, plus 0x80 to flag that it's a
3831 length rather than a small character. */
3832
3833 #ifdef SUPPORT_UTF8
3834 if (utf8 && (code[-1] & 0x80) != 0)
3835 {
3836 uschar *lastchar = code - 1;
3837 while((*lastchar & 0xc0) == 0x80) lastchar--;
3838 c = code - lastchar; /* Length of UTF-8 character */
3839 memcpy(utf8_char, lastchar, c); /* Save the char */
3840 c |= 0x80; /* Flag c as a length */
3841 }
3842 else
3843 #endif
3844
3845 /* Handle the case of a single byte - either with no UTF8 support, or
3846 with UTF-8 disabled, or for a UTF-8 character < 128. */
3847
3848 {
3849 c = code[-1];
3850 if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
3851 }
3852
3853 /* If the repetition is unlimited, it pays to see if the next thing on
3854 the line is something that cannot possibly match this character. If so,
3855 automatically possessifying this item gains some performance in the case
3856 where the match fails. */
3857
3858 if (!possessive_quantifier &&
3859 repeat_max < 0 &&
3860 check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3861 options, cd))
3862 {
3863 repeat_type = 0; /* Force greedy */
3864 possessive_quantifier = TRUE;
3865 }
3866
3867 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
3868 }
3869
3870 /* If previous was a single negated character ([^a] or similar), we use
3871 one of the special opcodes, replacing it. The code is shared with single-
3872 character repeats by setting opt_type to add a suitable offset into
3873 repeat_type. We can also test for auto-possessification. OP_NOT is
3874 currently used only for single-byte chars. */
3875
3876 else if (*previous == OP_NOT)
3877 {
3878 op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
3879 c = previous[1];
3880 if (!possessive_quantifier &&
3881 repeat_max < 0 &&
3882 check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3883 {
3884 repeat_type = 0; /* Force greedy */
3885 possessive_quantifier = TRUE;
3886 }
3887 goto OUTPUT_SINGLE_REPEAT;
3888 }
3889
3890 /* If previous was a character type match (\d or similar), abolish it and
3891 create a suitable repeat item. The code is shared with single-character
3892 repeats by setting op_type to add a suitable offset into repeat_type. Note
3893 the the Unicode property types will be present only when SUPPORT_UCP is
3894 defined, but we don't wrap the little bits of code here because it just
3895 makes it horribly messy. */
3896
3897 else if (*previous < OP_EODN)
3898 {
3899 uschar *oldcode;
3900 int prop_type, prop_value;
3901 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
3902 c = *previous;
3903
3904 if (!possessive_quantifier &&
3905 repeat_max < 0 &&
3906 check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3907 {
3908 repeat_type = 0; /* Force greedy */
3909 possessive_quantifier = TRUE;
3910 }
3911
3912 OUTPUT_SINGLE_REPEAT:
3913 if (*previous == OP_PROP || *previous == OP_NOTPROP)
3914 {
3915 prop_type = previous[1];
3916 prop_value = previous[2];
3917 }
3918 else prop_type = prop_value = -1;
3919
3920 oldcode = code;
3921 code = previous; /* Usually overwrite previous item */
3922
3923 /* If the maximum is zero then the minimum must also be zero; Perl allows
3924 this case, so we do too - by simply omitting the item altogether. */
3925
3926 if (repeat_max == 0) goto END_REPEAT;
3927
3928 /*--------------------------------------------------------------------*/
3929 /* This code is obsolete from release 8.00; the restriction was finally
3930 removed: */
3931
3932 /* All real repeats make it impossible to handle partial matching (maybe
3933 one day we will be able to remove this restriction). */
3934
3935 /* if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL; */
3936 /*--------------------------------------------------------------------*/
3937
3938 /* Combine the op_type with the repeat_type */
3939
3940 repeat_type += op_type;
3941
3942 /* A minimum of zero is handled either as the special case * or ?, or as
3943 an UPTO, with the maximum given. */
3944
3945 if (repeat_min == 0)
3946 {
3947 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
3948 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
3949 else
3950 {
3951 *code++ = OP_UPTO + repeat_type;
3952 PUT2INC(code, 0, repeat_max);
3953 }
3954 }
3955
3956 /* A repeat minimum of 1 is optimized into some special cases. If the
3957 maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
3958 left in place and, if the maximum is greater than 1, we use OP_UPTO with
3959 one less than the maximum. */
3960
3961 else if (repeat_min == 1)
3962 {
3963 if (repeat_max == -1)
3964 *code++ = OP_PLUS + repeat_type;
3965 else
3966 {
3967 code = oldcode; /* leave previous item in place */
3968 if (repeat_max == 1) goto END_REPEAT;
3969 *code++ = OP_UPTO + repeat_type;
3970 PUT2INC(code, 0, repeat_max - 1);
3971 }
3972 }
3973
3974 /* The case {n,n} is just an EXACT, while the general case {n,m} is
3975 handled as an EXACT followed by an UPTO. */
3976
3977 else
3978 {
3979 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
3980 PUT2INC(code, 0, repeat_min);
3981
3982 /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
3983 we have to insert the character for the previous code. For a repeated
3984 Unicode property match, there are two extra bytes that define the
3985 required property. In UTF-8 mode, long characters have their length in
3986 c, with the 0x80 bit as a flag. */
3987
3988 if (repeat_max < 0)
3989 {
3990 #ifdef SUPPORT_UTF8
3991 if (utf8 && c >= 128)
3992 {
3993 memcpy(code, utf8_char, c & 7);
3994 code += c & 7;
3995 }
3996 else
3997 #endif
3998 {
3999 *code++ = c;
4000 if (prop_type >= 0)
4001 {
4002 *code++ = prop_type;
4003 *code++ = prop_value;
4004 }
4005 }
4006 *code++ = OP_STAR + repeat_type;
4007 }
4008
4009 /* Else insert an UPTO if the max is greater than the min, again
4010 preceded by the character, for the previously inserted code. If the
4011 UPTO is just for 1 instance, we can use QUERY instead. */
4012
4013 else if (repeat_max != repeat_min)
4014 {
4015 #ifdef SUPPORT_UTF8
4016 if (utf8 && c >= 128)
4017 {
4018 memcpy(code, utf8_char, c & 7);
4019 code += c & 7;
4020 }
4021 else
4022 #endif
4023 *code++ = c;
4024 if (prop_type >= 0)
4025 {
4026 *code++ = prop_type;
4027 *code++ = prop_value;
4028 }
4029 repeat_max -= repeat_min;
4030
4031 if (repeat_max == 1)
4032 {
4033 *code++ = OP_QUERY + repeat_type;
4034 }
4035 else
4036 {
4037 *code++ = OP_UPTO + repeat_type;
4038 PUT2INC(code, 0, repeat_max);
4039 }
4040 }
4041 }
4042
4043 /* The character or character type itself comes last in all cases. */
4044
4045 #ifdef SUPPORT_UTF8
4046 if (utf8 && c >= 128)
4047 {
4048 memcpy(code, utf8_char, c & 7);
4049 code += c & 7;
4050 }
4051 else
4052 #endif
4053 *code++ = c;
4054
4055 /* For a repeated Unicode property match, there are two extra bytes that
4056 define the required property. */
4057
4058 #ifdef SUPPORT_UCP
4059 if (prop_type >= 0)
4060 {
4061 *code++ = prop_type;
4062 *code++ = prop_value;
4063 }
4064 #endif
4065 }
4066
4067 /* If previous was a character class or a back reference, we put the repeat
4068 stuff after it, but just skip the item if the repeat was {0,0}. */
4069
4070 else if (*previous == OP_CLASS ||
4071 *previous == OP_NCLASS ||
4072 #ifdef SUPPORT_UTF8
4073 *previous == OP_XCLASS ||
4074 #endif
4075 *previous == OP_REF)
4076 {
4077 if (repeat_max == 0)
4078 {
4079 code = previous;
4080 goto END_REPEAT;
4081 }
4082
4083 /*--------------------------------------------------------------------*/
4084 /* This code is obsolete from release 8.00; the restriction was finally
4085 removed: */
4086
4087 /* All real repeats make it impossible to handle partial matching (maybe
4088 one day we will be able to remove this restriction). */
4089
4090 /* if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL; */
4091 /*--------------------------------------------------------------------*/
4092
4093 if (repeat_min == 0 && repeat_max == -1)
4094 *code++ = OP_CRSTAR + repeat_type;
4095 else if (repeat_min == 1 && repeat_max == -1)
4096 *code++ = OP_CRPLUS + repeat_type;
4097 else if (repeat_min == 0 && repeat_max == 1)
4098 *code++ = OP_CRQUERY + repeat_type;
4099 else
4100 {
4101 *code++ = OP_CRRANGE + repeat_type;
4102 PUT2INC(code, 0, repeat_min);
4103 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
4104 PUT2INC(code, 0, repeat_max);
4105 }
4106 }
4107
4108 /* If previous was a bracket group, we may have to replicate it in certain
4109 cases. */
4110
4111 else if (*previous == OP_BRA || *previous == OP_CBRA ||
4112 *previous == OP_ONCE || *previous == OP_COND)
4113 {
4114 register int i;
4115 int ketoffset = 0;
4116 int len = code - previous;
4117 uschar *bralink = NULL;
4118
4119 /* Repeating a DEFINE group is pointless */
4120
4121 if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
4122 {
4123 *errorcodeptr = ERR55;
4124 goto FAILED;
4125 }
4126
4127 /* If the maximum repeat count is unlimited, find the end of the bracket
4128 by scanning through from the start, and compute the offset back to it
4129 from the current code pointer. There may be an OP_OPT setting following
4130 the final KET, so we can't find the end just by going back from the code
4131 pointer. */
4132
4133 if (repeat_max == -1)
4134 {
4135 register uschar *ket = previous;
4136 do ket += GET(ket, 1); while (*ket != OP_KET);
4137 ketoffset = code - ket;
4138 }
4139
4140 /* The case of a zero minimum is special because of the need to stick
4141 OP_BRAZERO in front of it, and because the group appears once in the
4142 data, whereas in other cases it appears the minimum number of times. For
4143 this reason, it is simplest to treat this case separately, as otherwise
4144 the code gets far too messy. There are several special subcases when the
4145 minimum is zero. */
4146
4147 if (repeat_min == 0)
4148 {
4149 /* If the maximum is also zero, we used to just omit the group from the
4150 output altogether, like this:
4151
4152 ** if (repeat_max == 0)
4153 ** {
4154 ** code = previous;
4155 ** goto END_REPEAT;
4156 ** }
4157
4158 However, that fails when a group is referenced as a subroutine from
4159 elsewhere in the pattern, so now we stick in OP_SKIPZERO in front of it
4160 so that it is skipped on execution. As we don't have a list of which
4161 groups are referenced, we cannot do this selectively.
4162
4163 If the maximum is 1 or unlimited, we just have to stick in the BRAZERO
4164 and do no more at this point. However, we do need to adjust any
4165 OP_RECURSE calls inside the group that refer to the group itself or any
4166 internal or forward referenced group, because the offset is from the
4167 start of the whole regex. Temporarily terminate the pattern while doing
4168 this. */
4169
4170 if (repeat_max <= 1) /* Covers 0, 1, and unlimited */
4171 {
4172 *code = OP_END;
4173 adjust_recurse(previous, 1, utf8, cd, save_hwm);
4174 memmove(previous+1, previous, len);
4175 code++;
4176 if (repeat_max == 0)
4177 {
4178 *previous++ = OP_SKIPZERO;
4179 goto END_REPEAT;
4180 }
4181 *previous++ = OP_BRAZERO + repeat_type;
4182 }
4183
4184 /* If the maximum is greater than 1 and limited, we have to replicate
4185 in a nested fashion, sticking OP_BRAZERO before each set of brackets.
4186 The first one has to be handled carefully because it's the original
4187 copy, which has to be moved up. The remainder can be handled by code
4188 that is common with the non-zero minimum case below. We have to
4189 adjust the value or repeat_max, since one less copy is required. Once
4190 again, we may have to adjust any OP_RECURSE calls inside the group. */
4191
4192 else
4193 {
4194 int offset;
4195 *code = OP_END;
4196 adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
4197 memmove(previous + 2 + LINK_SIZE, previous, len);
4198 code += 2 + LINK_SIZE;
4199 *previous++ = OP_BRAZERO + repeat_type;
4200 *previous++ = OP_BRA;
4201
4202 /* We chain together the bracket offset fields that have to be
4203 filled in later when the ends of the brackets are reached. */
4204
4205 offset = (bralink == NULL)? 0 : previous - bralink;
4206 bralink = previous;
4207 PUTINC(previous, 0, offset);
4208 }
4209
4210 repeat_max--;
4211 }
4212
4213 /* If the minimum is greater than zero, replicate the group as many
4214 times as necessary, and adjust the maximum to the number of subsequent
4215 copies that we need. If we set a first char from the group, and didn't
4216 set a required char, copy the latter from the former. If there are any
4217 forward reference subroutine calls in the group, there will be entries on
4218 the workspace list; replicate these with an appropriate increment. */
4219
4220 else
4221 {
4222 if (repeat_min > 1)
4223 {
4224 /* In the pre-compile phase, we don't actually do the replication. We
4225 just adjust the length as if we had. Do some paranoid checks for
4226 potential integer overflow. The INT64_OR_DOUBLE type is a 64-bit
4227 integer type when available, otherwise double. */
4228
4229 if (lengthptr != NULL)
4230 {
4231 int delta = (repeat_min - 1)*length_prevgroup;
4232 if ((INT64_OR_DOUBLE)(repeat_min - 1)*
4233 (INT64_OR_DOUBLE)length_prevgroup >
4234 (INT64_OR_DOUBLE)INT_MAX ||
4235 OFLOW_MAX - *lengthptr < delta)
4236 {
4237 *errorcodeptr = ERR20;
4238 goto FAILED;
4239 }
4240 *lengthptr += delta;
4241 }
4242
4243 /* This is compiling for real */
4244
4245 else
4246 {
4247 if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
4248 for (i = 1; i < repeat_min; i++)
4249 {
4250 uschar *hc;
4251 uschar *this_hwm = cd->hwm;
4252 memcpy(code, previous, len);
4253 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
4254 {
4255 PUT(cd->hwm, 0, GET(hc, 0) + len);
4256 cd->hwm += LINK_SIZE;
4257 }
4258 save_hwm = this_hwm;
4259 code += len;
4260 }
4261 }
4262 }
4263
4264 if (repeat_max > 0) repeat_max -= repeat_min;
4265 }
4266
4267 /* This code is common to both the zero and non-zero minimum cases. If
4268 the maximum is limited, it replicates the group in a nested fashion,
4269 remembering the bracket starts on a stack. In the case of a zero minimum,
4270 the first one was set up above. In all cases the repeat_max now specifies
4271 the number of additional copies needed. Again, we must remember to
4272 replicate entries on the forward reference list. */
4273
4274 if (repeat_max >= 0)
4275 {
4276 /* In the pre-compile phase, we don't actually do the replication. We
4277 just adjust the length as if we had. For each repetition we must add 1
4278 to the length for BRAZERO and for all but the last repetition we must
4279 add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
4280 paranoid checks to avoid integer overflow. The INT64_OR_DOUBLE type is
4281 a 64-bit integer type when available, otherwise double. */
4282
4283 if (lengthptr != NULL && repeat_max > 0)
4284 {
4285 int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
4286 2 - 2*LINK_SIZE; /* Last one doesn't nest */
4287 if ((INT64_OR_DOUBLE)repeat_max *
4288 (INT64_OR_DOUBLE)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
4289 > (INT64_OR_DOUBLE)INT_MAX ||
4290 OFLOW_MAX - *lengthptr < delta)
4291 {
4292 *errorcodeptr = ERR20;
4293 goto FAILED;
4294 }
4295 *lengthptr += delta;
4296 }
4297
4298 /* This is compiling for real */
4299
4300 else for (i = repeat_max - 1; i >= 0; i--)
4301 {
4302 uschar *hc;
4303 uschar *this_hwm = cd->hwm;
4304
4305 *code++ = OP_BRAZERO + repeat_type;
4306
4307 /* All but the final copy start a new nesting, maintaining the
4308 chain of brackets outstanding. */
4309
4310 if (i != 0)
4311 {
4312 int offset;
4313 *code++ = OP_BRA;
4314 offset = (bralink == NULL)? 0 : code - bralink;
4315 bralink = code;
4316 PUTINC(code, 0, offset);
4317 }
4318
4319 memcpy(code, previous, len);
4320 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
4321 {
4322 PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
4323 cd->hwm += LINK_SIZE;
4324 }
4325 save_hwm = this_hwm;
4326 code += len;
4327 }
4328
4329 /* Now chain through the pending brackets, and fill in their length
4330 fields (which are holding the chain links pro tem). */
4331
4332 while (bralink != NULL)
4333 {
4334 int oldlinkoffset;
4335 int offset = code - bralink + 1;
4336 uschar *bra = code - offset;
4337 oldlinkoffset = GET(bra, 1);
4338 bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
4339 *code++ = OP_KET;
4340 PUTINC(code, 0, offset);
4341 PUT(bra, 1, offset);
4342 }
4343 }
4344
4345 /* If the maximum is unlimited, set a repeater in the final copy. We
4346 can't just offset backwards from the current code point, because we
4347 don't know if there's been an options resetting after the ket. The
4348 correct offset was computed above.
4349
4350 Then, when we are doing the actual compile phase, check to see whether
4351 this group is a non-atomic one that could match an empty string. If so,
4352 convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
4353 that runtime checking can be done. [This check is also applied to
4354 atomic groups at runtime, but in a different way.] */
4355
4356 else
4357 {
4358 uschar *ketcode = code - ketoffset;
4359 uschar *bracode = ketcode - GET(ketcode, 1);
4360 *ketcode = OP_KETRMAX + repeat_type;
4361 if (lengthptr == NULL && *bracode != OP_ONCE)
4362 {
4363 uschar *scode = bracode;
4364 do
4365 {
4366 if (could_be_empty_branch(scode, ketcode, utf8))
4367 {
4368 *bracode += OP_SBRA - OP_BRA;
4369 break;
4370 }
4371 scode += GET(scode, 1);
4372 }
4373 while (*scode == OP_ALT);
4374 }
4375 }
4376 }
4377
4378 /* If previous is OP_FAIL, it was generated by an empty class [] in
4379 JavaScript mode. The other ways in which OP_FAIL can be generated, that is
4380 by (*FAIL) or (?!) set previous to NULL, which gives a "nothing to repeat"
4381 error above. We can just ignore the repeat in JS case. */
4382
4383 else if (*previous == OP_FAIL) goto END_REPEAT;
4384
4385 /* Else there's some kind of shambles */
4386
4387 else
4388 {
4389 *errorcodeptr = ERR11;
4390 goto FAILED;
4391 }
4392
4393 /* If the character following a repeat is '+', or if certain optimization
4394 tests above succeeded, possessive_quantifier is TRUE. For some of the
4395 simpler opcodes, there is an special alternative opcode for this. For
4396 anything else, we wrap the entire repeated item inside OP_ONCE brackets.
4397 The '+' notation is just syntactic sugar, taken from Sun's Java package,
4398 but the special opcodes can optimize it a bit. The repeated item starts at
4399 tempcode, not at previous, which might be the first part of a string whose
4400 (former) last char we repeated.
4401
4402 Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
4403 an 'upto' may follow. We skip over an 'exact' item, and then test the
4404 length of what remains before proceeding. */
4405
4406 if (possessive_quantifier)
4407 {
4408 int len;
4409
4410 if (*tempcode == OP_TYPEEXACT)
4411 tempcode += _pcre_OP_lengths[*tempcode] +
4412 ((tempcode[3] == OP_PROP || tempcode[3] == OP_NOTPROP)? 2 : 0);
4413
4414 else if (*tempcode == OP_EXACT || *tempcode == OP_NOTEXACT)
4415 {
4416 tempcode += _pcre_OP_lengths[*tempcode];
4417 #ifdef SUPPORT_UTF8
4418 if (utf8 && tempcode[-1] >= 0xc0)
4419 tempcode += _pcre_utf8_table4[tempcode[-1] & 0x3f];
4420 #endif
4421 }
4422
4423 len = code - tempcode;
4424 if (len > 0) switch (*tempcode)
4425 {
4426 case OP_STAR: *tempcode = OP_POSSTAR; break;
4427 case OP_PLUS: *tempcode = OP_POSPLUS; break;
4428 case OP_QUERY: *tempcode = OP_POSQUERY; break;
4429 case OP_UPTO: *tempcode = OP_POSUPTO; break;
4430
4431 case OP_TYPESTAR: *tempcode = OP_TYPEPOSSTAR; break;
4432 case OP_TYPEPLUS: *tempcode = OP_TYPEPOSPLUS; break;
4433 case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
4434 case OP_TYPEUPTO: *tempcode = OP_TYPEPOSUPTO; break;
4435
4436 case OP_NOTSTAR: *tempcode = OP_NOTPOSSTAR; break;
4437 case OP_NOTPLUS: *tempcode = OP_NOTPOSPLUS; break;
4438 case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
4439 case OP_NOTUPTO: *tempcode = OP_NOTPOSUPTO; break;
4440
4441 /* Because we are moving code along, we must ensure that any
4442 pending recursive references are updated. */
4443
4444 default:
4445 *code = OP_END;
4446 adjust_recurse(tempcode, 1 + LINK_SIZE, utf8, cd, save_hwm);
4447 memmove(tempcode + 1+LINK_SIZE, tempcode, len);
4448 code += 1 + LINK_SIZE;
4449 len += 1 + LINK_SIZE;
4450 tempcode[0] = OP_ONCE;
4451 *code++ = OP_KET;
4452 PUTINC(code, 0, len);
4453 PUT(tempcode, 1, len);
4454 break;
4455 }
4456 }
4457
4458 /* In all case we no longer have a previous item. We also set the
4459 "follows varying string" flag for subsequently encountered reqbytes if
4460 it isn't already set and we have just passed a varying length item. */
4461
4462 END_REPEAT:
4463 previous = NULL;
4464 cd->req_varyopt |= reqvary;
4465 break;
4466
4467
4468 /* ===================================================================*/
4469 /* Start of nested parenthesized sub-expression, or comment or lookahead or
4470 lookbehind or option setting or condition or all the other extended
4471 parenthesis forms. */
4472
4473 case CHAR_LEFT_PARENTHESIS:
4474 newoptions = options;
4475 skipbytes = 0;
4476 bravalue = OP_CBRA;
4477 save_hwm = cd->hwm;
4478 reset_bracount = FALSE;
4479
4480 /* First deal with various "verbs" that can be introduced by '*'. */
4481
4482 if (*(++ptr) == CHAR_ASTERISK && (cd->ctypes[ptr[1]] & ctype_letter) != 0)
4483 {
4484 int i, namelen;
4485 const char *vn = verbnames;
4486 const uschar *name = ++ptr;
4487 previous = NULL;
4488 while ((cd->ctypes[*++ptr] & ctype_letter) != 0) {};
4489 if (*ptr == CHAR_COLON)
4490 {
4491 *errorcodeptr = ERR59; /* Not supported */
4492 goto FAILED;
4493 }
4494 if (*ptr != CHAR_RIGHT_PARENTHESIS)
4495 {
4496 *errorcodeptr = ERR60;
4497 goto FAILED;
4498 }
4499 namelen = ptr - name;
4500 for (i = 0; i < verbcount; i++)
4501 {
4502 if (namelen == verbs[i].len &&
4503 strncmp((char *)name, vn, namelen) == 0)
4504 {
4505 /* Check for open captures before ACCEPT */
4506
4507 if (verbs[i].op == OP_ACCEPT)
4508 {
4509 open_capitem *oc;
4510 cd->had_accept = TRUE;
4511 for (oc = cd->open_caps; oc != NULL; oc = oc->next)
4512 {
4513 *code++ = OP_CLOSE;
4514 PUT2INC(code, 0, oc->number);
4515 }
4516 }
4517 *code++ = verbs[i].op;
4518 break;
4519 }
4520 vn += verbs[i].len + 1;
4521 }
4522 if (i < verbcount) continue;
4523 *errorcodeptr = ERR60;
4524 goto FAILED;
4525 }
4526
4527 /* Deal with the extended parentheses; all are introduced by '?', and the
4528 appearance of any of them means that this is not a capturing group. */
4529
4530 else if (*ptr == CHAR_QUESTION_MARK)
4531 {
4532 int i, set, unset, namelen;
4533 int *optset;
4534 const uschar *name;
4535 uschar *slot;
4536
4537 switch (*(++ptr))
4538 {
4539 case CHAR_NUMBER_SIGN: /* Comment; skip to ket */
4540 ptr++;
4541 while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
4542 if (*ptr == 0)
4543 {
4544 *errorcodeptr = ERR18;
4545 goto FAILED;
4546 }
4547 continue;
4548
4549
4550 /* ------------------------------------------------------------ */
4551 case CHAR_VERTICAL_LINE: /* Reset capture count for each branch */
4552 reset_bracount = TRUE;
4553 /* Fall through */
4554
4555 /* ------------------------------------------------------------ */
4556 case CHAR_COLON: /* Non-capturing bracket */
4557 bravalue = OP_BRA;
4558 ptr++;
4559 break;
4560
4561
4562 /* ------------------------------------------------------------ */
4563 case CHAR_LEFT_PARENTHESIS:
4564 bravalue = OP_COND; /* Conditional group */
4565
4566 /* A condition can be an assertion, a number (referring to a numbered
4567 group), a name (referring to a named group), or 'R', referring to
4568 recursion. R<digits> and R&name are also permitted for recursion tests.
4569
4570 There are several syntaxes for testing a named group: (?(name)) is used
4571 by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
4572
4573 There are two unfortunate ambiguities, caused by history. (a) 'R' can
4574 be the recursive thing or the name 'R' (and similarly for 'R' followed
4575 by digits), and (b) a number could be a name that consists of digits.
4576 In both cases, we look for a name first; if not found, we try the other
4577 cases. */
4578
4579 /* For conditions that are assertions, check the syntax, and then exit
4580 the switch. This will take control down to where bracketed groups,
4581 including assertions, are processed. */
4582
4583 if (ptr[1] == CHAR_QUESTION_MARK && (ptr[2] == CHAR_EQUALS_SIGN ||
4584 ptr[2] == CHAR_EXCLAMATION_MARK || ptr[2] == CHAR_LESS_THAN_SIGN))
4585 break;
4586
4587 /* Most other conditions use OP_CREF (a couple change to OP_RREF
4588 below), and all need to skip 3 bytes at the start of the group. */
4589
4590 code[1+LINK_SIZE] = OP_CREF;
4591 skipbytes = 3;
4592 refsign = -1;
4593
4594 /* Check for a test for recursion in a named group. */
4595
4596 if (ptr[1] == CHAR_R && ptr[2] == CHAR_AMPERSAND)
4597 {
4598 terminator = -1;
4599 ptr += 2;
4600 code[1+LINK_SIZE] = OP_RREF; /* Change the type of test */
4601 }
4602
4603 /* Check for a test for a named group's having been set, using the Perl
4604 syntax (?(<name>) or (?('name') */
4605
4606 else if (ptr[1] == CHAR_LESS_THAN_SIGN)
4607 {
4608 terminator = CHAR_GREATER_THAN_SIGN;
4609 ptr++;
4610 }
4611 else if (ptr[1] == CHAR_APOSTROPHE)
4612 {
4613 terminator = CHAR_APOSTROPHE;
4614 ptr++;
4615 }
4616 else
4617 {
4618 terminator = 0;
4619 if (ptr[1] == CHAR_MINUS || ptr[1] == CHAR_PLUS) refsign = *(++ptr);
4620 }
4621
4622 /* We now expect to read a name; any thing else is an error */
4623
4624 if ((cd->ctypes[ptr[1]] & ctype_word) == 0)
4625 {
4626 ptr += 1; /* To get the right offset */
4627 *errorcodeptr = ERR28;
4628 goto FAILED;
4629 }
4630
4631 /* Read the name, but also get it as a number if it's all digits */
4632
4633 recno = 0;
4634 name = ++ptr;
4635 while ((cd->ctypes[*ptr] & ctype_word) != 0)
4636 {
4637 if (recno >= 0)
4638 recno = ((digitab[*ptr] & ctype_digit) != 0)?
4639 recno * 10 + *ptr - CHAR_0 : -1;
4640 ptr++;
4641 }
4642 namelen = ptr - name;
4643
4644 if ((terminator > 0 && *ptr++ != terminator) ||
4645 *ptr++ != CHAR_RIGHT_PARENTHESIS)
4646 {
4647 ptr--; /* Error offset */
4648 *errorcodeptr = ERR26;
4649 goto FAILED;
4650 }
4651
4652 /* Do no further checking in the pre-compile phase. */
4653
4654 if (lengthptr != NULL) break;
4655
4656 /* In the real compile we do the work of looking for the actual
4657 reference. If the string started with "+" or "-" we require the rest to
4658 be digits, in which case recno will be set. */
4659
4660 if (refsign > 0)
4661 {
4662 if (recno <= 0)
4663 {
4664 *errorcodeptr = ERR58;
4665 goto FAILED;
4666 }
4667 recno = (refsign == CHAR_MINUS)?
4668 cd->bracount - recno + 1 : recno +cd->bracount;
4669 if (recno <= 0 || recno > cd->final_bracount)
4670 {
4671 *errorcodeptr = ERR15;
4672 goto FAILED;
4673 }
4674 PUT2(code, 2+LINK_SIZE, recno);
4675 break;
4676 }
4677
4678 /* Otherwise (did not start with "+" or "-"), start by looking for the
4679 name. If we find a name, add one to the opcode to change OP_CREF or
4680 OP_RREF into OP_NCREF or OP_NRREF. These behave exactly the same,
4681 except they record that the reference was originally to a name. The
4682 information is used to check duplicate names. */
4683
4684 slot = cd->name_table;
4685 for (i = 0; i < cd->names_found; i++)
4686 {
4687 if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
4688 slot += cd->name_entry_size;
4689 }
4690
4691 /* Found a previous named subpattern */
4692
4693 if (i < cd->names_found)
4694 {
4695 recno = GET2(slot, 0);
4696 PUT2(code, 2+LINK_SIZE, recno);
4697 code[1+LINK_SIZE]++;
4698 }
4699
4700 /* Search the pattern for a forward reference */
4701
4702 else if ((i = find_parens(cd, name, namelen,
4703 (options & PCRE_EXTENDED) != 0)) > 0)
4704 {
4705 PUT2(code, 2+LINK_SIZE, i);
4706 code[1+LINK_SIZE]++;
4707 }
4708
4709 /* If terminator == 0 it means that the name followed directly after
4710 the opening parenthesis [e.g. (?(abc)...] and in this case there are
4711 some further alternatives to try. For the cases where terminator != 0
4712 [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
4713 now checked all the possibilities, so give an error. */
4714
4715 else if (terminator != 0)
4716 {
4717 *errorcodeptr = ERR15;
4718 goto FAILED;
4719 }
4720
4721 /* Check for (?(R) for recursion. Allow digits after R to specify a
4722 specific group number. */
4723
4724 else if (*name == CHAR_R)
4725 {
4726 recno = 0;
4727 for (i = 1; i < namelen; i++)
4728 {
4729 if ((digitab[name[i]] & ctype_digit) == 0)
4730 {
4731 *errorcodeptr = ERR15;
4732 goto FAILED;
4733 }
4734 recno = recno * 10 + name[i] - CHAR_0;
4735 }
4736 if (recno == 0) recno = RREF_ANY;
4737 code[1+LINK_SIZE] = OP_RREF; /* Change test type */
4738 PUT2(code, 2+LINK_SIZE, recno);
4739 }
4740
4741 /* Similarly, check for the (?(DEFINE) "condition", which is always
4742 false. */
4743
4744 else if (namelen == 6 && strncmp((char *)name, STRING_DEFINE, 6) == 0)
4745 {
4746 code[1+LINK_SIZE] = OP_DEF;
4747 skipbytes = 1;
4748 }
4749
4750 /* Check for the "name" actually being a subpattern number. We are
4751 in the second pass here, so final_bracount is set. */
4752
4753 else if (recno > 0 && recno <= cd->final_bracount)
4754 {
4755 PUT2(code, 2+LINK_SIZE, recno);
4756 }
4757
4758 /* Either an unidentified subpattern, or a reference to (?(0) */
4759
4760 else
4761 {
4762 *errorcodeptr = (recno == 0)? ERR35: ERR15;
4763 goto FAILED;
4764 }
4765 break;
4766
4767
4768 /* ------------------------------------------------------------ */
4769 case CHAR_EQUALS_SIGN: /* Positive lookahead */
4770 bravalue = OP_ASSERT;
4771 ptr++;
4772 break;
4773
4774
4775 /* ------------------------------------------------------------ */
4776 case CHAR_EXCLAMATION_MARK: /* Negative lookahead */
4777 ptr++;
4778 if (*ptr == CHAR_RIGHT_PARENTHESIS) /* Optimize (?!) */
4779 {
4780 *code++ = OP_FAIL;
4781 previous = NULL;
4782 continue;
4783 }
4784 bravalue = OP_ASSERT_NOT;
4785 break;
4786
4787
4788 /* ------------------------------------------------------------ */
4789 case CHAR_LESS_THAN_SIGN: /* Lookbehind or named define */
4790 switch (ptr[1])
4791 {
4792 case CHAR_EQUALS_SIGN: /* Positive lookbehind */
4793 bravalue = OP_ASSERTBACK;
4794 ptr += 2;
4795 break;
4796
4797 case CHAR_EXCLAMATION_MARK: /* Negative lookbehind */
4798 bravalue = OP_ASSERTBACK_NOT;
4799 ptr += 2;
4800 break;
4801
4802 default: /* Could be name define, else bad */
4803 if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME;
4804 ptr++; /* Correct offset for error */
4805 *errorcodeptr = ERR24;
4806 goto FAILED;
4807 }
4808 break;
4809
4810
4811 /* ------------------------------------------------------------ */
4812 case CHAR_GREATER_THAN_SIGN: /* One-time brackets */
4813 bravalue = OP_ONCE;
4814 ptr++;
4815 break;
4816
4817
4818 /* ------------------------------------------------------------ */
4819 case CHAR_C: /* Callout - may be followed by digits; */
4820 previous_callout = code; /* Save for later completion */
4821 after_manual_callout = 1; /* Skip one item before completing */
4822 *code++ = OP_CALLOUT;
4823 {
4824 int n = 0;
4825 while ((digitab[*(++ptr)] & ctype_digit) != 0)
4826 n = n * 10 + *ptr - CHAR_0;
4827 if (*ptr != CHAR_RIGHT_PARENTHESIS)
4828 {
4829 *errorcodeptr = ERR39;
4830 goto FAILED;
4831 }
4832 if (n > 255)
4833 {
4834 *errorcodeptr = ERR38;
4835 goto FAILED;
4836 }
4837 *code++ = n;
4838 PUT(code, 0, ptr - cd->start_pattern + 1); /* Pattern offset */
4839 PUT(code, LINK_SIZE, 0); /* Default length */
4840 code += 2 * LINK_SIZE;
4841 }
4842 previous = NULL;
4843 continue;
4844
4845
4846 /* ------------------------------------------------------------ */
4847 case CHAR_P: /* Python-style named subpattern handling */
4848 if (*(++ptr) == CHAR_EQUALS_SIGN ||
4849 *ptr == CHAR_GREATER_THAN_SIGN) /* Reference or recursion */
4850 {
4851 is_recurse = *ptr == CHAR_GREATER_THAN_SIGN;
4852 terminator = CHAR_RIGHT_PARENTHESIS;
4853 goto NAMED_REF_OR_RECURSE;
4854 }
4855 else if (*ptr != CHAR_LESS_THAN_SIGN) /* Test for Python-style defn */
4856 {
4857 *errorcodeptr = ERR41;
4858 goto FAILED;
4859 }
4860 /* Fall through to handle (?P< as (?< is handled */
4861
4862
4863 /* ------------------------------------------------------------ */
4864 DEFINE_NAME: /* Come here from (?< handling */
4865 case CHAR_APOSTROPHE:
4866 {
4867 terminator = (*ptr == CHAR_LESS_THAN_SIGN)?
4868 CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
4869 name = ++ptr;
4870
4871 while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4872 namelen = ptr - name;
4873
4874 /* In the pre-compile phase, just do a syntax check. */
4875
4876 if (lengthptr != NULL)
4877 {
4878 if (*ptr != terminator)
4879 {
4880 *errorcodeptr = ERR42;
4881 goto FAILED;
4882 }
4883 if (cd->names_found >= MAX_NAME_COUNT)
4884 {
4885 *errorcodeptr = ERR49;
4886 goto FAILED;
4887 }
4888 if (namelen + 3 > cd->name_entry_size)
4889 {
4890 cd->name_entry_size = namelen + 3;
4891 if (namelen > MAX_NAME_SIZE)
4892 {
4893 *errorcodeptr = ERR48;
4894 goto FAILED;
4895 }
4896 }
4897 }
4898
4899 /* In the real compile, create the entry in the table, maintaining
4900 alphabetical order. Duplicate names for different numbers are
4901 permitted only if PCRE_DUPNAMES is set. Duplicate names for the same
4902 number are always OK. (An existing number can be re-used if (?|
4903 appears in the pattern.) In either event, a duplicate name results in
4904 a duplicate entry in the table, even if the number is the same. This
4905 is because the number of names, and hence the table size, is computed
4906 in the pre-compile, and it affects various numbers and pointers which
4907 would all have to be modified, and the compiled code moved down, if
4908 duplicates with the same number were omitted from the table. This
4909 doesn't seem worth the hassle. However, *different* names for the
4910 same number are not permitted. */
4911
4912 else
4913 {
4914 BOOL dupname = FALSE;
4915 slot = cd->name_table;
4916
4917 for (i = 0; i < cd->names_found; i++)
4918 {
4919 int crc = memcmp(name, slot+2, namelen);
4920 if (crc == 0)
4921 {
4922 if (slot[2+namelen] == 0)
4923 {
4924 if (GET2(slot, 0) != cd->bracount + 1 &&
4925 (options & PCRE_DUPNAMES) == 0)
4926 {
4927 *errorcodeptr = ERR43;
4928 goto FAILED;
4929 }
4930 else dupname = TRUE;
4931 }
4932 else crc = -1; /* Current name is a substring */
4933 }
4934
4935 /* Make space in the table and break the loop for an earlier
4936 name. For a duplicate or later name, carry on. We do this for
4937 duplicates so that in the simple case (when ?(| is not used) they
4938 are in order of their numbers. */
4939
4940 if (crc < 0)
4941 {
4942 memmove(slot + cd->name_entry_size, slot,
4943 (cd->names_found - i) * cd->name_entry_size);
4944 break;
4945 }
4946
4947 /* Continue the loop for a later or duplicate name */
4948
4949 slot += cd->name_entry_size;
4950 }
4951
4952 /* For non-duplicate names, check for a duplicate number before
4953 adding the new name. */
4954
4955 if (!dupname)
4956 {
4957 uschar *cslot = cd->name_table;
4958 for (i = 0; i < cd->names_found; i++)
4959 {
4960 if (cslot != slot)
4961 {
4962 if (GET2(cslot, 0) == cd->bracount + 1)
4963 {
4964 *errorcodeptr = ERR65;
4965 goto FAILED;
4966 }
4967 }
4968 else i--;
4969 cslot += cd->name_entry_size;
4970 }
4971 }
4972
4973 PUT2(slot, 0, cd->bracount + 1);
4974 memcpy(slot + 2, name, namelen);
4975 slot[2+namelen] = 0;
4976 }
4977 }
4978
4979 /* In both pre-compile and compile, count the number of names we've
4980 encountered. */
4981
4982 cd->names_found++;
4983 ptr++; /* Move past > or ' */
4984 goto NUMBERED_GROUP;
4985
4986
4987 /* ------------------------------------------------------------ */
4988 case CHAR_AMPERSAND: /* Perl recursion/subroutine syntax */
4989 terminator = CHAR_RIGHT_PARENTHESIS;
4990 is_recurse = TRUE;
4991 /* Fall through */
4992
4993 /* We come here from the Python syntax above that handles both
4994 references (?P=name) and recursion (?P>name), as well as falling
4995 through from the Perl recursion syntax (?&name). We also come here from
4996 the Perl \k<name> or \k'name' back reference syntax and the \k{name}
4997 .NET syntax, and the Oniguruma \g<...> and \g'...' subroutine syntax. */
4998
4999 NAMED_REF_OR_RECURSE:
5000 name = ++ptr;
5001 while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
5002 namelen = ptr - name;
5003
5004 /* In the pre-compile phase, do a syntax check and set a dummy
5005 reference number. */
5006
5007 if (lengthptr != NULL)
5008 {
5009 if (namelen == 0)
5010 {
5011 *errorcodeptr = ERR62;
5012 goto FAILED;
5013 }
5014 if (*ptr != terminator)
5015 {
5016 *errorcodeptr = ERR42;
5017 goto FAILED;
5018 }
5019 if (namelen > MAX_NAME_SIZE)
5020 {
5021 *errorcodeptr = ERR48;
5022 goto FAILED;
5023 }
5024 recno = 0;
5025 }
5026
5027 /* In the real compile, seek the name in the table. We check the name
5028 first, and then check that we have reached the end of the name in the
5029 table. That way, if the name that is longer than any in the table,
5030 the comparison will fail without reading beyond the table entry. */
5031
5032 else
5033 {
5034 slot = cd->name_table;
5035 for (i = 0; i < cd->names_found; i++)
5036 {
5037 if (strncmp((char *)name, (char *)slot+2, namelen) == 0 &&
5038 slot[2+namelen] == 0)
5039 break;
5040 slot += cd->name_entry_size;
5041 }
5042
5043 if (i < cd->names_found) /* Back reference */
5044 {
5045 recno = GET2(slot, 0);
5046 }
5047 else if ((recno = /* Forward back reference */
5048 find_parens(cd, name, namelen,
5049 (options & PCRE_EXTENDED) != 0)) <= 0)
5050 {
5051 *errorcodeptr = ERR15;
5052 goto FAILED;
5053 }
5054 }
5055
5056 /* In both phases, we can now go to the code than handles numerical
5057 recursion or backreferences. */
5058
5059 if (is_recurse) goto HANDLE_RECURSION;
5060 else goto HANDLE_REFERENCE;
5061
5062
5063 /* ------------------------------------------------------------ */
5064 case CHAR_R: /* Recursion */
5065 ptr++; /* Same as (?0) */
5066 /* Fall through */
5067
5068
5069 /* ------------------------------------------------------------ */
5070 case CHAR_MINUS: case CHAR_PLUS: /* Recursion or subroutine */
5071 case CHAR_0: case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4:
5072 case CHAR_5: case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
5073 {
5074 const uschar *called;
5075 terminator = CHAR_RIGHT_PARENTHESIS;
5076
5077 /* Come here from the \g<...> and \g'...' code (Oniguruma
5078 compatibility). However, the syntax has been checked to ensure that
5079 the ... are a (signed) number, so that neither ERR63 nor ERR29 will
5080 be called on this path, nor with the jump to OTHER_CHAR_AFTER_QUERY
5081 ever be taken. */
5082
5083 HANDLE_NUMERICAL_RECURSION:
5084
5085 if ((refsign = *ptr) == CHAR_PLUS)
5086 {
5087 ptr++;
5088 if ((digitab[*ptr] & ctype_digit) == 0)
5089 {
5090 *errorcodeptr = ERR63;
5091 goto FAILED;
5092 }
5093 }
5094 else if (refsign == CHAR_MINUS)
5095 {
5096 if ((digitab[ptr[1]] & ctype_digit) == 0)
5097 goto OTHER_CHAR_AFTER_QUERY;
5098 ptr++;
5099 }
5100
5101 recno = 0;
5102 while((digitab[*ptr] & ctype_digit) != 0)
5103 recno = recno * 10 + *ptr++ - CHAR_0;
5104
5105 if (*ptr != terminator)
5106 {
5107 *errorcodeptr = ERR29;
5108 goto FAILED;
5109 }
5110
5111 if (refsign == CHAR_MINUS)
5112 {
5113 if (recno == 0)
5114 {
5115 *errorcodeptr = ERR58;
5116 goto FAILED;
5117 }
5118 recno = cd->bracount - recno + 1;
5119 if (recno <= 0)
5120 {
5121 *errorcodeptr = ERR15;
5122 goto FAILED;
5123 }
5124 }
5125 else if (refsign == CHAR_PLUS)
5126 {
5127 if (recno == 0)
5128 {
5129 *errorcodeptr = ERR58;
5130 goto FAILED;
5131 }
5132 recno += cd->bracount;
5133 }
5134
5135 /* Come here from code above that handles a named recursion */
5136
5137 HANDLE_RECURSION:
5138
5139 previous = code;
5140 called = cd->start_code;
5141
5142 /* When we are actually compiling, find the bracket that is being
5143 referenced. Temporarily end the regex in case it doesn't exist before
5144 this point. If we end up with a forward reference, first check that
5145 the bracket does occur later so we can give the error (and position)
5146 now. Then remember this forward reference in the workspace so it can
5147 be filled in at the end. */
5148
5149 if (lengthptr == NULL)
5150 {
5151 *code = OP_END;
5152 if (recno != 0)
5153 called = _pcre_find_bracket(cd->start_code, utf8, recno);
5154
5155 /* Forward reference */
5156
5157 if (called == NULL)
5158 {
5159 if (find_parens(cd, NULL, recno,
5160 (options & PCRE_EXTENDED) != 0) < 0)
5161 {
5162 *errorcodeptr = ERR15;
5163 goto FAILED;
5164 }
5165
5166 /* Fudge the value of "called" so that when it is inserted as an
5167 offset below, what it actually inserted is the reference number
5168 of the group. */
5169
5170 called = cd->start_code + recno;
5171 PUTINC(cd->hwm, 0, code + 2 + LINK_SIZE - cd->start_code);
5172 }
5173
5174 /* If not a forward reference, and the subpattern is still open,
5175 this is a recursive call. We check to see if this is a left
5176 recursion that could loop for ever, and diagnose that case. */
5177
5178 else if (GET(called, 1) == 0 &&
5179 could_be_empty(called, code, bcptr, utf8))
5180 {
5181 *errorcodeptr = ERR40;
5182 goto FAILED;
5183 }
5184 }
5185
5186 /* Insert the recursion/subroutine item, automatically wrapped inside
5187 "once" brackets. Set up a "previous group" length so that a
5188 subsequent quantifier will work. */
5189
5190 *code = OP_ONCE;
5191 PUT(code, 1, 2 + 2*LINK_SIZE);
5192 code += 1 + LINK_SIZE;
5193
5194 *code = OP_RECURSE;
5195 PUT(code, 1, called - cd->start_code);
5196 code += 1 + LINK_SIZE;
5197
5198 *code = OP_KET;
5199 PUT(code, 1, 2 + 2*LINK_SIZE);
5200 code += 1 + LINK_SIZE;
5201
5202 length_prevgroup = 3 + 3*LINK_SIZE;
5203 }
5204
5205 /* Can't determine a first byte now */
5206
5207 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
5208 continue;
5209
5210
5211 /* ------------------------------------------------------------ */
5212 default: /* Other characters: check option setting */
5213 OTHER_CHAR_AFTER_QUERY:
5214 set = unset = 0;
5215 optset = &set;
5216
5217 while (*ptr != CHAR_RIGHT_PARENTHESIS && *ptr != CHAR_COLON)
5218 {
5219 switch (*ptr++)
5220 {
5221 case CHAR_MINUS: optset = &unset; break;
5222
5223 case CHAR_J: /* Record that it changed in the external options */
5224 *optset |= PCRE_DUPNAMES;
5225 cd->external_flags |= PCRE_JCHANGED;
5226 break;
5227
5228 case CHAR_i: *optset |= PCRE_CASELESS; break;
5229 case CHAR_m: *optset |= PCRE_MULTILINE; break;
5230 case CHAR_s: *optset |= PCRE_DOTALL; break;
5231 case CHAR_x: *optset |= PCRE_EXTENDED; break;
5232 case CHAR_U: *optset |= PCRE_UNGREEDY; break;
5233 case CHAR_X: *optset |= PCRE_EXTRA; break;
5234
5235 default: *errorcodeptr = ERR12;
5236 ptr--; /* Correct the offset */
5237 goto FAILED;
5238 }
5239 }
5240
5241 /* Set up the changed option bits, but don't change anything yet. */
5242
5243 newoptions = (options | set) & (~unset);
5244
5245 /* If the options ended with ')' this is not the start of a nested
5246 group with option changes, so the options change at this level. If this
5247 item is right at the start of the pattern, the options can be
5248 abstracted and made external in the pre-compile phase, and ignored in
5249 the compile phase. This can be helpful when matching -- for instance in
5250 caseless checking of required bytes.
5251
5252 If the code pointer is not (cd->start_code + 1 + LINK_SIZE), we are
5253 definitely *not* at the start of the pattern because something has been
5254 compiled. In the pre-compile phase, however, the code pointer can have
5255 that value after the start, because it gets reset as code is discarded
5256 during the pre-compile. However, this can happen only at top level - if
5257 we are within parentheses, the starting BRA will still be present. At
5258 any parenthesis level, the length value can be used to test if anything
5259 has been compiled at that level. Thus, a test for both these conditions
5260 is necessary to ensure we correctly detect the start of the pattern in
5261 both phases.
5262
5263 If we are not at the pattern start, compile code to change the ims
5264 options if this setting actually changes any of them, and reset the
5265 greedy defaults and the case value for firstbyte and reqbyte. */
5266
5267 if (*ptr == CHAR_RIGHT_PARENTHESIS)
5268 {
5269 if (code == cd->start_code + 1 + LINK_SIZE &&
5270 (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE))
5271 {
5272 cd->external_options = newoptions;
5273 }
5274 else
5275 {
5276 if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
5277 {
5278 *code++ = OP_OPT;
5279 *code++ = newoptions & PCRE_IMS;
5280 }
5281 greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
5282 greedy_non_default = greedy_default ^ 1;
5283 req_caseopt = ((newoptions & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
5284 }
5285
5286 /* Change options at this level, and pass them back for use
5287 in subsequent branches. When not at the start of the pattern, this
5288 information is also necessary so that a resetting item can be
5289 compiled at the end of a group (if we are in a group). */
5290
5291 *optionsptr = options = newoptions;
5292 previous = NULL; /* This item can't be repeated */
5293 continue; /* It is complete */
5294 }
5295
5296 /* If the options ended with ':' we are heading into a nested group
5297 with possible change of options. Such groups are non-capturing and are
5298 not assertions of any kind. All we need to do is skip over the ':';
5299 the newoptions value is handled below. */
5300
5301 bravalue = OP_BRA;
5302 ptr++;
5303 } /* End of switch for character following (? */
5304 } /* End of (? handling */
5305
5306 /* Opening parenthesis not followed by '?'. If PCRE_NO_AUTO_CAPTURE is set,
5307 all unadorned brackets become non-capturing and behave like (?:...)
5308 brackets. */
5309
5310 else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
5311 {
5312 bravalue = OP_BRA;
5313 }
5314
5315 /* Else we have a capturing group. */
5316
5317 else
5318 {
5319 NUMBERED_GROUP:
5320 cd->bracount += 1;
5321 PUT2(code, 1+LINK_SIZE, cd->bracount);
5322 skipbytes = 2;
5323 }
5324
5325 /* Process nested bracketed regex. Assertions may not be repeated, but
5326 other kinds can be. All their opcodes are >= OP_ONCE. We copy code into a
5327 non-register variable in order to be able to pass its address because some
5328 compilers complain otherwise. Pass in a new setting for the ims options if
5329 they have changed. */
5330
5331 previous = (bravalue >= OP_ONCE)? code : NULL;
5332 *code = bravalue;
5333 tempcode = code;
5334 tempreqvary = cd->req_varyopt; /* Save value before bracket */
5335 length_prevgroup = 0; /* Initialize for pre-compile phase */
5336
5337 if (!compile_regex(
5338 newoptions, /* The complete new option state */
5339 options & PCRE_IMS, /* The previous ims option state */
5340 &tempcode, /* Where to put code (updated) */
5341 &ptr, /* Input pointer (updated) */
5342 errorcodeptr, /* Where to put an error message */
5343 (bravalue == OP_ASSERTBACK ||
5344 bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
5345 reset_bracount, /* True if (?| group */
5346 skipbytes, /* Skip over bracket number */
5347 &subfirstbyte, /* For possible first char */
5348 &subreqbyte, /* For possible last char */
5349 bcptr, /* Current branch chain */
5350 cd, /* Tables block */
5351 (lengthptr == NULL)? NULL : /* Actual compile phase */
5352 &length_prevgroup /* Pre-compile phase */
5353 ))
5354 goto FAILED;
5355
5356 /* At the end of compiling, code is still pointing to the start of the
5357 group, while tempcode has been updated to point past the end of the group
5358 and any option resetting that may follow it. The pattern pointer (ptr)
5359 is on the bracket. */
5360
5361 /* If this is a conditional bracket, check that there are no more than
5362 two branches in the group, or just one if it's a DEFINE group. We do this
5363 in the real compile phase, not in the pre-pass, where the whole group may
5364 not be available. */
5365
5366 if (bravalue == OP_COND && lengthptr == NULL)
5367 {
5368 uschar *tc = code;
5369 int condcount = 0;
5370
5371 do {
5372 condcount++;
5373 tc += GET(tc,1);
5374 }
5375 while (*tc != OP_KET);
5376
5377 /* A DEFINE group is never obeyed inline (the "condition" is always
5378 false). It must have only one branch. */
5379
5380 if (code[LINK_SIZE+1] == OP_DEF)
5381 {
5382 if (condcount > 1)
5383 {
5384 *errorcodeptr = ERR54;
5385 goto FAILED;
5386 }
5387 bravalue = OP_DEF; /* Just a flag to suppress char handling below */
5388 }
5389
5390 /* A "normal" conditional group. If there is just one branch, we must not
5391 make use of its firstbyte or reqbyte, because this is equivalent to an
5392 empty second branch. */
5393
5394 else
5395 {
5396 if (condcount > 2)
5397 {
5398 *errorcodeptr = ERR27;
5399 goto FAILED;
5400 }
5401 if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
5402 }
5403 }
5404
5405 /* Error if hit end of pattern */
5406
5407 if (*ptr != CHAR_RIGHT_PARENTHESIS)
5408 {
5409 *errorcodeptr = ERR14;
5410 goto FAILED;
5411 }
5412
5413 /* In the pre-compile phase, update the length by the length of the group,
5414 less the brackets at either end. Then reduce the compiled code to just a
5415 set of non-capturing brackets so that it doesn't use much memory if it is
5416 duplicated by a quantifier.*/
5417
5418 if (lengthptr != NULL)
5419 {
5420 if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE)
5421 {
5422 *errorcodeptr = ERR20;
5423 goto FAILED;
5424 }
5425 *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
5426 *code++ = OP_BRA;
5427 PUTINC(code, 0, 1 + LINK_SIZE);
5428 *code++ = OP_KET;
5429 PUTINC(code, 0, 1 + LINK_SIZE);
5430 break; /* No need to waste time with special character handling */
5431 }
5432
5433 /* Otherwise update the main code pointer to the end of the group. */
5434
5435 code = tempcode;
5436
5437 /* For a DEFINE group, required and first character settings are not
5438 relevant. */
5439
5440 if (bravalue == OP_DEF) break;
5441
5442 /* Handle updating of the required and first characters for other types of
5443 group. Update for normal brackets of all kinds, and conditions with two
5444 branches (see code above). If the bracket is followed by a quantifier with
5445 zero repeat, we have to back off. Hence the definition of zeroreqbyte and
5446 zerofirstbyte outside the main loop so that they can be accessed for the
5447 back off. */
5448
5449 zeroreqbyte = reqbyte;
5450 zerofirstbyte = firstbyte;
5451 groupsetfirstbyte = FALSE;
5452
5453 if (bravalue >= OP_ONCE)
5454 {
5455 /* If we have not yet set a firstbyte in this branch, take it from the
5456 subpattern, remembering that it was set here so that a repeat of more
5457 than one can replicate it as reqbyte if necessary. If the subpattern has
5458 no firstbyte, set "none" for the whole branch. In both cases, a zero
5459 repeat forces firstbyte to "none". */
5460
5461 if (firstbyte == REQ_UNSET)
5462 {
5463 if (subfirstbyte >= 0)
5464 {
5465 firstbyte = subfirstbyte;
5466 groupsetfirstbyte = TRUE;
5467 }
5468 else firstbyte = REQ_NONE;
5469 zerofirstbyte = REQ_NONE;
5470 }
5471
5472 /* If firstbyte was previously set, convert the subpattern's firstbyte
5473 into reqbyte if there wasn't one, using the vary flag that was in
5474 existence beforehand. */
5475
5476 else if (subfirstbyte >= 0 && subreqbyte < 0)
5477 subreqbyte = subfirstbyte | tempreqvary;
5478
5479 /* If the subpattern set a required byte (or set a first byte that isn't
5480 really the first byte - see above), set it. */
5481
5482 if (subreqbyte >= 0) reqbyte = subreqbyte;
5483 }
5484
5485 /* For a forward assertion, we take the reqbyte, if set. This can be
5486 helpful if the pattern that follows the assertion doesn't set a different
5487 char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
5488 for an assertion, however because it leads to incorrect effect for patterns
5489 such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
5490 of a firstbyte. This is overcome by a scan at the end if there's no
5491 firstbyte, looking for an asserted first char. */
5492
5493 else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
5494 break; /* End of processing '(' */
5495
5496
5497 /* ===================================================================*/
5498 /* Handle metasequences introduced by \. For ones like \d, the ESC_ values
5499 are arranged to be the negation of the corresponding OP_values. For the
5500 back references, the values are ESC_REF plus the reference number. Only
5501 back references and those types that consume a character may be repeated.
5502 We can test for values between ESC_b and ESC_Z for the latter; this may
5503 have to change if any new ones are ever created. */
5504
5505 case CHAR_BACKSLASH:
5506 tempptr = ptr;
5507 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, FALSE);
5508 if (*errorcodeptr != 0) goto FAILED;
5509
5510 if (c < 0)
5511 {
5512 if (-c == ESC_Q) /* Handle start of quoted string */
5513 {
5514 if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
5515 ptr += 2; /* avoid empty string */
5516 else inescq = TRUE;
5517 continue;
5518 }
5519
5520 if (-c == ESC_E) continue; /* Perl ignores an orphan \E */
5521
5522 /* For metasequences that actually match a character, we disable the
5523 setting of a first character if it hasn't already been set. */
5524
5525 if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
5526 firstbyte = REQ_NONE;
5527
5528 /* Set values to reset to if this is followed by a zero repeat. */
5529
5530 zerofirstbyte = firstbyte;
5531 zeroreqbyte = reqbyte;
5532
5533 /* \g<name> or \g'name' is a subroutine call by name and \g<n> or \g'n'
5534 is a subroutine call by number (Oniguruma syntax). In fact, the value
5535 -ESC_g is returned only for these cases. So we don't need to check for <
5536 or ' if the value is -ESC_g. For the Perl syntax \g{n} the value is
5537 -ESC_REF+n, and for the Perl syntax \g{name} the result is -ESC_k (as
5538 that is a synonym for a named back reference). */
5539
5540 if (-c == ESC_g)
5541 {
5542 const uschar *p;
5543 save_hwm = cd->hwm; /* Normally this is set when '(' is read */
5544 terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
5545 CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
5546
5547 /* These two statements stop the compiler for warning about possibly
5548 unset variables caused by the jump to HANDLE_NUMERICAL_RECURSION. In
5549 fact, because we actually check for a number below, the paths that
5550 would actually be in error are never taken. */
5551
5552 skipbytes = 0;
5553 reset_bracount = FALSE;
5554
5555 /* Test for a name */
5556
5557 if (ptr[1] != CHAR_PLUS && ptr[1] != CHAR_MINUS)
5558 {
5559 BOOL isnumber = TRUE;
5560 for (p = ptr + 1; *p != 0 && *p != terminator; p++)
5561 {
5562 if ((cd->ctypes[*p] & ctype_digit) == 0) isnumber = FALSE;
5563 if ((cd->ctypes[*p] & ctype_word) == 0) break;
5564 }
5565 if (*p != terminator)
5566 {
5567 *errorcodeptr = ERR57;
5568 break;
5569 }
5570 if (isnumber)
5571 {
5572 ptr++;
5573 goto HANDLE_NUMERICAL_RECURSION;
5574 }
5575 is_recurse = TRUE;
5576 goto NAMED_REF_OR_RECURSE;
5577 }
5578
5579 /* Test a signed number in angle brackets or quotes. */
5580
5581 p = ptr + 2;
5582 while ((digitab[*p] & ctype_digit) != 0) p++;
5583 if (*p != terminator)
5584 {
5585 *errorcodeptr = ERR57;
5586 break;
5587 }
5588 ptr++;
5589 goto HANDLE_NUMERICAL_RECURSION;
5590 }
5591
5592 /* \k<name> or \k'name' is a back reference by name (Perl syntax).
5593 We also support \k{name} (.NET syntax) */
5594
5595 if (-c == ESC_k && (ptr[1] == CHAR_LESS_THAN_SIGN ||
5596 ptr[1] == CHAR_APOSTROPHE || ptr[1] == CHAR_LEFT_CURLY_BRACKET))
5597 {
5598 is_recurse = FALSE;
5599 terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
5600 CHAR_GREATER_THAN_SIGN : (*ptr == CHAR_APOSTROPHE)?
5601 CHAR_APOSTROPHE : CHAR_RIGHT_CURLY_BRACKET;
5602 goto NAMED_REF_OR_RECURSE;
5603 }
5604
5605 /* Back references are handled specially; must disable firstbyte if
5606 not set to cope with cases like (?=(\w+))\1: which would otherwise set
5607 ':' later. */
5608
5609 if (-c >= ESC_REF)
5610 {
5611 open_capitem *oc;
5612 recno = -c - ESC_REF;
5613
5614 HANDLE_REFERENCE: /* Come here from named backref handling */
5615 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
5616 previous = code;
5617 *code++ = OP_REF;
5618 PUT2INC(code, 0, recno);
5619 cd->backref_map |= (recno < 32)? (1 << recno) : 1;
5620 if (recno > cd->top_backref) cd->top_backref = recno;
5621
5622 /* Check to see if this back reference is recursive, that it, it
5623 is inside the group that it references. A flag is set so that the
5624 group can be made atomic. */
5625
5626 for (oc = cd->open_caps; oc != NULL; oc = oc->next)
5627 {
5628 if (oc->number == recno)
5629 {
5630 oc->flag = TRUE;
5631 break;
5632 }
5633 }
5634 }
5635
5636 /* So are Unicode property matches, if supported. */
5637
5638 #ifdef SUPPORT_UCP
5639 else if (-c == ESC_P || -c == ESC_p)
5640 {
5641 BOOL negated;
5642 int pdata;
5643 int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
5644 if (ptype < 0) goto FAILED;
5645 previous = code;
5646 *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
5647 *code++ = ptype;
5648 *code++ = pdata;
5649 }
5650 #else
5651
5652 /* If Unicode properties are not supported, \X, \P, and \p are not
5653 allowed. */
5654
5655 else if (-c == ESC_X || -c == ESC_P || -c == ESC_p)
5656 {
5657 *errorcodeptr = ERR45;
5658 goto FAILED;
5659 }
5660 #endif
5661
5662 /* For the rest (including \X when Unicode properties are supported), we
5663 can obtain the OP value by negating the escape value. */
5664
5665 else
5666 {
5667 previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
5668 *code++ = -c;
5669 }
5670 continue;
5671 }
5672
5673 /* We have a data character whose value is in c. In UTF-8 mode it may have
5674 a value > 127. We set its representation in the length/buffer, and then
5675 handle it as a data character. */
5676
5677 #ifdef SUPPORT_UTF8
5678 if (utf8 && c > 127)
5679 mclength = _pcre_ord2utf8(c, mcbuffer);
5680 else
5681 #endif
5682
5683 {
5684 mcbuffer[0] = c;
5685 mclength = 1;
5686 }
5687 goto ONE_CHAR;
5688
5689
5690 /* ===================================================================*/
5691 /* Handle a literal character. It is guaranteed not to be whitespace or #
5692 when the extended flag is set. If we are in UTF-8 mode, it may be a
5693 multi-byte literal character. */
5694
5695 default:
5696 NORMAL_CHAR:
5697 mclength = 1;
5698 mcbuffer[0] = c;
5699
5700 #ifdef SUPPORT_UTF8
5701 if (utf8 && c >= 0xc0)
5702 {
5703 while ((ptr[1] & 0xc0) == 0x80)
5704 mcbuffer[mclength++] = *(++ptr);
5705 }
5706 #endif
5707
5708 /* At this point we have the character's bytes in mcbuffer, and the length
5709 in mclength. When not in UTF-8 mode, the length is always 1. */
5710
5711 ONE_CHAR:
5712 previous = code;
5713 *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARNC : OP_CHAR;
5714 for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
5715
5716 /* Remember if \r or \n were seen */
5717
5718 if (mcbuffer[0] == CHAR_CR || mcbuffer[0] == CHAR_NL)
5719 cd->external_flags |= PCRE_HASCRORLF;
5720
5721 /* Set the first and required bytes appropriately. If no previous first
5722 byte, set it from this character, but revert to none on a zero repeat.
5723 Otherwise, leave the firstbyte value alone, and don't change it on a zero
5724 repeat. */
5725
5726 if (firstbyte == REQ_UNSET)
5727 {
5728 zerofirstbyte = REQ_NONE;
5729 zeroreqbyte = reqbyte;
5730
5731 /* If the character is more than one byte long, we can set firstbyte
5732 only if it is not to be matched caselessly. */
5733
5734 if (mclength == 1 || req_caseopt == 0)
5735 {
5736 firstbyte = mcbuffer[0] | req_caseopt;
5737 if (mclength != 1) reqbyte = code[-1] | cd->req_varyopt;
5738 }
5739 else firstbyte = reqbyte = REQ_NONE;
5740 }
5741
5742 /* firstbyte was previously set; we can set reqbyte only the length is
5743 1 or the matching is caseful. */
5744
5745 else
5746 {
5747 zerofirstbyte = firstbyte;
5748 zeroreqbyte = reqbyte;
5749 if (mclength == 1 || req_caseopt == 0)
5750 reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
5751 }
5752
5753 break; /* End of literal character handling */
5754 }
5755 } /* end of big loop */
5756
5757
5758 /* Control never reaches here by falling through, only by a goto for all the
5759 error states. Pass back the position in the pattern so that it can be displayed
5760 to the user for diagnosing the error. */
5761
5762 FAILED:
5763 *ptrptr = ptr;
5764 return FALSE;
5765 }
5766
5767
5768
5769
5770 /*************************************************
5771 * Compile sequence of alternatives *
5772 *************************************************/
5773
5774 /* On entry, ptr is pointing past the bracket character, but on return it
5775 points to the closing bracket, or vertical bar, or end of string. The code
5776 variable is pointing at the byte into which the BRA operator has been stored.
5777 If the ims options are changed at the start (for a (?ims: group) or during any
5778 branch, we need to insert an OP_OPT item at the start of every following branch
5779 to ensure they get set correctly at run time, and also pass the new options
5780 into every subsequent branch compile.
5781
5782 This function is used during the pre-compile phase when we are trying to find
5783 out the amount of memory needed, as well as during the real compile phase. The
5784 value of lengthptr distinguishes the two phases.
5785
5786 Arguments:
5787 options option bits, including any changes for this subpattern
5788 oldims previous settings of ims option bits
5789 codeptr -> the address of the current code pointer
5790 ptrptr -> the address of the current pattern pointer
5791 errorcodeptr -> pointer to error code variable
5792 lookbehind TRUE if this is a lookbehind assertion
5793 reset_bracount TRUE to reset the count for each branch
5794 skipbytes skip this many bytes at start (for brackets and OP_COND)
5795 firstbyteptr place to put the first required character, or a negative number
5796 reqbyteptr place to put the last required character, or a negative number
5797 bcptr pointer to the chain of currently open branches
5798 cd points to the data block with tables pointers etc.
5799 lengthptr NULL during the real compile phase
5800 points to length accumulator during pre-compile phase
5801
5802 Returns: TRUE on success
5803 */
5804
5805 static BOOL
5806 compile_regex(int options, int oldims, uschar **codeptr, const uschar **ptrptr,
5807 int *errorcodeptr, BOOL lookbehind, BOOL reset_bracount, int skipbytes,
5808 int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd,
5809 int *lengthptr)
5810 {
5811 const uschar *ptr = *ptrptr;
5812 uschar *code = *codeptr;
5813 uschar *last_branch = code;
5814 uschar *start_bracket = code;
5815 uschar *reverse_count = NULL;
5816 open_capitem capitem;
5817 int capnumber = 0;
5818 int firstbyte, reqbyte;
5819 int branchfirstbyte, branchreqbyte;
5820 int length;
5821 int orig_bracount;
5822 int max_bracount;
5823 int old_external_options = cd->external_options;
5824 branch_chain bc;
5825
5826 bc.outer = bcptr;
5827 bc.current_branch = code;
5828
5829 firstbyte = reqbyte = REQ_UNSET;
5830
5831 /* Accumulate the length for use in the pre-compile phase. Start with the
5832 length of the BRA and KET and any extra bytes that are required at the
5833 beginning. We accumulate in a local variable to save frequent testing of
5834 lenthptr for NULL. We cannot do this by looking at the value of code at the
5835 start and end of each alternative, because compiled items are discarded during
5836 the pre-compile phase so that the work space is not exceeded. */
5837
5838 length = 2 + 2*LINK_SIZE + skipbytes;
5839
5840 /* WARNING: If the above line is changed for any reason, you must also change
5841 the code that abstracts option settings at the start of the pattern and makes
5842 them global. It tests the value of length for (2 + 2*LINK_SIZE) in the
5843 pre-compile phase to find out whether anything has yet been compiled or not. */
5844
5845 /* If this is a capturing subpattern, add to the chain of open capturing items
5846 so that we can detect them if (*ACCEPT) is encountered. This is also used to
5847 detect groups that contain recursive back references to themselves. */
5848
5849 if (*code == OP_CBRA)
5850 {
5851 capnumber = GET2(code, 1 + LINK_SIZE);
5852 capitem.number = capnumber;
5853 capitem.next = cd->open_caps;
5854 capitem.flag = FALSE;
5855 cd->open_caps = &capitem;
5856 }
5857
5858 /* Offset is set zero to mark that this bracket is still open */
5859
5860 PUT(code, 1, 0);
5861 code += 1 + LINK_SIZE + skipbytes;
5862
5863 /* Loop for each alternative branch */
5864
5865 orig_bracount = max_bracount = cd->bracount;
5866 for (;;)
5867 {
5868 /* For a (?| group, reset the capturing bracket count so that each branch
5869 uses the same numbers. */
5870
5871 if (reset_bracount) cd->bracount = orig_bracount;
5872
5873 /* Handle a change of ims options at the start of the branch */
5874
5875 if ((options & PCRE_IMS) != oldims)
5876 {
5877 *code++ = OP_OPT;
5878 *code++ = options & PCRE_IMS;
5879 length += 2;
5880 }
5881
5882 /* Set up dummy OP_REVERSE if lookbehind assertion */
5883
5884 if (lookbehind)
5885 {
5886 *code++ = OP_REVERSE;
5887 reverse_count = code;
5888 PUTINC(code, 0, 0);
5889 length += 1 + LINK_SIZE;
5890 }
5891
5892 /* Now compile the branch; in the pre-compile phase its length gets added
5893 into the length. */
5894
5895 if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstbyte,
5896 &branchreqbyte, &bc, cd, (lengthptr == NULL)? NULL : &length))
5897 {
5898 *ptrptr = ptr;
5899 return FALSE;
5900 }
5901
5902 /* If the external options have changed during this branch, it means that we
5903 are at the top level, and a leading option setting has been encountered. We
5904 need to re-set the original option values to take account of this so that,
5905 during the pre-compile phase, we know to allow for a re-set at the start of
5906 subsequent branches. */
5907
5908 if (old_external_options != cd->external_options)
5909 oldims = cd->external_options & PCRE_IMS;
5910
5911 /* Keep the highest bracket count in case (?| was used and some branch
5912 has fewer than the rest. */
5913
5914 if (cd->bracount > max_bracount) max_bracount = cd->bracount;
5915
5916 /* In the real compile phase, there is some post-processing to be done. */
5917
5918 if (lengthptr == NULL)
5919 {
5920 /* If this is the first branch, the firstbyte and reqbyte values for the
5921 branch become the values for the regex. */
5922
5923 if (*last_branch != OP_ALT)
5924 {
5925 firstbyte = branchfirstbyte;
5926 reqbyte = branchreqbyte;
5927 }
5928
5929 /* If this is not the first branch, the first char and reqbyte have to
5930 match the values from all the previous branches, except that if the
5931 previous value for reqbyte didn't have REQ_VARY set, it can still match,
5932 and we set REQ_VARY for the regex. */
5933
5934 else
5935 {
5936 /* If we previously had a firstbyte, but it doesn't match the new branch,
5937 we have to abandon the firstbyte for the regex, but if there was
5938 previously no reqbyte, it takes on the value of the old firstbyte. */
5939
5940 if (firstbyte >= 0 && firstbyte != branchfirstbyte)
5941 {
5942 if (reqbyte < 0) reqbyte = firstbyte;
5943 firstbyte = REQ_NONE;
5944 }
5945
5946 /* If we (now or from before) have no firstbyte, a firstbyte from the
5947 branch becomes a reqbyte if there isn't a branch reqbyte. */
5948
5949 if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
5950 branchreqbyte = branchfirstbyte;
5951
5952 /* Now ensure that the reqbytes match */
5953
5954 if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
5955 reqbyte = REQ_NONE;
5956 else reqbyte |= branchreqbyte; /* To "or" REQ_VARY */
5957 }
5958
5959 /* If lookbehind, check that this branch matches a fixed-length string, and
5960 put the length into the OP_REVERSE item. Temporarily mark the end of the
5961 branch with OP_END. If the branch contains OP_RECURSE, the result is -3
5962 because there may be forward references that we can't check here. Set a
5963 flag to cause another lookbehind check at the end. Why not do it all at the
5964 end? Because common, erroneous checks are picked up here and the offset of
5965 the problem can be shown. */
5966
5967 if (lookbehind)
5968 {
5969 int fixed_length;
5970 *code = OP_END;
5971 fixed_length = find_fixedlength(last_branch, options, FALSE, cd);
5972 DPRINTF(("fixed length = %d\n", fixed_length));
5973 if (fixed_length == -3)
5974 {
5975 cd->check_lookbehind = TRUE;
5976 }
5977 else if (fixed_length < 0)
5978 {
5979 *errorcodeptr = (fixed_length == -2)? ERR36 : ERR25;
5980 *ptrptr = ptr;
5981 return FALSE;
5982 }
5983 else { PUT(reverse_count, 0, fixed_length); }
5984 }
5985 }
5986
5987 /* Reached end of expression, either ')' or end of pattern. In the real
5988 compile phase, go back through the alternative branches and reverse the chain
5989 of offsets, with the field in the BRA item now becoming an offset to the
5990 first alternative. If there are no alternatives, it points to the end of the
5991 group. The length in the terminating ket is always the length of the whole
5992 bracketed item. If any of the ims options were changed inside the group,
5993 compile a resetting op-code following, except at the very end of the pattern.
5994 Return leaving the pointer at the terminating char. */
5995
5996 if (*ptr != CHAR_VERTICAL_LINE)
5997 {
5998 if (lengthptr == NULL)
5999 {
6000 int branch_length = code - last_branch;
6001 do
6002 {
6003 int prev_length = GET(last_branch, 1);
6004 PUT(last_branch, 1, branch_length);
6005 branch_length = prev_length;
6006 last_branch -= branch_length;
6007 }
6008 while (branch_length > 0);
6009 }
6010
6011 /* Fill in the ket */
6012
6013 *code = OP_KET;
6014 PUT(code, 1, code - start_bracket);
6015 code += 1 + LINK_SIZE;
6016
6017 /* If it was a capturing subpattern, check to see if it contained any
6018 recursive back references. If so, we must wrap it in atomic brackets.
6019 In any event, remove the block from the chain. */
6020
6021 if (capnumber > 0)
6022 {
6023 if (cd->open_caps->flag)
6024 {
6025 memmove(start_bracket + 1 + LINK_SIZE, start_bracket,
6026 code - start_bracket);
6027 *start_bracket = OP_ONCE;
6028 code += 1 + LINK_SIZE;
6029 PUT(start_bracket, 1, code - start_bracket);
6030 *code = OP_KET;
6031 PUT(code, 1, code - start_bracket);
6032 code += 1 + LINK_SIZE;
6033 length += 2 + 2*LINK_SIZE;
6034 }
6035 cd->open_caps = cd->open_caps->next;
6036 }
6037
6038 /* Reset options if needed. */
6039
6040 if ((options & PCRE_IMS) != oldims && *ptr == CHAR_RIGHT_PARENTHESIS)
6041 {
6042 *code++ = OP_OPT;
6043 *code++ = oldims;
6044 length += 2;
6045 }
6046
6047 /* Retain the highest bracket number, in case resetting was used. */
6048
6049 cd->bracount = max_bracount;
6050
6051 /* Set values to pass back */
6052
6053 *codeptr = code;
6054 *ptrptr = ptr;
6055 *firstbyteptr = firstbyte;
6056 *reqbyteptr = reqbyte;
6057 if (lengthptr != NULL)
6058 {
6059 if (OFLOW_MAX - *lengthptr < length)
6060 {
6061 *errorcodeptr = ERR20;
6062 return FALSE;
6063 }
6064 *lengthptr += length;
6065 }
6066 return TRUE;
6067 }
6068
6069 /* Another branch follows. In the pre-compile phase, we can move the code
6070 pointer back to where it was for the start of the first branch. (That is,
6071 pretend that each branch is the only one.)
6072
6073 In the real compile phase, insert an ALT node. Its length field points back
6074 to the previous branch while the bracket remains open. At the end the chain
6075 is reversed. It's done like this so that the start of the bracket has a
6076 zero offset until it is closed, making it possible to detect recursion. */
6077
6078 if (lengthptr != NULL)
6079 {
6080 code = *codeptr + 1 + LINK_SIZE + skipbytes;
6081 length += 1 + LINK_SIZE;
6082 }
6083 else
6084 {
6085 *code = OP_ALT;
6086 PUT(code, 1, code - last_branch);
6087 bc.current_branch = last_branch = code;
6088 code += 1 + LINK_SIZE;
6089 }
6090
6091 ptr++;
6092 }
6093 /* Control never reaches here */
6094 }
6095
6096
6097
6098
6099 /*************************************************
6100 * Check for anchored expression *
6101 *************************************************/
6102
6103 /* Try to find out if this is an anchored regular expression. Consider each
6104 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
6105 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
6106 it's anchored. However, if this is a multiline pattern, then only OP_SOD
6107 counts, since OP_CIRC can match in the middle.
6108
6109 We can also consider a regex to be anchored if OP_SOM starts all its branches.
6110 This is the code for \G, which means "match at start of match position, taking
6111 into account the match offset".
6112
6113 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
6114 because that will try the rest of the pattern at all possible matching points,
6115 so there is no point trying again.... er ....
6116
6117 .... except when the .* appears inside capturing parentheses, and there is a
6118 subsequent back reference to those parentheses. We haven't enough information
6119 to catch that case precisely.
6120
6121 At first, the best we could do was to detect when .* was in capturing brackets
6122 and the highest back reference was greater than or equal to that level.
6123 However, by keeping a bitmap of the first 31 back references, we can catch some
6124 of the more common cases more precisely.
6125
6126 Arguments:
6127 code points to start of expression (the bracket)
6128 options points to the options setting
6129 bracket_map a bitmap of which brackets we are inside while testing; this
6130 handles up to substring 31; after that we just have to take
6131 the less precise approach
6132 backref_map the back reference bitmap
6133
6134 Returns: TRUE or FALSE
6135 */
6136
6137 static BOOL
6138 is_anchored(register const uschar *code, int *options, unsigned int bracket_map,
6139 unsigned int backref_map)
6140 {
6141 do {
6142 const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
6143 options, PCRE_MULTILINE, FALSE);
6144 register int op = *scode;
6145
6146 /* Non-capturing brackets */
6147
6148 if (op == OP_BRA)
6149 {
6150 if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
6151 }
6152
6153 /* Capturing brackets */
6154
6155 else if (op == OP_CBRA)
6156 {
6157 int n = GET2(scode, 1+LINK_SIZE);
6158 int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
6159 if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;
6160 }
6161
6162 /* Other brackets */
6163
6164 else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
6165 {
6166 if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
6167 }
6168
6169 /* .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and
6170 it isn't in brackets that are or may be referenced. */
6171
6172 else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
6173 op == OP_TYPEPOSSTAR))
6174 {
6175 if (scode[1] != OP_ALLANY || (bracket_map & backref_map) != 0)
6176 return FALSE;
6177 }
6178
6179 /* Check for explicit anchoring */
6180
6181 else if (op != OP_SOD && op != OP_SOM &&
6182 ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
6183 return FALSE;
6184 code += GET(code, 1);
6185 }
6186 while (*code == OP_ALT); /* Loop for each alternative */
6187 return TRUE;
6188 }
6189
6190
6191
6192 /*************************************************
6193 * Check for starting with ^ or .* *
6194 *************************************************/
6195
6196 /* This is called to find out if every branch starts with ^ or .* so that
6197 "first char" processing can be done to speed things up in multiline
6198 matching and for non-DOTALL patterns that start with .* (which must start at
6199 the beginning or after \n). As in the case of is_anchored() (see above), we
6200 have to take account of back references to capturing brackets that contain .*
6201 because in that case we can't make the assumption.
6202
6203 Arguments:
6204 code points to start of expression (the bracket)
6205 bracket_map a bitmap of which brackets we are inside while testing; this
6206 handles up to substring 31; after that we just have to take
6207 the less precise approach
6208 backref_map the back reference bitmap
6209
6210 Returns: TRUE or FALSE
6211 */
6212
6213 static BOOL
6214 is_startline(const uschar *code, unsigned int bracket_map,
6215 unsigned int backref_map)
6216 {
6217 do {
6218 const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
6219 NULL, 0, FALSE);
6220 register int op = *scode;
6221
6222 /* If we are at the start of a conditional assertion group, *both* the
6223 conditional assertion *and* what follows the condition must satisfy the test
6224 for start of line. Other kinds of condition fail. Note that there may be an
6225 auto-callout at the start of a condition. */
6226
6227 if (op == OP_COND)
6228 {
6229 scode += 1 + LINK_SIZE;
6230 if (*scode == OP_CALLOUT) scode += _pcre_OP_lengths[OP_CALLOUT];
6231 switch (*scode)
6232 {
6233 case OP_CREF:
6234 case OP_NCREF:
6235 case OP_RREF:
6236 case OP_NRREF:
6237 case OP_DEF:
6238 return FALSE;
6239
6240 default: /* Assertion */
6241 if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
6242 do scode += GET(scode, 1); while (*scode == OP_ALT);
6243 scode += 1 + LINK_SIZE;
6244 break;
6245 }
6246 scode = first_significant_code(scode, NULL, 0, FALSE);
6247 op = *scode;
6248 }
6249
6250 /* Non-capturing brackets */
6251
6252 if (op == OP_BRA)
6253 {
6254 if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
6255 }
6256
6257 /* Capturing brackets */
6258
6259 else if (op == OP_CBRA)
6260 {
6261 int n = GET2(scode, 1+LINK_SIZE);
6262 int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
6263 if (!is_startline(scode, new_map, backref_map)) return FALSE;
6264 }
6265
6266 /* Other brackets */
6267
6268 else if (op == OP_ASSERT || op == OP_ONCE)
6269 {
6270 if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
6271 }
6272
6273 /* .* means "start at start or after \n" if it isn't in brackets that
6274 may be referenced. */
6275
6276 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
6277 {
6278 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
6279 }
6280
6281 /* Check for explicit circumflex */
6282
6283 else if (op != OP_CIRC) return FALSE;
6284
6285 /* Move on to the next alternative */
6286
6287 code += GET(code, 1);
6288 }
6289 while (*code == OP_ALT); /* Loop for each alternative */
6290 return TRUE;
6291 }
6292
6293
6294
6295 /*************************************************
6296 * Check for asserted fixed first char *
6297 *************************************************/
6298
6299 /* During compilation, the "first char" settings from forward assertions are
6300 discarded, because they can cause conflicts with actual literals that follow.
6301 However, if we end up without a first char setting for an unanchored pattern,
6302 it is worth scanning the regex to see if there is an initial asserted first
6303 char. If all branches start with the same asserted char, or with a bracket all
6304 of whose alternatives start with the same asserted char (recurse ad lib), then
6305 we return that char, otherwise -1.
6306
6307 Arguments:
6308 code points to start of expression (the bracket)
6309 options pointer to the options (used to check casing changes)
6310 inassert TRUE if in an assertion
6311
6312 Returns: -1 or the fixed first char
6313 */
6314
6315 static int
6316 find_firstassertedchar(const uschar *code, int *options, BOOL inassert)
6317 {
6318 register int c = -1;
6319 do {
6320 int d;
6321 const uschar *scode =
6322 first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE);
6323 register int op = *scode;
6324
6325 switch(op)
6326 {
6327 default:
6328 return -1;
6329
6330 case OP_BRA:
6331 case OP_CBRA:
6332 case OP_ASSERT:
6333 case OP_ONCE:
6334 case OP_COND:
6335 if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0)
6336 return -1;
6337 if (c < 0) c = d; else if (c != d) return -1;
6338 break;
6339
6340 case OP_EXACT: /* Fall through */
6341 scode += 2;
6342
6343 case OP_CHAR:
6344 case OP_CHARNC:
6345 case OP_PLUS:
6346 case OP_MINPLUS:
6347 case OP_POSPLUS:
6348 if (!inassert) return -1;
6349 if (c < 0)
6350 {
6351 c = scode[1];
6352 if ((*options & PCRE_CASELESS) != 0) c |= REQ_CASELESS;
6353 }
6354 else if (c != scode[1]) return -1;
6355 break;
6356 }
6357
6358 code += GET(code, 1);
6359 }
6360 while (*code == OP_ALT);
6361 return c;
6362 }
6363
6364
6365
6366 /*************************************************
6367 * Compile a Regular Expression *
6368 *************************************************/
6369
6370 /* This function takes a string and returns a pointer to a block of store
6371 holding a compiled version of the expression. The original API for this
6372 function had no error code return variable; it is retained for backwards
6373 compatibility. The new function is given a new name.
6374
6375 Arguments:
6376 pattern the regular expression
6377 options various option bits
6378 errorcodeptr pointer to error code variable (pcre_compile2() only)
6379 can be NULL if you don't want a code value
6380 errorptr pointer to pointer to error text
6381 erroroffset ptr offset in pattern where error was detected
6382 tables pointer to character tables or NULL
6383
6384 Returns: pointer to compiled data block, or NULL on error,
6385 with errorptr and erroroffset set
6386 */
6387
6388 PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION
6389 pcre_compile(const char *pattern, int options, const char **errorptr,
6390 int *erroroffset, const unsigned char *tables)
6391 {
6392 return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
6393 }
6394
6395
6396 PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION
6397 pcre_compile2(const char *pattern, int options, int *errorcodeptr,
6398 const char **errorptr, int *erroroffset, const unsigned char *tables)
6399 {
6400 real_pcre *re;
6401 int length = 1; /* For final END opcode */
6402 int firstbyte, reqbyte, newline;
6403 int errorcode = 0;
6404 int skipatstart = 0;
6405 BOOL utf8 = (options & PCRE_UTF8) != 0;
6406 size_t size;
6407 uschar *code;
6408 const uschar *codestart;
6409 const uschar *ptr;
6410 compile_data compile_block;
6411 compile_data *cd = &compile_block;
6412
6413 /* This space is used for "compiling" into during the first phase, when we are
6414 computing the amount of memory that is needed. Compiled items are thrown away
6415 as soon as possible, so that a fairly large buffer should be sufficient for
6416 this purpose. The same space is used in the second phase for remembering where
6417 to fill in forward references to subpatterns. */
6418
6419 uschar cworkspace[COMPILE_WORK_SIZE];
6420
6421 /* Set this early so that early errors get offset 0. */
6422
6423 ptr = (const uschar *)pattern;
6424
6425 /* We can't pass back an error message if errorptr is NULL; I guess the best we
6426 can do is just return NULL, but we can set a code value if there is a code
6427 pointer. */
6428
6429 if (errorptr == NULL)
6430 {
6431 if (errorcodeptr != NULL) *errorcodeptr = 99;
6432 return NULL;
6433 }
6434
6435 *errorptr = NULL;
6436 if (errorcodeptr != NULL) *errorcodeptr = ERR0;
6437
6438 /* However, we can give a message for this error */
6439
6440 if (erroroffset == NULL)
6441 {
6442 errorcode = ERR16;
6443 goto PCRE_EARLY_ERROR_RETURN2;
6444 }
6445
6446 *erroroffset = 0;
6447
6448 /* Set up pointers to the individual character tables */
6449
6450 if (tables == NULL) tables = _pcre_default_tables;
6451 cd->lcc = tables + lcc_offset;
6452 cd->fcc = tables + fcc_offset;
6453 cd->cbits = tables + cbits_offset;
6454 cd->ctypes = tables + ctypes_offset;
6455
6456 /* Check that all undefined public option bits are zero */
6457
6458 if ((options & ~PUBLIC_COMPILE_OPTIONS) != 0)
6459 {
6460 errorcode = ERR17;
6461 goto PCRE_EARLY_ERROR_RETURN;
6462 }
6463
6464 /* Check for global one-time settings at the start of the pattern, and remember
6465 the offset for later. */
6466
6467 while (ptr[skipatstart] == CHAR_LEFT_PARENTHESIS &&
6468 ptr[skipatstart+1] == CHAR_ASTERISK)
6469 {
6470 int newnl = 0;
6471 int newbsr = 0;
6472
6473 if (strncmp((char *)(ptr+skipatstart+2), STRING_UTF8_RIGHTPAR, 5) == 0)
6474 { skipatstart += 7; options |= PCRE_UTF8; continue; }
6475
6476 if (strncmp((char *)(ptr+skipatstart+2), STRING_CR_RIGHTPAR, 3) == 0)
6477 { skipatstart += 5; newnl = PCRE_NEWLINE_CR; }
6478 else if (strncmp((char *)(ptr+skipatstart+2), STRING_LF_RIGHTPAR, 3) == 0)
6479 { skipatstart += 5; newnl = PCRE_NEWLINE_LF; }
6480 else if (strncmp((char *)(ptr+skipatstart+2), STRING_CRLF_RIGHTPAR, 5) == 0)
6481 { skipatstart += 7; newnl = PCRE_NEWLINE_CR + PCRE_NEWLINE_LF; }
6482 else if (strncmp((char *)(ptr+skipatstart+2), STRING_ANY_RIGHTPAR, 4) == 0)
6483 { skipatstart += 6; newnl = PCRE_NEWLINE_ANY; }
6484 else if (strncmp((char *)(ptr+skipatstart+2), STRING_ANYCRLF_RIGHTPAR, 8) == 0)
6485 { skipatstart += 10; newnl = PCRE_NEWLINE_ANYCRLF; }
6486
6487 else if (strncmp((char *)(ptr+skipatstart+2), STRING_BSR_ANYCRLF_RIGHTPAR, 12) == 0)
6488 { skipatstart += 14; newbsr = PCRE_BSR_ANYCRLF; }
6489 else if (strncmp((char *)(ptr+skipatstart+2), STRING_BSR_UNICODE_RIGHTPAR, 12) == 0)
6490 { skipatstart += 14; newbsr = PCRE_BSR_UNICODE; }
6491
6492 if (newnl != 0)
6493 options = (options & ~PCRE_NEWLINE_BITS) | newnl;
6494 else if (newbsr != 0)
6495 options = (options & ~(PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) | newbsr;
6496 else break;
6497 }
6498
6499 /* Can't support UTF8 unless PCRE has been compiled to include the code. */
6500
6501 #ifdef SUPPORT_UTF8
6502 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
6503 (*erroroffset = _pcre_valid_utf8((USPTR)pattern, -1)) >= 0)
6504 {
6505 errorcode = ERR44;
6506 goto PCRE_EARLY_ERROR_RETURN2;
6507 }
6508 #else
6509 if (utf8)
6510 {
6511 errorcode = ERR32;
6512 goto PCRE_EARLY_ERROR_RETURN;
6513 }
6514 #endif
6515
6516 /* Check validity of \R options. */
6517
6518 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
6519 {
6520 case 0:
6521 case PCRE_BSR_ANYCRLF:
6522 case PCRE_BSR_UNICODE:
6523 break;
6524 default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
6525 }
6526
6527 /* Handle different types of newline. The three bits give seven cases. The
6528 current code allows for fixed one- or two-byte sequences, plus "any" and
6529 "anycrlf". */
6530
6531 switch (options & PCRE_NEWLINE_BITS)
6532 {
6533 case 0: newline = NEWLINE; break; /* Build-time default */
6534 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
6535 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
6536 case PCRE_NEWLINE_CR+
6537 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
6538 case PCRE_NEWLINE_ANY: newline = -1; break;
6539 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
6540 default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
6541 }
6542
6543 if (newline == -2)
6544 {
6545 cd->nltype = NLTYPE_ANYCRLF;
6546 }
6547 else if (newline < 0)
6548 {
6549 cd->nltype = NLTYPE_ANY;
6550 }
6551 else
6552 {
6553 cd->nltype = NLTYPE_FIXED;
6554 if (newline > 255)
6555 {
6556 cd->nllen = 2;
6557 cd->nl[0] = (newline >> 8) & 255;
6558 cd->nl[1] = newline & 255;
6559 }
6560 else
6561 {
6562 cd->nllen = 1;
6563 cd->nl[0] = newline;
6564 }
6565 }
6566
6567 /* Maximum back reference and backref bitmap. The bitmap records up to 31 back
6568 references to help in deciding whether (.*) can be treated as anchored or not.
6569 */
6570
6571 cd->top_backref = 0;
6572 cd->backref_map = 0;
6573
6574 /* Reflect pattern for debugging output */
6575
6576 DPRINTF(("------------------------------------------------------------------\n"));
6577 DPRINTF(("%s\n", pattern));
6578
6579 /* Pretend to compile the pattern while actually just accumulating the length
6580 of memory required. This behaviour is triggered by passing a non-NULL final
6581 argument to compile_regex(). We pass a block of workspace (cworkspace) for it
6582 to compile parts of the pattern into; the compiled code is discarded when it is
6583 no longer needed, so hopefully this workspace will never overflow, though there
6584 is a test for its doing so. */
6585
6586 cd->bracount = cd->final_bracount = 0;
6587 cd->names_found = 0;
6588 cd->name_entry_size = 0;
6589 cd->name_table = NULL;
6590 cd->start_workspace = cworkspace;
6591 cd->start_code = cworkspace;
6592 cd->hwm = cworkspace;
6593 cd->start_pattern = (const uschar *)pattern;
6594 cd->end_pattern = (const uschar *)(pattern + strlen(pattern));
6595 cd->req_varyopt = 0;
6596 cd->external_options = options;
6597 cd->external_flags = 0;
6598 cd->open_caps = NULL;
6599
6600 /* Now do the pre-compile. On error, errorcode will be set non-zero, so we
6601 don't need to look at the result of the function here. The initial options have
6602 been put into the cd block so that they can be changed if an option setting is
6603 found within the regex right at the beginning. Bringing initial option settings
6604 outside can help speed up starting point checks. */
6605
6606 ptr += skipatstart;
6607 code = cworkspace;
6608 *code = OP_BRA;
6609 (void)compile_regex(cd->external_options, cd->external_options & PCRE_IMS,
6610 &code, &ptr, &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd,
6611 &length);
6612 if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN;
6613
6614 DPRINTF(("end pre-compile: length=%d workspace=%d\n", length,
6615 cd->hwm - cworkspace));
6616
6617 if (length > MAX_PATTERN_SIZE)
6618 {
6619 errorcode = ERR20;
6620 goto PCRE_EARLY_ERROR_RETURN;
6621 }
6622
6623 /* Compute the size of data block needed and get it, either from malloc or
6624 externally provided function. Integer overflow should no longer be possible
6625 because nowadays we limit the maximum value of cd->names_found and
6626 cd->name_entry_size. */
6627
6628 size = length + sizeof(real_pcre) + cd->names_found * (cd->name_entry_size + 3);
6629 re = (real_pcre *)(pcre_malloc)(size);
6630
6631 if (re == NULL)
6632 {
6633 errorcode = ERR21;
6634 goto PCRE_EARLY_ERROR_RETURN;
6635 }
6636
6637 /* Put in the magic number, and save the sizes, initial options, internal
6638 flags, and character table pointer. NULL is used for the default character
6639 tables. The nullpad field is at the end; it's there to help in the case when a