/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Contents of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 438 - (show annotations) (download)
Sun Sep 6 20:00:47 2009 UTC (4 years, 10 months ago) by ph10
File MIME type: text/plain
File size: 212803 byte(s)
Fix internal error for forward reference with [^m] interposing.

1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2009 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains the external function pcre_compile(), along with
42 supporting internal functions that are not used by other modules. */
43
44
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48
49 #define NLBLOCK cd /* Block containing newline information */
50 #define PSSTART start_pattern /* Field containing processed string start */
51 #define PSEND end_pattern /* Field containing processed string end */
52
53 #include "pcre_internal.h"
54
55
56 /* When DEBUG is defined, we need the pcre_printint() function, which is also
57 used by pcretest. DEBUG is not defined when building a production library. */
58
59 #ifdef DEBUG
60 #include "pcre_printint.src"
61 #endif
62
63
64 /* Macro for setting individual bits in class bitmaps. */
65
66 #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
67
68 /* Maximum length value to check against when making sure that the integer that
69 holds the compiled pattern length does not overflow. We make it a bit less than
70 INT_MAX to allow for adding in group terminating bytes, so that we don't have
71 to check them every time. */
72
73 #define OFLOW_MAX (INT_MAX - 20)
74
75
76 /*************************************************
77 * Code parameters and static tables *
78 *************************************************/
79
80 /* This value specifies the size of stack workspace that is used during the
81 first pre-compile phase that determines how much memory is required. The regex
82 is partly compiled into this space, but the compiled parts are discarded as
83 soon as they can be, so that hopefully there will never be an overrun. The code
84 does, however, check for an overrun. The largest amount I've seen used is 218,
85 so this number is very generous.
86
87 The same workspace is used during the second, actual compile phase for
88 remembering forward references to groups so that they can be filled in at the
89 end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
90 is 4 there is plenty of room. */
91
92 #define COMPILE_WORK_SIZE (4096)
93
94
95 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
96 are simple data values; negative values are for special things like \d and so
97 on. Zero means further processing is needed (for things like \x), or the escape
98 is invalid. */
99
100 #ifndef EBCDIC
101
102 /* This is the "normal" table for ASCII systems or for EBCDIC systems running
103 in UTF-8 mode. */
104
105 static const short int escapes[] = {
106 0, 0,
107 0, 0,
108 0, 0,
109 0, 0,
110 0, 0,
111 CHAR_COLON, CHAR_SEMICOLON,
112 CHAR_LESS_THAN_SIGN, CHAR_EQUALS_SIGN,
113 CHAR_GREATER_THAN_SIGN, CHAR_QUESTION_MARK,
114 CHAR_COMMERCIAL_AT, -ESC_A,
115 -ESC_B, -ESC_C,
116 -ESC_D, -ESC_E,
117 0, -ESC_G,
118 -ESC_H, 0,
119 0, -ESC_K,
120 0, 0,
121 0, 0,
122 -ESC_P, -ESC_Q,
123 -ESC_R, -ESC_S,
124 0, 0,
125 -ESC_V, -ESC_W,
126 -ESC_X, 0,
127 -ESC_Z, CHAR_LEFT_SQUARE_BRACKET,
128 CHAR_BACKSLASH, CHAR_RIGHT_SQUARE_BRACKET,
129 CHAR_CIRCUMFLEX_ACCENT, CHAR_UNDERSCORE,
130 CHAR_GRAVE_ACCENT, 7,
131 -ESC_b, 0,
132 -ESC_d, ESC_e,
133 ESC_f, 0,
134 -ESC_h, 0,
135 0, -ESC_k,
136 0, 0,
137 ESC_n, 0,
138 -ESC_p, 0,
139 ESC_r, -ESC_s,
140 ESC_tee, 0,
141 -ESC_v, -ESC_w,
142 0, 0,
143 -ESC_z
144 };
145
146 #else
147
148 /* This is the "abnormal" table for EBCDIC systems without UTF-8 support. */
149
150 static const short int escapes[] = {
151 /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
152 /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
153 /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
154 /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
155 /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
156 /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
157 /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
158 /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
159 /* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0,
160 /* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
161 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
162 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0,
163 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
164 /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
165 /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
166 /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
167 /* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0,
168 /* D0 */ '}', 0, -ESC_K, 0, 0, 0, 0, -ESC_P,
169 /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
170 /* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X,
171 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
172 /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
173 /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
174 };
175 #endif
176
177
178 /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
179 searched linearly. Put all the names into a single string, in order to reduce
180 the number of relocations when a shared library is dynamically linked. The
181 string is built from string macros so that it works in UTF-8 mode on EBCDIC
182 platforms. */
183
184 typedef struct verbitem {
185 int len;
186 int op;
187 } verbitem;
188
189 static const char verbnames[] =
190 STRING_ACCEPT0
191 STRING_COMMIT0
192 STRING_F0
193 STRING_FAIL0
194 STRING_PRUNE0
195 STRING_SKIP0
196 STRING_THEN;
197
198 static const verbitem verbs[] = {
199 { 6, OP_ACCEPT },
200 { 6, OP_COMMIT },
201 { 1, OP_FAIL },
202 { 4, OP_FAIL },
203 { 5, OP_PRUNE },
204 { 4, OP_SKIP },
205 { 4, OP_THEN }
206 };
207
208 static const int verbcount = sizeof(verbs)/sizeof(verbitem);
209
210
211 /* Tables of names of POSIX character classes and their lengths. The names are
212 now all in a single string, to reduce the number of relocations when a shared
213 library is dynamically loaded. The list of lengths is terminated by a zero
214 length entry. The first three must be alpha, lower, upper, as this is assumed
215 for handling case independence. */
216
217 static const char posix_names[] =
218 STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
219 STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
220 STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
221 STRING_word0 STRING_xdigit;
222
223 static const uschar posix_name_lengths[] = {
224 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
225
226 /* Table of class bit maps for each POSIX class. Each class is formed from a
227 base map, with an optional addition or removal of another map. Then, for some
228 classes, there is some additional tweaking: for [:blank:] the vertical space
229 characters are removed, and for [:alpha:] and [:alnum:] the underscore
230 character is removed. The triples in the table consist of the base map offset,
231 second map offset or -1 if no second map, and a non-negative value for map
232 addition or a negative value for map subtraction (if there are two maps). The
233 absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
234 remove vertical space characters, 2 => remove underscore. */
235
236 static const int posix_class_maps[] = {
237 cbit_word, cbit_digit, -2, /* alpha */
238 cbit_lower, -1, 0, /* lower */
239 cbit_upper, -1, 0, /* upper */
240 cbit_word, -1, 2, /* alnum - word without underscore */
241 cbit_print, cbit_cntrl, 0, /* ascii */
242 cbit_space, -1, 1, /* blank - a GNU extension */
243 cbit_cntrl, -1, 0, /* cntrl */
244 cbit_digit, -1, 0, /* digit */
245 cbit_graph, -1, 0, /* graph */
246 cbit_print, -1, 0, /* print */
247 cbit_punct, -1, 0, /* punct */
248 cbit_space, -1, 0, /* space */
249 cbit_word, -1, 0, /* word - a Perl extension */
250 cbit_xdigit,-1, 0 /* xdigit */
251 };
252
253
254 #define STRING(a) # a
255 #define XSTRING(s) STRING(s)
256
257 /* The texts of compile-time error messages. These are "char *" because they
258 are passed to the outside world. Do not ever re-use any error number, because
259 they are documented. Always add a new error instead. Messages marked DEAD below
260 are no longer used. This used to be a table of strings, but in order to reduce
261 the number of relocations needed when a shared library is loaded dynamically,
262 it is now one long string. We cannot use a table of offsets, because the
263 lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
264 simply count through to the one we want - this isn't a performance issue
265 because these strings are used only when there is a compilation error. */
266
267 static const char error_texts[] =
268 "no error\0"
269 "\\ at end of pattern\0"
270 "\\c at end of pattern\0"
271 "unrecognized character follows \\\0"
272 "numbers out of order in {} quantifier\0"
273 /* 5 */
274 "number too big in {} quantifier\0"
275 "missing terminating ] for character class\0"
276 "invalid escape sequence in character class\0"
277 "range out of order in character class\0"
278 "nothing to repeat\0"
279 /* 10 */
280 "operand of unlimited repeat could match the empty string\0" /** DEAD **/
281 "internal error: unexpected repeat\0"
282 "unrecognized character after (? or (?-\0"
283 "POSIX named classes are supported only within a class\0"
284 "missing )\0"
285 /* 15 */
286 "reference to non-existent subpattern\0"
287 "erroffset passed as NULL\0"
288 "unknown option bit(s) set\0"
289 "missing ) after comment\0"
290 "parentheses nested too deeply\0" /** DEAD **/
291 /* 20 */
292 "regular expression is too large\0"
293 "failed to get memory\0"
294 "unmatched parentheses\0"
295 "internal error: code overflow\0"
296 "unrecognized character after (?<\0"
297 /* 25 */
298 "lookbehind assertion is not fixed length\0"
299 "malformed number or name after (?(\0"
300 "conditional group contains more than two branches\0"
301 "assertion expected after (?(\0"
302 "(?R or (?[+-]digits must be followed by )\0"
303 /* 30 */
304 "unknown POSIX class name\0"
305 "POSIX collating elements are not supported\0"
306 "this version of PCRE is not compiled with PCRE_UTF8 support\0"
307 "spare error\0" /** DEAD **/
308 "character value in \\x{...} sequence is too large\0"
309 /* 35 */
310 "invalid condition (?(0)\0"
311 "\\C not allowed in lookbehind assertion\0"
312 "PCRE does not support \\L, \\l, \\N, \\U, or \\u\0"
313 "number after (?C is > 255\0"
314 "closing ) for (?C expected\0"
315 /* 40 */
316 "recursive call could loop indefinitely\0"
317 "unrecognized character after (?P\0"
318 "syntax error in subpattern name (missing terminator)\0"
319 "two named subpatterns have the same name\0"
320 "invalid UTF-8 string\0"
321 /* 45 */
322 "support for \\P, \\p, and \\X has not been compiled\0"
323 "malformed \\P or \\p sequence\0"
324 "unknown property name after \\P or \\p\0"
325 "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
326 "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
327 /* 50 */
328 "repeated subpattern is too long\0" /** DEAD **/
329 "octal value is greater than \\377 (not in UTF-8 mode)\0"
330 "internal error: overran compiling workspace\0"
331 "internal error: previously-checked referenced subpattern not found\0"
332 "DEFINE group contains more than one branch\0"
333 /* 55 */
334 "repeating a DEFINE group is not allowed\0"
335 "inconsistent NEWLINE options\0"
336 "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
337 "a numbered reference must not be zero\0"
338 "(*VERB) with an argument is not supported\0"
339 /* 60 */
340 "(*VERB) not recognized\0"
341 "number is too big\0"
342 "subpattern name expected\0"
343 "digit expected after (?+\0"
344 "] is an invalid data character in JavaScript compatibility mode";
345
346
347 /* Table to identify digits and hex digits. This is used when compiling
348 patterns. Note that the tables in chartables are dependent on the locale, and
349 may mark arbitrary characters as digits - but the PCRE compiling code expects
350 to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
351 a private table here. It costs 256 bytes, but it is a lot faster than doing
352 character value tests (at least in some simple cases I timed), and in some
353 applications one wants PCRE to compile efficiently as well as match
354 efficiently.
355
356 For convenience, we use the same bit definitions as in chartables:
357
358 0x04 decimal digit
359 0x08 hexadecimal digit
360
361 Then we can use ctype_digit and ctype_xdigit in the code. */
362
363 #ifndef EBCDIC
364
365 /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
366 UTF-8 mode. */
367
368 static const unsigned char digitab[] =
369 {
370 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
371 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
372 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
373 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
374 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
375 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
376 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
377 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
378 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
379 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
380 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
381 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
382 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
383 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
384 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
385 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
386 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
387 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
388 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
389 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
390 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
391 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
392 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
393 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
394 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
395 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
396 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
397 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
398 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
399 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
400 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
401 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
402
403 #else
404
405 /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
406
407 static const unsigned char digitab[] =
408 {
409 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
410 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
411 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
412 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
413 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
414 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
415 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
416 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
417 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
418 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
419 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
420 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */
421 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
422 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
423 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
424 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
425 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
426 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
427 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
428 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
429 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
430 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
431 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
432 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
433 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
434 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
435 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
436 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
437 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
438 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
439 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
440 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
441
442 static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
443 0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
444 0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
445 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
446 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
447 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
448 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
449 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
450 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
451 0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
452 0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
453 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
454 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */
455 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
456 0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
457 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
458 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
459 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
460 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
461 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
462 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
463 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
464 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
465 0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
466 0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
467 0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
468 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
469 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
470 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
471 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
472 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
473 0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
474 0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
475 #endif
476
477
478 /* Definition to allow mutual recursion */
479
480 static BOOL
481 compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
482 int *, int *, branch_chain *, compile_data *, int *);
483
484
485
486 /*************************************************
487 * Find an error text *
488 *************************************************/
489
490 /* The error texts are now all in one long string, to save on relocations. As
491 some of the text is of unknown length, we can't use a table of offsets.
492 Instead, just count through the strings. This is not a performance issue
493 because it happens only when there has been a compilation error.
494
495 Argument: the error number
496 Returns: pointer to the error string
497 */
498
499 static const char *
500 find_error_text(int n)
501 {
502 const char *s = error_texts;
503 for (; n > 0; n--) while (*s++ != 0) {};
504 return s;
505 }
506
507
508 /*************************************************
509 * Handle escapes *
510 *************************************************/
511
512 /* This function is called when a \ has been encountered. It either returns a
513 positive value for a simple escape such as \n, or a negative value which
514 encodes one of the more complicated things such as \d. A backreference to group
515 n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
516 UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
517 ptr is pointing at the \. On exit, it is on the final character of the escape
518 sequence.
519
520 Arguments:
521 ptrptr points to the pattern position pointer
522 errorcodeptr points to the errorcode variable
523 bracount number of previous extracting brackets
524 options the options bits
525 isclass TRUE if inside a character class
526
527 Returns: zero or positive => a data character
528 negative => a special escape sequence
529 on error, errorcodeptr is set
530 */
531
532 static int
533 check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
534 int options, BOOL isclass)
535 {
536 BOOL utf8 = (options & PCRE_UTF8) != 0;
537 const uschar *ptr = *ptrptr + 1;
538 int c, i;
539
540 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
541 ptr--; /* Set pointer back to the last byte */
542
543 /* If backslash is at the end of the pattern, it's an error. */
544
545 if (c == 0) *errorcodeptr = ERR1;
546
547 /* Non-alphanumerics are literals. For digits or letters, do an initial lookup
548 in a table. A non-zero result is something that can be returned immediately.
549 Otherwise further processing may be required. */
550
551 #ifndef EBCDIC /* ASCII/UTF-8 coding */
552 else if (c < CHAR_0 || c > CHAR_z) {} /* Not alphanumeric */
553 else if ((i = escapes[c - CHAR_0]) != 0) c = i;
554
555 #else /* EBCDIC coding */
556 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphanumeric */
557 else if ((i = escapes[c - 0x48]) != 0) c = i;
558 #endif
559
560 /* Escapes that need further processing, or are illegal. */
561
562 else
563 {
564 const uschar *oldptr;
565 BOOL braced, negated;
566
567 switch (c)
568 {
569 /* A number of Perl escapes are not handled by PCRE. We give an explicit
570 error. */
571
572 case CHAR_l:
573 case CHAR_L:
574 case CHAR_N:
575 case CHAR_u:
576 case CHAR_U:
577 *errorcodeptr = ERR37;
578 break;
579
580 /* \g must be followed by one of a number of specific things:
581
582 (1) A number, either plain or braced. If positive, it is an absolute
583 backreference. If negative, it is a relative backreference. This is a Perl
584 5.10 feature.
585
586 (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
587 is part of Perl's movement towards a unified syntax for back references. As
588 this is synonymous with \k{name}, we fudge it up by pretending it really
589 was \k.
590
591 (3) For Oniguruma compatibility we also support \g followed by a name or a
592 number either in angle brackets or in single quotes. However, these are
593 (possibly recursive) subroutine calls, _not_ backreferences. Just return
594 the -ESC_g code (cf \k). */
595
596 case CHAR_g:
597 if (ptr[1] == CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_APOSTROPHE)
598 {
599 c = -ESC_g;
600 break;
601 }
602
603 /* Handle the Perl-compatible cases */
604
605 if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
606 {
607 const uschar *p;
608 for (p = ptr+2; *p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET; p++)
609 if (*p != CHAR_MINUS && (digitab[*p] & ctype_digit) == 0) break;
610 if (*p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET)
611 {
612 c = -ESC_k;
613 break;
614 }
615 braced = TRUE;
616 ptr++;
617 }
618 else braced = FALSE;
619
620 if (ptr[1] == CHAR_MINUS)
621 {
622 negated = TRUE;
623 ptr++;
624 }
625 else negated = FALSE;
626
627 c = 0;
628 while ((digitab[ptr[1]] & ctype_digit) != 0)
629 c = c * 10 + *(++ptr) - CHAR_0;
630
631 if (c < 0) /* Integer overflow */
632 {
633 *errorcodeptr = ERR61;
634 break;
635 }
636
637 if (braced && *(++ptr) != CHAR_RIGHT_CURLY_BRACKET)
638 {
639 *errorcodeptr = ERR57;
640 break;
641 }
642
643 if (c == 0)
644 {
645 *errorcodeptr = ERR58;
646 break;
647 }
648
649 if (negated)
650 {
651 if (c > bracount)
652 {
653 *errorcodeptr = ERR15;
654 break;
655 }
656 c = bracount - (c - 1);
657 }
658
659 c = -(ESC_REF + c);
660 break;
661
662 /* The handling of escape sequences consisting of a string of digits
663 starting with one that is not zero is not straightforward. By experiment,
664 the way Perl works seems to be as follows:
665
666 Outside a character class, the digits are read as a decimal number. If the
667 number is less than 10, or if there are that many previous extracting
668 left brackets, then it is a back reference. Otherwise, up to three octal
669 digits are read to form an escaped byte. Thus \123 is likely to be octal
670 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
671 value is greater than 377, the least significant 8 bits are taken. Inside a
672 character class, \ followed by a digit is always an octal number. */
673
674 case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
675 case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
676
677 if (!isclass)
678 {
679 oldptr = ptr;
680 c -= CHAR_0;
681 while ((digitab[ptr[1]] & ctype_digit) != 0)
682 c = c * 10 + *(++ptr) - CHAR_0;
683 if (c < 0) /* Integer overflow */
684 {
685 *errorcodeptr = ERR61;
686 break;
687 }
688 if (c < 10 || c <= bracount)
689 {
690 c = -(ESC_REF + c);
691 break;
692 }
693 ptr = oldptr; /* Put the pointer back and fall through */
694 }
695
696 /* Handle an octal number following \. If the first digit is 8 or 9, Perl
697 generates a binary zero byte and treats the digit as a following literal.
698 Thus we have to pull back the pointer by one. */
699
700 if ((c = *ptr) >= CHAR_8)
701 {
702 ptr--;
703 c = 0;
704 break;
705 }
706
707 /* \0 always starts an octal number, but we may drop through to here with a
708 larger first octal digit. The original code used just to take the least
709 significant 8 bits of octal numbers (I think this is what early Perls used
710 to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
711 than 3 octal digits. */
712
713 case CHAR_0:
714 c -= CHAR_0;
715 while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
716 c = c * 8 + *(++ptr) - CHAR_0;
717 if (!utf8 && c > 255) *errorcodeptr = ERR51;
718 break;
719
720 /* \x is complicated. \x{ddd} is a character number which can be greater
721 than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
722 treated as a data character. */
723
724 case CHAR_x:
725 if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
726 {
727 const uschar *pt = ptr + 2;
728 int count = 0;
729
730 c = 0;
731 while ((digitab[*pt] & ctype_xdigit) != 0)
732 {
733 register int cc = *pt++;
734 if (c == 0 && cc == CHAR_0) continue; /* Leading zeroes */
735 count++;
736
737 #ifndef EBCDIC /* ASCII/UTF-8 coding */
738 if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
739 c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
740 #else /* EBCDIC coding */
741 if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */
742 c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
743 #endif
744 }
745
746 if (*pt == CHAR_RIGHT_CURLY_BRACKET)
747 {
748 if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
749 ptr = pt;
750 break;
751 }
752
753 /* If the sequence of hex digits does not end with '}', then we don't
754 recognize this construct; fall through to the normal \x handling. */
755 }
756
757 /* Read just a single-byte hex-defined char */
758
759 c = 0;
760 while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
761 {
762 int cc; /* Some compilers don't like */
763 cc = *(++ptr); /* ++ in initializers */
764 #ifndef EBCDIC /* ASCII/UTF-8 coding */
765 if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
766 c = c * 16 + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
767 #else /* EBCDIC coding */
768 if (cc <= CHAR_z) cc += 64; /* Convert to upper case */
769 c = c * 16 + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
770 #endif
771 }
772 break;
773
774 /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
775 This coding is ASCII-specific, but then the whole concept of \cx is
776 ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
777
778 case CHAR_c:
779 c = *(++ptr);
780 if (c == 0)
781 {
782 *errorcodeptr = ERR2;
783 break;
784 }
785
786 #ifndef EBCDIC /* ASCII/UTF-8 coding */
787 if (c >= CHAR_a && c <= CHAR_z) c -= 32;
788 c ^= 0x40;
789 #else /* EBCDIC coding */
790 if (c >= CHAR_a && c <= CHAR_z) c += 64;
791 c ^= 0xC0;
792 #endif
793 break;
794
795 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
796 other alphanumeric following \ is an error if PCRE_EXTRA was set;
797 otherwise, for Perl compatibility, it is a literal. This code looks a bit
798 odd, but there used to be some cases other than the default, and there may
799 be again in future, so I haven't "optimized" it. */
800
801 default:
802 if ((options & PCRE_EXTRA) != 0) switch(c)
803 {
804 default:
805 *errorcodeptr = ERR3;
806 break;
807 }
808 break;
809 }
810 }
811
812 *ptrptr = ptr;
813 return c;
814 }
815
816
817
818 #ifdef SUPPORT_UCP
819 /*************************************************
820 * Handle \P and \p *
821 *************************************************/
822
823 /* This function is called after \P or \p has been encountered, provided that
824 PCRE is compiled with support for Unicode properties. On entry, ptrptr is
825 pointing at the P or p. On exit, it is pointing at the final character of the
826 escape sequence.
827
828 Argument:
829 ptrptr points to the pattern position pointer
830 negptr points to a boolean that is set TRUE for negation else FALSE
831 dptr points to an int that is set to the detailed property value
832 errorcodeptr points to the error code variable
833
834 Returns: type value from ucp_type_table, or -1 for an invalid type
835 */
836
837 static int
838 get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
839 {
840 int c, i, bot, top;
841 const uschar *ptr = *ptrptr;
842 char name[32];
843
844 c = *(++ptr);
845 if (c == 0) goto ERROR_RETURN;
846
847 *negptr = FALSE;
848
849 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
850 negation. */
851
852 if (c == CHAR_LEFT_CURLY_BRACKET)
853 {
854 if (ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
855 {
856 *negptr = TRUE;
857 ptr++;
858 }
859 for (i = 0; i < (int)sizeof(name) - 1; i++)
860 {
861 c = *(++ptr);
862 if (c == 0) goto ERROR_RETURN;
863 if (c == CHAR_RIGHT_CURLY_BRACKET) break;
864 name[i] = c;
865 }
866 if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
867 name[i] = 0;
868 }
869
870 /* Otherwise there is just one following character */
871
872 else
873 {
874 name[0] = c;
875 name[1] = 0;
876 }
877
878 *ptrptr = ptr;
879
880 /* Search for a recognized property name using binary chop */
881
882 bot = 0;
883 top = _pcre_utt_size;
884
885 while (bot < top)
886 {
887 i = (bot + top) >> 1;
888 c = strcmp(name, _pcre_utt_names + _pcre_utt[i].name_offset);
889 if (c == 0)
890 {
891 *dptr = _pcre_utt[i].value;
892 return _pcre_utt[i].type;
893 }
894 if (c > 0) bot = i + 1; else top = i;
895 }
896
897 *errorcodeptr = ERR47;
898 *ptrptr = ptr;
899 return -1;
900
901 ERROR_RETURN:
902 *errorcodeptr = ERR46;
903 *ptrptr = ptr;
904 return -1;
905 }
906 #endif
907
908
909
910
911 /*************************************************
912 * Check for counted repeat *
913 *************************************************/
914
915 /* This function is called when a '{' is encountered in a place where it might
916 start a quantifier. It looks ahead to see if it really is a quantifier or not.
917 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
918 where the ddds are digits.
919
920 Arguments:
921 p pointer to the first char after '{'
922
923 Returns: TRUE or FALSE
924 */
925
926 static BOOL
927 is_counted_repeat(const uschar *p)
928 {
929 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
930 while ((digitab[*p] & ctype_digit) != 0) p++;
931 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
932
933 if (*p++ != CHAR_COMMA) return FALSE;
934 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
935
936 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
937 while ((digitab[*p] & ctype_digit) != 0) p++;
938
939 return (*p == CHAR_RIGHT_CURLY_BRACKET);
940 }
941
942
943
944 /*************************************************
945 * Read repeat counts *
946 *************************************************/
947
948 /* Read an item of the form {n,m} and return the values. This is called only
949 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
950 so the syntax is guaranteed to be correct, but we need to check the values.
951
952 Arguments:
953 p pointer to first char after '{'
954 minp pointer to int for min
955 maxp pointer to int for max
956 returned as -1 if no max
957 errorcodeptr points to error code variable
958
959 Returns: pointer to '}' on success;
960 current ptr on error, with errorcodeptr set non-zero
961 */
962
963 static const uschar *
964 read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
965 {
966 int min = 0;
967 int max = -1;
968
969 /* Read the minimum value and do a paranoid check: a negative value indicates
970 an integer overflow. */
971
972 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - CHAR_0;
973 if (min < 0 || min > 65535)
974 {
975 *errorcodeptr = ERR5;
976 return p;
977 }
978
979 /* Read the maximum value if there is one, and again do a paranoid on its size.
980 Also, max must not be less than min. */
981
982 if (*p == CHAR_RIGHT_CURLY_BRACKET) max = min; else
983 {
984 if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)
985 {
986 max = 0;
987 while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - CHAR_0;
988 if (max < 0 || max > 65535)
989 {
990 *errorcodeptr = ERR5;
991 return p;
992 }
993 if (max < min)
994 {
995 *errorcodeptr = ERR4;
996 return p;
997 }
998 }
999 }
1000
1001 /* Fill in the required variables, and pass back the pointer to the terminating
1002 '}'. */
1003
1004 *minp = min;
1005 *maxp = max;
1006 return p;
1007 }
1008
1009
1010
1011 /*************************************************
1012 * Subroutine for finding forward reference *
1013 *************************************************/
1014
1015 /* This recursive function is called only from find_parens() below. The
1016 top-level call starts at the beginning of the pattern. All other calls must
1017 start at a parenthesis. It scans along a pattern's text looking for capturing
1018 subpatterns, and counting them. If it finds a named pattern that matches the
1019 name it is given, it returns its number. Alternatively, if the name is NULL, it
1020 returns when it reaches a given numbered subpattern. We know that if (?P< is
1021 encountered, the name will be terminated by '>' because that is checked in the
1022 first pass. Recursion is used to keep track of subpatterns that reset the
1023 capturing group numbers - the (?| feature.
1024
1025 Arguments:
1026 ptrptr address of the current character pointer (updated)
1027 cd compile background data
1028 name name to seek, or NULL if seeking a numbered subpattern
1029 lorn name length, or subpattern number if name is NULL
1030 xmode TRUE if we are in /x mode
1031 count pointer to the current capturing subpattern number (updated)
1032
1033 Returns: the number of the named subpattern, or -1 if not found
1034 */
1035
1036 static int
1037 find_parens_sub(uschar **ptrptr, compile_data *cd, const uschar *name, int lorn,
1038 BOOL xmode, int *count)
1039 {
1040 uschar *ptr = *ptrptr;
1041 int start_count = *count;
1042 int hwm_count = start_count;
1043 BOOL dup_parens = FALSE;
1044
1045 /* If the first character is a parenthesis, check on the type of group we are
1046 dealing with. The very first call may not start with a parenthesis. */
1047
1048 if (ptr[0] == CHAR_LEFT_PARENTHESIS)
1049 {
1050 if (ptr[1] == CHAR_QUESTION_MARK &&
1051 ptr[2] == CHAR_VERTICAL_LINE)
1052 {
1053 ptr += 3;
1054 dup_parens = TRUE;
1055 }
1056
1057 /* Handle a normal, unnamed capturing parenthesis */
1058
1059 else if (ptr[1] != CHAR_QUESTION_MARK && ptr[1] != CHAR_ASTERISK)
1060 {
1061 *count += 1;
1062 if (name == NULL && *count == lorn) return *count;
1063 ptr++;
1064 }
1065
1066 /* Handle a condition. If it is an assertion, just carry on so that it
1067 is processed as normal. If not, skip to the closing parenthesis of the
1068 condition (there can't be any nested parens. */
1069
1070 else if (ptr[2] == CHAR_LEFT_PARENTHESIS)
1071 {
1072 ptr += 2;
1073 if (ptr[1] != CHAR_QUESTION_MARK)
1074 {
1075 while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
1076 if (*ptr != 0) ptr++;
1077 }
1078 }
1079
1080 /* We have either (? or (* and not a condition */
1081
1082 else
1083 {
1084 ptr += 2;
1085 if (*ptr == CHAR_P) ptr++; /* Allow optional P */
1086
1087 /* We have to disambiguate (?<! and (?<= from (?<name> for named groups */
1088
1089 if ((*ptr == CHAR_LESS_THAN_SIGN && ptr[1] != CHAR_EXCLAMATION_MARK &&
1090 ptr[1] != CHAR_EQUALS_SIGN) || *ptr == CHAR_APOSTROPHE)
1091 {
1092 int term;
1093 const uschar *thisname;
1094 *count += 1;
1095 if (name == NULL && *count == lorn) return *count;
1096 term = *ptr++;
1097 if (term == CHAR_LESS_THAN_SIGN) term = CHAR_GREATER_THAN_SIGN;
1098 thisname = ptr;
1099 while (*ptr != term) ptr++;
1100 if (name != NULL && lorn == ptr - thisname &&
1101 strncmp((const char *)name, (const char *)thisname, lorn) == 0)
1102 return *count;
1103 term++;
1104 }
1105 }
1106 }
1107
1108 /* Past any initial parenthesis handling, scan for parentheses or vertical
1109 bars. */
1110
1111 for (; *ptr != 0; ptr++)
1112 {
1113 /* Skip over backslashed characters and also entire \Q...\E */
1114
1115 if (*ptr == CHAR_BACKSLASH)
1116 {
1117 if (*(++ptr) == 0) goto FAIL_EXIT;
1118 if (*ptr == CHAR_Q) for (;;)
1119 {
1120 while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
1121 if (*ptr == 0) goto FAIL_EXIT;
1122 if (*(++ptr) == CHAR_E) break;
1123 }
1124 continue;
1125 }
1126
1127 /* Skip over character classes; this logic must be similar to the way they
1128 are handled for real. If the first character is '^', skip it. Also, if the
1129 first few characters (either before or after ^) are \Q\E or \E we skip them
1130 too. This makes for compatibility with Perl. Note the use of STR macros to
1131 encode "Q\\E" so that it works in UTF-8 on EBCDIC platforms. */
1132
1133 if (*ptr == CHAR_LEFT_SQUARE_BRACKET)
1134 {
1135 BOOL negate_class = FALSE;
1136 for (;;)
1137 {
1138 if (ptr[1] == CHAR_BACKSLASH)
1139 {
1140 if (ptr[2] == CHAR_E)
1141 ptr+= 2;
1142 else if (strncmp((const char *)ptr+2,
1143 STR_Q STR_BACKSLASH STR_E, 3) == 0)
1144 ptr += 4;
1145 else
1146 break;
1147 }
1148 else if (!negate_class && ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
1149 {
1150 negate_class = TRUE;
1151 ptr++;
1152 }
1153 else break;
1154 }
1155
1156 /* If the next character is ']', it is a data character that must be
1157 skipped, except in JavaScript compatibility mode. */
1158
1159 if (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET &&
1160 (cd->external_options & PCRE_JAVASCRIPT_COMPAT) == 0)
1161 ptr++;
1162
1163 while (*(++ptr) != CHAR_RIGHT_SQUARE_BRACKET)
1164 {
1165 if (*ptr == 0) return -1;
1166 if (*ptr == CHAR_BACKSLASH)
1167 {
1168 if (*(++ptr) == 0) goto FAIL_EXIT;
1169 if (*ptr == CHAR_Q) for (;;)
1170 {
1171 while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
1172 if (*ptr == 0) goto FAIL_EXIT;
1173 if (*(++ptr) == CHAR_E) break;
1174 }
1175 continue;
1176 }
1177 }
1178 continue;
1179 }
1180
1181 /* Skip comments in /x mode */
1182
1183 if (xmode && *ptr == CHAR_NUMBER_SIGN)
1184 {
1185 while (*(++ptr) != 0 && *ptr != CHAR_NL) {};
1186 if (*ptr == 0) goto FAIL_EXIT;
1187 continue;
1188 }
1189
1190 /* Check for the special metacharacters */
1191
1192 if (*ptr == CHAR_LEFT_PARENTHESIS)
1193 {
1194 int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, count);
1195 if (rc > 0) return rc;
1196 if (*ptr == 0) goto FAIL_EXIT;
1197 }
1198
1199 else if (*ptr == CHAR_RIGHT_PARENTHESIS)
1200 {
1201 if (dup_parens && *count < hwm_count) *count = hwm_count;
1202 *ptrptr = ptr;
1203 return -1;
1204 }
1205
1206 else if (*ptr == CHAR_VERTICAL_LINE && dup_parens)
1207 {
1208 if (*count > hwm_count) hwm_count = *count;
1209 *count = start_count;
1210 }
1211 }
1212
1213 FAIL_EXIT:
1214 *ptrptr = ptr;
1215 return -1;
1216 }
1217
1218
1219
1220
1221 /*************************************************
1222 * Find forward referenced subpattern *
1223 *************************************************/
1224
1225 /* This function scans along a pattern's text looking for capturing
1226 subpatterns, and counting them. If it finds a named pattern that matches the
1227 name it is given, it returns its number. Alternatively, if the name is NULL, it
1228 returns when it reaches a given numbered subpattern. This is used for forward
1229 references to subpatterns. We used to be able to start this scan from the
1230 current compiling point, using the current count value from cd->bracount, and
1231 do it all in a single loop, but the addition of the possibility of duplicate
1232 subpattern numbers means that we have to scan from the very start, in order to
1233 take account of such duplicates, and to use a recursive function to keep track
1234 of the different types of group.
1235
1236 Arguments:
1237 cd compile background data
1238 name name to seek, or NULL if seeking a numbered subpattern
1239 lorn name length, or subpattern number if name is NULL
1240 xmode TRUE if we are in /x mode
1241
1242 Returns: the number of the found subpattern, or -1 if not found
1243 */
1244
1245 static int
1246 find_parens(compile_data *cd, const uschar *name, int lorn, BOOL xmode)
1247 {
1248 uschar *ptr = (uschar *)cd->start_pattern;
1249 int count = 0;
1250 int rc;
1251
1252 /* If the pattern does not start with an opening parenthesis, the first call
1253 to find_parens_sub() will scan right to the end (if necessary). However, if it
1254 does start with a parenthesis, find_parens_sub() will return when it hits the
1255 matching closing parens. That is why we have to have a loop. */
1256
1257 for (;;)
1258 {
1259 rc = find_parens_sub(&ptr, cd, name, lorn, xmode, &count);
1260 if (rc > 0 || *ptr++ == 0) break;
1261 }
1262
1263 return rc;
1264 }
1265
1266
1267
1268
1269 /*************************************************
1270 * Find first significant op code *
1271 *************************************************/
1272
1273 /* This is called by several functions that scan a compiled expression looking
1274 for a fixed first character, or an anchoring op code etc. It skips over things
1275 that do not influence this. For some calls, a change of option is important.
1276 For some calls, it makes sense to skip negative forward and all backward
1277 assertions, and also the \b assertion; for others it does not.
1278
1279 Arguments:
1280 code pointer to the start of the group
1281 options pointer to external options
1282 optbit the option bit whose changing is significant, or
1283 zero if none are
1284 skipassert TRUE if certain assertions are to be skipped
1285
1286 Returns: pointer to the first significant opcode
1287 */
1288
1289 static const uschar*
1290 first_significant_code(const uschar *code, int *options, int optbit,
1291 BOOL skipassert)
1292 {
1293 for (;;)
1294 {
1295 switch ((int)*code)
1296 {
1297 case OP_OPT:
1298 if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
1299 *options = (int)code[1];
1300 code += 2;
1301 break;
1302
1303 case OP_ASSERT_NOT:
1304 case OP_ASSERTBACK:
1305 case OP_ASSERTBACK_NOT:
1306 if (!skipassert) return code;
1307 do code += GET(code, 1); while (*code == OP_ALT);
1308 code += _pcre_OP_lengths[*code];
1309 break;
1310
1311 case OP_WORD_BOUNDARY:
1312 case OP_NOT_WORD_BOUNDARY:
1313 if (!skipassert) return code;
1314 /* Fall through */
1315
1316 case OP_CALLOUT:
1317 case OP_CREF:
1318 case OP_RREF:
1319 case OP_DEF:
1320 code += _pcre_OP_lengths[*code];
1321 break;
1322
1323 default:
1324 return code;
1325 }
1326 }
1327 /* Control never reaches here */
1328 }
1329
1330
1331
1332
1333 /*************************************************
1334 * Find the fixed length of a pattern *
1335 *************************************************/
1336
1337 /* Scan a pattern and compute the fixed length of subject that will match it,
1338 if the length is fixed. This is needed for dealing with backward assertions.
1339 In UTF8 mode, the result is in characters rather than bytes.
1340
1341 Arguments:
1342 code points to the start of the pattern (the bracket)
1343 options the compiling options
1344
1345 Returns: the fixed length, or -1 if there is no fixed length,
1346 or -2 if \C was encountered
1347 */
1348
1349 static int
1350 find_fixedlength(uschar *code, int options)
1351 {
1352 int length = -1;
1353
1354 register int branchlength = 0;
1355 register uschar *cc = code + 1 + LINK_SIZE;
1356
1357 /* Scan along the opcodes for this branch. If we get to the end of the
1358 branch, check the length against that of the other branches. */
1359
1360 for (;;)
1361 {
1362 int d;
1363 register int op = *cc;
1364 switch (op)
1365 {
1366 case OP_CBRA:
1367 case OP_BRA:
1368 case OP_ONCE:
1369 case OP_COND:
1370 d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);
1371 if (d < 0) return d;
1372 branchlength += d;
1373 do cc += GET(cc, 1); while (*cc == OP_ALT);
1374 cc += 1 + LINK_SIZE;
1375 break;
1376
1377 /* Reached end of a branch; if it's a ket it is the end of a nested
1378 call. If it's ALT it is an alternation in a nested call. If it is
1379 END it's the end of the outer call. All can be handled by the same code. */
1380
1381 case OP_ALT:
1382 case OP_KET:
1383 case OP_KETRMAX:
1384 case OP_KETRMIN:
1385 case OP_END:
1386 if (length < 0) length = branchlength;
1387 else if (length != branchlength) return -1;
1388 if (*cc != OP_ALT) return length;
1389 cc += 1 + LINK_SIZE;
1390 branchlength = 0;
1391 break;
1392
1393 /* Skip over assertive subpatterns */
1394
1395 case OP_ASSERT:
1396 case OP_ASSERT_NOT:
1397 case OP_ASSERTBACK:
1398 case OP_ASSERTBACK_NOT:
1399 do cc += GET(cc, 1); while (*cc == OP_ALT);
1400 /* Fall through */
1401
1402 /* Skip over things that don't match chars */
1403
1404 case OP_REVERSE:
1405 case OP_CREF:
1406 case OP_RREF:
1407 case OP_DEF:
1408 case OP_OPT:
1409 case OP_CALLOUT:
1410 case OP_SOD:
1411 case OP_SOM:
1412 case OP_EOD:
1413 case OP_EODN:
1414 case OP_CIRC:
1415 case OP_DOLL:
1416 case OP_NOT_WORD_BOUNDARY:
1417 case OP_WORD_BOUNDARY:
1418 cc += _pcre_OP_lengths[*cc];
1419 break;
1420
1421 /* Handle literal characters */
1422
1423 case OP_CHAR:
1424 case OP_CHARNC:
1425 case OP_NOT:
1426 branchlength++;
1427 cc += 2;
1428 #ifdef SUPPORT_UTF8
1429 if ((options & PCRE_UTF8) != 0 && cc[-1] >= 0xc0)
1430 cc += _pcre_utf8_table4[cc[-1] & 0x3f];
1431 #endif
1432 break;
1433
1434 /* Handle exact repetitions. The count is already in characters, but we
1435 need to skip over a multibyte character in UTF8 mode. */
1436
1437 case OP_EXACT:
1438 branchlength += GET2(cc,1);
1439 cc += 4;
1440 #ifdef SUPPORT_UTF8
1441 if ((options & PCRE_UTF8) != 0 && cc[-1] >= 0xc0)
1442 cc += _pcre_utf8_table4[cc[-1] & 0x3f];
1443 #endif
1444 break;
1445
1446 case OP_TYPEEXACT:
1447 branchlength += GET2(cc,1);
1448 if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
1449 cc += 4;
1450 break;
1451
1452 /* Handle single-char matchers */
1453
1454 case OP_PROP:
1455 case OP_NOTPROP:
1456 cc += 2;
1457 /* Fall through */
1458
1459 case OP_NOT_DIGIT:
1460 case OP_DIGIT:
1461 case OP_NOT_WHITESPACE:
1462 case OP_WHITESPACE:
1463 case OP_NOT_WORDCHAR:
1464 case OP_WORDCHAR:
1465 case OP_ANY:
1466 case OP_ALLANY:
1467 branchlength++;
1468 cc++;
1469 break;
1470
1471 /* The single-byte matcher isn't allowed */
1472
1473 case OP_ANYBYTE:
1474 return -2;
1475
1476 /* Check a class for variable quantification */
1477
1478 #ifdef SUPPORT_UTF8
1479 case OP_XCLASS:
1480 cc += GET(cc, 1) - 33;
1481 /* Fall through */
1482 #endif
1483
1484 case OP_CLASS:
1485 case OP_NCLASS:
1486 cc += 33;
1487
1488 switch (*cc)
1489 {
1490 case OP_CRSTAR:
1491 case OP_CRMINSTAR:
1492 case OP_CRQUERY:
1493 case OP_CRMINQUERY:
1494 return -1;
1495
1496 case OP_CRRANGE:
1497 case OP_CRMINRANGE:
1498 if (GET2(cc,1) != GET2(cc,3)) return -1;
1499 branchlength += GET2(cc,1);
1500 cc += 5;
1501 break;
1502
1503 default:
1504 branchlength++;
1505 }
1506 break;
1507
1508 /* Anything else is variable length */
1509
1510 default:
1511 return -1;
1512 }
1513 }
1514 /* Control never gets here */
1515 }
1516
1517
1518
1519
1520 /*************************************************
1521 * Scan compiled regex for numbered bracket *
1522 *************************************************/
1523
1524 /* This little function scans through a compiled pattern until it finds a
1525 capturing bracket with the given number.
1526
1527 Arguments:
1528 code points to start of expression
1529 utf8 TRUE in UTF-8 mode
1530 number the required bracket number
1531
1532 Returns: pointer to the opcode for the bracket, or NULL if not found
1533 */
1534
1535 static const uschar *
1536 find_bracket(const uschar *code, BOOL utf8, int number)
1537 {
1538 for (;;)
1539 {
1540 register int c = *code;
1541 if (c == OP_END) return NULL;
1542
1543 /* XCLASS is used for classes that cannot be represented just by a bit
1544 map. This includes negated single high-valued characters. The length in
1545 the table is zero; the actual length is stored in the compiled code. */
1546
1547 if (c == OP_XCLASS) code += GET(code, 1);
1548
1549 /* Handle capturing bracket */
1550
1551 else if (c == OP_CBRA)
1552 {
1553 int n = GET2(code, 1+LINK_SIZE);
1554 if (n == number) return (uschar *)code;
1555 code += _pcre_OP_lengths[c];
1556 }
1557
1558 /* Otherwise, we can get the item's length from the table, except that for
1559 repeated character types, we have to test for \p and \P, which have an extra
1560 two bytes of parameters. */
1561
1562 else
1563 {
1564 switch(c)
1565 {
1566 case OP_TYPESTAR:
1567 case OP_TYPEMINSTAR:
1568 case OP_TYPEPLUS:
1569 case OP_TYPEMINPLUS:
1570 case OP_TYPEQUERY:
1571 case OP_TYPEMINQUERY:
1572 case OP_TYPEPOSSTAR:
1573 case OP_TYPEPOSPLUS:
1574 case OP_TYPEPOSQUERY:
1575 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1576 break;
1577
1578 case OP_TYPEUPTO:
1579 case OP_TYPEMINUPTO:
1580 case OP_TYPEEXACT:
1581 case OP_TYPEPOSUPTO:
1582 if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1583 break;
1584 }
1585
1586 /* Add in the fixed length from the table */
1587
1588 code += _pcre_OP_lengths[c];
1589
1590 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1591 a multi-byte character. The length in the table is a minimum, so we have to
1592 arrange to skip the extra bytes. */
1593
1594 #ifdef SUPPORT_UTF8
1595 if (utf8) switch(c)
1596 {
1597 case OP_CHAR:
1598 case OP_CHARNC:
1599 case OP_EXACT:
1600 case OP_UPTO:
1601 case OP_MINUPTO:
1602 case OP_POSUPTO:
1603 case OP_STAR:
1604 case OP_MINSTAR:
1605 case OP_POSSTAR:
1606 case OP_PLUS:
1607 case OP_MINPLUS:
1608 case OP_POSPLUS:
1609 case OP_QUERY:
1610 case OP_MINQUERY:
1611 case OP_POSQUERY:
1612 if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1613 break;
1614 }
1615 #else
1616 (void)(utf8); /* Keep compiler happy by referencing function argument */
1617 #endif
1618 }
1619 }
1620 }
1621
1622
1623
1624 /*************************************************
1625 * Scan compiled regex for recursion reference *
1626 *************************************************/
1627
1628 /* This little function scans through a compiled pattern until it finds an
1629 instance of OP_RECURSE.
1630
1631 Arguments:
1632 code points to start of expression
1633 utf8 TRUE in UTF-8 mode
1634
1635 Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1636 */
1637
1638 static const uschar *
1639 find_recurse(const uschar *code, BOOL utf8)
1640 {
1641 for (;;)
1642 {
1643 register int c = *code;
1644 if (c == OP_END) return NULL;
1645 if (c == OP_RECURSE) return code;
1646
1647 /* XCLASS is used for classes that cannot be represented just by a bit
1648 map. This includes negated single high-valued characters. The length in
1649 the table is zero; the actual length is stored in the compiled code. */
1650
1651 if (c == OP_XCLASS) code += GET(code, 1);
1652
1653 /* Otherwise, we can get the item's length from the table, except that for
1654 repeated character types, we have to test for \p and \P, which have an extra
1655 two bytes of parameters. */
1656
1657 else
1658 {
1659 switch(c)
1660 {
1661 case OP_TYPESTAR:
1662 case OP_TYPEMINSTAR:
1663 case OP_TYPEPLUS:
1664 case OP_TYPEMINPLUS:
1665 case OP_TYPEQUERY:
1666 case OP_TYPEMINQUERY:
1667 case OP_TYPEPOSSTAR:
1668 case OP_TYPEPOSPLUS:
1669 case OP_TYPEPOSQUERY:
1670 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1671 break;
1672
1673 case OP_TYPEPOSUPTO:
1674 case OP_TYPEUPTO:
1675 case OP_TYPEMINUPTO:
1676 case OP_TYPEEXACT:
1677 if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1678 break;
1679 }
1680
1681 /* Add in the fixed length from the table */
1682
1683 code += _pcre_OP_lengths[c];
1684
1685 /* In UTF-8 mode, opcodes that are followed by a character may be followed
1686 by a multi-byte character. The length in the table is a minimum, so we have
1687 to arrange to skip the extra bytes. */
1688
1689 #ifdef SUPPORT_UTF8
1690 if (utf8) switch(c)
1691 {
1692 case OP_CHAR:
1693 case OP_CHARNC:
1694 case OP_EXACT:
1695 case OP_UPTO:
1696 case OP_MINUPTO:
1697 case OP_POSUPTO:
1698 case OP_STAR:
1699 case OP_MINSTAR:
1700 case OP_POSSTAR:
1701 case OP_PLUS:
1702 case OP_MINPLUS:
1703 case OP_POSPLUS:
1704 case OP_QUERY:
1705 case OP_MINQUERY:
1706 case OP_POSQUERY:
1707 if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1708 break;
1709 }
1710 #else
1711 (void)(utf8); /* Keep compiler happy by referencing function argument */
1712 #endif
1713 }
1714 }
1715 }
1716
1717
1718
1719 /*************************************************
1720 * Scan compiled branch for non-emptiness *
1721 *************************************************/
1722
1723 /* This function scans through a branch of a compiled pattern to see whether it
1724 can match the empty string or not. It is called from could_be_empty()
1725 below and from compile_branch() when checking for an unlimited repeat of a
1726 group that can match nothing. Note that first_significant_code() skips over
1727 backward and negative forward assertions when its final argument is TRUE. If we
1728 hit an unclosed bracket, we return "empty" - this means we've struck an inner
1729 bracket whose current branch will already have been scanned.
1730
1731 Arguments:
1732 code points to start of search
1733 endcode points to where to stop
1734 utf8 TRUE if in UTF8 mode
1735
1736 Returns: TRUE if what is matched could be empty
1737 */
1738
1739 static BOOL
1740 could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1741 {
1742 register int c;
1743 for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1744 code < endcode;
1745 code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1746 {
1747 const uschar *ccode;
1748
1749 c = *code;
1750
1751 /* Skip over forward assertions; the other assertions are skipped by
1752 first_significant_code() with a TRUE final argument. */
1753
1754 if (c == OP_ASSERT)
1755 {
1756 do code += GET(code, 1); while (*code == OP_ALT);
1757 c = *code;
1758 continue;
1759 }
1760
1761 /* Groups with zero repeats can of course be empty; skip them. */
1762
1763 if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO)
1764 {
1765 code += _pcre_OP_lengths[c];
1766 do code += GET(code, 1); while (*code == OP_ALT);
1767 c = *code;
1768 continue;
1769 }
1770
1771 /* For other groups, scan the branches. */
1772
1773 if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND)
1774 {
1775 BOOL empty_branch;
1776 if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1777
1778 /* If a conditional group has only one branch, there is a second, implied,
1779 empty branch, so just skip over the conditional, because it could be empty.
1780 Otherwise, scan the individual branches of the group. */
1781
1782 if (c == OP_COND && code[GET(code, 1)] != OP_ALT)
1783 code += GET(code, 1);
1784 else
1785 {
1786 empty_branch = FALSE;
1787 do
1788 {
1789 if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1790 empty_branch = TRUE;
1791 code += GET(code, 1);
1792 }
1793 while (*code == OP_ALT);
1794 if (!empty_branch) return FALSE; /* All branches are non-empty */
1795 }
1796
1797 c = *code;
1798 continue;
1799 }
1800
1801 /* Handle the other opcodes */
1802
1803 switch (c)
1804 {
1805 /* Check for quantifiers after a class. XCLASS is used for classes that
1806 cannot be represented just by a bit map. This includes negated single
1807 high-valued characters. The length in _pcre_OP_lengths[] is zero; the
1808 actual length is stored in the compiled code, so we must update "code"
1809 here. */
1810
1811 #ifdef SUPPORT_UTF8
1812 case OP_XCLASS:
1813 ccode = code += GET(code, 1);
1814 goto CHECK_CLASS_REPEAT;
1815 #endif
1816
1817 case OP_CLASS:
1818 case OP_NCLASS:
1819 ccode = code + 33;
1820
1821 #ifdef SUPPORT_UTF8
1822 CHECK_CLASS_REPEAT:
1823 #endif
1824
1825 switch (*ccode)
1826 {
1827 case OP_CRSTAR: /* These could be empty; continue */
1828 case OP_CRMINSTAR:
1829 case OP_CRQUERY:
1830 case OP_CRMINQUERY:
1831 break;
1832
1833 default: /* Non-repeat => class must match */
1834 case OP_CRPLUS: /* These repeats aren't empty */
1835 case OP_CRMINPLUS:
1836 return FALSE;
1837
1838 case OP_CRRANGE:
1839 case OP_CRMINRANGE:
1840 if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
1841 break;
1842 }
1843 break;
1844
1845 /* Opcodes that must match a character */
1846
1847 case OP_PROP:
1848 case OP_NOTPROP:
1849 case OP_EXTUNI:
1850 case OP_NOT_DIGIT:
1851 case OP_DIGIT:
1852 case OP_NOT_WHITESPACE:
1853 case OP_WHITESPACE:
1854 case OP_NOT_WORDCHAR:
1855 case OP_WORDCHAR:
1856 case OP_ANY:
1857 case OP_ALLANY:
1858 case OP_ANYBYTE:
1859 case OP_CHAR:
1860 case OP_CHARNC:
1861 case OP_NOT:
1862 case OP_PLUS:
1863 case OP_MINPLUS:
1864 case OP_POSPLUS:
1865 case OP_EXACT:
1866 case OP_NOTPLUS:
1867 case OP_NOTMINPLUS:
1868 case OP_NOTPOSPLUS:
1869 case OP_NOTEXACT:
1870 case OP_TYPEPLUS:
1871 case OP_TYPEMINPLUS:
1872 case OP_TYPEPOSPLUS:
1873 case OP_TYPEEXACT:
1874 return FALSE;
1875
1876 /* These are going to continue, as they may be empty, but we have to
1877 fudge the length for the \p and \P cases. */
1878
1879 case OP_TYPESTAR:
1880 case OP_TYPEMINSTAR:
1881 case OP_TYPEPOSSTAR:
1882 case OP_TYPEQUERY:
1883 case OP_TYPEMINQUERY:
1884 case OP_TYPEPOSQUERY:
1885 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1886 break;
1887
1888 /* Same for these */
1889
1890 case OP_TYPEUPTO:
1891 case OP_TYPEMINUPTO:
1892 case OP_TYPEPOSUPTO:
1893 if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1894 break;
1895
1896 /* End of branch */
1897
1898 case OP_KET:
1899 case OP_KETRMAX:
1900 case OP_KETRMIN:
1901 case OP_ALT:
1902 return TRUE;
1903
1904 /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1905 MINUPTO, and POSUPTO may be followed by a multibyte character */
1906
1907 #ifdef SUPPORT_UTF8
1908 case OP_STAR:
1909 case OP_MINSTAR:
1910 case OP_POSSTAR:
1911 case OP_QUERY:
1912 case OP_MINQUERY:
1913 case OP_POSQUERY:
1914 if (utf8 && code[1] >= 0xc0) code += _pcre_utf8_table4[code[1] & 0x3f];
1915 break;
1916
1917 case OP_UPTO:
1918 case OP_MINUPTO:
1919 case OP_POSUPTO:
1920 if (utf8 && code[3] >= 0xc0) code += _pcre_utf8_table4[code[3] & 0x3f];
1921 break;
1922 #endif
1923 }
1924 }
1925
1926 return TRUE;
1927 }
1928
1929
1930
1931 /*************************************************
1932 * Scan compiled regex for non-emptiness *
1933 *************************************************/
1934
1935 /* This function is called to check for left recursive calls. We want to check
1936 the current branch of the current pattern to see if it could match the empty
1937 string. If it could, we must look outwards for branches at other levels,
1938 stopping when we pass beyond the bracket which is the subject of the recursion.
1939
1940 Arguments:
1941 code points to start of the recursion
1942 endcode points to where to stop (current RECURSE item)
1943 bcptr points to the chain of current (unclosed) branch starts
1944 utf8 TRUE if in UTF-8 mode
1945
1946 Returns: TRUE if what is matched could be empty
1947 */
1948
1949 static BOOL
1950 could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1951 BOOL utf8)
1952 {
1953 while (bcptr != NULL && bcptr->current >= code)
1954 {
1955 if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1956 bcptr = bcptr->outer;
1957 }
1958 return TRUE;
1959 }
1960
1961
1962
1963 /*************************************************
1964 * Check for POSIX class syntax *
1965 *************************************************/
1966
1967 /* This function is called when the sequence "[:" or "[." or "[=" is
1968 encountered in a character class. It checks whether this is followed by a
1969 sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
1970 reach an unescaped ']' without the special preceding character, return FALSE.
1971
1972 Originally, this function only recognized a sequence of letters between the
1973 terminators, but it seems that Perl recognizes any sequence of characters,
1974 though of course unknown POSIX names are subsequently rejected. Perl gives an
1975 "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
1976 didn't consider this to be a POSIX class. Likewise for [:1234:].
1977
1978 The problem in trying to be exactly like Perl is in the handling of escapes. We
1979 have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
1980 class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
1981 below handles the special case of \], but does not try to do any other escape
1982 processing. This makes it different from Perl for cases such as [:l\ower:]
1983 where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize
1984 "l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,
1985 I think.
1986
1987 Arguments:
1988 ptr pointer to the initial [
1989 endptr where to return the end pointer
1990
1991 Returns: TRUE or FALSE
1992 */
1993
1994 static BOOL
1995 check_posix_syntax(const uschar *ptr, const uschar **endptr)
1996 {
1997 int terminator; /* Don't combine these lines; the Solaris cc */
1998 terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
1999 for (++ptr; *ptr != 0; ptr++)
2000 {
2001 if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET) ptr++; else
2002 {
2003 if (*ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
2004 if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
2005 {
2006 *endptr = ptr;
2007 return TRUE;
2008 }
2009 }
2010 }
2011 return FALSE;
2012 }
2013
2014
2015
2016
2017 /*************************************************
2018 * Check POSIX class name *
2019 *************************************************/
2020
2021 /* This function is called to check the name given in a POSIX-style class entry
2022 such as [:alnum:].
2023
2024 Arguments:
2025 ptr points to the first letter
2026 len the length of the name
2027
2028 Returns: a value representing the name, or -1 if unknown
2029 */
2030
2031 static int
2032 check_posix_name(const uschar *ptr, int len)
2033 {
2034 const char *pn = posix_names;
2035 register int yield = 0;
2036 while (posix_name_lengths[yield] != 0)
2037 {
2038 if (len == posix_name_lengths[yield] &&
2039 strncmp((const char *)ptr, pn, len) == 0) return yield;
2040 pn += posix_name_lengths[yield] + 1;
2041 yield++;
2042 }
2043 return -1;
2044 }
2045
2046
2047 /*************************************************
2048 * Adjust OP_RECURSE items in repeated group *
2049 *************************************************/
2050
2051 /* OP_RECURSE items contain an offset from the start of the regex to the group
2052 that is referenced. This means that groups can be replicated for fixed
2053 repetition simply by copying (because the recursion is allowed to refer to
2054 earlier groups that are outside the current group). However, when a group is
2055 optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is
2056 inserted before it, after it has been compiled. This means that any OP_RECURSE
2057 items within it that refer to the group itself or any contained groups have to
2058 have their offsets adjusted. That one of the jobs of this function. Before it
2059 is called, the partially compiled regex must be temporarily terminated with
2060 OP_END.
2061
2062 This function has been extended with the possibility of forward references for
2063 recursions and subroutine calls. It must also check the list of such references
2064 for the group we are dealing with. If it finds that one of the recursions in
2065 the current group is on this list, it adjusts the offset in the list, not the
2066 value in the reference (which is a group number).
2067
2068 Arguments:
2069 group points to the start of the group
2070 adjust the amount by which the group is to be moved
2071 utf8 TRUE in UTF-8 mode
2072 cd contains pointers to tables etc.
2073 save_hwm the hwm forward reference pointer at the start of the group
2074
2075 Returns: nothing
2076 */
2077
2078 static void
2079 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
2080 uschar *save_hwm)
2081 {
2082 uschar *ptr = group;
2083
2084 while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
2085 {
2086 int offset;
2087 uschar *hc;
2088
2089 /* See if this recursion is on the forward reference list. If so, adjust the
2090 reference. */
2091
2092 for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
2093 {
2094 offset = GET(hc, 0);
2095 if (cd->start_code + offset == ptr + 1)
2096 {
2097 PUT(hc, 0, offset + adjust);
2098 break;
2099 }
2100 }
2101
2102 /* Otherwise, adjust the recursion offset if it's after the start of this
2103 group. */
2104
2105 if (hc >= cd->hwm)
2106 {
2107 offset = GET(ptr, 1);
2108 if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
2109 }
2110
2111 ptr += 1 + LINK_SIZE;
2112 }
2113 }
2114
2115
2116
2117 /*************************************************
2118 * Insert an automatic callout point *
2119 *************************************************/
2120
2121 /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
2122 callout points before each pattern item.
2123
2124 Arguments:
2125 code current code pointer
2126 ptr current pattern pointer
2127 cd pointers to tables etc
2128
2129 Returns: new code pointer
2130 */
2131
2132 static uschar *
2133 auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
2134 {
2135 *code++ = OP_CALLOUT;
2136 *code++ = 255;
2137 PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */
2138 PUT(code, LINK_SIZE, 0); /* Default length */
2139 return code + 2*LINK_SIZE;
2140 }
2141
2142
2143
2144 /*************************************************
2145 * Complete a callout item *
2146 *************************************************/
2147
2148 /* A callout item contains the length of the next item in the pattern, which
2149 we can't fill in till after we have reached the relevant point. This is used
2150 for both automatic and manual callouts.
2151
2152 Arguments:
2153 previous_callout points to previous callout item
2154 ptr current pattern pointer
2155 cd pointers to tables etc
2156
2157 Returns: nothing
2158 */
2159
2160 static void
2161 complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
2162 {
2163 int length = ptr - cd->start_pattern - GET(previous_callout, 2);
2164 PUT(previous_callout, 2 + LINK_SIZE, length);
2165 }
2166
2167
2168
2169 #ifdef SUPPORT_UCP
2170 /*************************************************
2171 * Get othercase range *
2172 *************************************************/
2173
2174 /* This function is passed the start and end of a class range, in UTF-8 mode
2175 with UCP support. It searches up the characters, looking for internal ranges of
2176 characters in the "other" case. Each call returns the next one, updating the
2177 start address.
2178
2179 Arguments:
2180 cptr points to starting character value; updated
2181 d end value
2182 ocptr where to put start of othercase range
2183 odptr where to put end of othercase range
2184
2185 Yield: TRUE when range returned; FALSE when no more
2186 */
2187
2188 static BOOL
2189 get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
2190 unsigned int *odptr)
2191 {
2192 unsigned int c, othercase, next;
2193
2194 for (c = *cptr; c <= d; c++)
2195 { if ((othercase = UCD_OTHERCASE(c)) != c) break; }
2196
2197 if (c > d) return FALSE;
2198
2199 *ocptr = othercase;
2200 next = othercase + 1;
2201
2202 for (++c; c <= d; c++)
2203 {
2204 if (UCD_OTHERCASE(c) != next) break;
2205 next++;
2206 }
2207
2208 *odptr = next - 1;
2209 *cptr = c;
2210
2211 return TRUE;
2212 }
2213 #endif /* SUPPORT_UCP */
2214
2215
2216
2217 /*************************************************
2218 * Check if auto-possessifying is possible *
2219 *************************************************/
2220
2221 /* This function is called for unlimited repeats of certain items, to see
2222 whether the next thing could possibly match the repeated item. If not, it makes
2223 sense to automatically possessify the repeated item.
2224
2225 Arguments:
2226 op_code the repeated op code
2227 this data for this item, depends on the opcode
2228 utf8 TRUE in UTF-8 mode
2229 utf8_char used for utf8 character bytes, NULL if not relevant
2230 ptr next character in pattern
2231 options options bits
2232 cd contains pointers to tables etc.
2233
2234 Returns: TRUE if possessifying is wanted
2235 */
2236
2237 static BOOL
2238 check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
2239 const uschar *ptr, int options, compile_data *cd)
2240 {
2241 int next;
2242
2243 /* Skip whitespace and comments in extended mode */
2244
2245 if ((options & PCRE_EXTENDED) != 0)
2246 {
2247 for (;;)
2248 {
2249 while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2250 if (*ptr == CHAR_NUMBER_SIGN)
2251 {
2252 while (*(++ptr) != 0)
2253 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2254 }
2255 else break;
2256 }
2257 }
2258
2259 /* If the next item is one that we can handle, get its value. A non-negative
2260 value is a character, a negative value is an escape value. */
2261
2262 if (*ptr == CHAR_BACKSLASH)
2263 {
2264 int temperrorcode = 0;
2265 next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
2266 if (temperrorcode != 0) return FALSE;
2267 ptr++; /* Point after the escape sequence */
2268 }
2269
2270 else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
2271 {
2272 #ifdef SUPPORT_UTF8
2273 if (utf8) { GETCHARINC(next, ptr); } else
2274 #endif
2275 next = *ptr++;
2276 }
2277
2278 else return FALSE;
2279
2280 /* Skip whitespace and comments in extended mode */
2281
2282 if ((options & PCRE_EXTENDED) != 0)
2283 {
2284 for (;;)
2285 {
2286 while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2287 if (*ptr == CHAR_NUMBER_SIGN)
2288 {
2289 while (*(++ptr) != 0)
2290 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2291 }
2292 else break;
2293 }
2294 }
2295
2296 /* If the next thing is itself optional, we have to give up. */
2297
2298 if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
2299 strncmp((char *)ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
2300 return FALSE;
2301
2302 /* Now compare the next item with the previous opcode. If the previous is a
2303 positive single character match, "item" either contains the character or, if
2304 "item" is greater than 127 in utf8 mode, the character's bytes are in
2305 utf8_char. */
2306
2307
2308 /* Handle cases when the next item is a character. */
2309
2310 if (next >= 0) switch(op_code)
2311 {
2312 case OP_CHAR:
2313 #ifdef SUPPORT_UTF8
2314 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2315 #else
2316 (void)(utf8_char); /* Keep compiler happy by referencing function argument */
2317 #endif
2318 return item != next;
2319
2320 /* For CHARNC (caseless character) we must check the other case. If we have
2321 Unicode property support, we can use it to test the other case of
2322 high-valued characters. */
2323
2324 case OP_CHARNC:
2325 #ifdef SUPPORT_UTF8
2326 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2327 #endif
2328 if (item == next) return FALSE;
2329 #ifdef SUPPORT_UTF8
2330 if (utf8)
2331 {
2332 unsigned int othercase;
2333 if (next < 128) othercase = cd->fcc[next]; else
2334 #ifdef SUPPORT_UCP
2335 othercase = UCD_OTHERCASE((unsigned int)next);
2336 #else
2337 othercase = NOTACHAR;
2338 #endif
2339 return (unsigned int)item != othercase;
2340 }
2341 else
2342 #endif /* SUPPORT_UTF8 */
2343 return (item != cd->fcc[next]); /* Non-UTF-8 mode */
2344
2345 /* For OP_NOT, "item" must be a single-byte character. */
2346
2347 case OP_NOT:
2348 if (item == next) return TRUE;
2349 if ((options & PCRE_CASELESS) == 0) return FALSE;
2350 #ifdef SUPPORT_UTF8
2351 if (utf8)
2352 {
2353 unsigned int othercase;
2354 if (next < 128) othercase = cd->fcc[next]; else
2355 #ifdef SUPPORT_UCP
2356 othercase = UCD_OTHERCASE(next);
2357 #else
2358 othercase = NOTACHAR;
2359 #endif
2360 return (unsigned int)item == othercase;
2361 }
2362 else
2363 #endif /* SUPPORT_UTF8 */
2364 return (item == cd->fcc[next]); /* Non-UTF-8 mode */
2365
2366 case OP_DIGIT:
2367 return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
2368
2369 case OP_NOT_DIGIT:
2370 return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
2371
2372 case OP_WHITESPACE:
2373 return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
2374
2375 case OP_NOT_WHITESPACE:
2376 return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
2377
2378 case OP_WORDCHAR:
2379 return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
2380
2381 case OP_NOT_WORDCHAR:
2382 return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
2383
2384 case OP_HSPACE:
2385 case OP_NOT_HSPACE:
2386 switch(next)
2387 {
2388 case 0x09:
2389 case 0x20:
2390 case 0xa0:
2391 case 0x1680:
2392 case 0x180e:
2393 case 0x2000:
2394 case 0x2001:
2395 case 0x2002:
2396 case 0x2003:
2397 case 0x2004:
2398 case 0x2005:
2399 case 0x2006:
2400 case 0x2007:
2401 case 0x2008:
2402 case 0x2009:
2403 case 0x200A:
2404 case 0x202f:
2405 case 0x205f:
2406 case 0x3000:
2407 return op_code != OP_HSPACE;
2408 default:
2409 return op_code == OP_HSPACE;
2410 }
2411
2412 case OP_VSPACE:
2413 case OP_NOT_VSPACE:
2414 switch(next)
2415 {
2416 case 0x0a:
2417 case 0x0b:
2418 case 0x0c:
2419 case 0x0d:
2420 case 0x85:
2421 case 0x2028:
2422 case 0x2029:
2423 return op_code != OP_VSPACE;
2424 default:
2425 return op_code == OP_VSPACE;
2426 }
2427
2428 default:
2429 return FALSE;
2430 }
2431
2432
2433 /* Handle the case when the next item is \d, \s, etc. */
2434
2435 switch(op_code)
2436 {
2437 case OP_CHAR:
2438 case OP_CHARNC:
2439 #ifdef SUPPORT_UTF8
2440 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2441 #endif
2442 switch(-next)
2443 {
2444 case ESC_d:
2445 return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
2446
2447 case ESC_D:
2448 return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
2449
2450 case ESC_s:
2451 return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
2452
2453 case ESC_S:
2454 return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
2455
2456 case ESC_w:
2457 return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
2458
2459 case ESC_W:
2460 return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
2461
2462 case ESC_h:
2463 case ESC_H:
2464 switch(item)
2465 {
2466 case 0x09:
2467 case 0x20:
2468 case 0xa0:
2469 case 0x1680:
2470 case 0x180e:
2471 case 0x2000:
2472 case 0x2001:
2473 case 0x2002:
2474 case 0x2003:
2475 case 0x2004:
2476 case 0x2005:
2477 case 0x2006:
2478 case 0x2007:
2479 case 0x2008:
2480 case 0x2009:
2481 case 0x200A:
2482 case 0x202f:
2483 case 0x205f:
2484 case 0x3000:
2485 return -next != ESC_h;
2486 default:
2487 return -next == ESC_h;
2488 }
2489
2490 case ESC_v:
2491 case ESC_V:
2492 switch(item)
2493 {
2494 case 0x0a:
2495 case 0x0b:
2496 case 0x0c:
2497 case 0x0d:
2498 case 0x85:
2499 case 0x2028:
2500 case 0x2029:
2501 return -next != ESC_v;
2502 default:
2503 return -next == ESC_v;
2504 }
2505
2506 default:
2507 return FALSE;
2508 }
2509
2510 case OP_DIGIT:
2511 return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
2512 next == -ESC_h || next == -ESC_v;
2513
2514 case OP_NOT_DIGIT:
2515 return next == -ESC_d;
2516
2517 case OP_WHITESPACE:
2518 return next == -ESC_S || next == -ESC_d || next == -ESC_w;
2519
2520 case OP_NOT_WHITESPACE:
2521 return next == -ESC_s || next == -ESC_h || next == -ESC_v;
2522
2523 case OP_HSPACE:
2524 return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w;
2525
2526 case OP_NOT_HSPACE:
2527 return next == -ESC_h;
2528
2529 /* Can't have \S in here because VT matches \S (Perl anomaly) */
2530 case OP_VSPACE:
2531 return next == -ESC_V || next == -ESC_d || next == -ESC_w;
2532
2533 case OP_NOT_VSPACE:
2534 return next == -ESC_v;
2535
2536 case OP_WORDCHAR:
2537 return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v;
2538
2539 case OP_NOT_WORDCHAR:
2540 return next == -ESC_w || next == -ESC_d;
2541
2542 default:
2543 return FALSE;
2544 }
2545
2546 /* Control does not reach here */
2547 }
2548
2549
2550
2551 /*************************************************
2552 * Compile one branch *
2553 *************************************************/
2554
2555 /* Scan the pattern, compiling it into the a vector. If the options are
2556 changed during the branch, the pointer is used to change the external options
2557 bits. This function is used during the pre-compile phase when we are trying
2558 to find out the amount of memory needed, as well as during the real compile
2559 phase. The value of lengthptr distinguishes the two phases.
2560
2561 Arguments:
2562 optionsptr pointer to the option bits
2563 codeptr points to the pointer to the current code point
2564 ptrptr points to the current pattern pointer
2565 errorcodeptr points to error code variable
2566 firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2567 reqbyteptr set to the last literal character required, else < 0
2568 bcptr points to current branch chain
2569 cd contains pointers to tables etc.
2570 lengthptr NULL during the real compile phase
2571 points to length accumulator during pre-compile phase
2572
2573 Returns: TRUE on success
2574 FALSE, with *errorcodeptr set non-zero on error
2575 */
2576
2577 static BOOL
2578 compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2579 int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2580 compile_data *cd, int *lengthptr)
2581 {
2582 int repeat_type, op_type;
2583 int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
2584 int bravalue = 0;
2585 int greedy_default, greedy_non_default;
2586 int firstbyte, reqbyte;
2587 int zeroreqbyte, zerofirstbyte;
2588 int req_caseopt, reqvary, tempreqvary;
2589 int options = *optionsptr;
2590 int after_manual_callout = 0;
2591 int length_prevgroup = 0;
2592 register int c;
2593 register uschar *code = *codeptr;
2594 uschar *last_code = code;
2595 uschar *orig_code = code;
2596 uschar *tempcode;
2597 BOOL inescq = FALSE;
2598 BOOL groupsetfirstbyte = FALSE;
2599 const uschar *ptr = *ptrptr;
2600 const uschar *tempptr;
2601 uschar *previous = NULL;
2602 uschar *previous_callout = NULL;
2603 uschar *save_hwm = NULL;
2604 uschar classbits[32];
2605
2606 #ifdef SUPPORT_UTF8
2607 BOOL class_utf8;
2608 BOOL utf8 = (options & PCRE_UTF8) != 0;
2609 uschar *class_utf8data;
2610 uschar *class_utf8data_base;
2611 uschar utf8_char[6];
2612 #else
2613 BOOL utf8 = FALSE;
2614 uschar *utf8_char = NULL;
2615 #endif
2616
2617 #ifdef DEBUG
2618 if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2619 #endif
2620
2621 /* Set up the default and non-default settings for greediness */
2622
2623 greedy_default = ((options & PCRE_UNGREEDY) != 0);
2624 greedy_non_default = greedy_default ^ 1;
2625
2626 /* Initialize no first byte, no required byte. REQ_UNSET means "no char
2627 matching encountered yet". It gets changed to REQ_NONE if we hit something that
2628 matches a non-fixed char first char; reqbyte just remains unset if we never
2629 find one.
2630
2631 When we hit a repeat whose minimum is zero, we may have to adjust these values
2632 to take the zero repeat into account. This is implemented by setting them to
2633 zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2634 item types that can be repeated set these backoff variables appropriately. */
2635
2636 firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2637
2638 /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2639 according to the current setting of the caseless flag. REQ_CASELESS is a bit
2640 value > 255. It is added into the firstbyte or reqbyte variables to record the
2641 case status of the value. This is used only for ASCII characters. */
2642
2643 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2644
2645 /* Switch on next character until the end of the branch */
2646
2647 for (;; ptr++)
2648 {
2649 BOOL negate_class;
2650 BOOL should_flip_negation;
2651 BOOL possessive_quantifier;
2652 BOOL is_quantifier;
2653 BOOL is_recurse;
2654 BOOL reset_bracount;
2655 int class_charcount;
2656 int class_lastchar;
2657 int newoptions;
2658 int recno;
2659 int refsign;
2660 int skipbytes;
2661 int subreqbyte;
2662 int subfirstbyte;
2663 int terminator;
2664 int mclength;
2665 uschar mcbuffer[8];
2666
2667 /* Get next byte in the pattern */
2668
2669 c = *ptr;
2670
2671 /* If we are in the pre-compile phase, accumulate the length used for the
2672 previous cycle of this loop. */
2673
2674 if (lengthptr != NULL)
2675 {
2676 #ifdef DEBUG
2677 if (code > cd->hwm) cd->hwm = code; /* High water info */
2678 #endif
2679 if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
2680 {
2681 *errorcodeptr = ERR52;
2682 goto FAILED;
2683 }
2684
2685 /* There is at least one situation where code goes backwards: this is the
2686 case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2687 the class is simply eliminated. However, it is created first, so we have to
2688 allow memory for it. Therefore, don't ever reduce the length at this point.
2689 */
2690
2691 if (code < last_code) code = last_code;
2692
2693 /* Paranoid check for integer overflow */
2694
2695 if (OFLOW_MAX - *lengthptr < code - last_code)
2696 {
2697 *errorcodeptr = ERR20;
2698 goto FAILED;
2699 }
2700
2701 *lengthptr += code - last_code;
2702 DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2703
2704 /* If "previous" is set and it is not at the start of the work space, move
2705 it back to there, in order to avoid filling up the work space. Otherwise,
2706 if "previous" is NULL, reset the current code pointer to the start. */
2707
2708 if (previous != NULL)
2709 {
2710 if (previous > orig_code)
2711 {
2712 memmove(orig_code, previous, code - previous);
2713 code -= previous - orig_code;
2714 previous = orig_code;
2715 }
2716 }
2717 else code = orig_code;
2718
2719 /* Remember where this code item starts so we can pick up the length
2720 next time round. */
2721
2722 last_code = code;
2723 }
2724
2725 /* In the real compile phase, just check the workspace used by the forward
2726 reference list. */
2727
2728 else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
2729 {
2730 *errorcodeptr = ERR52;
2731 goto FAILED;
2732 }
2733
2734 /* If in \Q...\E, check for the end; if not, we have a literal */
2735
2736 if (inescq && c != 0)
2737 {
2738 if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)
2739 {
2740 inescq = FALSE;
2741 ptr++;
2742 continue;
2743 }
2744 else
2745 {
2746 if (previous_callout != NULL)
2747 {
2748 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2749 complete_callout(previous_callout, ptr, cd);
2750 previous_callout = NULL;
2751 }
2752 if ((options & PCRE_AUTO_CALLOUT) != 0)
2753 {
2754 previous_callout = code;
2755 code = auto_callout(code, ptr, cd);
2756 }
2757 goto NORMAL_CHAR;
2758 }
2759 }
2760
2761 /* Fill in length of a previous callout, except when the next thing is
2762 a quantifier. */
2763
2764 is_quantifier =
2765 c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK ||
2766 (c == CHAR_LEFT_CURLY_BRACKET && is_counted_repeat(ptr+1));
2767
2768 if (!is_quantifier && previous_callout != NULL &&
2769 after_manual_callout-- <= 0)
2770 {
2771 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2772 complete_callout(previous_callout, ptr, cd);
2773 previous_callout = NULL;
2774 }
2775
2776 /* In extended mode, skip white space and comments */
2777
2778 if ((options & PCRE_EXTENDED) != 0)
2779 {
2780 if ((cd->ctypes[c] & ctype_space) != 0) continue;
2781 if (c == CHAR_NUMBER_SIGN)
2782 {
2783 while (*(++ptr) != 0)
2784 {
2785 if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
2786 }
2787 if (*ptr != 0) continue;
2788
2789 /* Else fall through to handle end of string */
2790 c = 0;
2791 }
2792 }
2793
2794 /* No auto callout for quantifiers. */
2795
2796 if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
2797 {
2798 previous_callout = code;
2799 code = auto_callout(code, ptr, cd);
2800 }
2801
2802 switch(c)
2803 {
2804 /* ===================================================================*/
2805 case 0: /* The branch terminates at string end */
2806 case CHAR_VERTICAL_LINE: /* or | or ) */
2807 case CHAR_RIGHT_PARENTHESIS:
2808 *firstbyteptr = firstbyte;
2809 *reqbyteptr = reqbyte;
2810 *codeptr = code;
2811 *ptrptr = ptr;
2812 if (lengthptr != NULL)
2813 {
2814 if (OFLOW_MAX - *lengthptr < code - last_code)
2815 {
2816 *errorcodeptr = ERR20;
2817 goto FAILED;
2818 }
2819 *lengthptr += code - last_code; /* To include callout length */
2820 DPRINTF((">> end branch\n"));
2821 }
2822 return TRUE;
2823
2824
2825 /* ===================================================================*/
2826 /* Handle single-character metacharacters. In multiline mode, ^ disables
2827 the setting of any following char as a first character. */
2828
2829 case CHAR_CIRCUMFLEX_ACCENT:
2830 if ((options & PCRE_MULTILINE) != 0)
2831 {
2832 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2833 }
2834 previous = NULL;
2835 *code++ = OP_CIRC;
2836 break;
2837
2838 case CHAR_DOLLAR_SIGN:
2839 previous = NULL;
2840 *code++ = OP_DOLL;
2841 break;
2842
2843 /* There can never be a first char if '.' is first, whatever happens about
2844 repeats. The value of reqbyte doesn't change either. */
2845
2846 case CHAR_DOT:
2847 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2848 zerofirstbyte = firstbyte;
2849 zeroreqbyte = reqbyte;
2850 previous = code;
2851 *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;
2852 break;
2853
2854
2855 /* ===================================================================*/
2856 /* Character classes. If the included characters are all < 256, we build a
2857 32-byte bitmap of the permitted characters, except in the special case
2858 where there is only one such character. For negated classes, we build the
2859 map as usual, then invert it at the end. However, we use a different opcode
2860 so that data characters > 255 can be handled correctly.
2861
2862 If the class contains characters outside the 0-255 range, a different
2863 opcode is compiled. It may optionally have a bit map for characters < 256,
2864 but those above are are explicitly listed afterwards. A flag byte tells
2865 whether the bitmap is present, and whether this is a negated class or not.
2866
2867 In JavaScript compatibility mode, an isolated ']' causes an error. In
2868 default (Perl) mode, it is treated as a data character. */
2869
2870 case CHAR_RIGHT_SQUARE_BRACKET:
2871 if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
2872 {
2873 *errorcodeptr = ERR64;
2874 goto FAILED;
2875 }
2876 goto NORMAL_CHAR;
2877
2878 case CHAR_LEFT_SQUARE_BRACKET:
2879 previous = code;
2880
2881 /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
2882 they are encountered at the top level, so we'll do that too. */
2883
2884 if ((ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
2885 ptr[1] == CHAR_EQUALS_SIGN) &&
2886 check_posix_syntax(ptr, &tempptr))
2887 {
2888 *errorcodeptr = (ptr[1] == CHAR_COLON)? ERR13 : ERR31;
2889 goto FAILED;
2890 }
2891
2892 /* If the first character is '^', set the negation flag and skip it. Also,
2893 if the first few characters (either before or after ^) are \Q\E or \E we
2894 skip them too. This makes for compatibility with Perl. */
2895
2896 negate_class = FALSE;
2897 for (;;)
2898 {
2899 c = *(++ptr);
2900 if (c == CHAR_BACKSLASH)
2901 {
2902 if (ptr[1] == CHAR_E)
2903 ptr++;
2904 else if (strncmp((const char *)ptr+1,
2905 STR_Q STR_BACKSLASH STR_E, 3) == 0)
2906 ptr += 3;
2907 else
2908 break;
2909 }
2910 else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
2911 negate_class = TRUE;
2912 else break;
2913 }
2914
2915 /* Empty classes are allowed in JavaScript compatibility mode. Otherwise,
2916 an initial ']' is taken as a data character -- the code below handles
2917 that. In JS mode, [] must always fail, so generate OP_FAIL, whereas
2918 [^] must match any character, so generate OP_ALLANY. */
2919
2920 if (c == CHAR_RIGHT_SQUARE_BRACKET &&
2921 (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
2922 {
2923 *code++ = negate_class? OP_ALLANY : OP_FAIL;
2924 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2925 zerofirstbyte = firstbyte;
2926 break;
2927 }
2928
2929 /* If a class contains a negative special such as \S, we need to flip the
2930 negation flag at the end, so that support for characters > 255 works
2931 correctly (they are all included in the class). */
2932
2933 should_flip_negation = FALSE;
2934
2935 /* Keep a count of chars with values < 256 so that we can optimize the case
2936 of just a single character (as long as it's < 256). However, For higher
2937 valued UTF-8 characters, we don't yet do any optimization. */
2938
2939 class_charcount = 0;
2940 class_lastchar = -1;
2941
2942 /* Initialize the 32-char bit map to all zeros. We build the map in a
2943 temporary bit of memory, in case the class contains only 1 character (less
2944 than 256), because in that case the compiled code doesn't use the bit map.
2945 */
2946
2947 memset(classbits, 0, 32 * sizeof(uschar));
2948
2949 #ifdef SUPPORT_UTF8
2950 class_utf8 = FALSE; /* No chars >= 256 */
2951 class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */
2952 class_utf8data_base = class_utf8data; /* For resetting in pass 1 */
2953 #endif
2954
2955 /* Process characters until ] is reached. By writing this as a "do" it
2956 means that an initial ] is taken as a data character. At the start of the
2957 loop, c contains the first byte of the character. */
2958
2959 if (c != 0) do
2960 {
2961 const uschar *oldptr;
2962
2963 #ifdef SUPPORT_UTF8
2964 if (utf8 && c > 127)
2965 { /* Braces are required because the */
2966 GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
2967 }
2968
2969 /* In the pre-compile phase, accumulate the length of any UTF-8 extra
2970 data and reset the pointer. This is so that very large classes that
2971 contain a zillion UTF-8 characters no longer overwrite the work space
2972 (which is on the stack). */
2973
2974 if (lengthptr != NULL)
2975 {
2976 *lengthptr += class_utf8data - class_utf8data_base;
2977 class_utf8data = class_utf8data_base;
2978 }
2979
2980 #endif
2981
2982 /* Inside \Q...\E everything is literal except \E */
2983
2984 if (inescq)
2985 {
2986 if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E) /* If we are at \E */
2987 {
2988 inescq = FALSE; /* Reset literal state */
2989 ptr++; /* Skip the 'E' */
2990 continue; /* Carry on with next */
2991 }
2992 goto CHECK_RANGE; /* Could be range if \E follows */
2993 }
2994
2995 /* Handle POSIX class names. Perl allows a negation extension of the
2996 form [:^name:]. A square bracket that doesn't match the syntax is
2997 treated as a literal. We also recognize the POSIX constructions
2998 [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
2999 5.6 and 5.8 do. */
3000
3001 if (c == CHAR_LEFT_SQUARE_BRACKET &&
3002 (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
3003 ptr[1] == CHAR_EQUALS_SIGN) && check_posix_syntax(ptr, &tempptr))
3004 {
3005 BOOL local_negate = FALSE;
3006 int posix_class, taboffset, tabopt;
3007 register const uschar *cbits = cd->cbits;
3008 uschar pbits[32];
3009
3010 if (ptr[1] != CHAR_COLON)
3011 {
3012 *errorcodeptr = ERR31;
3013 goto FAILED;
3014 }
3015
3016 ptr += 2;
3017 if (*ptr == CHAR_CIRCUMFLEX_ACCENT)
3018 {
3019 local_negate = TRUE;
3020 should_flip_negation = TRUE; /* Note negative special */
3021 ptr++;
3022 }
3023
3024 posix_class = check_posix_name(ptr, tempptr - ptr);
3025 if (posix_class < 0)
3026 {
3027 *errorcodeptr = ERR30;
3028 goto FAILED;
3029 }
3030
3031 /* If matching is caseless, upper and lower are converted to
3032 alpha. This relies on the fact that the class table starts with
3033 alpha, lower, upper as the first 3 entries. */
3034
3035 if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
3036 posix_class = 0;
3037
3038 /* We build the bit map for the POSIX class in a chunk of local store
3039 because we may be adding and subtracting from it, and we don't want to
3040 subtract bits that may be in the main map already. At the end we or the
3041 result into the bit map that is being built. */
3042
3043 posix_class *= 3;
3044
3045 /* Copy in the first table (always present) */
3046
3047 memcpy(pbits, cbits + posix_class_maps[posix_class],
3048 32 * sizeof(uschar));
3049
3050 /* If there is a second table, add or remove it as required. */
3051
3052 taboffset = posix_class_maps[posix_class + 1];
3053 tabopt = posix_class_maps[posix_class + 2];
3054
3055 if (taboffset >= 0)
3056 {
3057 if (tabopt >= 0)
3058 for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
3059 else
3060 for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
3061 }
3062
3063 /* Not see if we need to remove any special characters. An option
3064 value of 1 removes vertical space and 2 removes underscore. */
3065
3066 if (tabopt < 0) tabopt = -tabopt;
3067 if (tabopt == 1) pbits[1] &= ~0x3c;
3068 else if (tabopt == 2) pbits[11] &= 0x7f;
3069
3070 /* Add the POSIX table or its complement into the main table that is
3071 being built and we are done. */
3072
3073 if (local_negate)
3074 for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
3075 else
3076 for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
3077
3078 ptr = tempptr + 1;
3079 class_charcount = 10; /* Set > 1; assumes more than 1 per class */
3080 continue; /* End of POSIX syntax handling */
3081 }
3082
3083 /* Backslash may introduce a single character, or it may introduce one
3084 of the specials, which just set a flag. The sequence \b is a special
3085 case. Inside a class (and only there) it is treated as backspace.
3086 Elsewhere it marks a word boundary. Other escapes have preset maps ready
3087 to 'or' into the one we are building. We assume they have more than one
3088 character in them, so set class_charcount bigger than one. */
3089
3090 if (c == CHAR_BACKSLASH)
3091 {
3092 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3093 if (*errorcodeptr != 0) goto FAILED;
3094
3095 if (-c == ESC_b) c = CHAR_BS; /* \b is backspace in a class */
3096 else if (-c == ESC_X) c = CHAR_X; /* \X is literal X in a class */
3097 else if (-c == ESC_R) c = CHAR_R; /* \R is literal R in a class */
3098 else if (-c == ESC_Q) /* Handle start of quoted string */
3099 {
3100 if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
3101 {
3102 ptr += 2; /* avoid empty string */
3103 }
3104 else inescq = TRUE;
3105 continue;
3106 }
3107 else if (-c == ESC_E) continue; /* Ignore orphan \E */
3108
3109 if (c < 0)
3110 {
3111 register const uschar *cbits = cd->cbits;
3112 class_charcount += 2; /* Greater than 1 is what matters */
3113
3114 /* Save time by not doing this in the pre-compile phase. */
3115
3116 if (lengthptr == NULL) switch (-c)
3117 {
3118 case ESC_d:
3119 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
3120 continue;
3121
3122 case ESC_D:
3123 should_flip_negation = TRUE;
3124 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
3125 continue;
3126
3127 case ESC_w:
3128 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
3129 continue;
3130
3131 case ESC_W:
3132 should_flip_negation = TRUE;
3133 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
3134 continue;
3135
3136 case ESC_s:
3137 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
3138 classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
3139 continue;
3140
3141 case ESC_S:
3142 should_flip_negation = TRUE;
3143 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
3144 classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
3145 continue;
3146
3147 default: /* Not recognized; fall through */
3148 break; /* Need "default" setting to stop compiler warning. */
3149 }
3150
3151 /* In the pre-compile phase, just do the recognition. */
3152
3153 else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
3154 c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
3155
3156 /* We need to deal with \H, \h, \V, and \v in both phases because
3157 they use extra memory. */
3158
3159 if (-c == ESC_h)
3160 {
3161 SETBIT(classbits, 0x09); /* VT */
3162 SETBIT(classbits, 0x20); /* SPACE */
3163 SETBIT(classbits, 0xa0); /* NSBP */
3164 #ifdef SUPPORT_UTF8
3165 if (utf8)
3166 {
3167 class_utf8 = TRUE;
3168 *class_utf8data++ = XCL_SINGLE;
3169 class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
3170 *class_utf8data++ = XCL_SINGLE;
3171 class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
3172 *class_utf8data++ = XCL_RANGE;
3173 class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
3174 class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
3175 *class_utf8data++ = XCL_SINGLE;
3176 class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
3177 *class_utf8data++ = XCL_SINGLE;
3178 class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
3179 *class_utf8data++ = XCL_SINGLE;
3180 class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
3181 }
3182 #endif
3183 continue;
3184 }
3185
3186 if (-c == ESC_H)
3187 {
3188 for (c = 0; c < 32; c++)
3189 {
3190 int x = 0xff;
3191 switch (c)
3192 {
3193 case 0x09/8: x ^= 1 << (0x09%8); break;
3194 case 0x20/8: x ^= 1 << (0x20%8); break;
3195 case 0xa0/8: x ^= 1 << (0xa0%8); break;
3196 default: break;
3197 }
3198 classbits[c] |= x;
3199 }
3200
3201 #ifdef SUPPORT_UTF8
3202 if (utf8)
3203 {
3204 class_utf8 = TRUE;
3205 *class_utf8data++ = XCL_RANGE;
3206 class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
3207 class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
3208 *class_utf8data++ = XCL_RANGE;
3209 class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
3210 class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
3211 *class_utf8data++ = XCL_RANGE;
3212 class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
3213 class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
3214 *class_utf8data++ = XCL_RANGE;
3215 class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
3216 class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
3217 *class_utf8data++ = XCL_RANGE;
3218 class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
3219 class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
3220 *class_utf8data++ = XCL_RANGE;
3221 class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
3222 class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
3223 *class_utf8data++ = XCL_RANGE;
3224 class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
3225 class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
3226 }
3227 #endif
3228 continue;
3229 }
3230
3231 if (-c == ESC_v)
3232 {
3233 SETBIT(classbits, 0x0a); /* LF */
3234 SETBIT(classbits, 0x0b); /* VT */
3235 SETBIT(classbits, 0x0c); /* FF */
3236 SETBIT(classbits, 0x0d); /* CR */
3237 SETBIT(classbits, 0x85); /* NEL */
3238 #ifdef SUPPORT_UTF8
3239 if (utf8)
3240 {
3241 class_utf8 = TRUE;
3242 *class_utf8data++ = XCL_RANGE;
3243 class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
3244 class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
3245 }
3246 #endif
3247 continue;
3248 }
3249
3250 if (-c == ESC_V)
3251 {
3252 for (c = 0; c < 32; c++)
3253 {
3254 int x = 0xff;
3255 switch (c)
3256 {
3257 case 0x0a/8: x ^= 1 << (0x0a%8);
3258 x ^= 1 << (0x0b%8);
3259 x ^= 1 << (0x0c%8);
3260 x ^= 1 << (0x0d%8);
3261 break;
3262 case 0x85/8: x ^= 1 << (0x85%8); break;
3263 default: break;
3264 }
3265 classbits[c] |= x;
3266 }
3267
3268 #ifdef SUPPORT_UTF8
3269 if (utf8)
3270 {
3271 class_utf8 = TRUE;
3272 *class_utf8data++ = XCL_RANGE;
3273 class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
3274 class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
3275 *class_utf8data++ = XCL_RANGE;
3276 class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
3277 class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
3278 }
3279 #endif
3280 continue;
3281 }
3282
3283 /* We need to deal with \P and \p in both phases. */
3284
3285 #ifdef SUPPORT_UCP
3286 if (-c == ESC_p || -c == ESC_P)
3287 {
3288 BOOL negated;
3289 int pdata;
3290 int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
3291 if (ptype < 0) goto FAILED;
3292 class_utf8 = TRUE;
3293 *class_utf8data++ = ((-c == ESC_p) != negated)?
3294 XCL_PROP : XCL_NOTPROP;
3295 *class_utf8data++ = ptype;
3296 *class_utf8data++ = pdata;
3297 class_charcount -= 2; /* Not a < 256 character */
3298 continue;
3299 }
3300 #endif
3301 /* Unrecognized escapes are faulted if PCRE is running in its
3302 strict mode. By default, for compatibility with Perl, they are
3303 treated as literals. */
3304
3305 if ((options & PCRE_EXTRA) != 0)
3306 {
3307 *errorcodeptr = ERR7;
3308 goto FAILED;
3309 }
3310
3311 class_charcount -= 2; /* Undo the default count from above */
3312 c = *ptr; /* Get the final character and fall through */
3313 }
3314
3315 /* Fall through if we have a single character (c >= 0). This may be
3316 greater than 256 in UTF-8 mode. */
3317
3318 } /* End of backslash handling */
3319
3320 /* A single character may be followed by '-' to form a range. However,
3321 Perl does not permit ']' to be the end of the range. A '-' character
3322 at the end is treated as a literal. Perl ignores orphaned \E sequences
3323 entirely. The code for handling \Q and \E is messy. */
3324
3325 CHECK_RANGE:
3326 while (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
3327 {
3328 inescq = FALSE;
3329 ptr += 2;
3330 }
3331
3332 oldptr = ptr;
3333
3334 /* Remember \r or \n */
3335
3336 if (c == CHAR_CR || c == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
3337
3338 /* Check for range */
3339
3340 if (!inescq && ptr[1] == CHAR_MINUS)
3341 {
3342 int d;
3343 ptr += 2;
3344 while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E) ptr += 2;
3345
3346 /* If we hit \Q (not followed by \E) at this point, go into escaped
3347 mode. */
3348
3349 while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_Q)
3350 {
3351 ptr += 2;
3352 if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E)
3353 { ptr += 2; continue; }
3354 inescq = TRUE;
3355 break;
3356 }
3357
3358 if (*ptr == 0 || (!inescq && *ptr == CHAR_RIGHT_SQUARE_BRACKET))
3359 {
3360 ptr = oldptr;
3361 goto LONE_SINGLE_CHARACTER;
3362 }
3363
3364 #ifdef SUPPORT_UTF8
3365 if (utf8)
3366 { /* Braces are required because the */
3367 GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
3368 }
3369 else
3370 #endif
3371 d = *ptr; /* Not UTF-8 mode */
3372
3373 /* The second part of a range can be a single-character escape, but
3374 not any of the other escapes. Perl 5.6 treats a hyphen as a literal
3375 in such circumstances. */
3376
3377 if (!inescq && d == CHAR_BACKSLASH)
3378 {
3379 d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3380 if (*errorcodeptr != 0) goto FAILED;
3381
3382 /* \b is backspace; \X is literal X; \R is literal R; any other
3383 special means the '-' was literal */
3384
3385 if (d < 0)
3386 {
3387 if (d == -ESC_b) d = CHAR_BS;
3388 else if (d == -ESC_X) d = CHAR_X;
3389 else if (d == -ESC_R) d = CHAR_R; else
3390 {
3391 ptr = oldptr;
3392 goto LONE_SINGLE_CHARACTER; /* A few lines below */
3393 }
3394 }
3395 }
3396
3397 /* Check that the two values are in the correct order. Optimize
3398 one-character ranges */
3399
3400 if (d < c)
3401 {
3402 *errorcodeptr = ERR8;
3403 goto FAILED;
3404 }
3405
3406 if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
3407
3408 /* Remember \r or \n */
3409
3410 if (d == CHAR_CR || d == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
3411
3412 /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
3413 matching, we have to use an XCLASS with extra data items. Caseless
3414 matching for characters > 127 is available only if UCP support is
3415 available. */
3416
3417 #ifdef SUPPORT_UTF8
3418 if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
3419 {
3420 class_utf8 = TRUE;
3421
3422 /* With UCP support, we can find the other case equivalents of
3423 the relevant characters. There may be several ranges. Optimize how
3424 they fit with the basic range. */
3425
3426 #ifdef SUPPORT_UCP
3427 if ((options & PCRE_CASELESS) != 0)
3428 {
3429 unsigned int occ, ocd;
3430 unsigned int cc = c;
3431 unsigned int origd = d;
3432 while (get_othercase_range(&cc, origd, &occ, &ocd))
3433 {
3434 if (occ >= (unsigned int)c &&
3435 ocd <= (unsigned int)d)
3436 continue; /* Skip embedded ranges */
3437
3438 if (occ < (unsigned int)c &&
3439 ocd >= (unsigned int)c - 1) /* Extend the basic range */
3440 { /* if there is overlap, */
3441 c = occ; /* noting that if occ < c */
3442 continue; /* we can't have ocd > d */
3443 } /* because a subrange is */
3444 if (ocd > (unsigned int)d &&
3445 occ <= (unsigned int)d + 1) /* always shorter than */
3446 { /* the basic range. */
3447 d = ocd;
3448 continue;
3449 }
3450
3451 if (occ == ocd)
3452 {
3453 *class_utf8data++ = XCL_SINGLE;
3454 }
3455 else
3456 {
3457 *class_utf8data++ = XCL_RANGE;
3458 class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
3459 }
3460 class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
3461 }
3462 }
3463 #endif /* SUPPORT_UCP */
3464
3465 /* Now record the original range, possibly modified for UCP caseless
3466 overlapping ranges. */
3467
3468 *class_utf8data++ = XCL_RANGE;
3469 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3470 class_utf8data += _pcre_ord2utf8(d, class_utf8data);
3471
3472 /* With UCP support, we are done. Without UCP support, there is no
3473 caseless matching for UTF-8 characters > 127; we can use the bit map
3474 for the smaller ones. */
3475
3476 #ifdef SUPPORT_UCP
3477 continue; /* With next character in the class */
3478 #else
3479 if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
3480
3481 /* Adjust upper limit and fall through to set up the map */
3482
3483 d = 127;
3484
3485 #endif /* SUPPORT_UCP */
3486 }
3487 #endif /* SUPPORT_UTF8 */
3488
3489 /* We use the bit map for all cases when not in UTF-8 mode; else
3490 ranges that lie entirely within 0-127 when there is UCP support; else
3491 for partial ranges without UCP support. */
3492
3493 class_charcount += d - c + 1;
3494 class_lastchar = d;
3495
3496 /* We can save a bit of time by skipping this in the pre-compile. */
3497
3498 if (lengthptr == NULL) for (; c <= d; c++)
3499 {
3500 classbits[c/8] |= (1 << (c&7));
3501 if ((options & PCRE_CASELESS) != 0)
3502 {
3503 int uc = cd->fcc[c]; /* flip case */
3504 classbits[uc/8] |= (1 << (uc&7));
3505 }
3506 }
3507
3508 continue; /* Go get the next char in the class */
3509 }
3510
3511 /* Handle a lone single character - we can get here for a normal
3512 non-escape char, or after \ that introduces a single character or for an
3513 apparent range that isn't. */
3514
3515 LONE_SINGLE_CHARACTER:
3516
3517 /* Handle a character that cannot go in the bit map */
3518
3519 #ifdef SUPPORT_UTF8
3520 if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
3521 {
3522 class_utf8 = TRUE;
3523 *class_utf8data++ = XCL_SINGLE;
3524 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3525
3526 #ifdef SUPPORT_UCP
3527 if ((options & PCRE_CASELESS) != 0)
3528 {
3529 unsigned int othercase;
3530 if ((othercase = UCD_OTHERCASE(c)) != c)
3531 {
3532 *class_utf8data++ = XCL_SINGLE;
3533 class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
3534 }
3535 }
3536 #endif /* SUPPORT_UCP */
3537
3538 }
3539 else
3540 #endif /* SUPPORT_UTF8 */
3541
3542 /* Handle a single-byte character */
3543 {
3544 classbits[c/8] |= (1 << (c&7));
3545 if ((options & PCRE_CASELESS) != 0)
3546 {
3547 c = cd->fcc[c]; /* flip case */
3548 classbits[c/8] |= (1 << (c&7));
3549 }
3550 class_charcount++;
3551 class_lastchar = c;
3552 }
3553 }
3554
3555 /* Loop until ']' reached. This "while" is the end of the "do" above. */
3556
3557 while ((c = *(++ptr)) != 0 && (c != CHAR_RIGHT_SQUARE_BRACKET || inescq));
3558
3559 if (c == 0) /* Missing terminating ']' */
3560 {
3561 *errorcodeptr = ERR6;
3562 goto FAILED;
3563 }
3564
3565
3566 /* This code has been disabled because it would mean that \s counts as
3567 an explicit \r or \n reference, and that's not really what is wanted. Now
3568 we set the flag only if there is a literal "\r" or "\n" in the class. */
3569
3570 #if 0
3571 /* Remember whether \r or \n are in this class */
3572
3573 if (negate_class)
3574 {
3575 if ((classbits[1] & 0x24) != 0x24) cd->external_flags |= PCRE_HASCRORLF;
3576 }
3577 else
3578 {
3579 if ((classbits[1] & 0x24) != 0) cd->external_flags |= PCRE_HASCRORLF;
3580 }
3581 #endif
3582
3583
3584 /* If class_charcount is 1, we saw precisely one character whose value is
3585 less than 256. As long as there were no characters >= 128 and there was no
3586 use of \p or \P, in other words, no use of any XCLASS features, we can
3587 optimize.
3588
3589 In UTF-8 mode, we can optimize the negative case only if there were no
3590 characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR
3591 operate on single-bytes only. This is an historical hangover. Maybe one day
3592 we can tidy these opcodes to handle multi-byte characters.
3593
3594 The optimization throws away the bit map. We turn the item into a
3595 1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
3596 that OP_NOT does not support multibyte characters. In the positive case, it
3597 can cause firstbyte to be set. Otherwise, there can be no first char if
3598 this item is first, whatever repeat count may follow. In the case of
3599 reqbyte, save the previous value for reinstating. */
3600
3601 #ifdef SUPPORT_UTF8
3602 if (class_charcount == 1 && !class_utf8 &&
3603 (!utf8 || !negate_class || class_lastchar < 128))
3604 #else
3605 if (class_charcount == 1)
3606 #endif
3607 {
3608 zeroreqbyte = reqbyte;
3609
3610 /* The OP_NOT opcode works on one-byte characters only. */
3611
3612 if (negate_class)
3613 {
3614 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3615 zerofirstbyte = firstbyte;
3616 *code++ = OP_NOT;
3617 *code++ = class_lastchar;
3618 break;
3619 }
3620
3621 /* For a single, positive character, get the value into mcbuffer, and
3622 then we can handle this with the normal one-character code. */
3623
3624 #ifdef SUPPORT_UTF8
3625 if (utf8 && class_lastchar > 127)
3626 mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
3627 else
3628 #endif
3629 {
3630 mcbuffer[0] = class_lastchar;
3631 mclength = 1;
3632 }
3633 goto ONE_CHAR;
3634 } /* End of 1-char optimization */
3635
3636 /* The general case - not the one-char optimization. If this is the first
3637 thing in the branch, there can be no first char setting, whatever the
3638 repeat count. Any reqbyte setting must remain unchanged after any kind of
3639 repeat. */
3640
3641 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3642 zerofirstbyte = firstbyte;
3643 zeroreqbyte = reqbyte;
3644
3645 /* If there are characters with values > 255, we have to compile an
3646 extended class, with its own opcode, unless there was a negated special
3647 such as \S in the class, because in that case all characters > 255 are in
3648 the class, so any that were explicitly given as well can be ignored. If
3649 (when there are explicit characters > 255 that must be listed) there are no
3650 characters < 256, we can omit the bitmap in the actual compiled code. */
3651
3652 #ifdef SUPPORT_UTF8
3653 if (class_utf8 && !should_flip_negation)
3654 {
3655 *class_utf8data++ = XCL_END; /* Marks the end of extra data */
3656 *code++ = OP_XCLASS;
3657 code += LINK_SIZE;
3658 *code = negate_class? XCL_NOT : 0;
3659
3660 /* If the map is required, move up the extra data to make room for it;
3661 otherwise just move the code pointer to the end of the extra data. */
3662
3663 if (class_charcount > 0)
3664 {
3665 *code++ |= XCL_MAP;
3666 memmove(code + 32, code, class_utf8data - code);
3667 memcpy(code, classbits, 32);
3668 code = class_utf8data + 32;
3669 }
3670 else code = class_utf8data;
3671
3672 /* Now fill in the complete length of the item */
3673
3674 PUT(previous, 1, code - previous);
3675 break; /* End of class handling */
3676 }
3677 #endif
3678
3679 /* If there are no characters > 255, set the opcode to OP_CLASS or
3680 OP_NCLASS, depending on whether the whole class was negated and whether
3681 there were negative specials such as \S in the class. Then copy the 32-byte
3682 map into the code vector, negating it if necessary. */
3683
3684 *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
3685 if (negate_class)
3686 {
3687 if (lengthptr == NULL) /* Save time in the pre-compile phase */
3688 for (c = 0; c < 32; c++) code[c] = ~classbits[c];
3689 }
3690 else
3691 {
3692 memcpy(code, classbits, 32);
3693 }
3694 code += 32;
3695 break;
3696
3697
3698 /* ===================================================================*/
3699 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
3700 has been tested above. */
3701
3702 case CHAR_LEFT_CURLY_BRACKET:
3703 if (!is_quantifier) goto NORMAL_CHAR;
3704 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
3705 if (*errorcodeptr != 0) goto FAILED;
3706 goto REPEAT;
3707
3708 case CHAR_ASTERISK:
3709 repeat_min = 0;
3710 repeat_max = -1;
3711 goto REPEAT;
3712
3713 case CHAR_PLUS:
3714 repeat_min = 1;
3715 repeat_max = -1;
3716 goto REPEAT;
3717
3718 case CHAR_QUESTION_MARK:
3719 repeat_min = 0;
3720 repeat_max = 1;
3721
3722 REPEAT:
3723 if (previous == NULL)
3724 {
3725 *errorcodeptr = ERR9;
3726 goto FAILED;
3727 }
3728
3729 if (repeat_min == 0)
3730 {
3731 firstbyte = zerofirstbyte; /* Adjust for zero repeat */
3732 reqbyte = zeroreqbyte; /* Ditto */
3733 }
3734
3735 /* Remember whether this is a variable length repeat */
3736
3737 reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
3738
3739 op_type = 0; /* Default single-char op codes */
3740 possessive_quantifier = FALSE; /* Default not possessive quantifier */
3741
3742 /* Save start of previous item, in case we have to move it up to make space
3743 for an inserted OP_ONCE for the additional '+' extension. */
3744
3745 tempcode = previous;
3746
3747 /* If the next character is '+', we have a possessive quantifier. This
3748 implies greediness, whatever the setting of the PCRE_UNGREEDY option.
3749 If the next character is '?' this is a minimizing repeat, by default,
3750 but if PCRE_UNGREEDY is set, it works the other way round. We change the
3751 repeat type to the non-default. */
3752
3753 if (ptr[1] == CHAR_PLUS)
3754 {
3755 repeat_type = 0; /* Force greedy */
3756 possessive_quantifier = TRUE;
3757 ptr++;
3758 }
3759 else if (ptr[1] == CHAR_QUESTION_MARK)
3760 {
3761 repeat_type = greedy_non_default;
3762 ptr++;
3763 }
3764 else repeat_type = greedy_default;
3765
3766 /* If previous was a character match, abolish the item and generate a
3767 repeat item instead. If a char item has a minumum of more than one, ensure
3768 that it is set in reqbyte - it might not be if a sequence such as x{3} is
3769 the first thing in a branch because the x will have gone into firstbyte
3770 instead. */
3771
3772 if (*previous == OP_CHAR || *previous == OP_CHARNC)
3773 {
3774 /* Deal with UTF-8 characters that take up more than one byte. It's
3775 easier to write this out separately than try to macrify it. Use c to
3776 hold the length of the character in bytes, plus 0x80 to flag that it's a
3777 length rather than a small character. */
3778
3779 #ifdef SUPPORT_UTF8
3780 if (utf8 && (code[-1] & 0x80) != 0)
3781 {
3782 uschar *lastchar = code - 1;
3783 while((*lastchar & 0xc0) == 0x80) lastchar--;
3784 c = code - lastchar; /* Length of UTF-8 character */
3785 memcpy(utf8_char, lastchar, c); /* Save the char */
3786 c |= 0x80; /* Flag c as a length */
3787 }
3788 else
3789 #endif
3790
3791 /* Handle the case of a single byte - either with no UTF8 support, or
3792 with UTF-8 disabled, or for a UTF-8 character < 128. */
3793
3794 {
3795 c = code[-1];
3796 if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
3797 }
3798
3799 /* If the repetition is unlimited, it pays to see if the next thing on
3800 the line is something that cannot possibly match this character. If so,
3801 automatically possessifying this item gains some performance in the case
3802 where the match fails. */
3803
3804 if (!possessive_quantifier &&
3805 repeat_max < 0 &&
3806 check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3807 options, cd))
3808 {
3809 repeat_type = 0; /* Force greedy */
3810 possessive_quantifier = TRUE;
3811 }
3812
3813 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
3814 }
3815
3816 /* If previous was a single negated character ([^a] or similar), we use
3817 one of the special opcodes, replacing it. The code is shared with single-
3818 character repeats by setting opt_type to add a suitable offset into
3819 repeat_type. We can also test for auto-possessification. OP_NOT is
3820 currently used only for single-byte chars. */
3821
3822 else if (*previous == OP_NOT)
3823 {
3824 op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
3825 c = previous[1];
3826 if (!possessive_quantifier &&
3827 repeat_max < 0 &&
3828 check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3829 {
3830 repeat_type = 0; /* Force greedy */
3831 possessive_quantifier = TRUE;
3832 }
3833 goto OUTPUT_SINGLE_REPEAT;
3834 }
3835
3836 /* If previous was a character type match (\d or similar), abolish it and
3837 create a suitable repeat item. The code is shared with single-character
3838 repeats by setting op_type to add a suitable offset into repeat_type. Note
3839 the the Unicode property types will be present only when SUPPORT_UCP is
3840 defined, but we don't wrap the little bits of code here because it just
3841 makes it horribly messy. */
3842
3843 else if (*previous < OP_EODN)
3844 {
3845 uschar *oldcode;
3846 int prop_type, prop_value;
3847 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
3848 c = *previous;
3849
3850 if (!possessive_quantifier &&
3851 repeat_max < 0 &&
3852 check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3853 {
3854 repeat_type = 0; /* Force greedy */
3855 possessive_quantifier = TRUE;
3856 }
3857
3858 OUTPUT_SINGLE_REPEAT:
3859 if (*previous == OP_PROP || *previous == OP_NOTPROP)
3860 {
3861 prop_type = previous[1];
3862 prop_value = previous[2];
3863 }
3864 else prop_type = prop_value = -1;
3865
3866 oldcode = code;
3867 code = previous; /* Usually overwrite previous item */
3868
3869 /* If the maximum is zero then the minimum must also be zero; Perl allows
3870 this case, so we do too - by simply omitting the item altogether. */
3871
3872 if (repeat_max == 0) goto END_REPEAT;
3873
3874 /*--------------------------------------------------------------------*/
3875 /* This code is obsolete from release 8.00; the restriction was finally
3876 removed: */
3877
3878 /* All real repeats make it impossible to handle partial matching (maybe
3879 one day we will be able to remove this restriction). */
3880
3881 /* if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL; */
3882 /*--------------------------------------------------------------------*/
3883
3884 /* Combine the op_type with the repeat_type */
3885
3886 repeat_type += op_type;
3887
3888 /* A minimum of zero is handled either as the special case * or ?, or as
3889 an UPTO, with the maximum given. */
3890
3891 if (repeat_min == 0)
3892 {
3893 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
3894 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
3895 else
3896 {
3897 *code++ = OP_UPTO + repeat_type;
3898 PUT2INC(code, 0, repeat_max);
3899 }
3900 }
3901
3902 /* A repeat minimum of 1 is optimized into some special cases. If the
3903 maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
3904 left in place and, if the maximum is greater than 1, we use OP_UPTO with
3905 one less than the maximum. */
3906
3907 else if (repeat_min == 1)
3908 {
3909 if (repeat_max == -1)
3910 *code++ = OP_PLUS + repeat_type;
3911 else
3912 {
3913 code = oldcode; /* leave previous item in place */
3914 if (repeat_max == 1) goto END_REPEAT;
3915 *code++ = OP_UPTO + repeat_type;
3916 PUT2INC(code, 0, repeat_max - 1);
3917 }
3918 }
3919
3920 /* The case {n,n} is just an EXACT, while the general case {n,m} is
3921 handled as an EXACT followed by an UPTO. */
3922
3923 else
3924 {
3925 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
3926 PUT2INC(code, 0, repeat_min);
3927
3928 /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
3929 we have to insert the character for the previous code. For a repeated
3930 Unicode property match, there are two extra bytes that define the
3931 required property. In UTF-8 mode, long characters have their length in
3932 c, with the 0x80 bit as a flag. */
3933
3934 if (repeat_max < 0)
3935 {
3936 #ifdef SUPPORT_UTF8
3937 if (utf8 && c >= 128)
3938 {
3939 memcpy(code, utf8_char, c & 7);
3940 code += c & 7;
3941 }
3942 else
3943 #endif
3944 {
3945 *code++ = c;
3946 if (prop_type >= 0)
3947 {
3948 *code++ = prop_type;
3949 *code++ = prop_value;
3950 }
3951 }
3952 *code++ = OP_STAR + repeat_type;
3953 }
3954
3955 /* Else insert an UPTO if the max is greater than the min, again
3956 preceded by the character, for the previously inserted code. If the
3957 UPTO is just for 1 instance, we can use QUERY instead. */
3958
3959 else if (repeat_max != repeat_min)
3960 {
3961 #ifdef SUPPORT_UTF8
3962 if (utf8 && c >= 128)
3963 {
3964 memcpy(code, utf8_char, c & 7);
3965 code += c & 7;
3966 }
3967 else
3968 #endif
3969 *code++ = c;
3970 if (prop_type >= 0)
3971 {
3972 *code++ = prop_type;
3973 *code++ = prop_value;
3974 }
3975 repeat_max -= repeat_min;
3976
3977 if (repeat_max == 1)
3978 {
3979 *code++ = OP_QUERY + repeat_type;
3980 }
3981 else
3982 {
3983 *code++ = OP_UPTO + repeat_type;
3984 PUT2INC(code, 0, repeat_max);
3985 }
3986 }
3987 }
3988
3989 /* The character or character type itself comes last in all cases. */
3990
3991 #ifdef SUPPORT_UTF8
3992 if (utf8 && c >= 128)
3993 {
3994 memcpy(code, utf8_char, c & 7);
3995 code += c & 7;
3996 }
3997 else
3998 #endif
3999 *code++ = c;
4000
4001 /* For a repeated Unicode property match, there are two extra bytes that
4002 define the required property. */
4003
4004 #ifdef SUPPORT_UCP
4005 if (prop_type >= 0)
4006 {
4007 *code++ = prop_type;
4008 *code++ = prop_value;
4009 }
4010 #endif
4011 }
4012
4013 /* If previous was a character class or a back reference, we put the repeat
4014 stuff after it, but just skip the item if the repeat was {0,0}. */
4015
4016 else if (*previous == OP_CLASS ||
4017 *previous == OP_NCLASS ||
4018 #ifdef SUPPORT_UTF8
4019 *previous == OP_XCLASS ||
4020 #endif
4021 *previous == OP_REF)
4022 {
4023 if (repeat_max == 0)
4024 {
4025 code = previous;
4026 goto END_REPEAT;
4027 }
4028
4029 /*--------------------------------------------------------------------*/
4030 /* This code is obsolete from release 8.00; the restriction was finally
4031 removed: */
4032
4033 /* All real repeats make it impossible to handle partial matching (maybe
4034 one day we will be able to remove this restriction). */
4035
4036 /* if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL; */
4037 /*--------------------------------------------------------------------*/
4038
4039 if (repeat_min == 0 && repeat_max == -1)
4040 *code++ = OP_CRSTAR + repeat_type;
4041 else if (repeat_min == 1 && repeat_max == -1)
4042 *code++ = OP_CRPLUS + repeat_type;
4043 else if (repeat_min == 0 && repeat_max == 1)
4044 *code++ = OP_CRQUERY + repeat_type;
4045 else
4046 {
4047 *code++ = OP_CRRANGE + repeat_type;
4048 PUT2INC(code, 0, repeat_min);
4049 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
4050 PUT2INC(code, 0, repeat_max);
4051 }
4052 }
4053
4054 /* If previous was a bracket group, we may have to replicate it in certain
4055 cases. */
4056
4057 else if (*previous == OP_BRA || *previous == OP_CBRA ||
4058 *previous == OP_ONCE || *previous == OP_COND)
4059 {
4060 register int i;
4061 int ketoffset = 0;
4062 int len = code - previous;
4063 uschar *bralink = NULL;
4064
4065 /* Repeating a DEFINE group is pointless */
4066
4067 if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
4068 {
4069 *errorcodeptr = ERR55;
4070 goto FAILED;
4071 }
4072
4073 /* If the maximum repeat count is unlimited, find the end of the bracket
4074 by scanning through from the start, and compute the offset back to it
4075 from the current code pointer. There may be an OP_OPT setting following
4076 the final KET, so we can't find the end just by going back from the code
4077 pointer. */
4078
4079 if (repeat_max == -1)
4080 {
4081 register uschar *ket = previous;
4082 do ket += GET(ket, 1); while (*ket != OP_KET);
4083 ketoffset = code - ket;
4084 }
4085
4086 /* The case of a zero minimum is special because of the need to stick
4087 OP_BRAZERO in front of it, and because the group appears once in the
4088 data, whereas in other cases it appears the minimum number of times. For
4089 this reason, it is simplest to treat this case separately, as otherwise
4090 the code gets far too messy. There are several special subcases when the
4091 minimum is zero. */
4092
4093 if (repeat_min == 0)
4094 {
4095 /* If the maximum is also zero, we used to just omit the group from the
4096 output altogether, like this:
4097
4098 ** if (repeat_max == 0)
4099 ** {
4100 ** code = previous;
4101 ** goto END_REPEAT;
4102 ** }
4103
4104 However, that fails when a group is referenced as a subroutine from
4105 elsewhere in the pattern, so now we stick in OP_SKIPZERO in front of it
4106 so that it is skipped on execution. As we don't have a list of which
4107 groups are referenced, we cannot do this selectively.
4108
4109 If the maximum is 1 or unlimited, we just have to stick in the BRAZERO
4110 and do no more at this point. However, we do need to adjust any
4111 OP_RECURSE calls inside the group that refer to the group itself or any
4112 internal or forward referenced group, because the offset is from the
4113 start of the whole regex. Temporarily terminate the pattern while doing
4114 this. */
4115
4116 if (repeat_max <= 1) /* Covers 0, 1, and unlimited */
4117 {
4118 *code = OP_END;
4119 adjust_recurse(previous, 1, utf8, cd, save_hwm);
4120 memmove(previous+1, previous, len);
4121 code++;
4122 if (repeat_max == 0)
4123 {
4124 *previous++ = OP_SKIPZERO;
4125 goto END_REPEAT;
4126 }
4127 *previous++ = OP_BRAZERO + repeat_type;
4128 }
4129
4130 /* If the maximum is greater than 1 and limited, we have to replicate
4131 in a nested fashion, sticking OP_BRAZERO before each set of brackets.
4132 The first one has to be handled carefully because it's the original
4133 copy, which has to be moved up. The remainder can be handled by code
4134 that is common with the non-zero minimum case below. We have to
4135 adjust the value or repeat_max, since one less copy is required. Once
4136 again, we may have to adjust any OP_RECURSE calls inside the group. */
4137
4138 else
4139 {
4140 int offset;
4141 *code = OP_END;
4142 adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
4143 memmove(previous + 2 + LINK_SIZE, previous, len);
4144 code += 2 + LINK_SIZE;
4145 *previous++ = OP_BRAZERO + repeat_type;
4146 *previous++ = OP_BRA;
4147
4148 /* We chain together the bracket offset fields that have to be
4149 filled in later when the ends of the brackets are reached. */
4150
4151 offset = (bralink == NULL)? 0 : previous - bralink;
4152 bralink = previous;
4153 PUTINC(previous, 0, offset);
4154 }
4155
4156 repeat_max--;
4157 }
4158
4159 /* If the minimum is greater than zero, replicate the group as many
4160 times as necessary, and adjust the maximum to the number of subsequent
4161 copies that we need. If we set a first char from the group, and didn't
4162 set a required char, copy the latter from the former. If there are any
4163 forward reference subroutine calls in the group, there will be entries on
4164 the workspace list; replicate these with an appropriate increment. */
4165
4166 else
4167 {
4168 if (repeat_min > 1)
4169 {
4170 /* In the pre-compile phase, we don't actually do the replication. We
4171 just adjust the length as if we had. Do some paranoid checks for
4172 potential integer overflow. */
4173
4174 if (lengthptr != NULL)
4175 {
4176 int delta = (repeat_min - 1)*length_prevgroup;
4177 if ((double)(repeat_min - 1)*(double)length_prevgroup >
4178 (double)INT_MAX ||
4179 OFLOW_MAX - *lengthptr < delta)
4180 {
4181 *errorcodeptr = ERR20;
4182 goto FAILED;
4183 }
4184 *lengthptr += delta;
4185 }
4186
4187 /* This is compiling for real */
4188
4189 else
4190 {
4191 if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
4192 for (i = 1; i < repeat_min; i++)
4193 {
4194 uschar *hc;
4195 uschar *this_hwm = cd->hwm;
4196 memcpy(code, previous, len);
4197 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
4198 {
4199 PUT(cd->hwm, 0, GET(hc, 0) + len);
4200 cd->hwm += LINK_SIZE;
4201 }
4202 save_hwm = this_hwm;
4203 code += len;
4204 }
4205 }
4206 }
4207
4208 if (repeat_max > 0) repeat_max -= repeat_min;
4209 }
4210
4211 /* This code is common to both the zero and non-zero minimum cases. If
4212 the maximum is limited, it replicates the group in a nested fashion,
4213 remembering the bracket starts on a stack. In the case of a zero minimum,
4214 the first one was set up above. In all cases the repeat_max now specifies
4215 the number of additional copies needed. Again, we must remember to
4216 replicate entries on the forward reference list. */
4217
4218 if (repeat_max >= 0)
4219 {
4220 /* In the pre-compile phase, we don't actually do the replication. We
4221 just adjust the length as if we had. For each repetition we must add 1
4222 to the length for BRAZERO and for all but the last repetition we must
4223 add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
4224 paranoid checks to avoid integer overflow. */
4225
4226 if (lengthptr != NULL && repeat_max > 0)
4227 {
4228 int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
4229 2 - 2*LINK_SIZE; /* Last one doesn't nest */
4230 if ((double)repeat_max *
4231 (double)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
4232 > (double)INT_MAX ||
4233 OFLOW_MAX - *lengthptr < delta)
4234 {
4235 *errorcodeptr = ERR20;
4236 goto FAILED;
4237 }
4238 *lengthptr += delta;
4239 }
4240
4241 /* This is compiling for real */
4242
4243 else for (i = repeat_max - 1; i >= 0; i--)
4244 {
4245 uschar *hc;
4246 uschar *this_hwm = cd->hwm;
4247
4248 *code++ = OP_BRAZERO + repeat_type;
4249
4250 /* All but the final copy start a new nesting, maintaining the
4251 chain of brackets outstanding. */
4252
4253 if (i != 0)
4254 {
4255 int offset;
4256 *code++ = OP_BRA;
4257 offset = (bralink == NULL)? 0 : code - bralink;
4258 bralink = code;
4259 PUTINC(code, 0, offset);
4260 }
4261
4262 memcpy(code, previous, len);
4263 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
4264 {
4265 PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
4266 cd->hwm += LINK_SIZE;
4267 }
4268 save_hwm = this_hwm;
4269 code += len;
4270 }
4271
4272 /* Now chain through the pending brackets, and fill in their length
4273 fields (which are holding the chain links pro tem). */
4274
4275 while (bralink != NULL)
4276 {
4277 int oldlinkoffset;
4278 int offset = code - bralink + 1;
4279 uschar *bra = code - offset;
4280 oldlinkoffset = GET(bra, 1);
4281 bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
4282 *code++ = OP_KET;
4283 PUTINC(code, 0, offset);
4284 PUT(bra, 1, offset);
4285 }
4286 }
4287
4288 /* If the maximum is unlimited, set a repeater in the final copy. We
4289 can't just offset backwards from the current code point, because we
4290 don't know if there's been an options resetting after the ket. The
4291 correct offset was computed above.
4292
4293 Then, when we are doing the actual compile phase, check to see whether
4294 this group is a non-atomic one that could match an empty string. If so,
4295 convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
4296 that runtime checking can be done. [This check is also applied to
4297 atomic groups at runtime, but in a different way.] */
4298
4299 else
4300 {
4301 uschar *ketcode = code - ketoffset;
4302 uschar *bracode = ketcode - GET(ketcode, 1);
4303 *ketcode = OP_KETRMAX + repeat_type;
4304 if (lengthptr == NULL && *bracode != OP_ONCE)
4305 {
4306 uschar *scode = bracode;
4307 do
4308 {
4309 if (could_be_empty_branch(scode, ketcode, utf8))
4310 {
4311 *bracode += OP_SBRA - OP_BRA;
4312 break;
4313 }
4314 scode += GET(scode, 1);
4315 }
4316 while (*scode == OP_ALT);
4317 }
4318 }
4319 }
4320
4321 /* If previous is OP_FAIL, it was generated by an empty class [] in
4322 JavaScript mode. The other ways in which OP_FAIL can be generated, that is
4323 by (*FAIL) or (?!) set previous to NULL, which gives a "nothing to repeat"
4324 error above. We can just ignore the repeat in JS case. */
4325
4326 else if (*previous == OP_FAIL) goto END_REPEAT;
4327
4328 /* Else there's some kind of shambles */
4329
4330 else
4331 {
4332 *errorcodeptr = ERR11;
4333 goto FAILED;
4334 }
4335
4336 /* If the character following a repeat is '+', or if certain optimization
4337 tests above succeeded, possessive_quantifier is TRUE. For some of the
4338 simpler opcodes, there is an special alternative opcode for this. For
4339 anything else, we wrap the entire repeated item inside OP_ONCE brackets.
4340 The '+' notation is just syntactic sugar, taken from Sun's Java package,
4341 but the special opcodes can optimize it a bit. The repeated item starts at
4342 tempcode, not at previous, which might be the first part of a string whose
4343 (former) last char we repeated.
4344
4345 Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
4346 an 'upto' may follow. We skip over an 'exact' item, and then test the
4347 length of what remains before proceeding. */
4348
4349 if (possessive_quantifier)
4350 {
4351 int len;
4352
4353 if (*tempcode == OP_TYPEEXACT)
4354 tempcode += _pcre_OP_lengths[*tempcode] +
4355 ((tempcode[3] == OP_PROP || tempcode[3] == OP_NOTPROP)? 2 : 0);
4356
4357 else if (*tempcode == OP_EXACT || *tempcode == OP_NOTEXACT)
4358 {
4359 tempcode += _pcre_OP_lengths[*tempcode];
4360 #ifdef SUPPORT_UTF8
4361 if (utf8 && tempcode[-1] >= 0xc0)
4362 tempcode += _pcre_utf8_table4[tempcode[-1] & 0x3f];
4363 #endif
4364 }
4365
4366 len = code - tempcode;
4367 if (len > 0) switch (*tempcode)
4368 {
4369 case OP_STAR: *tempcode = OP_POSSTAR; break;
4370 case OP_PLUS: *tempcode = OP_POSPLUS; break;
4371 case OP_QUERY: *tempcode = OP_POSQUERY; break;
4372 case OP_UPTO: *tempcode = OP_POSUPTO; break;
4373
4374 case OP_TYPESTAR: *tempcode = OP_TYPEPOSSTAR; break;
4375 case OP_TYPEPLUS: *tempcode = OP_TYPEPOSPLUS; break;
4376 case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
4377 case OP_TYPEUPTO: *tempcode = OP_TYPEPOSUPTO; break;
4378
4379 case OP_NOTSTAR: *tempcode = OP_NOTPOSSTAR; break;
4380 case OP_NOTPLUS: *tempcode = OP_NOTPOSPLUS; break;
4381 case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
4382 case OP_NOTUPTO: *tempcode = OP_NOTPOSUPTO; break;
4383
4384 default:
4385 memmove(tempcode + 1+LINK_SIZE, tempcode, len);
4386 code += 1 + LINK_SIZE;
4387 len += 1 + LINK_SIZE;
4388 tempcode[0] = OP_ONCE;
4389 *code++ = OP_KET;
4390 PUTINC(code, 0, len);
4391 PUT(tempcode, 1, len);
4392 break;
4393 }
4394 }
4395
4396 /* In all case we no longer have a previous item. We also set the
4397 "follows varying string" flag for subsequently encountered reqbytes if
4398 it isn't already set and we have just passed a varying length item. */
4399
4400 END_REPEAT:
4401 previous = NULL;
4402 cd->req_varyopt |= reqvary;
4403 break;
4404
4405
4406 /* ===================================================================*/
4407 /* Start of nested parenthesized sub-expression, or comment or lookahead or
4408 lookbehind or option setting or condition or all the other extended
4409 parenthesis forms. */
4410
4411 case CHAR_LEFT_PARENTHESIS:
4412 newoptions = options;
4413 skipbytes = 0;
4414 bravalue = OP_CBRA;
4415 save_hwm = cd->hwm;
4416 reset_bracount = FALSE;
4417
4418 /* First deal with various "verbs" that can be introduced by '*'. */
4419
4420 if (*(++ptr) == CHAR_ASTERISK && (cd->ctypes[ptr[1]] & ctype_letter) != 0)
4421 {
4422 int i, namelen;
4423 const char *vn = verbnames;
4424 const uschar *name = ++ptr;
4425 previous = NULL;
4426 while ((cd->ctypes[*++ptr] & ctype_letter) != 0) {};
4427 if (*ptr == CHAR_COLON)
4428 {
4429 *errorcodeptr = ERR59; /* Not supported */
4430 goto FAILED;
4431 }
4432 if (*ptr != CHAR_RIGHT_PARENTHESIS)
4433 {
4434 *errorcodeptr = ERR60;
4435 goto FAILED;
4436 }
4437 namelen = ptr - name;
4438 for (i = 0; i < verbcount; i++)
4439 {
4440 if (namelen == verbs[i].len &&
4441 strncmp((char *)name, vn, namelen) == 0)
4442 {
4443 *code = verbs[i].op;
4444 if (*code++ == OP_ACCEPT) cd->had_accept = TRUE;
4445 break;
4446 }
4447 vn += verbs[i].len + 1;
4448 }
4449 if (i < verbcount) continue;
4450 *errorcodeptr = ERR60;
4451 goto FAILED;
4452 }
4453
4454 /* Deal with the extended parentheses; all are introduced by '?', and the
4455 appearance of any of them means that this is not a capturing group. */
4456
4457 else if (*ptr == CHAR_QUESTION_MARK)
4458 {
4459 int i, set, unset, namelen;
4460 int *optset;
4461 const uschar *name;
4462 uschar *slot;
4463
4464 switch (*(++ptr))
4465 {
4466 case CHAR_NUMBER_SIGN: /* Comment; skip to ket */
4467 ptr++;
4468 while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
4469 if (*ptr == 0)
4470 {
4471 *errorcodeptr = ERR18;
4472 goto FAILED;
4473 }
4474 continue;
4475
4476
4477 /* ------------------------------------------------------------ */
4478 case CHAR_VERTICAL_LINE: /* Reset capture count for each branch */
4479 reset_bracount = TRUE;
4480 /* Fall through */
4481
4482 /* ------------------------------------------------------------ */
4483 case CHAR_COLON: /* Non-capturing bracket */
4484 bravalue = OP_BRA;
4485 ptr++;
4486 break;
4487
4488
4489 /* ------------------------------------------------------------ */
4490 case CHAR_LEFT_PARENTHESIS:
4491 bravalue = OP_COND; /* Conditional group */
4492
4493 /* A condition can be an assertion, a number (referring to a numbered
4494 group), a name (referring to a named group), or 'R', referring to
4495 recursion. R<digits> and R&name are also permitted for recursion tests.
4496
4497 There are several syntaxes for testing a named group: (?(name)) is used
4498 by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
4499
4500 There are two unfortunate ambiguities, caused by history. (a) 'R' can
4501 be the recursive thing or the name 'R' (and similarly for 'R' followed
4502 by digits), and (b) a number could be a name that consists of digits.
4503 In both cases, we look for a name first; if not found, we try the other
4504 cases. */
4505
4506 /* For conditions that are assertions, check the syntax, and then exit
4507 the switch. This will take control down to where bracketed groups,
4508 including assertions, are processed. */
4509
4510 if (ptr[1] == CHAR_QUESTION_MARK && (ptr[2] == CHAR_EQUALS_SIGN ||
4511 ptr[2] == CHAR_EXCLAMATION_MARK || ptr[2] == CHAR_LESS_THAN_SIGN))
4512 break;
4513
4514 /* Most other conditions use OP_CREF (a couple change to OP_RREF
4515 below), and all need to skip 3 bytes at the start of the group. */
4516
4517 code[1+LINK_SIZE] = OP_CREF;
4518 skipbytes = 3;
4519 refsign = -1;
4520
4521 /* Check for a test for recursion in a named group. */
4522
4523 if (ptr[1] == CHAR_R && ptr[2] == CHAR_AMPERSAND)
4524 {
4525 terminator = -1;
4526 ptr += 2;
4527 code[1+LINK_SIZE] = OP_RREF; /* Change the type of test */
4528 }
4529
4530 /* Check for a test for a named group's having been set, using the Perl
4531 syntax (?(<name>) or (?('name') */
4532
4533 else if (ptr[1] == CHAR_LESS_THAN_SIGN)
4534 {
4535 terminator = CHAR_GREATER_THAN_SIGN;
4536 ptr++;
4537 }
4538 else if (ptr[1] == CHAR_APOSTROPHE)
4539 {
4540 terminator = CHAR_APOSTROPHE;
4541 ptr++;
4542 }
4543 else
4544 {
4545 terminator = 0;
4546 if (ptr[1] == CHAR_MINUS || ptr[1] == CHAR_PLUS) refsign = *(++ptr);
4547 }
4548
4549 /* We now expect to read a name; any thing else is an error */
4550
4551 if ((cd->ctypes[ptr[1]] & ctype_word) == 0)
4552 {
4553 ptr += 1; /* To get the right offset */
4554 *errorcodeptr = ERR28;
4555 goto FAILED;
4556 }
4557
4558 /* Read the name, but also get it as a number if it's all digits */
4559
4560 recno = 0;
4561 name = ++ptr;
4562 while ((cd->ctypes[*ptr] & ctype_word) != 0)
4563 {
4564 if (recno >= 0)
4565 recno = ((digitab[*ptr] & ctype_digit) != 0)?
4566 recno * 10 + *ptr - CHAR_0 : -1;
4567 ptr++;
4568 }
4569 namelen = ptr - name;
4570
4571 if ((terminator > 0 && *ptr++ != terminator) ||
4572 *ptr++ != CHAR_RIGHT_PARENTHESIS)
4573 {
4574 ptr--; /* Error offset */
4575 *errorcodeptr = ERR26;
4576 goto FAILED;
4577 }
4578
4579 /* Do no further checking in the pre-compile phase. */
4580
4581 if (lengthptr != NULL) break;
4582
4583 /* In the real compile we do the work of looking for the actual
4584 reference. If the string started with "+" or "-" we require the rest to
4585 be digits, in which case recno will be set. */
4586
4587 if (refsign > 0)
4588 {
4589 if (recno <= 0)
4590 {
4591 *errorcodeptr = ERR58;
4592 goto FAILED;
4593 }
4594 recno = (refsign == CHAR_MINUS)?
4595 cd->bracount - recno + 1 : recno +cd->bracount;
4596 if (recno <= 0 || recno > cd->final_bracount)
4597 {
4598 *errorcodeptr = ERR15;
4599 goto FAILED;
4600 }
4601 PUT2(code, 2+LINK_SIZE, recno);
4602 break;
4603 }
4604
4605 /* Otherwise (did not start with "+" or "-"), start by looking for the
4606 name. */
4607
4608 slot = cd->name_table;
4609 for (i = 0; i < cd->names_found; i++)
4610 {
4611 if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
4612 slot += cd->name_entry_size;
4613 }
4614
4615 /* Found a previous named subpattern */
4616
4617 if (i < cd->names_found)
4618 {
4619 recno = GET2(slot, 0);
4620 PUT2(code, 2+LINK_SIZE, recno);
4621 }
4622
4623 /* Search the pattern for a forward reference */
4624
4625 else if ((i = find_parens(cd, name, namelen,
4626 (options & PCRE_EXTENDED) != 0)) > 0)
4627 {
4628 PUT2(code, 2+LINK_SIZE, i);
4629 }
4630
4631 /* If terminator == 0 it means that the name followed directly after
4632 the opening parenthesis [e.g. (?(abc)...] and in this case there are
4633 some further alternatives to try. For the cases where terminator != 0
4634 [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
4635 now checked all the possibilities, so give an error. */
4636
4637 else if (terminator != 0)
4638 {
4639 *errorcodeptr = ERR15;
4640 goto FAILED;
4641 }
4642
4643 /* Check for (?(R) for recursion. Allow digits after R to specify a
4644 specific group number. */
4645
4646 else if (*name == CHAR_R)
4647 {
4648 recno = 0;
4649 for (i = 1; i < namelen; i++)
4650 {
4651 if ((digitab[name[i]] & ctype_digit) == 0)
4652 {
4653 *errorcodeptr = ERR15;
4654 goto FAILED;
4655 }
4656 recno = recno * 10 + name[i] - CHAR_0;
4657 }
4658 if (recno == 0) recno = RREF_ANY;
4659 code[1+LINK_SIZE] = OP_RREF; /* Change test type */
4660 PUT2(code, 2+LINK_SIZE, recno);
4661 }
4662
4663 /* Similarly, check for the (?(DEFINE) "condition", which is always
4664 false. */
4665
4666 else if (namelen == 6 && strncmp((char *)name, STRING_DEFINE, 6) == 0)
4667 {
4668 code[1+LINK_SIZE] = OP_DEF;
4669 skipbytes = 1;
4670 }
4671
4672 /* Check for the "name" actually being a subpattern number. We are
4673 in the second pass here, so final_bracount is set. */
4674
4675 else if (recno > 0 && recno <= cd->final_bracount)
4676 {
4677 PUT2(code, 2+LINK_SIZE, recno);
4678 }
4679
4680 /* Either an unidentified subpattern, or a reference to (?(0) */
4681
4682 else
4683 {
4684 *errorcodeptr = (recno == 0)? ERR35: ERR15;
4685 goto FAILED;
4686 }
4687 break;
4688
4689
4690 /* ------------------------------------------------------------ */
4691 case CHAR_EQUALS_SIGN: /* Positive lookahead */
4692 bravalue = OP_ASSERT;
4693 ptr++;
4694 break;
4695
4696
4697 /* ------------------------------------------------------------ */
4698 case CHAR_EXCLAMATION_MARK: /* Negative lookahead */
4699 ptr++;
4700 if (*ptr == CHAR_RIGHT_PARENTHESIS) /* Optimize (?!) */
4701 {
4702 *code++ = OP_FAIL;
4703 previous = NULL;
4704 continue;
4705 }
4706 bravalue = OP_ASSERT_NOT;
4707 break;
4708
4709
4710 /* ------------------------------------------------------------ */
4711 case CHAR_LESS_THAN_SIGN: /* Lookbehind or named define */
4712 switch (ptr[1])
4713 {
4714 case CHAR_EQUALS_SIGN: /* Positive lookbehind */
4715 bravalue = OP_ASSERTBACK;
4716 ptr += 2;
4717 break;
4718
4719 case CHAR_EXCLAMATION_MARK: /* Negative lookbehind */
4720 bravalue = OP_ASSERTBACK_NOT;
4721 ptr += 2;
4722 break;
4723
4724 default: /* Could be name define, else bad */
4725 if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME;
4726 ptr++; /* Correct offset for error */
4727 *errorcodeptr = ERR24;
4728 goto FAILED;
4729 }
4730 break;
4731
4732
4733 /* ------------------------------------------------------------ */
4734 case CHAR_GREATER_THAN_SIGN: /* One-time brackets */
4735 bravalue = OP_ONCE;
4736 ptr++;
4737 break;
4738
4739
4740 /* ------------------------------------------------------------ */
4741 case CHAR_C: /* Callout - may be followed by digits; */
4742 previous_callout = code; /* Save for later completion */
4743 after_manual_callout = 1; /* Skip one item before completing */
4744 *code++ = OP_CALLOUT;
4745 {
4746 int n = 0;
4747 while ((digitab[*(++ptr)] & ctype_digit) != 0)
4748 n = n * 10 + *ptr - CHAR_0;
4749 if (*ptr != CHAR_RIGHT_PARENTHESIS)
4750 {
4751 *errorcodeptr = ERR39;
4752 goto FAILED;
4753 }
4754 if (n > 255)
4755 {
4756 *errorcodeptr = ERR38;
4757 goto FAILED;
4758 }
4759 *code++ = n;
4760 PUT(code, 0, ptr - cd->start_pattern + 1); /* Pattern offset */
4761 PUT(code, LINK_SIZE, 0); /* Default length */
4762 code += 2 * LINK_SIZE;
4763 }
4764 previous = NULL;
4765 continue;
4766
4767
4768 /* ------------------------------------------------------------ */
4769 case CHAR_P: /* Python-style named subpattern handling */
4770 if (*(++ptr) == CHAR_EQUALS_SIGN ||
4771 *ptr == CHAR_GREATER_THAN_SIGN) /* Reference or recursion */
4772 {
4773 is_recurse = *ptr == CHAR_GREATER_THAN_SIGN;
4774 terminator = CHAR_RIGHT_PARENTHESIS;
4775 goto NAMED_REF_OR_RECURSE;
4776 }
4777 else if (*ptr != CHAR_LESS_THAN_SIGN) /* Test for Python-style defn */
4778 {
4779 *errorcodeptr = ERR41;
4780 goto FAILED;
4781 }
4782 /* Fall through to handle (?P< as (?< is handled */
4783
4784
4785 /* ------------------------------------------------------------ */
4786 DEFINE_NAME: /* Come here from (?< handling */
4787 case CHAR_APOSTROPHE:
4788 {
4789 terminator = (*ptr == CHAR_LESS_THAN_SIGN)?
4790 CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
4791 name = ++ptr;
4792
4793 while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4794 namelen = ptr - name;
4795
4796 /* In the pre-compile phase, just do a syntax check. */
4797
4798 if (lengthptr != NULL)
4799 {
4800 if (*ptr != terminator)
4801 {
4802 *errorcodeptr = ERR42;
4803 goto FAILED;
4804 }
4805 if (cd->names_found >= MAX_NAME_COUNT)
4806 {
4807 *errorcodeptr = ERR49;
4808 goto FAILED;
4809 }
4810 if (namelen + 3 > cd->name_entry_size)
4811 {
4812 cd->name_entry_size = namelen + 3;
4813 if (namelen > MAX_NAME_SIZE)
4814 {
4815 *errorcodeptr = ERR48;
4816 goto FAILED;
4817 }
4818 }
4819 }
4820
4821 /* In the real compile, create the entry in the table */
4822
4823 else
4824 {
4825 slot = cd->name_table;
4826 for (i = 0; i < cd->names_found; i++)
4827 {
4828 int crc = memcmp(name, slot+2, namelen);
4829 if (crc == 0)
4830 {
4831 if (slot[2+namelen] == 0)
4832 {
4833 if ((options & PCRE_DUPNAMES) == 0)
4834 {
4835 *errorcodeptr = ERR43;
4836 goto FAILED;
4837 }
4838 }
4839 else crc = -1; /* Current name is substring */
4840 }
4841 if (crc < 0)
4842 {
4843 memmove(slot + cd->name_entry_size, slot,
4844 (cd->names_found - i) * cd->name_entry_size);
4845 break;
4846 }
4847 slot += cd->name_entry_size;
4848 }
4849
4850 PUT2(slot, 0, cd->bracount + 1);
4851 memcpy(slot + 2, name, namelen);
4852 slot[2+namelen] = 0;
4853 }
4854 }
4855
4856 /* In both cases, count the number of names we've encountered. */
4857
4858 ptr++; /* Move past > or ' */
4859 cd->names_found++;
4860 goto NUMBERED_GROUP;
4861
4862
4863 /* ------------------------------------------------------------ */
4864 case CHAR_AMPERSAND: /* Perl recursion/subroutine syntax */
4865 terminator = CHAR_RIGHT_PARENTHESIS;
4866 is_recurse = TRUE;
4867 /* Fall through */
4868
4869 /* We come here from the Python syntax above that handles both
4870 references (?P=name) and recursion (?P>name), as well as falling
4871 through from the Perl recursion syntax (?&name). We also come here from
4872 the Perl \k<name> or \k'name' back reference syntax and the \k{name}
4873 .NET syntax, and the Oniguruma \g<...> and \g'...' subroutine syntax. */
4874
4875 NAMED_REF_OR_RECURSE:
4876 name = ++ptr;
4877 while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4878 namelen = ptr - name;
4879
4880 /* In the pre-compile phase, do a syntax check and set a dummy
4881 reference number. */
4882
4883 if (lengthptr != NULL)
4884 {
4885 if (namelen == 0)
4886 {
4887 *errorcodeptr = ERR62;
4888 goto FAILED;
4889 }
4890 if (*ptr != terminator)
4891 {
4892 *errorcodeptr = ERR42;
4893 goto FAILED;
4894 }
4895 if (namelen > MAX_NAME_SIZE)
4896 {
4897 *errorcodeptr = ERR48;
4898 goto FAILED;
4899 }
4900 recno = 0;
4901 }
4902
4903 /* In the real compile, seek the name in the table. We check the name
4904 first, and then check that we have reached the end of the name in the
4905 table. That way, if the name that is longer than any in the table,
4906 the comparison will fail without reading beyond the table entry. */
4907
4908 else
4909 {
4910 slot = cd->name_table;
4911 for (i = 0; i < cd->names_found; i++)
4912 {
4913 if (strncmp((char *)name, (char *)slot+2, namelen) == 0 &&
4914 slot[2+namelen] == 0)
4915 break;
4916 slot += cd->name_entry_size;
4917 }
4918
4919 if (i < cd->names_found) /* Back reference */
4920 {
4921 recno = GET2(slot, 0);
4922 }
4923 else if ((recno = /* Forward back reference */
4924 find_parens(cd, name, namelen,
4925 (options & PCRE_EXTENDED) != 0)) <= 0)
4926 {
4927 *errorcodeptr = ERR15;
4928 goto FAILED;
4929 }
4930 }
4931
4932 /* In both phases, we can now go to the code than handles numerical
4933 recursion or backreferences. */
4934
4935 if (is_recurse) goto HANDLE_RECURSION;
4936 else goto HANDLE_REFERENCE;
4937
4938
4939 /* ------------------------------------------------------------ */
4940 case CHAR_R: /* Recursion */
4941 ptr++; /* Same as (?0) */
4942 /* Fall through */
4943
4944
4945 /* ------------------------------------------------------------ */
4946 case CHAR_MINUS: case CHAR_PLUS: /* Recursion or subroutine */
4947 case CHAR_0: case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4:
4948 case CHAR_5: case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
4949 {
4950 const uschar *called;
4951 terminator = CHAR_RIGHT_PARENTHESIS;
4952
4953 /* Come here from the \g<...> and \g'...' code (Oniguruma
4954 compatibility). However, the syntax has been checked to ensure that
4955 the ... are a (signed) number, so that neither ERR63 nor ERR29 will
4956 be called on this path, nor with the jump to OTHER_CHAR_AFTER_QUERY
4957 ever be taken. */
4958
4959 HANDLE_NUMERICAL_RECURSION:
4960
4961 if ((refsign = *ptr) == CHAR_PLUS)
4962 {
4963 ptr++;
4964 if ((digitab[*ptr] & ctype_digit) == 0)
4965 {
4966 *errorcodeptr = ERR63;
4967 goto FAILED;
4968 }
4969 }
4970 else if (refsign == CHAR_MINUS)
4971 {
4972 if ((digitab[ptr[1]] & ctype_digit) == 0)
4973 goto OTHER_CHAR_AFTER_QUERY;
4974 ptr++;
4975 }
4976
4977 recno = 0;
4978 while((digitab[*ptr] & ctype_digit) != 0)
4979 recno = recno * 10 + *ptr++ - CHAR_0;
4980
4981 if (*ptr != terminator)
4982 {
4983 *errorcodeptr = ERR29;
4984 goto FAILED;
4985 }
4986
4987 if (refsign == CHAR_MINUS)
4988 {
4989 if (recno == 0)
4990 {
4991 *errorcodeptr = ERR58;
4992 goto FAILED;
4993 }
4994 recno = cd->bracount - recno + 1;
4995 if (recno <= 0)
4996 {
4997 *errorcodeptr = ERR15;
4998 goto FAILED;
4999 }
5000 }
5001 else if (refsign == CHAR_PLUS)
5002 {
5003 if (recno == 0)
5004 {
5005 *errorcodeptr = ERR58;
5006 goto FAILED;
5007 }
5008 recno += cd->bracount;
5009 }
5010
5011 /* Come here from code above that handles a named recursion */
5012
5013 HANDLE_RECURSION:
5014
5015 previous = code;
5016 called = cd->start_code;
5017
5018 /* When we are actually compiling, find the bracket that is being
5019 referenced. Temporarily end the regex in case it doesn't exist before
5020 this point. If we end up with a forward reference, first check that
5021 the bracket does occur later so we can give the error (and position)
5022 now. Then remember this forward reference in the workspace so it can
5023 be filled in at the end. */
5024
5025 if (lengthptr == NULL)
5026 {
5027 *code = OP_END;
5028 if (recno != 0) called = find_bracket(cd->start_code, utf8, recno);
5029
5030 /* Forward reference */
5031
5032 if (called == NULL)
5033 {
5034 if (find_parens(cd, NULL, recno,
5035 (options & PCRE_EXTENDED) != 0) < 0)
5036 {
5037 *errorcodeptr = ERR15;
5038 goto FAILED;
5039 }
5040 called = cd->start_code + recno;
5041 PUTINC(cd->hwm, 0, code + 2 + LINK_SIZE - cd->start_code);
5042 }
5043
5044 /* If not a forward reference, and the subpattern is still open,
5045 this is a recursive call. We check to see if this is a left
5046 recursion that could loop for ever, and diagnose that case. */
5047
5048 else if (GET(called, 1) == 0 &&
5049 could_be_empty(called, code, bcptr, utf8))
5050 {
5051 *errorcodeptr = ERR40;
5052 goto FAILED;
5053 }
5054 }
5055
5056 /* Insert the recursion/subroutine item, automatically wrapped inside
5057 "once" brackets. Set up a "previous group" length so that a
5058 subsequent quantifier will work. */
5059
5060 *code = OP_ONCE;
5061 PUT(code, 1, 2 + 2*LINK_SIZE);
5062 code += 1 + LINK_SIZE;
5063
5064 *code = OP_RECURSE;
5065 PUT(code, 1, called - cd->start_code);
5066 code += 1 + LINK_SIZE;
5067
5068 *code = OP_KET;
5069 PUT(code, 1, 2 + 2*LINK_SIZE);
5070 code += 1 + LINK_SIZE;
5071
5072 length_prevgroup = 3 + 3*LINK_SIZE;
5073 }
5074
5075 /* Can't determine a first byte now */
5076
5077 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
5078 continue;
5079
5080
5081 /* ------------------------------------------------------------ */
5082 default: /* Other characters: check option setting */
5083 OTHER_CHAR_AFTER_QUERY:
5084 set = unset = 0;
5085 optset = &set;
5086
5087 while (*ptr != CHAR_RIGHT_PARENTHESIS && *ptr != CHAR_COLON)
5088 {
5089 switch (*ptr++)
5090 {
5091 case CHAR_MINUS: optset = &unset; break;
5092
5093 case CHAR_J: /* Record that it changed in the external options */
5094 *optset |= PCRE_DUPNAMES;
5095 cd->external_flags |= PCRE_JCHANGED;
5096 break;
5097
5098 case CHAR_i: *optset |= PCRE_CASELESS; break;
5099 case CHAR_m: *optset |= PCRE_MULTILINE; break;
5100 case CHAR_s: *optset |= PCRE_DOTALL; break;
5101 case CHAR_x: *optset |= PCRE_EXTENDED; break;
5102 case CHAR_U: *optset |= PCRE_UNGREEDY; break;
5103 case CHAR_X: *optset |= PCRE_EXTRA; break;
5104
5105 default: *errorcodeptr = ERR12;
5106 ptr--; /* Correct the offset */
5107 goto FAILED;
5108 }
5109 }
5110
5111 /* Set up the changed option bits, but don't change anything yet. */
5112
5113 newoptions = (options | set) & (~unset);
5114
5115 /* If the options ended with ')' this is not the start of a nested
5116 group with option changes, so the options change at this level. If this
5117 item is right at the start of the pattern, the options can be
5118 abstracted and made external in the pre-compile phase, and ignored in
5119 the compile phase. This can be helpful when matching -- for instance in
5120 caseless checking of required bytes.
5121
5122 If the code pointer is not (cd->start_code + 1 + LINK_SIZE), we are
5123 definitely *not* at the start of the pattern because something has been
5124 compiled. In the pre-compile phase, however, the code pointer can have
5125 that value after the start, because it gets reset as code is discarded
5126 during the pre-compile. However, this can happen only at top level - if
5127 we are within parentheses, the starting BRA will still be present. At
5128 any parenthesis level, the length value can be used to test if anything
5129 has been compiled at that level. Thus, a test for both these conditions
5130 is necessary to ensure we correctly detect the start of the pattern in
5131 both phases.
5132
5133 If we are not at the pattern start, compile code to change the ims
5134 options if this setting actually changes any of them, and reset the
5135 greedy defaults and the case value for firstbyte and reqbyte. */
5136
5137 if (*ptr == CHAR_RIGHT_PARENTHESIS)
5138 {
5139 if (code == cd->start_code + 1 + LINK_SIZE &&
5140 (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE))
5141 {
5142 cd->external_options = newoptions;
5143 }
5144 else
5145 {
5146 if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
5147 {
5148 *code++ = OP_OPT;
5149 *code++ = newoptions & PCRE_IMS;
5150 }
5151 greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
5152 greedy_non_default = greedy_default ^ 1;
5153 req_caseopt = ((newoptions & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
5154 }
5155
5156 /* Change options at this level, and pass them back for use
5157 in subsequent branches. When not at the start of the pattern, this
5158 information is also necessary so that a resetting item can be
5159 compiled at the end of a group (if we are in a group). */
5160
5161 *optionsptr = options = newoptions;
5162 previous = NULL; /* This item can't be repeated */
5163 continue; /* It is complete */
5164 }
5165
5166 /* If the options ended with ':' we are heading into a nested group
5167 with possible change of options. Such groups are non-capturing and are
5168 not assertions of any kind. All we need to do is skip over the ':';
5169 the newoptions value is handled below. */
5170
5171 bravalue = OP_BRA;
5172 ptr++;
5173 } /* End of switch for character following (? */
5174 } /* End of (? handling */
5175
5176 /* Opening parenthesis not followed by '?'. If PCRE_NO_AUTO_CAPTURE is set,
5177 all unadorned brackets become non-capturing and behave like (?:...)
5178 brackets. */
5179
5180 else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
5181 {
5182 bravalue = OP_BRA;
5183 }
5184
5185 /* Else we have a capturing group. */
5186
5187 else
5188 {
5189 NUMBERED_GROUP:
5190 cd->bracount += 1;
5191 PUT2(code, 1+LINK_SIZE, cd->bracount);
5192 skipbytes = 2;
5193 }
5194
5195 /* Process nested bracketed regex. Assertions may not be repeated, but
5196 other kinds can be. All their opcodes are >= OP_ONCE. We copy code into a
5197 non-register variable in order to be able to pass its address because some
5198 compilers complain otherwise. Pass in a new setting for the ims options if
5199 they have changed. */
5200
5201 previous = (bravalue >= OP_ONCE)? code : NULL;
5202 *code = bravalue;
5203 tempcode = code;
5204 tempreqvary = cd->req_varyopt; /* Save value before bracket */
5205 length_prevgroup = 0; /* Initialize for pre-compile phase */
5206
5207 if (!compile_regex(
5208 newoptions, /* The complete new option state */
5209 options & PCRE_IMS, /* The previous ims option state */
5210 &tempcode, /* Where to put code (updated) */
5211 &ptr, /* Input pointer (updated) */
5212 errorcodeptr, /* Where to put an error message */
5213 (bravalue == OP_ASSERTBACK ||
5214 bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
5215 reset_bracount, /* True if (?| group */
5216 skipbytes, /* Skip over bracket number */
5217 &subfirstbyte, /* For possible first char */
5218 &subreqbyte, /* For possible last char */
5219 bcptr, /* Current branch chain */
5220 cd, /* Tables block */
5221 (lengthptr == NULL)? NULL : /* Actual compile phase */
5222 &length_prevgroup /* Pre-compile phase */
5223 ))
5224 goto FAILED;
5225
5226 /* At the end of compiling, code is still pointing to the start of the
5227 group, while tempcode has been updated to point past the end of the group
5228 and any option resetting that may follow it. The pattern pointer (ptr)
5229 is on the bracket. */
5230
5231 /* If this is a conditional bracket, check that there are no more than
5232 two branches in the group, or just one if it's a DEFINE group. We do this
5233 in the real compile phase, not in the pre-pass, where the whole group may
5234 not be available. */
5235
5236 if (bravalue == OP_COND && lengthptr == NULL)
5237 {
5238 uschar *tc = code;
5239 int condcount = 0;
5240
5241 do {
5242 condcount++;
5243 tc += GET(tc,1);
5244 }
5245 while (*tc != OP_KET);
5246
5247 /* A DEFINE group is never obeyed inline (the "condition" is always
5248 false). It must have only one branch. */
5249
5250 if (code[LINK_SIZE+1] == OP_DEF)
5251 {
5252 if (condcount > 1)
5253 {
5254 *errorcodeptr = ERR54;
5255 goto FAILED;
5256 }
5257 bravalue = OP_DEF; /* Just a flag to suppress char handling below */
5258 }
5259
5260 /* A "normal" conditional group. If there is just one branch, we must not
5261 make use of its firstbyte or reqbyte, because this is equivalent to an
5262 empty second branch. */
5263
5264 else
5265 {
5266 if (condcount > 2)
5267 {
5268 *errorcodeptr = ERR27;
5269 goto FAILED;
5270 }
5271 if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
5272 }
5273 }
5274
5275 /* Error if hit end of pattern */
5276
5277 if (*ptr != CHAR_RIGHT_PARENTHESIS)
5278 {
5279 *errorcodeptr = ERR14;
5280 goto FAILED;
5281 }
5282
5283 /* In the pre-compile phase, update the length by the length of the group,
5284 less the brackets at either end. Then reduce the compiled code to just a
5285 set of non-capturing brackets so that it doesn't use much memory if it is
5286 duplicated by a quantifier.*/
5287
5288 if (lengthptr != NULL)
5289 {
5290 if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE)
5291 {
5292 *errorcodeptr = ERR20;
5293 goto FAILED;
5294 }
5295 *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
5296 *code++ = OP_BRA;
5297 PUTINC(code, 0, 1 + LINK_SIZE);
5298 *code++ = OP_KET;
5299 PUTINC(code, 0, 1 + LINK_SIZE);
5300 break; /* No need to waste time with special character handling */
5301 }
5302
5303 /* Otherwise update the main code pointer to the end of the group. */
5304
5305 code = tempcode;
5306
5307 /* For a DEFINE group, required and first character settings are not
5308 relevant. */
5309
5310 if (bravalue == OP_DEF) break;
5311
5312 /* Handle updating of the required and first characters for other types of
5313 group. Update for normal brackets of all kinds, and conditions with two
5314 branches (see code above). If the bracket is followed by a quantifier with
5315 zero repeat, we have to back off. Hence the definition of zeroreqbyte and
5316 zerofirstbyte outside the main loop so that they can be accessed for the
5317 back off. */
5318
5319 zeroreqbyte = reqbyte;
5320 zerofirstbyte = firstbyte;
5321 groupsetfirstbyte = FALSE;
5322
5323 if (bravalue >= OP_ONCE)
5324 {
5325 /* If we have not yet set a firstbyte in this branch, take it from the
5326 subpattern, remembering that it was set here so that a repeat of more
5327 than one can replicate it as reqbyte if necessary. If the subpattern has
5328 no firstbyte, set "none" for the whole branch. In both cases, a zero
5329 repeat forces firstbyte to "none". */
5330
5331 if (firstbyte == REQ_UNSET)
5332 {
5333 if (subfirstbyte >= 0)
5334 {
5335 firstbyte = subfirstbyte;
5336 groupsetfirstbyte = TRUE;
5337 }
5338 else firstbyte = REQ_NONE;
5339 zerofirstbyte = REQ_NONE;
5340 }
5341
5342 /* If firstbyte was previously set, convert the subpattern's firstbyte
5343 into reqbyte if there wasn't one, using the vary flag that was in
5344 existence beforehand. */
5345
5346 else if (subfirstbyte >= 0 && subreqbyte < 0)
5347 subreqbyte = subfirstbyte | tempreqvary;
5348
5349 /* If the subpattern set a required byte (or set a first byte that isn't
5350 really the first byte - see above), set it. */
5351
5352 if (subreqbyte >= 0) reqbyte = subreqbyte;
5353 }
5354
5355 /* For a forward assertion, we take the reqbyte, if set. This can be
5356 helpful if the pattern that follows the assertion doesn't set a different
5357 char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
5358 for an assertion, however because it leads to incorrect effect for patterns
5359 such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
5360 of a firstbyte. This is overcome by a scan at the end if there's no
5361 firstbyte, looking for an asserted first char. */
5362
5363 else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
5364 break; /* End of processing '(' */
5365
5366
5367 /* ===================================================================*/
5368 /* Handle metasequences introduced by \. For ones like \d, the ESC_ values
5369 are arranged to be the negation of the corresponding OP_values. For the
5370 back references, the values are ESC_REF plus the reference number. Only
5371 back references and those types that consume a character may be repeated.
5372 We can test for values between ESC_b and ESC_Z for the latter; this may
5373 have to change if any new ones are ever created. */
5374
5375 case CHAR_BACKSLASH:
5376 tempptr = ptr;
5377 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, FALSE);
5378 if (*errorcodeptr != 0) goto FAILED;
5379
5380 if (c < 0)
5381 {
5382 if (-c == ESC_Q) /* Handle start of quoted string */
5383 {
5384 if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
5385 ptr += 2; /* avoid empty string */
5386 else inescq = TRUE;
5387 continue;
5388 }
5389
5390 if (-c == ESC_E) continue; /* Perl ignores an orphan \E */
5391
5392 /* For metasequences that actually match a character, we disable the
5393 setting of a first character if it hasn't already been set. */
5394
5395 if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
5396 firstbyte = REQ_NONE;
5397
5398 /* Set values to reset to if this is followed by a zero repeat. */
5399
5400 zerofirstbyte = firstbyte;
5401 zeroreqbyte = reqbyte;
5402
5403 /* \g<name> or \g'name' is a subroutine call by name and \g<n> or \g'n'
5404 is a subroutine call by number (Oniguruma syntax). In fact, the value
5405 -ESC_g is returned only for these cases. So we don't need to check for <
5406 or ' if the value is -ESC_g. For the Perl syntax \g{n} the value is
5407 -ESC_REF+n, and for the Perl syntax \g{name} the result is -ESC_k (as
5408 that is a synonym for a named back reference). */
5409
5410 if (-c == ESC_g)
5411 {
5412 const uschar *p;
5413 save_hwm = cd->hwm; /* Normally this is set when '(' is read */
5414 terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
5415 CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
5416
5417 /* These two statements stop the compiler for warning about possibly
5418 unset variables caused by the jump to HANDLE_NUMERICAL_RECURSION. In
5419 fact, because we actually check for a number below, the paths that
5420 would actually be in error are never taken. */
5421
5422 skipbytes = 0;
5423 reset_bracount = FALSE;
5424
5425 /* Test for a name */
5426
5427 if (ptr[1] != CHAR_PLUS && ptr[1] != CHAR_MINUS)
5428 {
5429 BOOL isnumber = TRUE;
5430 for (p = ptr + 1; *p != 0 && *p != terminator; p++)
5431 {
5432 if ((cd->ctypes[*p] & ctype_digit) == 0) isnumber = FALSE;
5433 if ((cd->ctypes[*p] & ctype_word) == 0) break;
5434 }
5435 if (*p != terminator)
5436 {
5437 *errorcodeptr = ERR57;
5438 break;
5439 }
5440 if (isnumber)
5441 {
5442 ptr++;
5443 goto HANDLE_NUMERICAL_RECURSION;
5444 }
5445 is_recurse = TRUE;
5446 goto NAMED_REF_OR_RECURSE;
5447 }
5448
5449 /* Test a signed number in angle brackets or quotes. */
5450
5451 p = ptr + 2;
5452 while ((digitab[*p] & ctype_digit) != 0) p++;
5453 if (*p != terminator)
5454 {
5455 *errorcodeptr = ERR57;
5456 break;
5457 }
5458 ptr++;
5459 goto HANDLE_NUMERICAL_RECURSION;
5460 }
5461
5462 /* \k<name> or \k'name' is a back reference by name (Perl syntax).
5463 We also support \k{name} (.NET syntax) */
5464
5465 if (-c == ESC_k && (ptr[1] == CHAR_LESS_THAN_SIGN ||
5466 ptr[1] == CHAR_APOSTROPHE || ptr[1] == CHAR_LEFT_CURLY_BRACKET))
5467 {
5468 is_recurse = FALSE;
5469 terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
5470 CHAR_GREATER_THAN_SIGN : (*ptr == CHAR_APOSTROPHE)?
5471 CHAR_APOSTROPHE : CHAR_RIGHT_CURLY_BRACKET;
5472 goto NAMED_REF_OR_RECURSE;
5473 }
5474
5475 /* Back references are handled specially; must disable firstbyte if
5476 not set to cope with cases like (?=(\w+))\1: which would otherwise set
5477 ':' later. */
5478
5479 if (-c >= ESC_REF)
5480 {
5481 recno = -c - ESC_REF;
5482
5483 HANDLE_REFERENCE: /* Come here from named backref handling */
5484 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
5485 previous = code;
5486 *code++ = OP_REF;
5487 PUT2INC(code, 0, recno);
5488 cd->backref_map |= (recno < 32)? (1 << recno) : 1;
5489 if (recno > cd->top_backref) cd->top_backref = recno;
5490 }
5491
5492 /* So are Unicode property matches, if supported. */
5493
5494 #ifdef SUPPORT_UCP
5495 else if (-c == ESC_P || -c == ESC_p)
5496 {
5497 BOOL negated;
5498 int pdata;
5499 int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
5500 if (ptype < 0) goto FAILED;
5501 previous = code;
5502 *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
5503 *code++ = ptype;
5504 *code++ = pdata;
5505 }
5506 #else
5507
5508 /* If Unicode properties are not supported, \X, \P, and \p are not
5509 allowed. */
5510
5511 else if (-c == ESC_X || -c == ESC_P || -c == ESC_p)
5512 {
5513 *errorcodeptr = ERR45;
5514 goto FAILED;
5515 }
5516 #endif
5517
5518 /* For the rest (including \X when Unicode properties are supported), we
5519 can obtain the OP value by negating the escape value. */
5520
5521 else
5522 {
5523 previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
5524 *code++ = -c;
5525 }
5526 continue;
5527 }
5528
5529 /* We have a data character whose value is in c. In UTF-8 mode it may have
5530 a value > 127. We set its representation in the length/buffer, and then
5531 handle it as a data character. */
5532
5533 #ifdef SUPPORT_UTF8
5534 if (utf8 && c > 127)
5535 mclength = _pcre_ord2utf8(c, mcbuffer);
5536 else
5537 #endif
5538
5539 {
5540 mcbuffer[0] = c;
5541 mclength = 1;
5542 }
5543 goto ONE_CHAR;
5544
5545
5546 /* ===================================================================*/
5547 /* Handle a literal character. It is guaranteed not to be whitespace or #
5548 when the extended flag is set. If we are in UTF-8 mode, it may be a
5549 multi-byte literal character. */
5550
5551 default:
5552 NORMAL_CHAR:
5553 mclength = 1;
5554 mcbuffer[0] = c;
5555
5556 #ifdef SUPPORT_UTF8
5557 if (utf8 && c >= 0xc0)
5558 {
5559 while ((ptr[1] & 0xc0) == 0x80)
5560 mcbuffer[mclength++] = *(++ptr);
5561 }
5562 #endif
5563
5564 /* At this point we have the character's bytes in mcbuffer, and the length
5565 in mclength. When not in UTF-8 mode, the length is always 1. */
5566
5567 ONE_CHAR:
5568 previous = code;
5569 *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARNC : OP_CHAR;
5570 for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
5571
5572 /* Remember if \r or \n were seen */
5573
5574 if (mcbuffer[0] == CHAR_CR || mcbuffer[0] == CHAR_NL)
5575 cd->external_flags |= PCRE_HASCRORLF;
5576
5577 /* Set the first and required bytes appropriately. If no previous first
5578 byte, set it from this character, but revert to none on a zero repeat.
5579 Otherwise, leave the firstbyte value alone, and don't change it on a zero
5580 repeat. */
5581
5582 if (firstbyte == REQ_UNSET)
5583 {
5584 zerofirstbyte = REQ_NONE;
5585 zeroreqbyte = reqbyte;
5586
5587 /* If the character is more than one byte long, we can set firstbyte
5588 only if it is not to be matched caselessly. */
5589
5590 if (mclength == 1 || req_caseopt == 0)
5591 {
5592 firstbyte = mcbuffer[0] | req_caseopt;
5593 if (mclength != 1) reqbyte = code[-1] | cd->req_varyopt;
5594 }
5595 else firstbyte = reqbyte = REQ_NONE;
5596 }
5597
5598 /* firstbyte was previously set; we can set reqbyte only the length is
5599 1 or the matching is caseful. */
5600
5601 else
5602 {
5603 zerofirstbyte = firstbyte;
5604 zeroreqbyte = reqbyte;
5605 if (mclength == 1 || req_caseopt == 0)
5606 reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
5607 }
5608
5609 break; /* End of literal character handling */
5610 }
5611 } /* end of big loop */
5612
5613
5614 /* Control never reaches here by falling through, only by a goto for all the
5615 error states. Pass back the position in the pattern so that it can be displayed
5616 to the user for diagnosing the error. */
5617
5618 FAILED:
5619 *ptrptr = ptr;
5620 return FALSE;
5621 }
5622
5623
5624
5625
5626 /*************************************************
5627 * Compile sequence of alternatives *
5628 *************************************************/
5629
5630 /* On entry, ptr is pointing past the bracket character, but on return it
5631 points to the closing bracket, or vertical bar, or end of string. The code
5632 variable is pointing at the byte into which the BRA operator has been stored.
5633 If the ims options are changed at the start (for a (?ims: group) or during any
5634 branch, we need to insert an OP_OPT item at the start of every following branch
5635 to ensure they get set correctly at run time, and also pass the new options
5636 into every subsequent branch compile.
5637
5638 This function is used during the pre-compile phase when we are trying to find
5639 out the amount of memory needed, as well as during the real compile phase. The
5640 value of lengthptr distinguishes the two phases.
5641
5642 Arguments:
5643 options option bits, including any changes for this subpattern
5644 oldims previous settings of ims option bits
5645 codeptr -> the address of the current code pointer
5646 ptrptr -> the address of the current pattern pointer
5647 errorcodeptr -> pointer to error code variable
5648 lookbehind TRUE if this is a lookbehind assertion
5649 reset_bracount TRUE to reset the count for each branch
5650 skipbytes skip this many bytes at start (for brackets and OP_COND)
5651 firstbyteptr place to put the first required character, or a negative number
5652 reqbyteptr place to put the last required character, or a negative number
5653 bcptr pointer to the chain of currently open branches
5654 cd points to the data block with tables pointers etc.
5655 lengthptr NULL during the real compile phase
5656 points to length accumulator during pre-compile phase
5657
5658 Returns: TRUE on success
5659 */
5660
5661 static BOOL
5662 compile_regex(int options, int oldims, uschar **codeptr, const uschar **ptrptr,
5663 int *errorcodeptr, BOOL lookbehind, BOOL reset_bracount, int skipbytes,
5664 int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd,
5665 int *lengthptr)
5666 {
5667 const uschar *ptr = *ptrptr;
5668 uschar *code = *codeptr;
5669 uschar *last_branch = code;
5670 uschar *start_bracket = code;
5671 uschar *reverse_count = NULL;
5672 int firstbyte, reqbyte;
5673 int branchfirstbyte, branchreqbyte;
5674 int length;
5675 int orig_bracount;
5676 int max_bracount;
5677 branch_chain bc;
5678
5679 bc.outer = bcptr;
5680 bc.current = code;
5681
5682 firstbyte = reqbyte = REQ_UNSET;
5683
5684 /* Accumulate the length for use in the pre-compile phase. Start with the
5685 length of the BRA and KET and any extra bytes that are required at the
5686 beginning. We accumulate in a local variable to save frequent testing of
5687 lenthptr for NULL. We cannot do this by looking at the value of code at the
5688 start and end of each alternative, because compiled items are discarded during
5689 the pre-compile phase so that the work space is not exceeded. */
5690
5691 length = 2 + 2*LINK_SIZE + skipbytes;
5692
5693 /* WARNING: If the above line is changed for any reason, you must also change
5694 the code that abstracts option settings at the start of the pattern and makes
5695 them global. It tests the value of length for (2 + 2*LINK_SIZE) in the
5696 pre-compile phase to find out whether anything has yet been compiled or not. */
5697
5698 /* Offset is set zero to mark that this bracket is still open */
5699
5700 PUT(code, 1, 0);
5701 code += 1 + LINK_SIZE + skipbytes;
5702
5703 /* Loop for each alternative branch */
5704
5705 orig_bracount = max_bracount = cd->bracount;
5706 for (;;)
5707 {
5708 /* For a (?| group, reset the capturing bracket count so that each branch
5709 uses the same numbers. */
5710
5711 if (reset_bracount) cd->bracount = orig_bracount;
5712
5713 /* Handle a change of ims options at the start of the branch */
5714
5715 if ((options & PCRE_IMS) != oldims)
5716 {
5717 *code++ = OP_OPT;
5718 *code++ = options & PCRE_IMS;
5719 length += 2;
5720 }
5721
5722 /* Set up dummy OP_REVERSE if lookbehind assertion */
5723
5724 if (lookbehind)
5725 {
5726 *code++ = OP_REVERSE;
5727 reverse_count = code;
5728 PUTINC(code, 0, 0);
5729 length += 1 + LINK_SIZE;
5730 }
5731
5732 /* Now compile the branch; in the pre-compile phase its length gets added
5733 into the length. */
5734
5735 if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstbyte,
5736 &branchreqbyte, &bc, cd, (lengthptr == NULL)? NULL : &length))
5737 {
5738 *ptrptr = ptr;
5739 return FALSE;
5740 }
5741
5742 /* Keep the highest bracket count in case (?| was used and some branch
5743 has fewer than the rest. */
5744
5745 if (cd->bracount > max_bracount) max_bracount = cd->bracount;
5746
5747 /* In the real compile phase, there is some post-processing to be done. */
5748
5749 if (lengthptr == NULL)
5750 {
5751 /* If this is the first branch, the firstbyte and reqbyte values for the
5752 branch become the values for the regex. */
5753
5754 if (*last_branch != OP_ALT)
5755 {
5756 firstbyte = branchfirstbyte;
5757 reqbyte = branchreqbyte;
5758 }
5759
5760 /* If this is not the first branch, the first char and reqbyte have to
5761 match the values from all the previous branches, except that if the
5762 previous value for reqbyte didn't have REQ_VARY set, it can still match,
5763 and we set REQ_VARY for the regex. */
5764
5765 else
5766 {
5767 /* If we previously had a firstbyte, but it doesn't match the new branch,
5768 we have to abandon the firstbyte for the regex, but if there was
5769 previously no reqbyte, it takes on the value of the old firstbyte. */
5770
5771 if (firstbyte >= 0 && firstbyte != branchfirstbyte)
5772 {
5773 if (reqbyte < 0) reqbyte = firstbyte;
5774 firstbyte = REQ_NONE;
5775 }
5776
5777 /* If we (now or from before) have no firstbyte, a firstbyte from the
5778 branch becomes a reqbyte if there isn't a branch reqbyte. */
5779
5780 if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
5781 branchreqbyte = branchfirstbyte;
5782
5783 /* Now ensure that the reqbytes match */
5784
5785 if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
5786 reqbyte = REQ_NONE;
5787 else reqbyte |= branchreqbyte; /* To "or" REQ_VARY */
5788 }
5789
5790 /* If lookbehind, check that this branch matches a fixed-length string, and
5791 put the length into the OP_REVERSE item. Temporarily mark the end of the
5792 branch with OP_END. */
5793
5794 if (lookbehind)
5795 {
5796 int fixed_length;
5797 *code = OP_END;
5798 fixed_length = find_fixedlength(last_branch, options);
5799 DPRINTF(("fixed length = %d\n", fixed_length));
5800 if (fixed_length < 0)
5801 {
5802 *errorcodeptr = (fixed_length == -2)? ERR36 : ERR25;
5803 *ptrptr = ptr;
5804 return FALSE;
5805 }
5806 PUT(reverse_count, 0, fixed_length);
5807 }
5808 }
5809
5810 /* Reached end of expression, either ')' or end of pattern. In the real
5811 compile phase, go back through the alternative branches and reverse the chain
5812 of offsets, with the field in the BRA item now becoming an offset to the
5813 first alternative. If there are no alternatives, it points to the end of the
5814 group. The length in the terminating ket is always the length of the whole
5815 bracketed item. If any of the ims options were changed inside the group,
5816 compile a resetting op-code following, except at the very end of the pattern.
5817 Return leaving the pointer at the terminating char. */
5818
5819 if (*ptr != CHAR_VERTICAL_LINE)
5820 {
5821 if (lengthptr == NULL)
5822 {
5823 int branch_length = code - last_branch;
5824 do
5825 {
5826 int prev_length = GET(last_branch, 1);
5827 PUT(last_branch, 1, branch_length);
5828 branch_length = prev_length;
5829 last_branch -= branch_length;
5830 }
5831 while (branch_length > 0);
5832 }
5833
5834 /* Fill in the ket */
5835
5836 *code = OP_KET;
5837 PUT(code, 1, code - start_bracket);
5838 code += 1 + LINK_SIZE;
5839
5840 /* Resetting option if needed */
5841
5842 if ((options & PCRE_IMS) != oldims && *ptr == CHAR_RIGHT_PARENTHESIS)
5843 {
5844 *code++ = OP_OPT;
5845 *code++ = oldims;
5846 length += 2;
5847 }
5848
5849 /* Retain the highest bracket number, in case resetting was used. */
5850
5851 cd->bracount = max_bracount;
5852
5853 /* Set values to pass back */
5854
5855 *codeptr = code;
5856 *ptrptr = ptr;
5857 *firstbyteptr = firstbyte;
5858 *reqbyteptr = reqbyte;
5859 if (lengthptr != NULL)
5860 {
5861 if (OFLOW_MAX - *lengthptr < length)
5862 {
5863 *errorcodeptr = ERR20;
5864 return FALSE;
5865 }
5866 *lengthptr += length;
5867 }
5868 return TRUE;
5869 }
5870
5871 /* Another branch follows. In the pre-compile phase, we can move the code
5872 pointer back to where it was for the start of the first branch. (That is,
5873 pretend that each branch is the only one.)
5874
5875 In the real compile phase, insert an ALT node. Its length field points back
5876 to the previous branch while the bracket remains open. At the end the chain
5877 is reversed. It's done like this so that the start of the bracket has a
5878 zero offset until it is closed, making it possible to detect recursion. */
5879
5880 if (lengthptr != NULL)
5881 {
5882 code = *codeptr + 1 + LINK_SIZE + skipbytes;
5883 length += 1 + LINK_SIZE;
5884 }
5885 else
5886 {
5887 *code = OP_ALT;
5888 PUT(code, 1, code - last_branch);
5889 bc.current = last_branch = code;
5890 code += 1 + LINK_SIZE;
5891 }
5892
5893 ptr++;
5894 }
5895 /* Control never reaches here */
5896 }
5897
5898
5899
5900
5901 /*************************************************
5902 * Check for anchored expression *
5903 *************************************************/
5904
5905 /* Try to find out if this is an anchored regular expression. Consider each
5906 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
5907 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
5908 it's anchored. However, if this is a multiline pattern, then only OP_SOD
5909 counts, since OP_CIRC can match in the middle.
5910
5911 We can also consider a regex to be anchored if OP_SOM starts all its branches.
5912 This is the code for \G, which means "match at start of match position, taking
5913 into account the match offset".
5914
5915 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
5916 because that will try the rest of the pattern at all possible matching points,
5917 so there is no point trying again.... er ....
5918
5919 .... except when the .* appears inside capturing parentheses, and there is a
5920 subsequent back reference to those parentheses. We haven't enough information
5921 to catch that case precisely.
5922
5923 At first, the best we could do was to detect when .* was in capturing brackets
5924 and the highest back reference was greater than or equal to that level.
5925 However, by keeping a bitmap of the first 31 back references, we can catch some
5926 of the more common cases more precisely.
5927
5928 Arguments:
5929 code points to start of expression (the bracket)
5930 options points to the options setting
5931 bracket_map a bitmap of which brackets we are inside while testing; this
5932 handles up to substring 31; after that we just have to take
5933 the less precise approach
5934 backref_map the back reference bitmap
5935
5936 Returns: TRUE or FALSE
5937 */
5938
5939 static BOOL
5940 is_anchored(register const uschar *code, int *options, unsigned int bracket_map,
5941 unsigned int backref_map)
5942 {
5943 do {
5944 const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
5945 options, PCRE_MULTILINE, FALSE);
5946 register int op = *scode;
5947
5948 /* Non-capturing brackets */
5949
5950 if (op == OP_BRA)
5951 {
5952 if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
5953 }
5954
5955 /* Capturing brackets */
5956
5957 else if (op == OP_CBRA)
5958 {
5959 int n = GET2(scode, 1+LINK_SIZE);
5960 int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
5961 if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;
5962 }
5963
5964 /* Other brackets */
5965
5966 else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
5967 {
5968 if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
5969 }
5970
5971 /* .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and
5972 it isn't in brackets that are or may be referenced. */
5973
5974 else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
5975 op == OP_TYPEPOSSTAR))
5976 {
5977 if (scode[1] != OP_ALLANY || (bracket_map & backref_map) != 0)
5978 return FALSE;
5979 }
5980
5981 /* Check for explicit anchoring */
5982
5983 else if (op != OP_SOD && op != OP_SOM &&
5984 ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
5985 return FALSE;
5986 code += GET(code, 1);
5987 }
5988 while (*code == OP_ALT); /* Loop for each alternative */
5989 return TRUE;
5990 }
5991
5992
5993
5994 /*************************************************
5995 * Check for starting with ^ or .* *
5996 *************************************************/
5997
5998 /* This is called to find out if every branch starts with ^ or .* so that
5999 "first char" processing can be done to speed things up in multiline
6000 matching and for non-DOTALL patterns that start with .* (which must start at
6001 the beginning or after \n). As in the case of is_anchored() (see above), we
6002 have to take account of back references to capturing brackets that contain .*
6003 because in that case we can't make the assumption.
6004
6005 Arguments:
6006 code points to start of expression (the bracket)
6007 bracket_map a bitmap of which brackets we are inside while testing; this
6008 handles up to substring 31; after that we just have to take
6009 the less precise approach
6010 backref_map the back reference bitmap
6011
6012 Returns: TRUE or FALSE
6013 */
6014
6015 static BOOL
6016 is_startline(const uschar *code, unsigned int bracket_map,
6017 unsigned int backref_map)
6018 {
6019 do {
6020 const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
6021 NULL, 0, FALSE);
6022 register int op = *scode;
6023
6024 /* If we are at the start of a conditional assertion group, *both* the
6025 conditional assertion *and* what follows the condition must satisfy the test
6026 for start of line. Other kinds of condition fail. Note that there may be an
6027 auto-callout at the start of a condition. */
6028
6029 if (op == OP_COND)
6030 {
6031 scode += 1 + LINK_SIZE;
6032 if (*scode == OP_CALLOUT) scode += _pcre_OP_lengths[OP_CALLOUT];
6033 switch (*scode)
6034 {
6035 case OP_CREF:
6036 case OP_RREF:
6037 case OP_DEF:
6038 return FALSE;
6039
6040 default: /* Assertion */
6041 if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
6042 do scode += GET(scode, 1); while (*scode == OP_ALT);
6043 scode += 1 + LINK_SIZE;
6044 break;
6045 }
6046 scode = first_significant_code(scode, NULL, 0, FALSE);
6047 op = *scode;
6048 }
6049
6050 /* Non-capturing brackets */
6051
6052 if (op == OP_BRA)
6053 {
6054 if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
6055 }
6056
6057 /* Capturing brackets */
6058
6059 else if (op == OP_CBRA)
6060 {
6061 int n = GET2(scode, 1+LINK_SIZE);
6062 int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
6063 if (!is_startline(scode, new_map, backref_map)) return FALSE;
6064 }
6065
6066 /* Other brackets */
6067
6068 else if (op == OP_ASSERT || op == OP_ONCE)
6069 {
6070 if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
6071 }
6072
6073 /* .* means "start at start or after \n" if it isn't in brackets that
6074 may be referenced. */
6075
6076 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
6077 {
6078 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
6079 }
6080
6081 /* Check for explicit circumflex */
6082
6083 else if (op != OP_CIRC) return FALSE;
6084
6085 /* Move on to the next alternative */
6086
6087 code += GET(code, 1);
6088 }
6089 while (*code == OP_ALT); /* Loop for each alternative */
6090 return TRUE;
6091 }
6092
6093
6094
6095 /*************************************************
6096 * Check for asserted fixed first char *
6097 *************************************************/
6098
6099 /* During compilation, the "first char" settings from forward assertions are
6100 discarded, because they can cause conflicts with actual literals that follow.
6101 However, if we end up without a first char setting for an unanchored pattern,
6102 it is worth scanning the regex to see if there is an initial asserted first
6103 char. If all branches start with the same asserted char, or with a bracket all
6104 of whose alternatives start with the same asserted char (recurse ad lib), then
6105 we return that char, otherwise -1.
6106
6107 Arguments:
6108 code points to start of expression (the bracket)
6109 options pointer to the options (used to check casing changes)
6110 inassert TRUE if in an assertion
6111
6112 Returns: -1 or the fixed first char
6113 */
6114
6115 static int
6116 find_firstassertedchar(const uschar *code, int *options, BOOL inassert)
6117 {
6118 register int c = -1;
6119 do {
6120 int d;
6121 const uschar *scode =
6122 first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE);
6123 register int op = *scode;
6124
6125 switch(op)
6126 {
6127 default:
6128 return -1;
6129
6130 case OP_BRA:
6131 case OP_CBRA:
6132 case OP_ASSERT:
6133 case OP_ONCE:
6134 case OP_COND:
6135 if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0)
6136 return -1;
6137 if (c < 0) c = d; else if (c != d) return -1;
6138 break;
6139
6140 case OP_EXACT: /* Fall through */
6141 scode += 2;
6142
6143 case OP_CHAR:
6144 case OP_CHARNC:
6145 case OP_PLUS:
6146 case OP_MINPLUS:
6147 case OP_POSPLUS:
6148 if (!inassert) return -1;
6149 if (c < 0)
6150 {
6151 c = scode[1];
6152 if ((*options & PCRE_CASELESS) != 0) c |= REQ_CASELESS;
6153 }
6154 else if (c != scode[1]) return -1;
6155 break;
6156 }
6157
6158 code += GET(code, 1);
6159 }
6160 while (*code == OP_ALT);
6161 return c;
6162 }
6163
6164
6165
6166 /*************************************************
6167 * Compile a Regular Expression *
6168 *************************************************/
6169
6170 /* This function takes a string and returns a pointer to a block of store
6171 holding a compiled version of the expression. The original API for this
6172 function had no error code return variable; it is retained for backwards
6173 compatibility. The new function is given a new name.
6174
6175 Arguments:
6176 pattern the regular expression
6177 options various option bits
6178 errorcodeptr pointer to error code variable (pcre_compile2() only)
6179 can be NULL if you don't want a code value
6180 errorptr pointer to pointer to error text
6181 erroroffset ptr offset in pattern where error was detected
6182 tables pointer to character tables or NULL
6183
6184 Returns: pointer to compiled data block, or NULL on error,
6185 with errorptr and erroroffset set
6186 */
6187
6188 PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION
6189 pcre_compile(const char *pattern, int options, const char **errorptr,
6190 int *erroroffset, const unsigned char *tables)
6191 {
6192 return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
6193 }
6194
6195
6196 PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION
6197 pcre_compile2(const char *pattern, int options, int *errorcodeptr,
6198 const char **errorptr, int *erroroffset, const unsigned char *tables)
6199 {
6200 real_pcre *re;
6201 int length = 1; /* For final END opcode */
6202 int firstbyte, reqbyte, newline;
6203 int errorcode = 0;
6204 int skipatstart = 0;
6205 #ifdef SUPPORT_UTF8
6206 BOOL utf8;
6207 #endif
6208 size_t size;
6209 uschar *code;
6210 const uschar *codestart;
6211 const uschar *ptr;
6212 compile_data compile_block;
6213 compile_data *cd = &compile_block;
6214
6215 /* This space is used for "compiling" into during the first phase, when we are
6216 computing the amount of memory that is needed. Compiled items are thrown away
6217 as soon as possible, so that a fairly large buffer should be sufficient for
6218 this purpose. The same space is used in the second phase for remembering where
6219 to fill in forward references to subpatterns. */
6220
6221 uschar cworkspace[COMPILE_WORK_SIZE];
6222
6223 /* Set this early so that early errors get offset 0. */
6224
6225 ptr = (const uschar *)pattern;
6226
6227 /* We can't pass back an error message if errorptr is NULL; I guess the best we
6228 can do is just return NULL, but we can set a code value if there is a code
6229 pointer. */
6230
6231 if (errorptr == NULL)
6232 {
6233 if (errorcodeptr != NULL) *errorcodeptr = 99;
6234 return NULL;
6235 }
6236
6237 *errorptr = NULL;
6238 if (errorcodeptr != NULL) *errorcodeptr = ERR0;
6239
6240 /* However, we can give a message for this error */
6241
6242 if (erroroffset == NULL)
6243 {
6244 errorcode = ERR16;
6245 goto PCRE_EARLY_ERROR_RETURN2;
6246 }
6247
6248 *erroroffset = 0;
6249
6250 /* Set up pointers to the individual character tables */
6251
6252 if (tables == NULL) tables = _pcre_default_tables;
6253 cd->lcc = tables + lcc_offset;
6254 cd->fcc = tables + fcc_offset;
6255 cd->cbits = tables + cbits_offset;
6256 cd->ctypes = tables + ctypes_offset;
6257
6258 /* Check that all undefined public option bits are zero */
6259
6260 if ((options & ~PUBLIC_COMPILE_OPTIONS) != 0)
6261 {
6262 errorcode = ERR17;
6263 goto PCRE_EARLY_ERROR_RETURN;
6264 }
6265
6266 /* Check for global one-time settings at the start of the pattern, and remember
6267 the offset for later. */
6268
6269 while (ptr[skipatstart] == CHAR_LEFT_PARENTHESIS &&
6270 ptr[skipatstart+1] == CHAR_ASTERISK)
6271 {
6272 int newnl = 0;
6273 int newbsr = 0;
6274
6275 if (strncmp((char *)(ptr+skipatstart+2), STRING_UTF8_RIGHTPAR, 5) == 0)
6276 { skipatstart += 7; options |= PCRE_UTF8; continue; }
6277
6278 if (strncmp((char *)(ptr+skipatstart+2), STRING_CR_RIGHTPAR, 3) == 0)
6279 { skipatstart += 5; newnl = PCRE_NEWLINE_CR; }
6280 else if (strncmp((char *)(ptr+skipatstart+2), STRING_LF_RIGHTPAR, 3) == 0)
6281 { skipatstart += 5; newnl = PCRE_NEWLINE_LF; }
6282 else if (strncmp((char *)(ptr+skipatstart+2), STRING_CRLF_RIGHTPAR, 5) == 0)
6283 { skipatstart += 7; newnl = PCRE_NEWLINE_CR + PCRE_NEWLINE_LF; }
6284 else if (strncmp((char *)(ptr+skipatstart+2), STRING_ANY_RIGHTPAR, 4) == 0)
6285 { skipatstart += 6; newnl = PCRE_NEWLINE_ANY; }
6286 else if (strncmp((char *)(ptr+skipatstart+2), STRING_ANYCRLF_RIGHTPAR, 8) == 0)
6287 { skipatstart += 10; newnl = PCRE_NEWLINE_ANYCRLF; }
6288
6289 else if (strncmp((char *)(ptr+skipatstart+2), STRING_BSR_ANYCRLF_RIGHTPAR, 12) == 0)
6290 { skipatstart += 14; newbsr = PCRE_BSR_ANYCRLF; }
6291 else if (strncmp((char *)(ptr+skipatstart+2), STRING_BSR_UNICODE_RIGHTPAR, 12) == 0)
6292 { skipatstart += 14; newbsr = PCRE_BSR_UNICODE; }
6293
6294 if (newnl != 0)
6295 options = (options & ~PCRE_NEWLINE_BITS) | newnl;
6296 else if (newbsr != 0)
6297 options = (options & ~(PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) | newbsr;
6298 else break;
6299 }
6300
6301 /* Can't support UTF8 unless PCRE has been compiled to include the code. */
6302
6303 #ifdef SUPPORT_UTF8
6304 utf8 = (options & PCRE_UTF8) != 0;
6305 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
6306 (*erroroffset = _pcre_valid_utf8((uschar *)pattern, -1)) >= 0)
6307 {
6308 errorcode = ERR44;
6309 goto PCRE_EARLY_ERROR_RETURN2;
6310 }
6311 #else
6312 if ((options & PCRE_UTF8) != 0)
6313 {
6314 errorcode = ERR32;
6315 goto PCRE_EARLY_ERROR_RETURN;
6316 }
6317 #endif
6318
6319 /* Check validity of \R options. */
6320
6321 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
6322 {
6323 case 0:
6324 case PCRE_BSR_ANYCRLF:
6325 case PCRE_BSR_UNICODE:
6326 break;
6327 default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
6328 }
6329
6330 /* Handle different types of newline. The three bits give seven cases. The
6331 current code allows for fixed one- or two-byte sequences, plus "any" and
6332 "anycrlf". */
6333
6334 switch (options & PCRE_NEWLINE_BITS)
6335 {
6336 case 0: newline = NEWLINE; break; /* Build-time default */
6337 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
6338 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
6339 case PCRE_NEWLINE_CR+
6340 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
6341 case PCRE_NEWLINE_ANY: newline = -1; break;
6342 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
6343 default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
6344 }
6345
6346 if (newline == -2)
6347 {
6348 cd->nltype = NLTYPE_ANYCRLF;
6349 }
6350 else if (newline < 0)
6351 {
6352 cd->nltype = NLTYPE_ANY;
6353 }
6354 else
6355 {
6356 cd->nltype = NLTYPE_FIXED;
6357 if (newline > 255)
6358 {
6359 cd->nllen = 2;
6360 cd->nl[0] = (newline >> 8) & 255;
6361 cd->nl[1] = newline & 255;
6362 }
6363 else
6364 {
6365 cd->nllen = 1;
6366 cd->nl[0] = newline;
6367 }
6368 }
6369
6370 /* Maximum back reference and backref bitmap. The bitmap records up to 31 back
6371 references to help in deciding whether (.*) can be treated as anchored or not.
6372 */
6373
6374 cd->top_backref = 0;
6375 cd->backref_map = 0;
6376
6377 /* Reflect pattern for debugging output */
6378
6379 DPRINTF(("------------------------------------------------------------------\n"));
6380 DPRINTF(("%s\n", pattern));
6381
6382 /* Pretend to compile the pattern while actually just accumulating the length
6383 of memory required. This behaviour is triggered by passing a non-NULL final
6384 argument to compile_regex(). We pass a block of workspace (cworkspace) for it
6385 to compile parts of the pattern into; the compiled code is discarded when it is
6386 no longer needed, so hopefully this workspace will never overflow, though there
6387 is a test for its doing so. */
6388
6389 cd->bracount = cd->final_bracount = 0;
6390 cd->names_found = 0;
6391 cd->name_entry_size = 0;
6392 cd->name_table = NULL;
6393 cd->start_workspace = cworkspace;
6394 cd->start_code = cworkspace;
6395 cd->hwm = cworkspace;
6396 cd->start_pattern = (const uschar *)pattern;
6397 cd->end_pattern = (const uschar *)(pattern + strlen(pattern));
6398 cd->req_varyopt = 0;
6399 cd->external_options = options;
6400 cd->external_flags = 0;
6401
6402 /* Now do the pre-compile. On error, errorcode will be set non-zero, so we
6403 don't need to look at the result of the function here. The initial options have
6404 been put into the cd block so that they can be changed if an option setting is
6405 found within the regex right at the beginning. Bringing initial option settings
6406 outside can help speed up starting point checks. */
6407
6408 ptr += skipatstart;
6409 code = cworkspace;
6410 *code = OP_BRA;
6411 (void)compile_regex(cd->external_options, cd->external_options & PCRE_IMS,
6412 &code, &ptr, &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd,
6413 &length);
6414 if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN;
6415
6416 DPRINTF(("end pre-compile: length=%d workspace=%d\n", length,
6417 cd->hwm - cworkspace));
6418
6419 if (length > MAX_PATTERN_SIZE)
6420 {
6421 errorcode = ERR20;
6422 goto PCRE_EARLY_ERROR_RETURN;
6423 }
6424
6425 /* Compute the size of data block needed and get it, either from malloc or
6426 externally provided function. Integer overflow should no longer be possible
6427 because nowadays we limit the maximum value of cd->names_found and
6428 cd->name_entry_size. */
6429
6430 size = length + sizeof(real_pcre) + cd->names_found * (cd->name_entry_size + 3);
6431 re = (real_pcre *)(pcre_malloc)(size);
6432
6433 if (re == NULL)
6434 {
6435 errorcode = ERR21;
6436 goto PCRE_EARLY_ERROR_RETURN;
6437 }
6438
6439 /* Put in the magic number, and save the sizes, initial options, internal
6440 flags, and character table pointer. NULL is used for the default character
6441 tables. The nullpad field is at the end; it's there to help in the case when a
6442 regex compiled on a system with 4-byte pointers is run on another with 8-byte
6443 pointers. */
6444
6445 re->magic_number = MAGIC_NUMBER;
6446 re->size = size;
6447 re->options = cd->external_options;
6448 re->flags = cd->external_flags;
6449 re->dummy1 = 0;
6450 re->first_byte = 0;
6451 re->req_byte = 0;
6452 re->name_table_offset = sizeof(real_pcre);
6453 re->name_entry_size = cd->name_entry_size;
6454 re->name_count = cd->names_found;
6455 re->ref_count = 0;
6456 re->tables = (tables == _pcre_default_tables)? NULL : tables;
6457 re->nullpad = NULL;
6458
6459 /* The starting points of the name/number translation table and of the code are
6460 passed around in the compile data block. The start/end pattern and initial
6461 options are already set from the pre-compile phase, as is the name_entry_size
6462 field. Reset the bracket count and the names_found field. Also reset the hwm
6463 field; this time it's used for remembering forward references to subpatterns.
6464 */
6465
6466 cd->final_bracount = cd->bracount; /* Save for checking forward references */
6467 cd->bracount = 0;
6468 cd->names_found = 0;
6469 cd->name_table = (uschar *)re + re->name_table_offset;
6470 codestart = cd->name_table + re->name_entry_size * re->name_count;
6471 cd->start_code = codestart;
6472 cd->hwm = cworkspace;
6473 cd->req_varyopt = 0;
6474 cd->had_accept = FALSE;
6475
6476 /* Set up a starting, non-extracting bracket, then compile the expression. On
6477 error, errorcode will be set non-zero, so we don't need to look at the result
6478 of the function here. */
6479
6480 ptr = (const uschar *)pattern + skipatstart;
6481 code = (uschar *)codestart;
6482 *code = OP_BRA;
6483 (void)compile_regex(re->options, re->options & PCRE_IMS, &code, &ptr,
6484 &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd, NULL);
6485 re->top_bracket = cd->bracount;
6486 re->top_backref = cd->top_backref;
6487 re->flags = cd->external_flags;
6488
6489 if (cd->had_accept) reqbyte = -1; /* Must disable after (*ACCEPT) */
6490
6491 /* If not reached end of pattern on success, there's an excess bracket. */
6492
6493 if (errorcode == 0 && *ptr != 0) errorcode = ERR22;
6494
6495 /* Fill in the terminating state and check for disastrous overflow, but
6496 if debugging, leave the test till after things are printed out. */
6497
6498 *code++ = OP_END;
6499
6500 #ifndef DEBUG
6501 if (code - codestart > length) errorcode = ERR23;
6502 #endif
6503
6504 /* Fill in any forward references that are required. */
6505
6506 while (errorcode == 0 && cd->hwm > cworkspace)
6507 {
6508 int offset, recno;
6509 const uschar *groupptr;
6510 cd->hwm -= LINK_SIZE;
6511 offset = GET(cd->hwm, 0);
6512 recno = GET(codestart, offset);
6513 groupptr = find_bracket(codestart, (re->options & PCRE_UTF8) != 0, recno);
6514 if (groupptr == NULL) errorcode = ERR53;
6515 else PUT(((uschar *)codestart), offset, groupptr - codestart);
6516 }
6517
6518 /* Give an error if there's back reference to a non-existent capturing
6519 subpattern. */
6520
6521 if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15;
6522
6523 /* Failed to compile, or error while post-processing */
6524
6525 if (errorcode != 0)
6526 {
6527 (pcre_free)(re);
6528 PCRE_EARLY_ERROR_RETURN:
6529 *erroroffset = ptr - (const uschar *)pattern;
6530 PCRE_EARLY_ERROR_RETURN2:
6531 *errorptr = find_error_text(errorcode);
6532 if (errorcodeptr != NULL) *errorcodeptr = errorcode;
6533 return NULL;
6534 }
6535
6536 /* If the anchored option was not passed, set the flag if we can determine that
6537 the pattern is anchored by virtue of ^ characters or \A or anything else (such
6538 as starting with .* when DOTALL is set).
6539
6540 Otherwise, if we know what the first byte has to be, save it, because that
6541 speeds up unanchored matches no end. If not, see if we can set the
6542 PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
6543 start with ^. and also when all branches start with .* for non-DOTALL matches.
6544 */
6545
6546 if ((re->options & PCRE_ANCHORED) == 0)
6547 {
6548 int temp_options = re->options; /* May get changed during these scans */
6549 if (is_anchored(codestart, &temp_options, 0, cd->backref_map))
6550 re->options |= PCRE_ANCHORED;
6551 else
6552 {
6553 if (firstbyte < 0)
6554 firstbyte = find_firstassertedchar(codestart, &temp_options, FALSE);
6555 if (firstbyte >= 0) /* Remove caseless flag for non-caseable chars */
6556 {
6557 int ch = firstbyte & 255;
6558 re->first_byte = ((firstbyte & REQ_CASELESS) != 0 &&
6559 cd->fcc[ch] == ch)? ch : firstbyte;
6560 re->flags |= PCRE_FIRSTSET;
6561 }
6562 else if (is_startline(codestart, 0, cd->backref_map))
6563 re->flags |= PCRE_STARTLINE;
6564 }
6565 }
6566
6567 /* For an anchored pattern, we use the "required byte" only if it follows a
6568 variable length item in the regex. Remove the caseless flag for non-caseable
6569 bytes. */
6570
6571 if (reqbyte >= 0 &&
6572 ((re->options & PCRE_ANCHORED) == 0 || (reqbyte & REQ_VARY) != 0))
6573 {
6574 int ch = reqbyte & 255;
6575 re->req_byte = ((reqbyte & REQ_CASELESS) != 0 &&
6576 cd->fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte;
6577 re->flags |= PCRE_REQCHSET;
6578 }
6579
6580 /* Print out the compiled data if debugging is enabled. This is never the
6581 case when building a production library. */
6582
6583 #ifdef DEBUG
6584
6585 printf("Length = %d top_bracket = %d top_backref = %d\n",
6586 length, re->top_bracket, re->top_backref);
6587
6588 printf("Options=%08x\n", re->options);
6589
6590 if ((re->flags & PCRE_FIRSTSET) != 0)
6591 {
6592 int ch = re->first_byte & 255;
6593 const char *caseless = ((re->first_byte & REQ_CASELESS) == 0)?
6594 "" : " (caseless)";
6595 if (isprint(ch)) printf("First char = %c%s\n", ch, caseless);
6596 else printf("First char = \\x%02x%s\n", ch, caseless);
6597 }
6598
6599 if ((re->flags & PCRE_REQCHSET) != 0)
6600 {
6601 int ch = re->req_byte & 255;
6602 const char *caseless = ((re->req_byte & REQ_CASELESS) == 0)?
6603 "" : " (caseless)";
6604 if (isprint(ch)) printf("Req char = %c%s\n", ch, caseless);
6605 else printf("Req char = \\x%02x%s\n", ch, caseless);
6606 }
6607
6608 pcre_printint(re, stdout, TRUE);
6609
6610 /* This check is done here in the debugging case so that the code that
6611 was compiled can be seen. */
6612
6613 if (code - codestart > length)
6614 {
6615 (pcre_free)(re);
6616 *errorptr = find_error_text(ERR23);
6617 *erroroffset = ptr - (uschar *)pattern;
6618 if (errorcodeptr != NULL) *errorcodeptr = ERR23;
6619 return NULL;
6620 }
6621 #endif /* DEBUG */
6622
6623 return (pcre *)re;
6624 }
6625
6626 /* End of pcre_compile.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12