/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Contents of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 447 - (show annotations) (download)
Tue Sep 15 18:17:54 2009 UTC (4 years, 10 months ago) by ph10
File MIME type: text/plain
File size: 213658 byte(s)
Capture data when (*ACCEPT) is inside capturing parentheses.

1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2009 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains the external function pcre_compile(), along with
42 supporting internal functions that are not used by other modules. */
43
44
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48
49 #define NLBLOCK cd /* Block containing newline information */
50 #define PSSTART start_pattern /* Field containing processed string start */
51 #define PSEND end_pattern /* Field containing processed string end */
52
53 #include "pcre_internal.h"
54
55
56 /* When DEBUG is defined, we need the pcre_printint() function, which is also
57 used by pcretest. DEBUG is not defined when building a production library. */
58
59 #ifdef DEBUG
60 #include "pcre_printint.src"
61 #endif
62
63
64 /* Macro for setting individual bits in class bitmaps. */
65
66 #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
67
68 /* Maximum length value to check against when making sure that the integer that
69 holds the compiled pattern length does not overflow. We make it a bit less than
70 INT_MAX to allow for adding in group terminating bytes, so that we don't have
71 to check them every time. */
72
73 #define OFLOW_MAX (INT_MAX - 20)
74
75
76 /*************************************************
77 * Code parameters and static tables *
78 *************************************************/
79
80 /* This value specifies the size of stack workspace that is used during the
81 first pre-compile phase that determines how much memory is required. The regex
82 is partly compiled into this space, but the compiled parts are discarded as
83 soon as they can be, so that hopefully there will never be an overrun. The code
84 does, however, check for an overrun. The largest amount I've seen used is 218,
85 so this number is very generous.
86
87 The same workspace is used during the second, actual compile phase for
88 remembering forward references to groups so that they can be filled in at the
89 end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
90 is 4 there is plenty of room. */
91
92 #define COMPILE_WORK_SIZE (4096)
93
94
95 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
96 are simple data values; negative values are for special things like \d and so
97 on. Zero means further processing is needed (for things like \x), or the escape
98 is invalid. */
99
100 #ifndef EBCDIC
101
102 /* This is the "normal" table for ASCII systems or for EBCDIC systems running
103 in UTF-8 mode. */
104
105 static const short int escapes[] = {
106 0, 0,
107 0, 0,
108 0, 0,
109 0, 0,
110 0, 0,
111 CHAR_COLON, CHAR_SEMICOLON,
112 CHAR_LESS_THAN_SIGN, CHAR_EQUALS_SIGN,
113 CHAR_GREATER_THAN_SIGN, CHAR_QUESTION_MARK,
114 CHAR_COMMERCIAL_AT, -ESC_A,
115 -ESC_B, -ESC_C,
116 -ESC_D, -ESC_E,
117 0, -ESC_G,
118 -ESC_H, 0,
119 0, -ESC_K,
120 0, 0,
121 0, 0,
122 -ESC_P, -ESC_Q,
123 -ESC_R, -ESC_S,
124 0, 0,
125 -ESC_V, -ESC_W,
126 -ESC_X, 0,
127 -ESC_Z, CHAR_LEFT_SQUARE_BRACKET,
128 CHAR_BACKSLASH, CHAR_RIGHT_SQUARE_BRACKET,
129 CHAR_CIRCUMFLEX_ACCENT, CHAR_UNDERSCORE,
130 CHAR_GRAVE_ACCENT, 7,
131 -ESC_b, 0,
132 -ESC_d, ESC_e,
133 ESC_f, 0,
134 -ESC_h, 0,
135 0, -ESC_k,
136 0, 0,
137 ESC_n, 0,
138 -ESC_p, 0,
139 ESC_r, -ESC_s,
140 ESC_tee, 0,
141 -ESC_v, -ESC_w,
142 0, 0,
143 -ESC_z
144 };
145
146 #else
147
148 /* This is the "abnormal" table for EBCDIC systems without UTF-8 support. */
149
150 static const short int escapes[] = {
151 /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
152 /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
153 /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
154 /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
155 /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
156 /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
157 /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
158 /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
159 /* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0,
160 /* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
161 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
162 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0,
163 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
164 /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
165 /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
166 /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
167 /* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0,
168 /* D0 */ '}', 0, -ESC_K, 0, 0, 0, 0, -ESC_P,
169 /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
170 /* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X,
171 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
172 /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
173 /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
174 };
175 #endif
176
177
178 /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
179 searched linearly. Put all the names into a single string, in order to reduce
180 the number of relocations when a shared library is dynamically linked. The
181 string is built from string macros so that it works in UTF-8 mode on EBCDIC
182 platforms. */
183
184 typedef struct verbitem {
185 int len;
186 int op;
187 } verbitem;
188
189 static const char verbnames[] =
190 STRING_ACCEPT0
191 STRING_COMMIT0
192 STRING_F0
193 STRING_FAIL0
194 STRING_PRUNE0
195 STRING_SKIP0
196 STRING_THEN;
197
198 static const verbitem verbs[] = {
199 { 6, OP_ACCEPT },
200 { 6, OP_COMMIT },
201 { 1, OP_FAIL },
202 { 4, OP_FAIL },
203 { 5, OP_PRUNE },
204 { 4, OP_SKIP },
205 { 4, OP_THEN }
206 };
207
208 static const int verbcount = sizeof(verbs)/sizeof(verbitem);
209
210
211 /* Tables of names of POSIX character classes and their lengths. The names are
212 now all in a single string, to reduce the number of relocations when a shared
213 library is dynamically loaded. The list of lengths is terminated by a zero
214 length entry. The first three must be alpha, lower, upper, as this is assumed
215 for handling case independence. */
216
217 static const char posix_names[] =
218 STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
219 STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
220 STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
221 STRING_word0 STRING_xdigit;
222
223 static const uschar posix_name_lengths[] = {
224 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
225
226 /* Table of class bit maps for each POSIX class. Each class is formed from a
227 base map, with an optional addition or removal of another map. Then, for some
228 classes, there is some additional tweaking: for [:blank:] the vertical space
229 characters are removed, and for [:alpha:] and [:alnum:] the underscore
230 character is removed. The triples in the table consist of the base map offset,
231 second map offset or -1 if no second map, and a non-negative value for map
232 addition or a negative value for map subtraction (if there are two maps). The
233 absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
234 remove vertical space characters, 2 => remove underscore. */
235
236 static const int posix_class_maps[] = {
237 cbit_word, cbit_digit, -2, /* alpha */
238 cbit_lower, -1, 0, /* lower */
239 cbit_upper, -1, 0, /* upper */
240 cbit_word, -1, 2, /* alnum - word without underscore */
241 cbit_print, cbit_cntrl, 0, /* ascii */
242 cbit_space, -1, 1, /* blank - a GNU extension */
243 cbit_cntrl, -1, 0, /* cntrl */
244 cbit_digit, -1, 0, /* digit */
245 cbit_graph, -1, 0, /* graph */
246 cbit_print, -1, 0, /* print */
247 cbit_punct, -1, 0, /* punct */
248 cbit_space, -1, 0, /* space */
249 cbit_word, -1, 0, /* word - a Perl extension */
250 cbit_xdigit,-1, 0 /* xdigit */
251 };
252
253
254 #define STRING(a) # a
255 #define XSTRING(s) STRING(s)
256
257 /* The texts of compile-time error messages. These are "char *" because they
258 are passed to the outside world. Do not ever re-use any error number, because
259 they are documented. Always add a new error instead. Messages marked DEAD below
260 are no longer used. This used to be a table of strings, but in order to reduce
261 the number of relocations needed when a shared library is loaded dynamically,
262 it is now one long string. We cannot use a table of offsets, because the
263 lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
264 simply count through to the one we want - this isn't a performance issue
265 because these strings are used only when there is a compilation error. */
266
267 static const char error_texts[] =
268 "no error\0"
269 "\\ at end of pattern\0"
270 "\\c at end of pattern\0"
271 "unrecognized character follows \\\0"
272 "numbers out of order in {} quantifier\0"
273 /* 5 */
274 "number too big in {} quantifier\0"
275 "missing terminating ] for character class\0"
276 "invalid escape sequence in character class\0"
277 "range out of order in character class\0"
278 "nothing to repeat\0"
279 /* 10 */
280 "operand of unlimited repeat could match the empty string\0" /** DEAD **/
281 "internal error: unexpected repeat\0"
282 "unrecognized character after (? or (?-\0"
283 "POSIX named classes are supported only within a class\0"
284 "missing )\0"
285 /* 15 */
286 "reference to non-existent subpattern\0"
287 "erroffset passed as NULL\0"
288 "unknown option bit(s) set\0"
289 "missing ) after comment\0"
290 "parentheses nested too deeply\0" /** DEAD **/
291 /* 20 */
292 "regular expression is too large\0"
293 "failed to get memory\0"
294 "unmatched parentheses\0"
295 "internal error: code overflow\0"
296 "unrecognized character after (?<\0"
297 /* 25 */
298 "lookbehind assertion is not fixed length\0"
299 "malformed number or name after (?(\0"
300 "conditional group contains more than two branches\0"
301 "assertion expected after (?(\0"
302 "(?R or (?[+-]digits must be followed by )\0"
303 /* 30 */
304 "unknown POSIX class name\0"
305 "POSIX collating elements are not supported\0"
306 "this version of PCRE is not compiled with PCRE_UTF8 support\0"
307 "spare error\0" /** DEAD **/
308 "character value in \\x{...} sequence is too large\0"
309 /* 35 */
310 "invalid condition (?(0)\0"
311 "\\C not allowed in lookbehind assertion\0"
312 "PCRE does not support \\L, \\l, \\N, \\U, or \\u\0"
313 "number after (?C is > 255\0"
314 "closing ) for (?C expected\0"
315 /* 40 */
316 "recursive call could loop indefinitely\0"
317 "unrecognized character after (?P\0"
318 "syntax error in subpattern name (missing terminator)\0"
319 "two named subpatterns have the same name\0"
320 "invalid UTF-8 string\0"
321 /* 45 */
322 "support for \\P, \\p, and \\X has not been compiled\0"
323 "malformed \\P or \\p sequence\0"
324 "unknown property name after \\P or \\p\0"
325 "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
326 "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
327 /* 50 */
328 "repeated subpattern is too long\0" /** DEAD **/
329 "octal value is greater than \\377 (not in UTF-8 mode)\0"
330 "internal error: overran compiling workspace\0"
331 "internal error: previously-checked referenced subpattern not found\0"
332 "DEFINE group contains more than one branch\0"
333 /* 55 */
334 "repeating a DEFINE group is not allowed\0"
335 "inconsistent NEWLINE options\0"
336 "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
337 "a numbered reference must not be zero\0"
338 "(*VERB) with an argument is not supported\0"
339 /* 60 */
340 "(*VERB) not recognized\0"
341 "number is too big\0"
342 "subpattern name expected\0"
343 "digit expected after (?+\0"
344 "] is an invalid data character in JavaScript compatibility mode";
345
346
347 /* Table to identify digits and hex digits. This is used when compiling
348 patterns. Note that the tables in chartables are dependent on the locale, and
349 may mark arbitrary characters as digits - but the PCRE compiling code expects
350 to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
351 a private table here. It costs 256 bytes, but it is a lot faster than doing
352 character value tests (at least in some simple cases I timed), and in some
353 applications one wants PCRE to compile efficiently as well as match
354 efficiently.
355
356 For convenience, we use the same bit definitions as in chartables:
357
358 0x04 decimal digit
359 0x08 hexadecimal digit
360
361 Then we can use ctype_digit and ctype_xdigit in the code. */
362
363 #ifndef EBCDIC
364
365 /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
366 UTF-8 mode. */
367
368 static const unsigned char digitab[] =
369 {
370 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
371 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
372 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
373 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
374 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
375 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
376 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
377 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
378 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
379 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
380 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
381 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
382 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
383 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
384 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
385 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
386 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
387 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
388 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
389 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
390 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
391 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
392 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
393 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
394 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
395 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
396 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
397 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
398 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
399 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
400 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
401 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
402
403 #else
404
405 /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
406
407 static const unsigned char digitab[] =
408 {
409 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
410 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
411 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
412 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
413 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
414 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
415 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
416 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
417 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
418 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
419 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
420 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */
421 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
422 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
423 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
424 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
425 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
426 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
427 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
428 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
429 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
430 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
431 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
432 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
433 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
434 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
435 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
436 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
437 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
438 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
439 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
440 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
441
442 static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
443 0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
444 0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
445 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
446 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
447 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
448 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
449 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
450 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
451 0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
452 0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
453 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
454 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */
455 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
456 0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
457 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
458 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
459 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
460 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
461 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
462 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
463 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
464 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
465 0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
466 0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
467 0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
468 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
469 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
470 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
471 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
472 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
473 0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
474 0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
475 #endif
476
477
478 /* Definition to allow mutual recursion */
479
480 static BOOL
481 compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
482 int *, int *, branch_chain *, compile_data *, int *);
483
484
485
486 /*************************************************
487 * Find an error text *
488 *************************************************/
489
490 /* The error texts are now all in one long string, to save on relocations. As
491 some of the text is of unknown length, we can't use a table of offsets.
492 Instead, just count through the strings. This is not a performance issue
493 because it happens only when there has been a compilation error.
494
495 Argument: the error number
496 Returns: pointer to the error string
497 */
498
499 static const char *
500 find_error_text(int n)
501 {
502 const char *s = error_texts;
503 for (; n > 0; n--) while (*s++ != 0) {};
504 return s;
505 }
506
507
508 /*************************************************
509 * Handle escapes *
510 *************************************************/
511
512 /* This function is called when a \ has been encountered. It either returns a
513 positive value for a simple escape such as \n, or a negative value which
514 encodes one of the more complicated things such as \d. A backreference to group
515 n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
516 UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
517 ptr is pointing at the \. On exit, it is on the final character of the escape
518 sequence.
519
520 Arguments:
521 ptrptr points to the pattern position pointer
522 errorcodeptr points to the errorcode variable
523 bracount number of previous extracting brackets
524 options the options bits
525 isclass TRUE if inside a character class
526
527 Returns: zero or positive => a data character
528 negative => a special escape sequence
529 on error, errorcodeptr is set
530 */
531
532 static int
533 check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
534 int options, BOOL isclass)
535 {
536 BOOL utf8 = (options & PCRE_UTF8) != 0;
537 const uschar *ptr = *ptrptr + 1;
538 int c, i;
539
540 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
541 ptr--; /* Set pointer back to the last byte */
542
543 /* If backslash is at the end of the pattern, it's an error. */
544
545 if (c == 0) *errorcodeptr = ERR1;
546
547 /* Non-alphanumerics are literals. For digits or letters, do an initial lookup
548 in a table. A non-zero result is something that can be returned immediately.
549 Otherwise further processing may be required. */
550
551 #ifndef EBCDIC /* ASCII/UTF-8 coding */
552 else if (c < CHAR_0 || c > CHAR_z) {} /* Not alphanumeric */
553 else if ((i = escapes[c - CHAR_0]) != 0) c = i;
554
555 #else /* EBCDIC coding */
556 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphanumeric */
557 else if ((i = escapes[c - 0x48]) != 0) c = i;
558 #endif
559
560 /* Escapes that need further processing, or are illegal. */
561
562 else
563 {
564 const uschar *oldptr;
565 BOOL braced, negated;
566
567 switch (c)
568 {
569 /* A number of Perl escapes are not handled by PCRE. We give an explicit
570 error. */
571
572 case CHAR_l:
573 case CHAR_L:
574 case CHAR_N:
575 case CHAR_u:
576 case CHAR_U:
577 *errorcodeptr = ERR37;
578 break;
579
580 /* \g must be followed by one of a number of specific things:
581
582 (1) A number, either plain or braced. If positive, it is an absolute
583 backreference. If negative, it is a relative backreference. This is a Perl
584 5.10 feature.
585
586 (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
587 is part of Perl's movement towards a unified syntax for back references. As
588 this is synonymous with \k{name}, we fudge it up by pretending it really
589 was \k.
590
591 (3) For Oniguruma compatibility we also support \g followed by a name or a
592 number either in angle brackets or in single quotes. However, these are
593 (possibly recursive) subroutine calls, _not_ backreferences. Just return
594 the -ESC_g code (cf \k). */
595
596 case CHAR_g:
597 if (ptr[1] == CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_APOSTROPHE)
598 {
599 c = -ESC_g;
600 break;
601 }
602
603 /* Handle the Perl-compatible cases */
604
605 if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
606 {
607 const uschar *p;
608 for (p = ptr+2; *p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET; p++)
609 if (*p != CHAR_MINUS && (digitab[*p] & ctype_digit) == 0) break;
610 if (*p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET)
611 {
612 c = -ESC_k;
613 break;
614 }
615 braced = TRUE;
616 ptr++;
617 }
618 else braced = FALSE;
619
620 if (ptr[1] == CHAR_MINUS)
621 {
622 negated = TRUE;
623 ptr++;
624 }
625 else negated = FALSE;
626
627 c = 0;
628 while ((digitab[ptr[1]] & ctype_digit) != 0)
629 c = c * 10 + *(++ptr) - CHAR_0;
630
631 if (c < 0) /* Integer overflow */
632 {
633 *errorcodeptr = ERR61;
634 break;
635 }
636
637 if (braced && *(++ptr) != CHAR_RIGHT_CURLY_BRACKET)
638 {
639 *errorcodeptr = ERR57;
640 break;
641 }
642
643 if (c == 0)
644 {
645 *errorcodeptr = ERR58;
646 break;
647 }
648
649 if (negated)
650 {
651 if (c > bracount)
652 {
653 *errorcodeptr = ERR15;
654 break;
655 }
656 c = bracount - (c - 1);
657 }
658
659 c = -(ESC_REF + c);
660 break;
661
662 /* The handling of escape sequences consisting of a string of digits
663 starting with one that is not zero is not straightforward. By experiment,
664 the way Perl works seems to be as follows:
665
666 Outside a character class, the digits are read as a decimal number. If the
667 number is less than 10, or if there are that many previous extracting
668 left brackets, then it is a back reference. Otherwise, up to three octal
669 digits are read to form an escaped byte. Thus \123 is likely to be octal
670 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
671 value is greater than 377, the least significant 8 bits are taken. Inside a
672 character class, \ followed by a digit is always an octal number. */
673
674 case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
675 case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
676
677 if (!isclass)
678 {
679 oldptr = ptr;
680 c -= CHAR_0;
681 while ((digitab[ptr[1]] & ctype_digit) != 0)
682 c = c * 10 + *(++ptr) - CHAR_0;
683 if (c < 0) /* Integer overflow */
684 {
685 *errorcodeptr = ERR61;
686 break;
687 }
688 if (c < 10 || c <= bracount)
689 {
690 c = -(ESC_REF + c);
691 break;
692 }
693 ptr = oldptr; /* Put the pointer back and fall through */
694 }
695
696 /* Handle an octal number following \. If the first digit is 8 or 9, Perl
697 generates a binary zero byte and treats the digit as a following literal.
698 Thus we have to pull back the pointer by one. */
699
700 if ((c = *ptr) >= CHAR_8)
701 {
702 ptr--;
703 c = 0;
704 break;
705 }
706
707 /* \0 always starts an octal number, but we may drop through to here with a
708 larger first octal digit. The original code used just to take the least
709 significant 8 bits of octal numbers (I think this is what early Perls used
710 to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
711 than 3 octal digits. */
712
713 case CHAR_0:
714 c -= CHAR_0;
715 while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
716 c = c * 8 + *(++ptr) - CHAR_0;
717 if (!utf8 && c > 255) *errorcodeptr = ERR51;
718 break;
719
720 /* \x is complicated. \x{ddd} is a character number which can be greater
721 than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
722 treated as a data character. */
723
724 case CHAR_x:
725 if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
726 {
727 const uschar *pt = ptr + 2;
728 int count = 0;
729
730 c = 0;
731 while ((digitab[*pt] & ctype_xdigit) != 0)
732 {
733 register int cc = *pt++;
734 if (c == 0 && cc == CHAR_0) continue; /* Leading zeroes */
735 count++;
736
737 #ifndef EBCDIC /* ASCII/UTF-8 coding */
738 if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
739 c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
740 #else /* EBCDIC coding */
741 if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */
742 c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
743 #endif
744 }
745
746 if (*pt == CHAR_RIGHT_CURLY_BRACKET)
747 {
748 if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
749 ptr = pt;
750 break;
751 }
752
753 /* If the sequence of hex digits does not end with '}', then we don't
754 recognize this construct; fall through to the normal \x handling. */
755 }
756
757 /* Read just a single-byte hex-defined char */
758
759 c = 0;
760 while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
761 {
762 int cc; /* Some compilers don't like */
763 cc = *(++ptr); /* ++ in initializers */
764 #ifndef EBCDIC /* ASCII/UTF-8 coding */
765 if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
766 c = c * 16 + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
767 #else /* EBCDIC coding */
768 if (cc <= CHAR_z) cc += 64; /* Convert to upper case */
769 c = c * 16 + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
770 #endif
771 }
772 break;
773
774 /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
775 This coding is ASCII-specific, but then the whole concept of \cx is
776 ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
777
778 case CHAR_c:
779 c = *(++ptr);
780 if (c == 0)
781 {
782 *errorcodeptr = ERR2;
783 break;
784 }
785
786 #ifndef EBCDIC /* ASCII/UTF-8 coding */
787 if (c >= CHAR_a && c <= CHAR_z) c -= 32;
788 c ^= 0x40;
789 #else /* EBCDIC coding */
790 if (c >= CHAR_a && c <= CHAR_z) c += 64;
791 c ^= 0xC0;
792 #endif
793 break;
794
795 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
796 other alphanumeric following \ is an error if PCRE_EXTRA was set;
797 otherwise, for Perl compatibility, it is a literal. This code looks a bit
798 odd, but there used to be some cases other than the default, and there may
799 be again in future, so I haven't "optimized" it. */
800
801 default:
802 if ((options & PCRE_EXTRA) != 0) switch(c)
803 {
804 default:
805 *errorcodeptr = ERR3;
806 break;
807 }
808 break;
809 }
810 }
811
812 *ptrptr = ptr;
813 return c;
814 }
815
816
817
818 #ifdef SUPPORT_UCP
819 /*************************************************
820 * Handle \P and \p *
821 *************************************************/
822
823 /* This function is called after \P or \p has been encountered, provided that
824 PCRE is compiled with support for Unicode properties. On entry, ptrptr is
825 pointing at the P or p. On exit, it is pointing at the final character of the
826 escape sequence.
827
828 Argument:
829 ptrptr points to the pattern position pointer
830 negptr points to a boolean that is set TRUE for negation else FALSE
831 dptr points to an int that is set to the detailed property value
832 errorcodeptr points to the error code variable
833
834 Returns: type value from ucp_type_table, or -1 for an invalid type
835 */
836
837 static int
838 get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
839 {
840 int c, i, bot, top;
841 const uschar *ptr = *ptrptr;
842 char name[32];
843
844 c = *(++ptr);
845 if (c == 0) goto ERROR_RETURN;
846
847 *negptr = FALSE;
848
849 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
850 negation. */
851
852 if (c == CHAR_LEFT_CURLY_BRACKET)
853 {
854 if (ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
855 {
856 *negptr = TRUE;
857 ptr++;
858 }
859 for (i = 0; i < (int)sizeof(name) - 1; i++)
860 {
861 c = *(++ptr);
862 if (c == 0) goto ERROR_RETURN;
863 if (c == CHAR_RIGHT_CURLY_BRACKET) break;
864 name[i] = c;
865 }
866 if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
867 name[i] = 0;
868 }
869
870 /* Otherwise there is just one following character */
871
872 else
873 {
874 name[0] = c;
875 name[1] = 0;
876 }
877
878 *ptrptr = ptr;
879
880 /* Search for a recognized property name using binary chop */
881
882 bot = 0;
883 top = _pcre_utt_size;
884
885 while (bot < top)
886 {
887 i = (bot + top) >> 1;
888 c = strcmp(name, _pcre_utt_names + _pcre_utt[i].name_offset);
889 if (c == 0)
890 {
891 *dptr = _pcre_utt[i].value;
892 return _pcre_utt[i].type;
893 }
894 if (c > 0) bot = i + 1; else top = i;
895 }
896
897 *errorcodeptr = ERR47;
898 *ptrptr = ptr;
899 return -1;
900
901 ERROR_RETURN:
902 *errorcodeptr = ERR46;
903 *ptrptr = ptr;
904 return -1;
905 }
906 #endif
907
908
909
910
911 /*************************************************
912 * Check for counted repeat *
913 *************************************************/
914
915 /* This function is called when a '{' is encountered in a place where it might
916 start a quantifier. It looks ahead to see if it really is a quantifier or not.
917 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
918 where the ddds are digits.
919
920 Arguments:
921 p pointer to the first char after '{'
922
923 Returns: TRUE or FALSE
924 */
925
926 static BOOL
927 is_counted_repeat(const uschar *p)
928 {
929 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
930 while ((digitab[*p] & ctype_digit) != 0) p++;
931 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
932
933 if (*p++ != CHAR_COMMA) return FALSE;
934 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
935
936 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
937 while ((digitab[*p] & ctype_digit) != 0) p++;
938
939 return (*p == CHAR_RIGHT_CURLY_BRACKET);
940 }
941
942
943
944 /*************************************************
945 * Read repeat counts *
946 *************************************************/
947
948 /* Read an item of the form {n,m} and return the values. This is called only
949 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
950 so the syntax is guaranteed to be correct, but we need to check the values.
951
952 Arguments:
953 p pointer to first char after '{'
954 minp pointer to int for min
955 maxp pointer to int for max
956 returned as -1 if no max
957 errorcodeptr points to error code variable
958
959 Returns: pointer to '}' on success;
960 current ptr on error, with errorcodeptr set non-zero
961 */
962
963 static const uschar *
964 read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
965 {
966 int min = 0;
967 int max = -1;
968
969 /* Read the minimum value and do a paranoid check: a negative value indicates
970 an integer overflow. */
971
972 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - CHAR_0;
973 if (min < 0 || min > 65535)
974 {
975 *errorcodeptr = ERR5;
976 return p;
977 }
978
979 /* Read the maximum value if there is one, and again do a paranoid on its size.
980 Also, max must not be less than min. */
981
982 if (*p == CHAR_RIGHT_CURLY_BRACKET) max = min; else
983 {
984 if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)
985 {
986 max = 0;
987 while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - CHAR_0;
988 if (max < 0 || max > 65535)
989 {
990 *errorcodeptr = ERR5;
991 return p;
992 }
993 if (max < min)
994 {
995 *errorcodeptr = ERR4;
996 return p;
997 }
998 }
999 }
1000
1001 /* Fill in the required variables, and pass back the pointer to the terminating
1002 '}'. */
1003
1004 *minp = min;
1005 *maxp = max;
1006 return p;
1007 }
1008
1009
1010
1011 /*************************************************
1012 * Subroutine for finding forward reference *
1013 *************************************************/
1014
1015 /* This recursive function is called only from find_parens() below. The
1016 top-level call starts at the beginning of the pattern. All other calls must
1017 start at a parenthesis. It scans along a pattern's text looking for capturing
1018 subpatterns, and counting them. If it finds a named pattern that matches the
1019 name it is given, it returns its number. Alternatively, if the name is NULL, it
1020 returns when it reaches a given numbered subpattern. We know that if (?P< is
1021 encountered, the name will be terminated by '>' because that is checked in the
1022 first pass. Recursion is used to keep track of subpatterns that reset the
1023 capturing group numbers - the (?| feature.
1024
1025 Arguments:
1026 ptrptr address of the current character pointer (updated)
1027 cd compile background data
1028 name name to seek, or NULL if seeking a numbered subpattern
1029 lorn name length, or subpattern number if name is NULL
1030 xmode TRUE if we are in /x mode
1031 count pointer to the current capturing subpattern number (updated)
1032
1033 Returns: the number of the named subpattern, or -1 if not found
1034 */
1035
1036 static int
1037 find_parens_sub(uschar **ptrptr, compile_data *cd, const uschar *name, int lorn,
1038 BOOL xmode, int *count)
1039 {
1040 uschar *ptr = *ptrptr;
1041 int start_count = *count;
1042 int hwm_count = start_count;
1043 BOOL dup_parens = FALSE;
1044
1045 /* If the first character is a parenthesis, check on the type of group we are
1046 dealing with. The very first call may not start with a parenthesis. */
1047
1048 if (ptr[0] == CHAR_LEFT_PARENTHESIS)
1049 {
1050 if (ptr[1] == CHAR_QUESTION_MARK &&
1051 ptr[2] == CHAR_VERTICAL_LINE)
1052 {
1053 ptr += 3;
1054 dup_parens = TRUE;
1055 }
1056
1057 /* Handle a normal, unnamed capturing parenthesis */
1058
1059 else if (ptr[1] != CHAR_QUESTION_MARK && ptr[1] != CHAR_ASTERISK)
1060 {
1061 *count += 1;
1062 if (name == NULL && *count == lorn) return *count;
1063 ptr++;
1064 }
1065
1066 /* Handle a condition. If it is an assertion, just carry on so that it
1067 is processed as normal. If not, skip to the closing parenthesis of the
1068 condition (there can't be any nested parens. */
1069
1070 else if (ptr[2] == CHAR_LEFT_PARENTHESIS)
1071 {
1072 ptr += 2;
1073 if (ptr[1] != CHAR_QUESTION_MARK)
1074 {
1075 while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
1076 if (*ptr != 0) ptr++;
1077 }
1078 }
1079
1080 /* We have either (? or (* and not a condition */
1081
1082 else
1083 {
1084 ptr += 2;
1085 if (*ptr == CHAR_P) ptr++; /* Allow optional P */
1086
1087 /* We have to disambiguate (?<! and (?<= from (?<name> for named groups */
1088
1089 if ((*ptr == CHAR_LESS_THAN_SIGN && ptr[1] != CHAR_EXCLAMATION_MARK &&
1090 ptr[1] != CHAR_EQUALS_SIGN) || *ptr == CHAR_APOSTROPHE)
1091 {
1092 int term;
1093 const uschar *thisname;
1094 *count += 1;
1095 if (name == NULL && *count == lorn) return *count;
1096 term = *ptr++;
1097 if (term == CHAR_LESS_THAN_SIGN) term = CHAR_GREATER_THAN_SIGN;
1098 thisname = ptr;
1099 while (*ptr != term) ptr++;
1100 if (name != NULL && lorn == ptr - thisname &&
1101 strncmp((const char *)name, (const char *)thisname, lorn) == 0)
1102 return *count;
1103 term++;
1104 }
1105 }
1106 }
1107
1108 /* Past any initial parenthesis handling, scan for parentheses or vertical
1109 bars. */
1110
1111 for (; *ptr != 0; ptr++)
1112 {
1113 /* Skip over backslashed characters and also entire \Q...\E */
1114
1115 if (*ptr == CHAR_BACKSLASH)
1116 {
1117 if (*(++ptr) == 0) goto FAIL_EXIT;
1118 if (*ptr == CHAR_Q) for (;;)
1119 {
1120 while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
1121 if (*ptr == 0) goto FAIL_EXIT;
1122 if (*(++ptr) == CHAR_E) break;
1123 }
1124 continue;
1125 }
1126
1127 /* Skip over character classes; this logic must be similar to the way they
1128 are handled for real. If the first character is '^', skip it. Also, if the
1129 first few characters (either before or after ^) are \Q\E or \E we skip them
1130 too. This makes for compatibility with Perl. Note the use of STR macros to
1131 encode "Q\\E" so that it works in UTF-8 on EBCDIC platforms. */
1132
1133 if (*ptr == CHAR_LEFT_SQUARE_BRACKET)
1134 {
1135 BOOL negate_class = FALSE;
1136 for (;;)
1137 {
1138 if (ptr[1] == CHAR_BACKSLASH)
1139 {
1140 if (ptr[2] == CHAR_E)
1141 ptr+= 2;
1142 else if (strncmp((const char *)ptr+2,
1143 STR_Q STR_BACKSLASH STR_E, 3) == 0)
1144 ptr += 4;
1145 else
1146 break;
1147 }
1148 else if (!negate_class && ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
1149 {
1150 negate_class = TRUE;
1151 ptr++;
1152 }
1153 else break;
1154 }
1155
1156 /* If the next character is ']', it is a data character that must be
1157 skipped, except in JavaScript compatibility mode. */
1158
1159 if (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET &&
1160 (cd->external_options & PCRE_JAVASCRIPT_COMPAT) == 0)
1161 ptr++;
1162
1163 while (*(++ptr) != CHAR_RIGHT_SQUARE_BRACKET)
1164 {
1165 if (*ptr == 0) return -1;
1166 if (*ptr == CHAR_BACKSLASH)
1167 {
1168 if (*(++ptr) == 0) goto FAIL_EXIT;
1169 if (*ptr == CHAR_Q) for (;;)
1170 {
1171 while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
1172 if (*ptr == 0) goto FAIL_EXIT;
1173 if (*(++ptr) == CHAR_E) break;
1174 }
1175 continue;
1176 }
1177 }
1178 continue;
1179 }
1180
1181 /* Skip comments in /x mode */
1182
1183 if (xmode && *ptr == CHAR_NUMBER_SIGN)
1184 {
1185 while (*(++ptr) != 0 && *ptr != CHAR_NL) {};
1186 if (*ptr == 0) goto FAIL_EXIT;
1187 continue;
1188 }
1189
1190 /* Check for the special metacharacters */
1191
1192 if (*ptr == CHAR_LEFT_PARENTHESIS)
1193 {
1194 int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, count);
1195 if (rc > 0) return rc;
1196 if (*ptr == 0) goto FAIL_EXIT;
1197 }
1198
1199 else if (*ptr == CHAR_RIGHT_PARENTHESIS)
1200 {
1201 if (dup_parens && *count < hwm_count) *count = hwm_count;
1202 *ptrptr = ptr;
1203 return -1;
1204 }
1205
1206 else if (*ptr == CHAR_VERTICAL_LINE && dup_parens)
1207 {
1208 if (*count > hwm_count) hwm_count = *count;
1209 *count = start_count;
1210 }
1211 }
1212
1213 FAIL_EXIT:
1214 *ptrptr = ptr;
1215 return -1;
1216 }
1217
1218
1219
1220
1221 /*************************************************
1222 * Find forward referenced subpattern *
1223 *************************************************/
1224
1225 /* This function scans along a pattern's text looking for capturing
1226 subpatterns, and counting them. If it finds a named pattern that matches the
1227 name it is given, it returns its number. Alternatively, if the name is NULL, it
1228 returns when it reaches a given numbered subpattern. This is used for forward
1229 references to subpatterns. We used to be able to start this scan from the
1230 current compiling point, using the current count value from cd->bracount, and
1231 do it all in a single loop, but the addition of the possibility of duplicate
1232 subpattern numbers means that we have to scan from the very start, in order to
1233 take account of such duplicates, and to use a recursive function to keep track
1234 of the different types of group.
1235
1236 Arguments:
1237 cd compile background data
1238 name name to seek, or NULL if seeking a numbered subpattern
1239 lorn name length, or subpattern number if name is NULL
1240 xmode TRUE if we are in /x mode
1241
1242 Returns: the number of the found subpattern, or -1 if not found
1243 */
1244
1245 static int
1246 find_parens(compile_data *cd, const uschar *name, int lorn, BOOL xmode)
1247 {
1248 uschar *ptr = (uschar *)cd->start_pattern;
1249 int count = 0;
1250 int rc;
1251
1252 /* If the pattern does not start with an opening parenthesis, the first call
1253 to find_parens_sub() will scan right to the end (if necessary). However, if it
1254 does start with a parenthesis, find_parens_sub() will return when it hits the
1255 matching closing parens. That is why we have to have a loop. */
1256
1257 for (;;)
1258 {
1259 rc = find_parens_sub(&ptr, cd, name, lorn, xmode, &count);
1260 if (rc > 0 || *ptr++ == 0) break;
1261 }
1262
1263 return rc;
1264 }
1265
1266
1267
1268
1269 /*************************************************
1270 * Find first significant op code *
1271 *************************************************/
1272
1273 /* This is called by several functions that scan a compiled expression looking
1274 for a fixed first character, or an anchoring op code etc. It skips over things
1275 that do not influence this. For some calls, a change of option is important.
1276 For some calls, it makes sense to skip negative forward and all backward
1277 assertions, and also the \b assertion; for others it does not.
1278
1279 Arguments:
1280 code pointer to the start of the group
1281 options pointer to external options
1282 optbit the option bit whose changing is significant, or
1283 zero if none are
1284 skipassert TRUE if certain assertions are to be skipped
1285
1286 Returns: pointer to the first significant opcode
1287 */
1288
1289 static const uschar*
1290 first_significant_code(const uschar *code, int *options, int optbit,
1291 BOOL skipassert)
1292 {
1293 for (;;)
1294 {
1295 switch ((int)*code)
1296 {
1297 case OP_OPT:
1298 if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
1299 *options = (int)code[1];
1300 code += 2;
1301 break;
1302
1303 case OP_ASSERT_NOT:
1304 case OP_ASSERTBACK:
1305 case OP_ASSERTBACK_NOT:
1306 if (!skipassert) return code;
1307 do code += GET(code, 1); while (*code == OP_ALT);
1308 code += _pcre_OP_lengths[*code];
1309 break;
1310
1311 case OP_WORD_BOUNDARY:
1312 case OP_NOT_WORD_BOUNDARY:
1313 if (!skipassert) return code;
1314 /* Fall through */
1315
1316 case OP_CALLOUT:
1317 case OP_CREF:
1318 case OP_RREF:
1319 case OP_DEF:
1320 code += _pcre_OP_lengths[*code];
1321 break;
1322
1323 default:
1324 return code;
1325 }
1326 }
1327 /* Control never reaches here */
1328 }
1329
1330
1331
1332
1333 /*************************************************
1334 * Find the fixed length of a pattern *
1335 *************************************************/
1336
1337 /* Scan a pattern and compute the fixed length of subject that will match it,
1338 if the length is fixed. This is needed for dealing with backward assertions.
1339 In UTF8 mode, the result is in characters rather than bytes.
1340
1341 Arguments:
1342 code points to the start of the pattern (the bracket)
1343 options the compiling options
1344
1345 Returns: the fixed length, or -1 if there is no fixed length,
1346 or -2 if \C was encountered
1347 */
1348
1349 static int
1350 find_fixedlength(uschar *code, int options)
1351 {
1352 int length = -1;
1353
1354 register int branchlength = 0;
1355 register uschar *cc = code + 1 + LINK_SIZE;
1356
1357 /* Scan along the opcodes for this branch. If we get to the end of the
1358 branch, check the length against that of the other branches. */
1359
1360 for (;;)
1361 {
1362 int d;
1363 register int op = *cc;
1364 switch (op)
1365 {
1366 case OP_CBRA:
1367 case OP_BRA:
1368 case OP_ONCE:
1369 case OP_COND:
1370 d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);
1371 if (d < 0) return d;
1372 branchlength += d;
1373 do cc += GET(cc, 1); while (*cc == OP_ALT);
1374 cc += 1 + LINK_SIZE;
1375 break;
1376
1377 /* Reached end of a branch; if it's a ket it is the end of a nested
1378 call. If it's ALT it is an alternation in a nested call. If it is
1379 END it's the end of the outer call. All can be handled by the same code. */
1380
1381 case OP_ALT:
1382 case OP_KET:
1383 case OP_KETRMAX:
1384 case OP_KETRMIN:
1385 case OP_END:
1386 if (length < 0) length = branchlength;
1387 else if (length != branchlength) return -1;
1388 if (*cc != OP_ALT) return length;
1389 cc += 1 + LINK_SIZE;
1390 branchlength = 0;
1391 break;
1392
1393 /* Skip over assertive subpatterns */
1394
1395 case OP_ASSERT:
1396 case OP_ASSERT_NOT:
1397 case OP_ASSERTBACK:
1398 case OP_ASSERTBACK_NOT:
1399 do cc += GET(cc, 1); while (*cc == OP_ALT);
1400 /* Fall through */
1401
1402 /* Skip over things that don't match chars */
1403
1404 case OP_REVERSE:
1405 case OP_CREF:
1406 case OP_RREF:
1407 case OP_DEF:
1408 case OP_OPT:
1409 case OP_CALLOUT:
1410 case OP_SOD:
1411 case OP_SOM:
1412 case OP_EOD:
1413 case OP_EODN:
1414 case OP_CIRC:
1415 case OP_DOLL:
1416 case OP_NOT_WORD_BOUNDARY:
1417 case OP_WORD_BOUNDARY:
1418 cc += _pcre_OP_lengths[*cc];
1419 break;
1420
1421 /* Handle literal characters */
1422
1423 case OP_CHAR:
1424 case OP_CHARNC:
1425 case OP_NOT:
1426 branchlength++;
1427 cc += 2;
1428 #ifdef SUPPORT_UTF8
1429 if ((options & PCRE_UTF8) != 0 && cc[-1] >= 0xc0)
1430 cc += _pcre_utf8_table4[cc[-1] & 0x3f];
1431 #endif
1432 break;
1433
1434 /* Handle exact repetitions. The count is already in characters, but we
1435 need to skip over a multibyte character in UTF8 mode. */
1436
1437 case OP_EXACT:
1438 branchlength += GET2(cc,1);
1439 cc += 4;
1440 #ifdef SUPPORT_UTF8
1441 if ((options & PCRE_UTF8) != 0 && cc[-1] >= 0xc0)
1442 cc += _pcre_utf8_table4[cc[-1] & 0x3f];
1443 #endif
1444 break;
1445
1446 case OP_TYPEEXACT:
1447 branchlength += GET2(cc,1);
1448 if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
1449 cc += 4;
1450 break;
1451
1452 /* Handle single-char matchers */
1453
1454 case OP_PROP:
1455 case OP_NOTPROP:
1456 cc += 2;
1457 /* Fall through */
1458
1459 case OP_NOT_DIGIT:
1460 case OP_DIGIT:
1461 case OP_NOT_WHITESPACE:
1462 case OP_WHITESPACE:
1463 case OP_NOT_WORDCHAR:
1464 case OP_WORDCHAR:
1465 case OP_ANY:
1466 case OP_ALLANY:
1467 branchlength++;
1468 cc++;
1469 break;
1470
1471 /* The single-byte matcher isn't allowed */
1472
1473 case OP_ANYBYTE:
1474 return -2;
1475
1476 /* Check a class for variable quantification */
1477
1478 #ifdef SUPPORT_UTF8
1479 case OP_XCLASS:
1480 cc += GET(cc, 1) - 33;
1481 /* Fall through */
1482 #endif
1483
1484 case OP_CLASS:
1485 case OP_NCLASS:
1486 cc += 33;
1487
1488 switch (*cc)
1489 {
1490 case OP_CRSTAR:
1491 case OP_CRMINSTAR:
1492 case OP_CRQUERY:
1493 case OP_CRMINQUERY:
1494 return -1;
1495
1496 case OP_CRRANGE:
1497 case OP_CRMINRANGE:
1498 if (GET2(cc,1) != GET2(cc,3)) return -1;
1499 branchlength += GET2(cc,1);
1500 cc += 5;
1501 break;
1502
1503 default:
1504 branchlength++;
1505 }
1506 break;
1507
1508 /* Anything else is variable length */
1509
1510 default:
1511 return -1;
1512 }
1513 }
1514 /* Control never gets here */
1515 }
1516
1517
1518
1519
1520 /*************************************************
1521 * Scan compiled regex for numbered bracket *
1522 *************************************************/
1523
1524 /* This little function scans through a compiled pattern until it finds a
1525 capturing bracket with the given number.
1526
1527 Arguments:
1528 code points to start of expression
1529 utf8 TRUE in UTF-8 mode
1530 number the required bracket number
1531
1532 Returns: pointer to the opcode for the bracket, or NULL if not found
1533 */
1534
1535 static const uschar *
1536 find_bracket(const uschar *code, BOOL utf8, int number)
1537 {
1538 for (;;)
1539 {
1540 register int c = *code;
1541 if (c == OP_END) return NULL;
1542
1543 /* XCLASS is used for classes that cannot be represented just by a bit
1544 map. This includes negated single high-valued characters. The length in
1545 the table is zero; the actual length is stored in the compiled code. */
1546
1547 if (c == OP_XCLASS) code += GET(code, 1);
1548
1549 /* Handle capturing bracket */
1550
1551 else if (c == OP_CBRA)
1552 {
1553 int n = GET2(code, 1+LINK_SIZE);
1554 if (n == number) return (uschar *)code;
1555 code += _pcre_OP_lengths[c];
1556 }
1557
1558 /* Otherwise, we can get the item's length from the table, except that for
1559 repeated character types, we have to test for \p and \P, which have an extra
1560 two bytes of parameters. */
1561
1562 else
1563 {
1564 switch(c)
1565 {
1566 case OP_TYPESTAR:
1567 case OP_TYPEMINSTAR:
1568 case OP_TYPEPLUS:
1569 case OP_TYPEMINPLUS:
1570 case OP_TYPEQUERY:
1571 case OP_TYPEMINQUERY:
1572 case OP_TYPEPOSSTAR:
1573 case OP_TYPEPOSPLUS:
1574 case OP_TYPEPOSQUERY:
1575 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1576 break;
1577
1578 case OP_TYPEUPTO:
1579 case OP_TYPEMINUPTO:
1580 case OP_TYPEEXACT:
1581 case OP_TYPEPOSUPTO:
1582 if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1583 break;
1584 }
1585
1586 /* Add in the fixed length from the table */
1587
1588 code += _pcre_OP_lengths[c];
1589
1590 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1591 a multi-byte character. The length in the table is a minimum, so we have to
1592 arrange to skip the extra bytes. */
1593
1594 #ifdef SUPPORT_UTF8
1595 if (utf8) switch(c)
1596 {
1597 case OP_CHAR:
1598 case OP_CHARNC:
1599 case OP_EXACT:
1600 case OP_UPTO:
1601 case OP_MINUPTO:
1602 case OP_POSUPTO:
1603 case OP_STAR:
1604 case OP_MINSTAR:
1605 case OP_POSSTAR:
1606 case OP_PLUS:
1607 case OP_MINPLUS:
1608 case OP_POSPLUS:
1609 case OP_QUERY:
1610 case OP_MINQUERY:
1611 case OP_POSQUERY:
1612 if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1613 break;
1614 }
1615 #else
1616 (void)(utf8); /* Keep compiler happy by referencing function argument */
1617 #endif
1618 }
1619 }
1620 }
1621
1622
1623
1624 /*************************************************
1625 * Scan compiled regex for recursion reference *
1626 *************************************************/
1627
1628 /* This little function scans through a compiled pattern until it finds an
1629 instance of OP_RECURSE.
1630
1631 Arguments:
1632 code points to start of expression
1633 utf8 TRUE in UTF-8 mode
1634
1635 Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1636 */
1637
1638 static const uschar *
1639 find_recurse(const uschar *code, BOOL utf8)
1640 {
1641 for (;;)
1642 {
1643 register int c = *code;
1644 if (c == OP_END) return NULL;
1645 if (c == OP_RECURSE) return code;
1646
1647 /* XCLASS is used for classes that cannot be represented just by a bit
1648 map. This includes negated single high-valued characters. The length in
1649 the table is zero; the actual length is stored in the compiled code. */
1650
1651 if (c == OP_XCLASS) code += GET(code, 1);
1652
1653 /* Otherwise, we can get the item's length from the table, except that for
1654 repeated character types, we have to test for \p and \P, which have an extra
1655 two bytes of parameters. */
1656
1657 else
1658 {
1659 switch(c)
1660 {
1661 case OP_TYPESTAR:
1662 case OP_TYPEMINSTAR:
1663 case OP_TYPEPLUS:
1664 case OP_TYPEMINPLUS:
1665 case OP_TYPEQUERY:
1666 case OP_TYPEMINQUERY:
1667 case OP_TYPEPOSSTAR:
1668 case OP_TYPEPOSPLUS:
1669 case OP_TYPEPOSQUERY:
1670 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1671 break;
1672
1673 case OP_TYPEPOSUPTO:
1674 case OP_TYPEUPTO:
1675 case OP_TYPEMINUPTO:
1676 case OP_TYPEEXACT:
1677 if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1678 break;
1679 }
1680
1681 /* Add in the fixed length from the table */
1682
1683 code += _pcre_OP_lengths[c];
1684
1685 /* In UTF-8 mode, opcodes that are followed by a character may be followed
1686 by a multi-byte character. The length in the table is a minimum, so we have
1687 to arrange to skip the extra bytes. */
1688
1689 #ifdef SUPPORT_UTF8
1690 if (utf8) switch(c)
1691 {
1692 case OP_CHAR:
1693 case OP_CHARNC:
1694 case OP_EXACT:
1695 case OP_UPTO:
1696 case OP_MINUPTO:
1697 case OP_POSUPTO:
1698 case OP_STAR:
1699 case OP_MINSTAR:
1700 case OP_POSSTAR:
1701 case OP_PLUS:
1702 case OP_MINPLUS:
1703 case OP_POSPLUS:
1704 case OP_QUERY:
1705 case OP_MINQUERY:
1706 case OP_POSQUERY:
1707 if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1708 break;
1709 }
1710 #else
1711 (void)(utf8); /* Keep compiler happy by referencing function argument */
1712 #endif
1713 }
1714 }
1715 }
1716
1717
1718
1719 /*************************************************
1720 * Scan compiled branch for non-emptiness *
1721 *************************************************/
1722
1723 /* This function scans through a branch of a compiled pattern to see whether it
1724 can match the empty string or not. It is called from could_be_empty()
1725 below and from compile_branch() when checking for an unlimited repeat of a
1726 group that can match nothing. Note that first_significant_code() skips over
1727 backward and negative forward assertions when its final argument is TRUE. If we
1728 hit an unclosed bracket, we return "empty" - this means we've struck an inner
1729 bracket whose current branch will already have been scanned.
1730
1731 Arguments:
1732 code points to start of search
1733 endcode points to where to stop
1734 utf8 TRUE if in UTF8 mode
1735
1736 Returns: TRUE if what is matched could be empty
1737 */
1738
1739 static BOOL
1740 could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1741 {
1742 register int c;
1743 for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1744 code < endcode;
1745 code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1746 {
1747 const uschar *ccode;
1748
1749 c = *code;
1750
1751 /* Skip over forward assertions; the other assertions are skipped by
1752 first_significant_code() with a TRUE final argument. */
1753
1754 if (c == OP_ASSERT)
1755 {
1756 do code += GET(code, 1); while (*code == OP_ALT);
1757 c = *code;
1758 continue;
1759 }
1760
1761 /* Groups with zero repeats can of course be empty; skip them. */
1762
1763 if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO)
1764 {
1765 code += _pcre_OP_lengths[c];
1766 do code += GET(code, 1); while (*code == OP_ALT);
1767 c = *code;
1768 continue;
1769 }
1770
1771 /* For other groups, scan the branches. */
1772
1773 if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND)
1774 {
1775 BOOL empty_branch;
1776 if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1777
1778 /* If a conditional group has only one branch, there is a second, implied,
1779 empty branch, so just skip over the conditional, because it could be empty.
1780 Otherwise, scan the individual branches of the group. */
1781
1782 if (c == OP_COND && code[GET(code, 1)] != OP_ALT)
1783 code += GET(code, 1);
1784 else
1785 {
1786 empty_branch = FALSE;
1787 do
1788 {
1789 if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1790 empty_branch = TRUE;
1791 code += GET(code, 1);
1792 }
1793 while (*code == OP_ALT);
1794 if (!empty_branch) return FALSE; /* All branches are non-empty */
1795 }
1796
1797 c = *code;
1798 continue;
1799 }
1800
1801 /* Handle the other opcodes */
1802
1803 switch (c)
1804 {
1805 /* Check for quantifiers after a class. XCLASS is used for classes that
1806 cannot be represented just by a bit map. This includes negated single
1807 high-valued characters. The length in _pcre_OP_lengths[] is zero; the
1808 actual length is stored in the compiled code, so we must update "code"
1809 here. */
1810
1811 #ifdef SUPPORT_UTF8
1812 case OP_XCLASS:
1813 ccode = code += GET(code, 1);
1814 goto CHECK_CLASS_REPEAT;
1815 #endif
1816
1817 case OP_CLASS:
1818 case OP_NCLASS:
1819 ccode = code + 33;
1820
1821 #ifdef SUPPORT_UTF8
1822 CHECK_CLASS_REPEAT:
1823 #endif
1824
1825 switch (*ccode)
1826 {
1827 case OP_CRSTAR: /* These could be empty; continue */
1828 case OP_CRMINSTAR:
1829 case OP_CRQUERY:
1830 case OP_CRMINQUERY:
1831 break;
1832
1833 default: /* Non-repeat => class must match */
1834 case OP_CRPLUS: /* These repeats aren't empty */
1835 case OP_CRMINPLUS:
1836 return FALSE;
1837
1838 case OP_CRRANGE:
1839 case OP_CRMINRANGE:
1840 if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
1841 break;
1842 }
1843 break;
1844
1845 /* Opcodes that must match a character */
1846
1847 case OP_PROP:
1848 case OP_NOTPROP:
1849 case OP_EXTUNI:
1850 case OP_NOT_DIGIT:
1851 case OP_DIGIT:
1852 case OP_NOT_WHITESPACE:
1853 case OP_WHITESPACE:
1854 case OP_NOT_WORDCHAR:
1855 case OP_WORDCHAR:
1856 case OP_ANY:
1857 case OP_ALLANY:
1858 case OP_ANYBYTE:
1859 case OP_CHAR:
1860 case OP_CHARNC:
1861 case OP_NOT:
1862 case OP_PLUS:
1863 case OP_MINPLUS:
1864 case OP_POSPLUS:
1865 case OP_EXACT:
1866 case OP_NOTPLUS:
1867 case OP_NOTMINPLUS:
1868 case OP_NOTPOSPLUS:
1869 case OP_NOTEXACT:
1870 case OP_TYPEPLUS:
1871 case OP_TYPEMINPLUS:
1872 case OP_TYPEPOSPLUS:
1873 case OP_TYPEEXACT:
1874 return FALSE;
1875
1876 /* These are going to continue, as they may be empty, but we have to
1877 fudge the length for the \p and \P cases. */
1878
1879 case OP_TYPESTAR:
1880 case OP_TYPEMINSTAR:
1881 case OP_TYPEPOSSTAR:
1882 case OP_TYPEQUERY:
1883 case OP_TYPEMINQUERY:
1884 case OP_TYPEPOSQUERY:
1885 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1886 break;
1887
1888 /* Same for these */
1889
1890 case OP_TYPEUPTO:
1891 case OP_TYPEMINUPTO:
1892 case OP_TYPEPOSUPTO:
1893 if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1894 break;
1895
1896 /* End of branch */
1897
1898 case OP_KET:
1899 case OP_KETRMAX:
1900 case OP_KETRMIN:
1901 case OP_ALT:
1902 return TRUE;
1903
1904 /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1905 MINUPTO, and POSUPTO may be followed by a multibyte character */
1906
1907 #ifdef SUPPORT_UTF8
1908 case OP_STAR:
1909 case OP_MINSTAR:
1910 case OP_POSSTAR:
1911 case OP_QUERY:
1912 case OP_MINQUERY:
1913 case OP_POSQUERY:
1914 if (utf8 && code[1] >= 0xc0) code += _pcre_utf8_table4[code[1] & 0x3f];
1915 break;
1916
1917 case OP_UPTO:
1918 case OP_MINUPTO:
1919 case OP_POSUPTO:
1920 if (utf8 && code[3] >= 0xc0) code += _pcre_utf8_table4[code[3] & 0x3f];
1921 break;
1922 #endif
1923 }
1924 }
1925
1926 return TRUE;
1927 }
1928
1929
1930
1931 /*************************************************
1932 * Scan compiled regex for non-emptiness *
1933 *************************************************/
1934
1935 /* This function is called to check for left recursive calls. We want to check
1936 the current branch of the current pattern to see if it could match the empty
1937 string. If it could, we must look outwards for branches at other levels,
1938 stopping when we pass beyond the bracket which is the subject of the recursion.
1939
1940 Arguments:
1941 code points to start of the recursion
1942 endcode points to where to stop (current RECURSE item)
1943 bcptr points to the chain of current (unclosed) branch starts
1944 utf8 TRUE if in UTF-8 mode
1945
1946 Returns: TRUE if what is matched could be empty
1947 */
1948
1949 static BOOL
1950 could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1951 BOOL utf8)
1952 {
1953 while (bcptr != NULL && bcptr->current >= code)
1954 {
1955 if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1956 bcptr = bcptr->outer;
1957 }
1958 return TRUE;
1959 }
1960
1961
1962
1963 /*************************************************
1964 * Check for POSIX class syntax *
1965 *************************************************/
1966
1967 /* This function is called when the sequence "[:" or "[." or "[=" is
1968 encountered in a character class. It checks whether this is followed by a
1969 sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
1970 reach an unescaped ']' without the special preceding character, return FALSE.
1971
1972 Originally, this function only recognized a sequence of letters between the
1973 terminators, but it seems that Perl recognizes any sequence of characters,
1974 though of course unknown POSIX names are subsequently rejected. Perl gives an
1975 "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
1976 didn't consider this to be a POSIX class. Likewise for [:1234:].
1977
1978 The problem in trying to be exactly like Perl is in the handling of escapes. We
1979 have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
1980 class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
1981 below handles the special case of \], but does not try to do any other escape
1982 processing. This makes it different from Perl for cases such as [:l\ower:]
1983 where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize
1984 "l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,
1985 I think.
1986
1987 Arguments:
1988 ptr pointer to the initial [
1989 endptr where to return the end pointer
1990
1991 Returns: TRUE or FALSE
1992 */
1993
1994 static BOOL
1995 check_posix_syntax(const uschar *ptr, const uschar **endptr)
1996 {
1997 int terminator; /* Don't combine these lines; the Solaris cc */
1998 terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
1999 for (++ptr; *ptr != 0; ptr++)
2000 {
2001 if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET) ptr++; else
2002 {
2003 if (*ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
2004 if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
2005 {
2006 *endptr = ptr;
2007 return TRUE;
2008 }
2009 }
2010 }
2011 return FALSE;
2012 }
2013
2014
2015
2016
2017 /*************************************************
2018 * Check POSIX class name *
2019 *************************************************/
2020
2021 /* This function is called to check the name given in a POSIX-style class entry
2022 such as [:alnum:].
2023
2024 Arguments:
2025 ptr points to the first letter
2026 len the length of the name
2027
2028 Returns: a value representing the name, or -1 if unknown
2029 */
2030
2031 static int
2032 check_posix_name(const uschar *ptr, int len)
2033 {
2034 const char *pn = posix_names;
2035 register int yield = 0;
2036 while (posix_name_lengths[yield] != 0)
2037 {
2038 if (len == posix_name_lengths[yield] &&
2039 strncmp((const char *)ptr, pn, len) == 0) return yield;
2040 pn += posix_name_lengths[yield] + 1;
2041 yield++;
2042 }
2043 return -1;
2044 }
2045
2046
2047 /*************************************************
2048 * Adjust OP_RECURSE items in repeated group *
2049 *************************************************/
2050
2051 /* OP_RECURSE items contain an offset from the start of the regex to the group
2052 that is referenced. This means that groups can be replicated for fixed
2053 repetition simply by copying (because the recursion is allowed to refer to
2054 earlier groups that are outside the current group). However, when a group is
2055 optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is
2056 inserted before it, after it has been compiled. This means that any OP_RECURSE
2057 items within it that refer to the group itself or any contained groups have to
2058 have their offsets adjusted. That one of the jobs of this function. Before it
2059 is called, the partially compiled regex must be temporarily terminated with
2060 OP_END.
2061
2062 This function has been extended with the possibility of forward references for
2063 recursions and subroutine calls. It must also check the list of such references
2064 for the group we are dealing with. If it finds that one of the recursions in
2065 the current group is on this list, it adjusts the offset in the list, not the
2066 value in the reference (which is a group number).
2067
2068 Arguments:
2069 group points to the start of the group
2070 adjust the amount by which the group is to be moved
2071 utf8 TRUE in UTF-8 mode
2072 cd contains pointers to tables etc.
2073 save_hwm the hwm forward reference pointer at the start of the group
2074
2075 Returns: nothing
2076 */
2077
2078 static void
2079 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
2080 uschar *save_hwm)
2081 {
2082 uschar *ptr = group;
2083
2084 while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
2085 {
2086 int offset;
2087 uschar *hc;
2088
2089 /* See if this recursion is on the forward reference list. If so, adjust the
2090 reference. */
2091
2092 for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
2093 {
2094 offset = GET(hc, 0);
2095 if (cd->start_code + offset == ptr + 1)
2096 {
2097 PUT(hc, 0, offset + adjust);
2098 break;
2099 }
2100 }
2101
2102 /* Otherwise, adjust the recursion offset if it's after the start of this
2103 group. */
2104
2105 if (hc >= cd->hwm)
2106 {
2107 offset = GET(ptr, 1);
2108 if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
2109 }
2110
2111 ptr += 1 + LINK_SIZE;
2112 }
2113 }
2114
2115
2116
2117 /*************************************************
2118 * Insert an automatic callout point *
2119 *************************************************/
2120
2121 /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
2122 callout points before each pattern item.
2123
2124 Arguments:
2125 code current code pointer
2126 ptr current pattern pointer
2127 cd pointers to tables etc
2128
2129 Returns: new code pointer
2130 */
2131
2132 static uschar *
2133 auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
2134 {
2135 *code++ = OP_CALLOUT;
2136 *code++ = 255;
2137 PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */
2138 PUT(code, LINK_SIZE, 0); /* Default length */
2139 return code + 2*LINK_SIZE;
2140 }
2141
2142
2143
2144 /*************************************************
2145 * Complete a callout item *
2146 *************************************************/
2147
2148 /* A callout item contains the length of the next item in the pattern, which
2149 we can't fill in till after we have reached the relevant point. This is used
2150 for both automatic and manual callouts.
2151
2152 Arguments:
2153 previous_callout points to previous callout item
2154 ptr current pattern pointer
2155 cd pointers to tables etc
2156
2157 Returns: nothing
2158 */
2159
2160 static void
2161 complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
2162 {
2163 int length = ptr - cd->start_pattern - GET(previous_callout, 2);
2164 PUT(previous_callout, 2 + LINK_SIZE, length);
2165 }
2166
2167
2168
2169 #ifdef SUPPORT_UCP
2170 /*************************************************
2171 * Get othercase range *
2172 *************************************************/
2173
2174 /* This function is passed the start and end of a class range, in UTF-8 mode
2175 with UCP support. It searches up the characters, looking for internal ranges of
2176 characters in the "other" case. Each call returns the next one, updating the
2177 start address.
2178
2179 Arguments:
2180 cptr points to starting character value; updated
2181 d end value
2182 ocptr where to put start of othercase range
2183 odptr where to put end of othercase range
2184
2185 Yield: TRUE when range returned; FALSE when no more
2186 */
2187
2188 static BOOL
2189 get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
2190 unsigned int *odptr)
2191 {
2192 unsigned int c, othercase, next;
2193
2194 for (c = *cptr; c <= d; c++)
2195 { if ((othercase = UCD_OTHERCASE(c)) != c) break; }
2196
2197 if (c > d) return FALSE;
2198
2199 *ocptr = othercase;
2200 next = othercase + 1;
2201
2202 for (++c; c <= d; c++)
2203 {
2204 if (UCD_OTHERCASE(c) != next) break;
2205 next++;
2206 }
2207
2208 *odptr = next - 1;
2209 *cptr = c;
2210
2211 return TRUE;
2212 }
2213 #endif /* SUPPORT_UCP */
2214
2215
2216
2217 /*************************************************
2218 * Check if auto-possessifying is possible *
2219 *************************************************/
2220
2221 /* This function is called for unlimited repeats of certain items, to see
2222 whether the next thing could possibly match the repeated item. If not, it makes
2223 sense to automatically possessify the repeated item.
2224
2225 Arguments:
2226 op_code the repeated op code
2227 this data for this item, depends on the opcode
2228 utf8 TRUE in UTF-8 mode
2229 utf8_char used for utf8 character bytes, NULL if not relevant
2230 ptr next character in pattern
2231 options options bits
2232 cd contains pointers to tables etc.
2233
2234 Returns: TRUE if possessifying is wanted
2235 */
2236
2237 static BOOL
2238 check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
2239 const uschar *ptr, int options, compile_data *cd)
2240 {
2241 int next;
2242
2243 /* Skip whitespace and comments in extended mode */
2244
2245 if ((options & PCRE_EXTENDED) != 0)
2246 {
2247 for (;;)
2248 {
2249 while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2250 if (*ptr == CHAR_NUMBER_SIGN)
2251 {
2252 while (*(++ptr) != 0)
2253 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2254 }
2255 else break;
2256 }
2257 }
2258
2259 /* If the next item is one that we can handle, get its value. A non-negative
2260 value is a character, a negative value is an escape value. */
2261
2262 if (*ptr == CHAR_BACKSLASH)
2263 {
2264 int temperrorcode = 0;
2265 next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
2266 if (temperrorcode != 0) return FALSE;
2267 ptr++; /* Point after the escape sequence */
2268 }
2269
2270 else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
2271 {
2272 #ifdef SUPPORT_UTF8
2273 if (utf8) { GETCHARINC(next, ptr); } else
2274 #endif
2275 next = *ptr++;
2276 }
2277
2278 else return FALSE;
2279
2280 /* Skip whitespace and comments in extended mode */
2281
2282 if ((options & PCRE_EXTENDED) != 0)
2283 {
2284 for (;;)
2285 {
2286 while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2287 if (*ptr == CHAR_NUMBER_SIGN)
2288 {
2289 while (*(++ptr) != 0)
2290 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2291 }
2292 else break;
2293 }
2294 }
2295
2296 /* If the next thing is itself optional, we have to give up. */
2297
2298 if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
2299 strncmp((char *)ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
2300 return FALSE;
2301
2302 /* Now compare the next item with the previous opcode. If the previous is a
2303 positive single character match, "item" either contains the character or, if
2304 "item" is greater than 127 in utf8 mode, the character's bytes are in
2305 utf8_char. */
2306
2307
2308 /* Handle cases when the next item is a character. */
2309
2310 if (next >= 0) switch(op_code)
2311 {
2312 case OP_CHAR:
2313 #ifdef SUPPORT_UTF8
2314 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2315 #else
2316 (void)(utf8_char); /* Keep compiler happy by referencing function argument */
2317 #endif
2318 return item != next;
2319
2320 /* For CHARNC (caseless character) we must check the other case. If we have
2321 Unicode property support, we can use it to test the other case of
2322 high-valued characters. */
2323
2324 case OP_CHARNC:
2325 #ifdef SUPPORT_UTF8
2326 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2327 #endif
2328 if (item == next) return FALSE;
2329 #ifdef SUPPORT_UTF8
2330 if (utf8)
2331 {
2332 unsigned int othercase;
2333 if (next < 128) othercase = cd->fcc[next]; else
2334 #ifdef SUPPORT_UCP
2335 othercase = UCD_OTHERCASE((unsigned int)next);
2336 #else
2337 othercase = NOTACHAR;
2338 #endif
2339 return (unsigned int)item != othercase;
2340 }
2341 else
2342 #endif /* SUPPORT_UTF8 */
2343 return (item != cd->fcc[next]); /* Non-UTF-8 mode */
2344
2345 /* For OP_NOT, "item" must be a single-byte character. */
2346
2347 case OP_NOT:
2348 if (item == next) return TRUE;
2349 if ((options & PCRE_CASELESS) == 0) return FALSE;
2350 #ifdef SUPPORT_UTF8
2351 if (utf8)
2352 {
2353 unsigned int othercase;
2354 if (next < 128) othercase = cd->fcc[next]; else
2355 #ifdef SUPPORT_UCP
2356 othercase = UCD_OTHERCASE(next);
2357 #else
2358 othercase = NOTACHAR;
2359 #endif
2360 return (unsigned int)item == othercase;
2361 }
2362 else
2363 #endif /* SUPPORT_UTF8 */
2364 return (item == cd->fcc[next]); /* Non-UTF-8 mode */
2365
2366 case OP_DIGIT:
2367 return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
2368
2369 case OP_NOT_DIGIT:
2370 return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
2371
2372 case OP_WHITESPACE:
2373 return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
2374
2375 case OP_NOT_WHITESPACE:
2376 return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
2377
2378 case OP_WORDCHAR:
2379 return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
2380
2381 case OP_NOT_WORDCHAR:
2382 return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
2383
2384 case OP_HSPACE:
2385 case OP_NOT_HSPACE:
2386 switch(next)
2387 {
2388 case 0x09:
2389 case 0x20:
2390 case 0xa0:
2391 case 0x1680:
2392 case 0x180e:
2393 case 0x2000:
2394 case 0x2001:
2395 case 0x2002:
2396 case 0x2003:
2397 case 0x2004:
2398 case 0x2005:
2399 case 0x2006:
2400 case 0x2007:
2401 case 0x2008:
2402 case 0x2009:
2403 case 0x200A:
2404 case 0x202f:
2405 case 0x205f:
2406 case 0x3000:
2407 return op_code != OP_HSPACE;
2408 default:
2409 return op_code == OP_HSPACE;
2410 }
2411
2412 case OP_VSPACE:
2413 case OP_NOT_VSPACE:
2414 switch(next)
2415 {
2416 case 0x0a:
2417 case 0x0b:
2418 case 0x0c:
2419 case 0x0d:
2420 case 0x85:
2421 case 0x2028:
2422 case 0x2029:
2423 return op_code != OP_VSPACE;
2424 default:
2425 return op_code == OP_VSPACE;
2426 }
2427
2428 default:
2429 return FALSE;
2430 }
2431
2432
2433 /* Handle the case when the next item is \d, \s, etc. */
2434
2435 switch(op_code)
2436 {
2437 case OP_CHAR:
2438 case OP_CHARNC:
2439 #ifdef SUPPORT_UTF8
2440 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2441 #endif
2442 switch(-next)
2443 {
2444 case ESC_d:
2445 return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
2446
2447 case ESC_D:
2448 return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
2449
2450 case ESC_s:
2451 return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
2452
2453 case ESC_S:
2454 return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
2455
2456 case ESC_w:
2457 return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
2458
2459 case ESC_W:
2460 return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
2461
2462 case ESC_h:
2463 case ESC_H:
2464 switch(item)
2465 {
2466 case 0x09:
2467 case 0x20:
2468 case 0xa0:
2469 case 0x1680:
2470 case 0x180e:
2471 case 0x2000:
2472 case 0x2001:
2473 case 0x2002:
2474 case 0x2003:
2475 case 0x2004:
2476 case 0x2005:
2477 case 0x2006:
2478 case 0x2007:
2479 case 0x2008:
2480 case 0x2009:
2481 case 0x200A:
2482 case 0x202f:
2483 case 0x205f:
2484 case 0x3000:
2485 return -next != ESC_h;
2486 default:
2487 return -next == ESC_h;
2488 }
2489
2490 case ESC_v:
2491 case ESC_V:
2492 switch(item)
2493 {
2494 case 0x0a:
2495 case 0x0b:
2496 case 0x0c:
2497 case 0x0d:
2498 case 0x85:
2499 case 0x2028:
2500 case 0x2029:
2501 return -next != ESC_v;
2502 default:
2503 return -next == ESC_v;
2504 }
2505
2506 default:
2507 return FALSE;
2508 }
2509
2510 case OP_DIGIT:
2511 return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
2512 next == -ESC_h || next == -ESC_v;
2513
2514 case OP_NOT_DIGIT:
2515 return next == -ESC_d;
2516
2517 case OP_WHITESPACE:
2518 return next == -ESC_S || next == -ESC_d || next == -ESC_w;
2519
2520 case OP_NOT_WHITESPACE:
2521 return next == -ESC_s || next == -ESC_h || next == -ESC_v;
2522
2523 case OP_HSPACE:
2524 return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w;
2525
2526 case OP_NOT_HSPACE:
2527 return next == -ESC_h;
2528
2529 /* Can't have \S in here because VT matches \S (Perl anomaly) */
2530 case OP_VSPACE:
2531 return next == -ESC_V || next == -ESC_d || next == -ESC_w;
2532
2533 case OP_NOT_VSPACE:
2534 return next == -ESC_v;
2535
2536 case OP_WORDCHAR:
2537 return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v;
2538
2539 case OP_NOT_WORDCHAR:
2540 return next == -ESC_w || next == -ESC_d;
2541
2542 default:
2543 return FALSE;
2544 }
2545
2546 /* Control does not reach here */
2547 }
2548
2549
2550
2551 /*************************************************
2552 * Compile one branch *
2553 *************************************************/
2554
2555 /* Scan the pattern, compiling it into the a vector. If the options are
2556 changed during the branch, the pointer is used to change the external options
2557 bits. This function is used during the pre-compile phase when we are trying
2558 to find out the amount of memory needed, as well as during the real compile
2559 phase. The value of lengthptr distinguishes the two phases.
2560
2561 Arguments:
2562 optionsptr pointer to the option bits
2563 codeptr points to the pointer to the current code point
2564 ptrptr points to the current pattern pointer
2565 errorcodeptr points to error code variable
2566 firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2567 reqbyteptr set to the last literal character required, else < 0
2568 bcptr points to current branch chain
2569 cd contains pointers to tables etc.
2570 lengthptr NULL during the real compile phase
2571 points to length accumulator during pre-compile phase
2572
2573 Returns: TRUE on success
2574 FALSE, with *errorcodeptr set non-zero on error
2575 */
2576
2577 static BOOL
2578 compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2579 int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2580 compile_data *cd, int *lengthptr)
2581 {
2582 int repeat_type, op_type;
2583 int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
2584 int bravalue = 0;
2585 int greedy_default, greedy_non_default;
2586 int firstbyte, reqbyte;
2587 int zeroreqbyte, zerofirstbyte;
2588 int req_caseopt, reqvary, tempreqvary;
2589 int options = *optionsptr;
2590 int after_manual_callout = 0;
2591 int length_prevgroup = 0;
2592 register int c;
2593 register uschar *code = *codeptr;
2594 uschar *last_code = code;
2595 uschar *orig_code = code;
2596 uschar *tempcode;
2597 BOOL inescq = FALSE;
2598 BOOL groupsetfirstbyte = FALSE;
2599 const uschar *ptr = *ptrptr;
2600 const uschar *tempptr;
2601 uschar *previous = NULL;
2602 uschar *previous_callout = NULL;
2603 uschar *save_hwm = NULL;
2604 uschar classbits[32];
2605
2606 #ifdef SUPPORT_UTF8
2607 BOOL class_utf8;
2608 BOOL utf8 = (options & PCRE_UTF8) != 0;
2609 uschar *class_utf8data;
2610 uschar *class_utf8data_base;
2611 uschar utf8_char[6];
2612 #else
2613 BOOL utf8 = FALSE;
2614 uschar *utf8_char = NULL;
2615 #endif
2616
2617 #ifdef DEBUG
2618 if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2619 #endif
2620
2621 /* Set up the default and non-default settings for greediness */
2622
2623 greedy_default = ((options & PCRE_UNGREEDY) != 0);
2624 greedy_non_default = greedy_default ^ 1;
2625
2626 /* Initialize no first byte, no required byte. REQ_UNSET means "no char
2627 matching encountered yet". It gets changed to REQ_NONE if we hit something that
2628 matches a non-fixed char first char; reqbyte just remains unset if we never
2629 find one.
2630
2631 When we hit a repeat whose minimum is zero, we may have to adjust these values
2632 to take the zero repeat into account. This is implemented by setting them to
2633 zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2634 item types that can be repeated set these backoff variables appropriately. */
2635
2636 firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2637
2638 /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2639 according to the current setting of the caseless flag. REQ_CASELESS is a bit
2640 value > 255. It is added into the firstbyte or reqbyte variables to record the
2641 case status of the value. This is used only for ASCII characters. */
2642
2643 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2644
2645 /* Switch on next character until the end of the branch */
2646
2647 for (;; ptr++)
2648 {
2649 BOOL negate_class;
2650 BOOL should_flip_negation;
2651 BOOL possessive_quantifier;
2652 BOOL is_quantifier;
2653 BOOL is_recurse;
2654 BOOL reset_bracount;
2655 int class_charcount;
2656 int class_lastchar;
2657 int newoptions;
2658 int recno;
2659 int refsign;
2660 int skipbytes;
2661 int subreqbyte;
2662 int subfirstbyte;
2663 int terminator;
2664 int mclength;
2665 uschar mcbuffer[8];
2666
2667 /* Get next byte in the pattern */
2668
2669 c = *ptr;
2670
2671 /* If we are in the pre-compile phase, accumulate the length used for the
2672 previous cycle of this loop. */
2673
2674 if (lengthptr != NULL)
2675 {
2676 #ifdef DEBUG
2677 if (code > cd->hwm) cd->hwm = code; /* High water info */
2678 #endif
2679 if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
2680 {
2681 *errorcodeptr = ERR52;
2682 goto FAILED;
2683 }
2684
2685 /* There is at least one situation where code goes backwards: this is the
2686 case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2687 the class is simply eliminated. However, it is created first, so we have to
2688 allow memory for it. Therefore, don't ever reduce the length at this point.
2689 */
2690
2691 if (code < last_code) code = last_code;
2692
2693 /* Paranoid check for integer overflow */
2694
2695 if (OFLOW_MAX - *lengthptr < code - last_code)
2696 {
2697 *errorcodeptr = ERR20;
2698 goto FAILED;
2699 }
2700
2701 *lengthptr += code - last_code;
2702 DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2703
2704 /* If "previous" is set and it is not at the start of the work space, move
2705 it back to there, in order to avoid filling up the work space. Otherwise,
2706 if "previous" is NULL, reset the current code pointer to the start. */
2707
2708 if (previous != NULL)
2709 {
2710 if (previous > orig_code)
2711 {
2712 memmove(orig_code, previous, code - previous);
2713 code -= previous - orig_code;
2714 previous = orig_code;
2715 }
2716 }
2717 else code = orig_code;
2718
2719 /* Remember where this code item starts so we can pick up the length
2720 next time round. */
2721
2722 last_code = code;
2723 }
2724
2725 /* In the real compile phase, just check the workspace used by the forward
2726 reference list. */
2727
2728 else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
2729 {
2730 *errorcodeptr = ERR52;
2731 goto FAILED;
2732 }
2733
2734 /* If in \Q...\E, check for the end; if not, we have a literal */
2735
2736 if (inescq && c != 0)
2737 {
2738 if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)
2739 {
2740 inescq = FALSE;
2741 ptr++;
2742 continue;
2743 }
2744 else
2745 {
2746 if (previous_callout != NULL)
2747 {
2748 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2749 complete_callout(previous_callout, ptr, cd);
2750 previous_callout = NULL;
2751 }
2752 if ((options & PCRE_AUTO_CALLOUT) != 0)
2753 {
2754 previous_callout = code;
2755 code = auto_callout(code, ptr, cd);
2756 }
2757 goto NORMAL_CHAR;
2758 }
2759 }
2760
2761 /* Fill in length of a previous callout, except when the next thing is
2762 a quantifier. */
2763
2764 is_quantifier =
2765 c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK ||
2766 (c == CHAR_LEFT_CURLY_BRACKET && is_counted_repeat(ptr+1));
2767
2768 if (!is_quantifier && previous_callout != NULL &&
2769 after_manual_callout-- <= 0)
2770 {
2771 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2772 complete_callout(previous_callout, ptr, cd);
2773 previous_callout = NULL;
2774 }
2775
2776 /* In extended mode, skip white space and comments */
2777
2778 if ((options & PCRE_EXTENDED) != 0)
2779 {
2780 if ((cd->ctypes[c] & ctype_space) != 0) continue;
2781 if (c == CHAR_NUMBER_SIGN)
2782 {
2783 while (*(++ptr) != 0)
2784 {
2785 if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
2786 }
2787 if (*ptr != 0) continue;
2788
2789 /* Else fall through to handle end of string */
2790 c = 0;
2791 }
2792 }
2793
2794 /* No auto callout for quantifiers. */
2795
2796 if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
2797 {
2798 previous_callout = code;
2799 code = auto_callout(code, ptr, cd);
2800 }
2801
2802 switch(c)
2803 {
2804 /* ===================================================================*/
2805 case 0: /* The branch terminates at string end */
2806 case CHAR_VERTICAL_LINE: /* or | or ) */
2807 case CHAR_RIGHT_PARENTHESIS:
2808 *firstbyteptr = firstbyte;
2809 *reqbyteptr = reqbyte;
2810 *codeptr = code;
2811 *ptrptr = ptr;
2812 if (lengthptr != NULL)
2813 {
2814 if (OFLOW_MAX - *lengthptr < code - last_code)
2815 {
2816 *errorcodeptr = ERR20;
2817 goto FAILED;
2818 }
2819 *lengthptr += code - last_code; /* To include callout length */
2820 DPRINTF((">> end branch\n"));
2821 }
2822 return TRUE;
2823
2824
2825 /* ===================================================================*/
2826 /* Handle single-character metacharacters. In multiline mode, ^ disables
2827 the setting of any following char as a first character. */
2828
2829 case CHAR_CIRCUMFLEX_ACCENT:
2830 if ((options & PCRE_MULTILINE) != 0)
2831 {
2832 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2833 }
2834 previous = NULL;
2835 *code++ = OP_CIRC;
2836 break;
2837
2838 case CHAR_DOLLAR_SIGN:
2839 previous = NULL;
2840 *code++ = OP_DOLL;
2841 break;
2842
2843 /* There can never be a first char if '.' is first, whatever happens about
2844 repeats. The value of reqbyte doesn't change either. */
2845
2846 case CHAR_DOT:
2847 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2848 zerofirstbyte = firstbyte;
2849 zeroreqbyte = reqbyte;
2850 previous = code;
2851 *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;
2852 break;
2853
2854
2855 /* ===================================================================*/
2856 /* Character classes. If the included characters are all < 256, we build a
2857 32-byte bitmap of the permitted characters, except in the special case
2858 where there is only one such character. For negated classes, we build the
2859 map as usual, then invert it at the end. However, we use a different opcode
2860 so that data characters > 255 can be handled correctly.
2861
2862 If the class contains characters outside the 0-255 range, a different
2863 opcode is compiled. It may optionally have a bit map for characters < 256,
2864 but those above are are explicitly listed afterwards. A flag byte tells
2865 whether the bitmap is present, and whether this is a negated class or not.
2866
2867 In JavaScript compatibility mode, an isolated ']' causes an error. In
2868 default (Perl) mode, it is treated as a data character. */
2869
2870 case CHAR_RIGHT_SQUARE_BRACKET:
2871 if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
2872 {
2873 *errorcodeptr = ERR64;
2874 goto FAILED;
2875 }
2876 goto NORMAL_CHAR;
2877
2878 case CHAR_LEFT_SQUARE_BRACKET:
2879 previous = code;
2880
2881 /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
2882 they are encountered at the top level, so we'll do that too. */
2883
2884 if ((ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
2885 ptr[1] == CHAR_EQUALS_SIGN) &&
2886 check_posix_syntax(ptr, &tempptr))
2887 {
2888 *errorcodeptr = (ptr[1] == CHAR_COLON)? ERR13 : ERR31;
2889 goto FAILED;
2890 }
2891
2892 /* If the first character is '^', set the negation flag and skip it. Also,
2893 if the first few characters (either before or after ^) are \Q\E or \E we
2894 skip them too. This makes for compatibility with Perl. */
2895
2896 negate_class = FALSE;
2897 for (;;)
2898 {
2899 c = *(++ptr);
2900 if (c == CHAR_BACKSLASH)
2901 {
2902 if (ptr[1] == CHAR_E)
2903 ptr++;
2904 else if (strncmp((const char *)ptr+1,
2905 STR_Q STR_BACKSLASH STR_E, 3) == 0)
2906 ptr += 3;
2907 else
2908 break;
2909 }
2910 else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
2911 negate_class = TRUE;
2912 else break;
2913 }
2914
2915 /* Empty classes are allowed in JavaScript compatibility mode. Otherwise,
2916 an initial ']' is taken as a data character -- the code below handles
2917 that. In JS mode, [] must always fail, so generate OP_FAIL, whereas
2918 [^] must match any character, so generate OP_ALLANY. */
2919
2920 if (c == CHAR_RIGHT_SQUARE_BRACKET &&
2921 (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
2922 {
2923 *code++ = negate_class? OP_ALLANY : OP_FAIL;
2924 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2925 zerofirstbyte = firstbyte;
2926 break;
2927 }
2928
2929 /* If a class contains a negative special such as \S, we need to flip the
2930 negation flag at the end, so that support for characters > 255 works
2931 correctly (they are all included in the class). */
2932
2933 should_flip_negation = FALSE;
2934
2935 /* Keep a count of chars with values < 256 so that we can optimize the case
2936 of just a single character (as long as it's < 256). However, For higher
2937 valued UTF-8 characters, we don't yet do any optimization. */
2938
2939 class_charcount = 0;
2940 class_lastchar = -1;
2941
2942 /* Initialize the 32-char bit map to all zeros. We build the map in a
2943 temporary bit of memory, in case the class contains only 1 character (less
2944 than 256), because in that case the compiled code doesn't use the bit map.
2945 */
2946
2947 memset(classbits, 0, 32 * sizeof(uschar));
2948
2949 #ifdef SUPPORT_UTF8
2950 class_utf8 = FALSE; /* No chars >= 256 */
2951 class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */
2952 class_utf8data_base = class_utf8data; /* For resetting in pass 1 */
2953 #endif
2954
2955 /* Process characters until ] is reached. By writing this as a "do" it
2956 means that an initial ] is taken as a data character. At the start of the
2957 loop, c contains the first byte of the character. */
2958
2959 if (c != 0) do
2960 {
2961 const uschar *oldptr;
2962
2963 #ifdef SUPPORT_UTF8
2964 if (utf8 && c > 127)
2965 { /* Braces are required because the */
2966 GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
2967 }
2968
2969 /* In the pre-compile phase, accumulate the length of any UTF-8 extra
2970 data and reset the pointer. This is so that very large classes that
2971 contain a zillion UTF-8 characters no longer overwrite the work space
2972 (which is on the stack). */
2973
2974 if (lengthptr != NULL)
2975 {
2976 *lengthptr += class_utf8data - class_utf8data_base;
2977 class_utf8data = class_utf8data_base;
2978 }
2979
2980 #endif
2981
2982 /* Inside \Q...\E everything is literal except \E */
2983
2984 if (inescq)
2985 {
2986 if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E) /* If we are at \E */
2987 {
2988 inescq = FALSE; /* Reset literal state */
2989 ptr++; /* Skip the 'E' */
2990 continue; /* Carry on with next */
2991 }
2992 goto CHECK_RANGE; /* Could be range if \E follows */
2993 }
2994
2995 /* Handle POSIX class names. Perl allows a negation extension of the
2996 form [:^name:]. A square bracket that doesn't match the syntax is
2997 treated as a literal. We also recognize the POSIX constructions
2998 [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
2999 5.6 and 5.8 do. */
3000
3001 if (c == CHAR_LEFT_SQUARE_BRACKET &&
3002 (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
3003 ptr[1] == CHAR_EQUALS_SIGN) && check_posix_syntax(ptr, &tempptr))
3004 {
3005 BOOL local_negate = FALSE;
3006 int posix_class, taboffset, tabopt;
3007 register const uschar *cbits = cd->cbits;
3008 uschar pbits[32];
3009
3010 if (ptr[1] != CHAR_COLON)
3011 {
3012 *errorcodeptr = ERR31;
3013 goto FAILED;
3014 }
3015
3016 ptr += 2;
3017 if (*ptr == CHAR_CIRCUMFLEX_ACCENT)
3018 {
3019 local_negate = TRUE;
3020 should_flip_negation = TRUE; /* Note negative special */
3021 ptr++;
3022 }
3023
3024 posix_class = check_posix_name(ptr, tempptr - ptr);
3025 if (posix_class < 0)
3026 {
3027 *errorcodeptr = ERR30;
3028 goto FAILED;
3029 }
3030
3031 /* If matching is caseless, upper and lower are converted to
3032 alpha. This relies on the fact that the class table starts with
3033 alpha, lower, upper as the first 3 entries. */
3034
3035 if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
3036 posix_class = 0;
3037
3038 /* We build the bit map for the POSIX class in a chunk of local store
3039 because we may be adding and subtracting from it, and we don't want to
3040 subtract bits that may be in the main map already. At the end we or the
3041 result into the bit map that is being built. */
3042
3043 posix_class *= 3;
3044
3045 /* Copy in the first table (always present) */
3046
3047 memcpy(pbits, cbits + posix_class_maps[posix_class],
3048 32 * sizeof(uschar));
3049
3050 /* If there is a second table, add or remove it as required. */
3051
3052 taboffset = posix_class_maps[posix_class + 1];
3053 tabopt = posix_class_maps[posix_class + 2];
3054
3055 if (taboffset >= 0)
3056 {
3057 if (tabopt >= 0)
3058 for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
3059 else
3060 for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
3061 }
3062
3063 /* Not see if we need to remove any special characters. An option
3064 value of 1 removes vertical space and 2 removes underscore. */
3065
3066 if (tabopt < 0) tabopt = -tabopt;
3067 if (tabopt == 1) pbits[1] &= ~0x3c;
3068 else if (tabopt == 2) pbits[11] &= 0x7f;
3069
3070 /* Add the POSIX table or its complement into the main table that is
3071 being built and we are done. */
3072
3073 if (local_negate)
3074 for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
3075 else
3076 for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
3077
3078 ptr = tempptr + 1;
3079 class_charcount = 10; /* Set > 1; assumes more than 1 per class */
3080 continue; /* End of POSIX syntax handling */
3081 }
3082
3083 /* Backslash may introduce a single character, or it may introduce one
3084 of the specials, which just set a flag. The sequence \b is a special
3085 case. Inside a class (and only there) it is treated as backspace.
3086 Elsewhere it marks a word boundary. Other escapes have preset maps ready
3087 to 'or' into the one we are building. We assume they have more than one
3088 character in them, so set class_charcount bigger than one. */
3089
3090 if (c == CHAR_BACKSLASH)
3091 {
3092 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3093 if (*errorcodeptr != 0) goto FAILED;
3094
3095 if (-c == ESC_b) c = CHAR_BS; /* \b is backspace in a class */
3096 else if (-c == ESC_X) c = CHAR_X; /* \X is literal X in a class */
3097 else if (-c == ESC_R) c = CHAR_R; /* \R is literal R in a class */
3098 else if (-c == ESC_Q) /* Handle start of quoted string */
3099 {
3100 if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
3101 {
3102 ptr += 2; /* avoid empty string */
3103 }
3104 else inescq = TRUE;
3105 continue;
3106 }
3107 else if (-c == ESC_E) continue; /* Ignore orphan \E */
3108
3109 if (c < 0)
3110 {
3111 register const uschar *cbits = cd->cbits;
3112 class_charcount += 2; /* Greater than 1 is what matters */
3113
3114 /* Save time by not doing this in the pre-compile phase. */
3115
3116 if (lengthptr == NULL) switch (-c)
3117 {
3118 case ESC_d:
3119 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
3120 continue;
3121
3122 case ESC_D:
3123 should_flip_negation = TRUE;
3124 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
3125 continue;
3126
3127 case ESC_w:
3128 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
3129 continue;
3130
3131 case ESC_W:
3132 should_flip_negation = TRUE;
3133 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
3134 continue;
3135
3136 case ESC_s:
3137 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
3138 classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
3139 continue;
3140
3141 case ESC_S:
3142 should_flip_negation = TRUE;
3143 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
3144 classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
3145 continue;
3146
3147 default: /* Not recognized; fall through */
3148 break; /* Need "default" setting to stop compiler warning. */
3149 }
3150
3151 /* In the pre-compile phase, just do the recognition. */
3152
3153 else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
3154 c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
3155
3156 /* We need to deal with \H, \h, \V, and \v in both phases because
3157 they use extra memory. */
3158
3159 if (-c == ESC_h)
3160 {
3161 SETBIT(classbits, 0x09); /* VT */
3162 SETBIT(classbits, 0x20); /* SPACE */
3163 SETBIT(classbits, 0xa0); /* NSBP */
3164 #ifdef SUPPORT_UTF8
3165 if (utf8)
3166 {
3167 class_utf8 = TRUE;
3168 *class_utf8data++ = XCL_SINGLE;
3169 class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
3170 *class_utf8data++ = XCL_SINGLE;
3171 class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
3172 *class_utf8data++ = XCL_RANGE;
3173 class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
3174 class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
3175 *class_utf8data++ = XCL_SINGLE;
3176 class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
3177 *class_utf8data++ = XCL_SINGLE;
3178 class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
3179 *class_utf8data++ = XCL_SINGLE;
3180 class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
3181 }
3182 #endif
3183 continue;
3184 }
3185
3186 if (-c == ESC_H)
3187 {
3188 for (c = 0; c < 32; c++)
3189 {
3190 int x = 0xff;
3191 switch (c)
3192 {
3193 case 0x09/8: x ^= 1 << (0x09%8); break;
3194 case 0x20/8: x ^= 1 << (0x20%8); break;
3195 case 0xa0/8: x ^= 1 << (0xa0%8); break;
3196 default: break;
3197 }
3198 classbits[c] |= x;
3199 }
3200
3201 #ifdef SUPPORT_UTF8
3202 if (utf8)
3203 {
3204 class_utf8 = TRUE;
3205 *class_utf8data++ = XCL_RANGE;
3206 class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
3207 class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
3208 *class_utf8data++ = XCL_RANGE;
3209 class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
3210 class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
3211 *class_utf8data++ = XCL_RANGE;
3212 class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
3213 class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
3214 *class_utf8data++ = XCL_RANGE;
3215 class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
3216 class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
3217 *class_utf8data++ = XCL_RANGE;
3218 class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
3219 class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
3220 *class_utf8data++ = XCL_RANGE;
3221 class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
3222 class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
3223 *class_utf8data++ = XCL_RANGE;
3224 class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
3225 class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
3226 }
3227 #endif
3228 continue;
3229 }
3230
3231 if (-c == ESC_v)
3232 {
3233 SETBIT(classbits, 0x0a); /* LF */
3234 SETBIT(classbits, 0x0b); /* VT */
3235 SETBIT(classbits, 0x0c); /* FF */
3236 SETBIT(classbits, 0x0d); /* CR */
3237 SETBIT(classbits, 0x85); /* NEL */
3238 #ifdef SUPPORT_UTF8
3239 if (utf8)
3240 {
3241 class_utf8 = TRUE;
3242 *class_utf8data++ = XCL_RANGE;
3243 class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
3244 class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
3245 }
3246 #endif
3247 continue;
3248 }
3249
3250 if (-c == ESC_V)
3251 {
3252 for (c = 0; c < 32; c++)
3253 {
3254 int x = 0xff;
3255 switch (c)
3256 {
3257 case 0x0a/8: x ^= 1 << (0x0a%8);
3258 x ^= 1 << (0x0b%8);
3259 x ^= 1 << (0x0c%8);
3260 x ^= 1 << (0x0d%8);
3261 break;
3262 case 0x85/8: x ^= 1 << (0x85%8); break;
3263 default: break;
3264 }
3265 classbits[c] |= x;
3266 }
3267
3268 #ifdef SUPPORT_UTF8
3269 if (utf8)
3270 {
3271 class_utf8 = TRUE;
3272 *class_utf8data++ = XCL_RANGE;
3273 class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
3274 class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
3275 *class_utf8data++ = XCL_RANGE;
3276 class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
3277 class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
3278 }
3279 #endif
3280 continue;
3281 }
3282
3283 /* We need to deal with \P and \p in both phases. */
3284
3285 #ifdef SUPPORT_UCP
3286 if (-c == ESC_p || -c == ESC_P)
3287 {
3288 BOOL negated;
3289 int pdata;
3290 int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
3291 if (ptype < 0) goto FAILED;
3292 class_utf8 = TRUE;
3293 *class_utf8data++ = ((-c == ESC_p) != negated)?
3294 XCL_PROP : XCL_NOTPROP;
3295 *class_utf8data++ = ptype;
3296 *class_utf8data++ = pdata;
3297 class_charcount -= 2; /* Not a < 256 character */
3298 continue;
3299 }
3300 #endif
3301 /* Unrecognized escapes are faulted if PCRE is running in its
3302 strict mode. By default, for compatibility with Perl, they are
3303 treated as literals. */
3304
3305 if ((options & PCRE_EXTRA) != 0)
3306 {
3307 *errorcodeptr = ERR7;
3308 goto FAILED;
3309 }
3310
3311 class_charcount -= 2; /* Undo the default count from above */
3312 c = *ptr; /* Get the final character and fall through */
3313 }
3314
3315 /* Fall through if we have a single character (c >= 0). This may be
3316 greater than 256 in UTF-8 mode. */
3317
3318 } /* End of backslash handling */
3319
3320 /* A single character may be followed by '-' to form a range. However,
3321 Perl does not permit ']' to be the end of the range. A '-' character
3322 at the end is treated as a literal. Perl ignores orphaned \E sequences
3323 entirely. The code for handling \Q and \E is messy. */
3324
3325 CHECK_RANGE:
3326 while (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
3327 {
3328 inescq = FALSE;
3329 ptr += 2;
3330 }
3331
3332 oldptr = ptr;
3333
3334 /* Remember \r or \n */
3335
3336 if (c == CHAR_CR || c == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
3337
3338 /* Check for range */
3339
3340 if (!inescq && ptr[1] == CHAR_MINUS)
3341 {
3342 int d;
3343 ptr += 2;
3344 while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E) ptr += 2;
3345
3346 /* If we hit \Q (not followed by \E) at this point, go into escaped
3347 mode. */
3348
3349 while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_Q)
3350 {
3351 ptr += 2;
3352 if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E)
3353 { ptr += 2; continue; }
3354 inescq = TRUE;
3355 break;
3356 }
3357
3358 if (*ptr == 0 || (!inescq && *ptr == CHAR_RIGHT_SQUARE_BRACKET))
3359 {
3360 ptr = oldptr;
3361 goto LONE_SINGLE_CHARACTER;
3362 }
3363
3364 #ifdef SUPPORT_UTF8
3365 if (utf8)
3366 { /* Braces are required because the */
3367 GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
3368 }
3369 else
3370 #endif
3371 d = *ptr; /* Not UTF-8 mode */
3372
3373 /* The second part of a range can be a single-character escape, but
3374 not any of the other escapes. Perl 5.6 treats a hyphen as a literal
3375 in such circumstances. */
3376
3377 if (!inescq && d == CHAR_BACKSLASH)
3378 {
3379 d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3380 if (*errorcodeptr != 0) goto FAILED;
3381
3382 /* \b is backspace; \X is literal X; \R is literal R; any other
3383 special means the '-' was literal */
3384
3385 if (d < 0)
3386 {
3387 if (d == -ESC_b) d = CHAR_BS;
3388 else if (d == -ESC_X) d = CHAR_X;
3389 else if (d == -ESC_R) d = CHAR_R; else
3390 {
3391 ptr = oldptr;
3392 goto LONE_SINGLE_CHARACTER; /* A few lines below */
3393 }
3394 }
3395 }
3396
3397 /* Check that the two values are in the correct order. Optimize
3398 one-character ranges */
3399
3400 if (d < c)
3401 {
3402 *errorcodeptr = ERR8;
3403 goto FAILED;
3404 }
3405
3406 if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
3407
3408 /* Remember \r or \n */
3409
3410 if (d == CHAR_CR || d == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
3411
3412 /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
3413 matching, we have to use an XCLASS with extra data items. Caseless
3414 matching for characters > 127 is available only if UCP support is
3415 available. */
3416
3417 #ifdef SUPPORT_UTF8
3418 if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
3419 {
3420 class_utf8 = TRUE;
3421
3422 /* With UCP support, we can find the other case equivalents of
3423 the relevant characters. There may be several ranges. Optimize how
3424 they fit with the basic range. */
3425
3426 #ifdef SUPPORT_UCP
3427 if ((options & PCRE_CASELESS) != 0)
3428 {
3429 unsigned int occ, ocd;
3430 unsigned int cc = c;
3431 unsigned int origd = d;
3432 while (get_othercase_range(&cc, origd, &occ, &ocd))
3433 {
3434 if (occ >= (unsigned int)c &&
3435 ocd <= (unsigned int)d)
3436 continue; /* Skip embedded ranges */
3437
3438 if (occ < (unsigned int)c &&
3439 ocd >= (unsigned int)c - 1) /* Extend the basic range */
3440 { /* if there is overlap, */
3441 c = occ; /* noting that if occ < c */
3442 continue; /* we can't have ocd > d */
3443 } /* because a subrange is */
3444 if (ocd > (unsigned int)d &&
3445 occ <= (unsigned int)d + 1) /* always shorter than */
3446 { /* the basic range. */
3447 d = ocd;
3448 continue;
3449 }
3450
3451 if (occ == ocd)
3452 {
3453 *class_utf8data++ = XCL_SINGLE;
3454 }
3455 else
3456 {
3457 *class_utf8data++ = XCL_RANGE;
3458 class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
3459 }
3460 class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
3461 }
3462 }
3463 #endif /* SUPPORT_UCP */
3464
3465 /* Now record the original range, possibly modified for UCP caseless
3466 overlapping ranges. */
3467
3468 *class_utf8data++ = XCL_RANGE;
3469 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3470 class_utf8data += _pcre_ord2utf8(d, class_utf8data);
3471
3472 /* With UCP support, we are done. Without UCP support, there is no
3473 caseless matching for UTF-8 characters > 127; we can use the bit map
3474 for the smaller ones. */
3475
3476 #ifdef SUPPORT_UCP
3477 continue; /* With next character in the class */
3478 #else
3479 if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
3480
3481 /* Adjust upper limit and fall through to set up the map */
3482
3483 d = 127;
3484
3485 #endif /* SUPPORT_UCP */
3486 }
3487 #endif /* SUPPORT_UTF8 */
3488
3489 /* We use the bit map for all cases when not in UTF-8 mode; else
3490 ranges that lie entirely within 0-127 when there is UCP support; else
3491 for partial ranges without UCP support. */
3492
3493 class_charcount += d - c + 1;
3494 class_lastchar = d;
3495
3496 /* We can save a bit of time by skipping this in the pre-compile. */
3497
3498 if (lengthptr == NULL) for (; c <= d; c++)
3499 {
3500 classbits[c/8] |= (1 << (c&7));
3501 if ((options & PCRE_CASELESS) != 0)
3502 {
3503 int uc = cd->fcc[c]; /* flip case */
3504 classbits[uc/8] |= (1 << (uc&7));
3505 }
3506 }
3507
3508 continue; /* Go get the next char in the class */
3509 }
3510
3511 /* Handle a lone single character - we can get here for a normal
3512 non-escape char, or after \ that introduces a single character or for an
3513 apparent range that isn't. */
3514
3515 LONE_SINGLE_CHARACTER:
3516
3517 /* Handle a character that cannot go in the bit map */
3518
3519 #ifdef SUPPORT_UTF8
3520 if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
3521 {
3522 class_utf8 = TRUE;
3523 *class_utf8data++ = XCL_SINGLE;
3524 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3525
3526 #ifdef SUPPORT_UCP
3527 if ((options & PCRE_CASELESS) != 0)
3528 {
3529 unsigned int othercase;
3530 if ((othercase = UCD_OTHERCASE(c)) != c)
3531 {
3532 *class_utf8data++ = XCL_SINGLE;
3533 class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
3534 }
3535 }
3536 #endif /* SUPPORT_UCP */
3537
3538 }
3539 else
3540 #endif /* SUPPORT_UTF8 */
3541
3542 /* Handle a single-byte character */
3543 {
3544 classbits[c/8] |= (1 << (c&7));
3545 if ((options & PCRE_CASELESS) != 0)
3546 {
3547 c = cd->fcc[c]; /* flip case */
3548 classbits[c/8] |= (1 << (c&7));
3549 }
3550 class_charcount++;
3551 class_lastchar = c;
3552 }
3553 }
3554
3555 /* Loop until ']' reached. This "while" is the end of the "do" above. */
3556
3557 while ((c = *(++ptr)) != 0 && (c != CHAR_RIGHT_SQUARE_BRACKET || inescq));
3558
3559 if (c == 0) /* Missing terminating ']' */
3560 {
3561 *errorcodeptr = ERR6;
3562 goto FAILED;
3563 }
3564
3565
3566 /* This code has been disabled because it would mean that \s counts as
3567 an explicit \r or \n reference, and that's not really what is wanted. Now
3568 we set the flag only if there is a literal "\r" or "\n" in the class. */
3569
3570 #if 0
3571 /* Remember whether \r or \n are in this class */
3572
3573 if (negate_class)
3574 {
3575 if ((classbits[1] & 0x24) != 0x24) cd->external_flags |= PCRE_HASCRORLF;
3576 }
3577 else
3578 {
3579 if ((classbits[1] & 0x24) != 0) cd->external_flags |= PCRE_HASCRORLF;
3580 }
3581 #endif
3582
3583
3584 /* If class_charcount is 1, we saw precisely one character whose value is
3585 less than 256. As long as there were no characters >= 128 and there was no
3586 use of \p or \P, in other words, no use of any XCLASS features, we can
3587 optimize.
3588
3589 In UTF-8 mode, we can optimize the negative case only if there were no
3590 characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR
3591 operate on single-bytes only. This is an historical hangover. Maybe one day
3592 we can tidy these opcodes to handle multi-byte characters.
3593
3594 The optimization throws away the bit map. We turn the item into a
3595 1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
3596 that OP_NOT does not support multibyte characters. In the positive case, it
3597 can cause firstbyte to be set. Otherwise, there can be no first char if
3598 this item is first, whatever repeat count may follow. In the case of
3599 reqbyte, save the previous value for reinstating. */
3600
3601 #ifdef SUPPORT_UTF8
3602 if (class_charcount == 1 && !class_utf8 &&
3603 (!utf8 || !negate_class || class_lastchar < 128))
3604 #else
3605 if (class_charcount == 1)
3606 #endif
3607 {
3608 zeroreqbyte = reqbyte;
3609
3610 /* The OP_NOT opcode works on one-byte characters only. */
3611
3612 if (negate_class)
3613 {
3614 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3615 zerofirstbyte = firstbyte;
3616 *code++ = OP_NOT;
3617 *code++ = class_lastchar;
3618 break;
3619 }
3620
3621 /* For a single, positive character, get the value into mcbuffer, and
3622 then we can handle this with the normal one-character code. */
3623
3624 #ifdef SUPPORT_UTF8
3625 if (utf8 && class_lastchar > 127)
3626 mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
3627 else
3628 #endif
3629 {
3630 mcbuffer[0] = class_lastchar;
3631 mclength = 1;
3632 }
3633 goto ONE_CHAR;
3634 } /* End of 1-char optimization */
3635
3636 /* The general case - not the one-char optimization. If this is the first
3637 thing in the branch, there can be no first char setting, whatever the
3638 repeat count. Any reqbyte setting must remain unchanged after any kind of
3639 repeat. */
3640
3641 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3642 zerofirstbyte = firstbyte;
3643 zeroreqbyte = reqbyte;
3644
3645 /* If there are characters with values > 255, we have to compile an
3646 extended class, with its own opcode, unless there was a negated special
3647 such as \S in the class, because in that case all characters > 255 are in
3648 the class, so any that were explicitly given as well can be ignored. If
3649 (when there are explicit characters > 255 that must be listed) there are no
3650 characters < 256, we can omit the bitmap in the actual compiled code. */
3651
3652 #ifdef SUPPORT_UTF8
3653 if (class_utf8 && !should_flip_negation)
3654 {
3655 *class_utf8data++ = XCL_END; /* Marks the end of extra data */
3656 *code++ = OP_XCLASS;
3657 code += LINK_SIZE;
3658 *code = negate_class? XCL_NOT : 0;
3659
3660 /* If the map is required, move up the extra data to make room for it;
3661 otherwise just move the code pointer to the end of the extra data. */
3662
3663 if (class_charcount > 0)
3664 {
3665 *code++ |= XCL_MAP;
3666 memmove(code + 32, code, class_utf8data - code);
3667 memcpy(code, classbits, 32);
3668 code = class_utf8data + 32;
3669 }
3670 else code = class_utf8data;
3671
3672 /* Now fill in the complete length of the item */
3673
3674 PUT(previous, 1, code - previous);
3675 break; /* End of class handling */
3676 }
3677 #endif
3678
3679 /* If there are no characters > 255, set the opcode to OP_CLASS or
3680 OP_NCLASS, depending on whether the whole class was negated and whether
3681 there were negative specials such as \S in the class. Then copy the 32-byte
3682 map into the code vector, negating it if necessary. */
3683
3684 *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
3685 if (negate_class)
3686 {
3687 if (lengthptr == NULL) /* Save time in the pre-compile phase */
3688 for (c = 0; c < 32; c++) code[c] = ~classbits[c];
3689 }
3690 else
3691 {
3692 memcpy(code, classbits, 32);
3693 }
3694 code += 32;
3695 break;
3696
3697
3698 /* ===================================================================*/
3699 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
3700 has been tested above. */
3701
3702 case CHAR_LEFT_CURLY_BRACKET:
3703 if (!is_quantifier) goto NORMAL_CHAR;
3704 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
3705 if (*errorcodeptr != 0) goto FAILED;
3706 goto REPEAT;
3707
3708 case CHAR_ASTERISK:
3709 repeat_min = 0;
3710 repeat_max = -1;
3711 goto REPEAT;
3712
3713 case CHAR_PLUS:
3714 repeat_min = 1;
3715 repeat_max = -1;
3716 goto REPEAT;
3717
3718 case CHAR_QUESTION_MARK:
3719 repeat_min = 0;
3720 repeat_max = 1;
3721
3722 REPEAT:
3723 if (previous == NULL)
3724 {
3725 *errorcodeptr = ERR9;
3726 goto FAILED;
3727 }
3728
3729 if (repeat_min == 0)
3730 {
3731 firstbyte = zerofirstbyte; /* Adjust for zero repeat */
3732 reqbyte = zeroreqbyte; /* Ditto */
3733 }
3734
3735 /* Remember whether this is a variable length repeat */
3736
3737 reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
3738
3739 op_type = 0; /* Default single-char op codes */
3740 possessive_quantifier = FALSE; /* Default not possessive quantifier */
3741
3742 /* Save start of previous item, in case we have to move it up to make space
3743 for an inserted OP_ONCE for the additional '+' extension. */
3744
3745 tempcode = previous;
3746
3747 /* If the next character is '+', we have a possessive quantifier. This
3748 implies greediness, whatever the setting of the PCRE_UNGREEDY option.
3749 If the next character is '?' this is a minimizing repeat, by default,
3750 but if PCRE_UNGREEDY is set, it works the other way round. We change the
3751 repeat type to the non-default. */
3752
3753 if (ptr[1] == CHAR_PLUS)
3754 {
3755 repeat_type = 0; /* Force greedy */
3756 possessive_quantifier = TRUE;
3757 ptr++;
3758 }
3759 else if (ptr[1] == CHAR_QUESTION_MARK)
3760 {
3761 repeat_type = greedy_non_default;
3762 ptr++;
3763 }
3764 else repeat_type = greedy_default;
3765
3766 /* If previous was a character match, abolish the item and generate a
3767 repeat item instead. If a char item has a minumum of more than one, ensure
3768 that it is set in reqbyte - it might not be if a sequence such as x{3} is
3769 the first thing in a branch because the x will have gone into firstbyte
3770 instead. */
3771
3772 if (*previous == OP_CHAR || *previous == OP_CHARNC)
3773 {
3774 /* Deal with UTF-8 characters that take up more than one byte. It's
3775 easier to write this out separately than try to macrify it. Use c to
3776 hold the length of the character in bytes, plus 0x80 to flag that it's a
3777 length rather than a small character. */
3778
3779 #ifdef SUPPORT_UTF8
3780 if (utf8 && (code[-1] & 0x80) != 0)
3781 {
3782 uschar *lastchar = code - 1;
3783 while((*lastchar & 0xc0) == 0x80) lastchar--;
3784 c = code - lastchar; /* Length of UTF-8 character */
3785 memcpy(utf8_char, lastchar, c); /* Save the char */
3786 c |= 0x80; /* Flag c as a length */
3787 }
3788 else
3789 #endif
3790
3791 /* Handle the case of a single byte - either with no UTF8 support, or
3792 with UTF-8 disabled, or for a UTF-8 character < 128. */
3793
3794 {
3795 c = code[-1];
3796 if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
3797 }
3798
3799 /* If the repetition is unlimited, it pays to see if the next thing on
3800 the line is something that cannot possibly match this character. If so,
3801 automatically possessifying this item gains some performance in the case
3802 where the match fails. */
3803
3804 if (!possessive_quantifier &&
3805 repeat_max < 0 &&
3806 check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3807 options, cd))
3808 {
3809 repeat_type = 0; /* Force greedy */
3810 possessive_quantifier = TRUE;
3811 }
3812
3813 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
3814 }
3815
3816 /* If previous was a single negated character ([^a] or similar), we use
3817 one of the special opcodes, replacing it. The code is shared with single-
3818 character repeats by setting opt_type to add a suitable offset into
3819 repeat_type. We can also test for auto-possessification. OP_NOT is
3820 currently used only for single-byte chars. */
3821
3822 else if (*previous == OP_NOT)
3823 {
3824 op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
3825 c = previous[1];
3826 if (!possessive_quantifier &&
3827 repeat_max < 0 &&
3828 check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3829 {
3830 repeat_type = 0; /* Force greedy */
3831 possessive_quantifier = TRUE;
3832 }
3833 goto OUTPUT_SINGLE_REPEAT;
3834 }
3835
3836 /* If previous was a character type match (\d or similar), abolish it and
3837 create a suitable repeat item. The code is shared with single-character
3838 repeats by setting op_type to add a suitable offset into repeat_type. Note
3839 the the Unicode property types will be present only when SUPPORT_UCP is
3840 defined, but we don't wrap the little bits of code here because it just
3841 makes it horribly messy. */
3842
3843 else if (*previous < OP_EODN)
3844 {
3845 uschar *oldcode;
3846 int prop_type, prop_value;
3847 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
3848 c = *previous;
3849
3850 if (!possessive_quantifier &&
3851 repeat_max < 0 &&
3852 check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3853 {
3854 repeat_type = 0; /* Force greedy */
3855 possessive_quantifier = TRUE;
3856 }
3857
3858 OUTPUT_SINGLE_REPEAT:
3859 if (*previous == OP_PROP || *previous == OP_NOTPROP)
3860 {
3861 prop_type = previous[1];
3862 prop_value = previous[2];
3863 }
3864 else prop_type = prop_value = -1;
3865
3866 oldcode = code;
3867 code = previous; /* Usually overwrite previous item */
3868
3869 /* If the maximum is zero then the minimum must also be zero; Perl allows
3870 this case, so we do too - by simply omitting the item altogether. */
3871
3872 if (repeat_max == 0) goto END_REPEAT;
3873
3874 /*--------------------------------------------------------------------*/
3875 /* This code is obsolete from release 8.00; the restriction was finally
3876 removed: */
3877
3878 /* All real repeats make it impossible to handle partial matching (maybe
3879 one day we will be able to remove this restriction). */
3880
3881 /* if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL; */
3882 /*--------------------------------------------------------------------*/
3883
3884 /* Combine the op_type with the repeat_type */
3885
3886 repeat_type += op_type;
3887
3888 /* A minimum of zero is handled either as the special case * or ?, or as
3889 an UPTO, with the maximum given. */
3890
3891 if (repeat_min == 0)
3892 {
3893 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
3894 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
3895 else
3896 {
3897 *code++ = OP_UPTO + repeat_type;
3898 PUT2INC(code, 0, repeat_max);
3899 }
3900 }
3901
3902 /* A repeat minimum of 1 is optimized into some special cases. If the
3903 maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
3904 left in place and, if the maximum is greater than 1, we use OP_UPTO with
3905 one less than the maximum. */
3906
3907 else if (repeat_min == 1)
3908 {
3909 if (repeat_max == -1)
3910 *code++ = OP_PLUS + repeat_type;
3911 else
3912 {
3913 code = oldcode; /* leave previous item in place */
3914 if (repeat_max == 1) goto END_REPEAT;
3915 *code++ = OP_UPTO + repeat_type;
3916 PUT2INC(code, 0, repeat_max - 1);
3917 }
3918 }
3919
3920 /* The case {n,n} is just an EXACT, while the general case {n,m} is
3921 handled as an EXACT followed by an UPTO. */
3922
3923 else
3924 {
3925 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
3926 PUT2INC(code, 0, repeat_min);
3927
3928 /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
3929 we have to insert the character for the previous code. For a repeated
3930 Unicode property match, there are two extra bytes that define the
3931 required property. In UTF-8 mode, long characters have their length in
3932 c, with the 0x80 bit as a flag. */
3933
3934 if (repeat_max < 0)
3935 {
3936 #ifdef SUPPORT_UTF8
3937 if (utf8 && c >= 128)
3938 {
3939 memcpy(code, utf8_char, c & 7);
3940 code += c & 7;
3941 }
3942 else
3943 #endif
3944 {
3945 *code++ = c;
3946 if (prop_type >= 0)
3947 {
3948 *code++ = prop_type;
3949 *code++ = prop_value;
3950 }
3951 }
3952 *code++ = OP_STAR + repeat_type;
3953 }
3954
3955 /* Else insert an UPTO if the max is greater than the min, again
3956 preceded by the character, for the previously inserted code. If the
3957 UPTO is just for 1 instance, we can use QUERY instead. */
3958
3959 else if (repeat_max != repeat_min)
3960 {
3961 #ifdef SUPPORT_UTF8
3962 if (utf8 && c >= 128)
3963 {
3964 memcpy(code, utf8_char, c & 7);
3965 code += c & 7;
3966 }
3967 else
3968 #endif
3969 *code++ = c;
3970 if (prop_type >= 0)
3971 {
3972 *code++ = prop_type;
3973 *code++ = prop_value;
3974 }
3975 repeat_max -= repeat_min;
3976
3977 if (repeat_max == 1)
3978 {
3979 *code++ = OP_QUERY + repeat_type;
3980 }
3981 else
3982 {
3983 *code++ = OP_UPTO + repeat_type;
3984 PUT2INC(code, 0, repeat_max);
3985 }
3986 }
3987 }
3988
3989 /* The character or character type itself comes last in all cases. */
3990
3991 #ifdef SUPPORT_UTF8
3992 if (utf8 && c >= 128)
3993 {
3994 memcpy(code, utf8_char, c & 7);
3995 code += c & 7;
3996 }
3997 else
3998 #endif
3999 *code++ = c;
4000
4001 /* For a repeated Unicode property match, there are two extra bytes that
4002 define the required property. */
4003
4004 #ifdef SUPPORT_UCP
4005 if (prop_type >= 0)
4006 {
4007 *code++ = prop_type;
4008 *code++ = prop_value;
4009 }
4010 #endif
4011 }
4012
4013 /* If previous was a character class or a back reference, we put the repeat
4014 stuff after it, but just skip the item if the repeat was {0,0}. */
4015
4016 else if (*previous == OP_CLASS ||
4017 *previous == OP_NCLASS ||
4018 #ifdef SUPPORT_UTF8
4019 *previous == OP_XCLASS ||
4020 #endif
4021 *previous == OP_REF)
4022 {
4023 if (repeat_max == 0)
4024 {
4025 code = previous;
4026 goto END_REPEAT;
4027 }
4028
4029 /*--------------------------------------------------------------------*/
4030 /* This code is obsolete from release 8.00; the restriction was finally
4031 removed: */
4032
4033 /* All real repeats make it impossible to handle partial matching (maybe
4034 one day we will be able to remove this restriction). */
4035
4036 /* if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL; */
4037 /*--------------------------------------------------------------------*/
4038
4039 if (repeat_min == 0 && repeat_max == -1)
4040 *code++ = OP_CRSTAR + repeat_type;
4041 else if (repeat_min == 1 && repeat_max == -1)
4042 *code++ = OP_CRPLUS + repeat_type;
4043 else if (repeat_min == 0 && repeat_max == 1)
4044 *code++ = OP_CRQUERY + repeat_type;
4045 else
4046 {
4047 *code++ = OP_CRRANGE + repeat_type;
4048 PUT2INC(code, 0, repeat_min);
4049 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
4050 PUT2INC(code, 0, repeat_max);
4051 }
4052 }
4053
4054 /* If previous was a bracket group, we may have to replicate it in certain
4055 cases. */
4056
4057 else if (*previous == OP_BRA || *previous == OP_CBRA ||
4058 *previous == OP_ONCE || *previous == OP_COND)
4059 {
4060 register int i;
4061 int ketoffset = 0;
4062 int len = code - previous;
4063 uschar *bralink = NULL;
4064
4065 /* Repeating a DEFINE group is pointless */
4066
4067 if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
4068 {
4069 *errorcodeptr = ERR55;
4070 goto FAILED;
4071 }
4072
4073 /* If the maximum repeat count is unlimited, find the end of the bracket
4074 by scanning through from the start, and compute the offset back to it
4075 from the current code pointer. There may be an OP_OPT setting following
4076 the final KET, so we can't find the end just by going back from the code
4077 pointer. */
4078
4079 if (repeat_max == -1)
4080 {
4081 register uschar *ket = previous;
4082 do ket += GET(ket, 1); while (*ket != OP_KET);
4083 ketoffset = code - ket;
4084 }
4085
4086 /* The case of a zero minimum is special because of the need to stick
4087 OP_BRAZERO in front of it, and because the group appears once in the
4088 data, whereas in other cases it appears the minimum number of times. For
4089 this reason, it is simplest to treat this case separately, as otherwise
4090 the code gets far too messy. There are several special subcases when the
4091 minimum is zero. */
4092
4093 if (repeat_min == 0)
4094 {
4095 /* If the maximum is also zero, we used to just omit the group from the
4096 output altogether, like this:
4097
4098 ** if (repeat_max == 0)
4099 ** {
4100 ** code = previous;
4101 ** goto END_REPEAT;
4102 ** }
4103
4104 However, that fails when a group is referenced as a subroutine from
4105 elsewhere in the pattern, so now we stick in OP_SKIPZERO in front of it
4106 so that it is skipped on execution. As we don't have a list of which
4107 groups are referenced, we cannot do this selectively.
4108
4109 If the maximum is 1 or unlimited, we just have to stick in the BRAZERO
4110 and do no more at this point. However, we do need to adjust any
4111 OP_RECURSE calls inside the group that refer to the group itself or any
4112 internal or forward referenced group, because the offset is from the
4113 start of the whole regex. Temporarily terminate the pattern while doing
4114 this. */
4115
4116 if (repeat_max <= 1) /* Covers 0, 1, and unlimited */
4117 {
4118 *code = OP_END;
4119 adjust_recurse(previous, 1, utf8, cd, save_hwm);
4120 memmove(previous+1, previous, len);
4121 code++;
4122 if (repeat_max == 0)
4123 {
4124 *previous++ = OP_SKIPZERO;
4125 goto END_REPEAT;
4126 }
4127 *previous++ = OP_BRAZERO + repeat_type;
4128 }
4129
4130 /* If the maximum is greater than 1 and limited, we have to replicate
4131 in a nested fashion, sticking OP_BRAZERO before each set of brackets.
4132 The first one has to be handled carefully because it's the original
4133 copy, which has to be moved up. The remainder can be handled by code
4134 that is common with the non-zero minimum case below. We have to
4135 adjust the value or repeat_max, since one less copy is required. Once
4136 again, we may have to adjust any OP_RECURSE calls inside the group. */
4137
4138 else
4139 {
4140 int offset;
4141 *code = OP_END;
4142 adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
4143 memmove(previous + 2 + LINK_SIZE, previous, len);
4144 code += 2 + LINK_SIZE;
4145 *previous++ = OP_BRAZERO + repeat_type;
4146 *previous++ = OP_BRA;
4147
4148 /* We chain together the bracket offset fields that have to be
4149 filled in later when the ends of the brackets are reached. */
4150
4151 offset = (bralink == NULL)? 0 : previous - bralink;
4152 bralink = previous;
4153 PUTINC(previous, 0, offset);
4154 }
4155
4156 repeat_max--;
4157 }
4158
4159 /* If the minimum is greater than zero, replicate the group as many
4160 times as necessary, and adjust the maximum to the number of subsequent
4161 copies that we need. If we set a first char from the group, and didn't
4162 set a required char, copy the latter from the former. If there are any
4163 forward reference subroutine calls in the group, there will be entries on
4164 the workspace list; replicate these with an appropriate increment. */
4165
4166 else
4167 {
4168 if (repeat_min > 1)
4169 {
4170 /* In the pre-compile phase, we don't actually do the replication. We
4171 just adjust the length as if we had. Do some paranoid checks for
4172 potential integer overflow. */
4173
4174 if (lengthptr != NULL)
4175 {
4176 int delta = (repeat_min - 1)*length_prevgroup;
4177 if ((double)(repeat_min - 1)*(double)length_prevgroup >
4178 (double)INT_MAX ||
4179 OFLOW_MAX - *lengthptr < delta)
4180 {
4181 *errorcodeptr = ERR20;
4182 goto FAILED;
4183 }
4184 *lengthptr += delta;
4185 }
4186
4187 /* This is compiling for real */
4188
4189 else
4190 {
4191 if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
4192 for (i = 1; i < repeat_min; i++)
4193 {
4194 uschar *hc;
4195 uschar *this_hwm = cd->hwm;
4196 memcpy(code, previous, len);
4197 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
4198 {
4199 PUT(cd->hwm, 0, GET(hc, 0) + len);
4200 cd->hwm += LINK_SIZE;
4201 }
4202 save_hwm = this_hwm;
4203 code += len;
4204 }
4205 }
4206 }
4207
4208 if (repeat_max > 0) repeat_max -= repeat_min;
4209 }
4210
4211 /* This code is common to both the zero and non-zero minimum cases. If
4212 the maximum is limited, it replicates the group in a nested fashion,
4213 remembering the bracket starts on a stack. In the case of a zero minimum,
4214 the first one was set up above. In all cases the repeat_max now specifies
4215 the number of additional copies needed. Again, we must remember to
4216 replicate entries on the forward reference list. */
4217
4218 if (repeat_max >= 0)
4219 {
4220 /* In the pre-compile phase, we don't actually do the replication. We
4221 just adjust the length as if we had. For each repetition we must add 1
4222 to the length for BRAZERO and for all but the last repetition we must
4223 add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
4224 paranoid checks to avoid integer overflow. */
4225
4226 if (lengthptr != NULL && repeat_max > 0)
4227 {
4228 int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
4229 2 - 2*LINK_SIZE; /* Last one doesn't nest */
4230 if ((double)repeat_max *
4231 (double)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
4232 > (double)INT_MAX ||
4233 OFLOW_MAX - *lengthptr < delta)
4234 {
4235 *errorcodeptr = ERR20;
4236 goto FAILED;
4237 }
4238 *lengthptr += delta;
4239 }
4240
4241 /* This is compiling for real */
4242
4243 else for (i = repeat_max - 1; i >= 0; i--)
4244 {
4245 uschar *hc;
4246 uschar *this_hwm = cd->hwm;
4247
4248 *code++ = OP_BRAZERO + repeat_type;
4249
4250 /* All but the final copy start a new nesting, maintaining the
4251 chain of brackets outstanding. */
4252
4253 if (i != 0)
4254 {
4255 int offset;
4256 *code++ = OP_BRA;
4257 offset = (bralink == NULL)? 0 : code - bralink;
4258 bralink = code;
4259 PUTINC(code, 0, offset);
4260 }
4261
4262 memcpy(code, previous, len);
4263 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
4264 {
4265 PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
4266 cd->hwm += LINK_SIZE;
4267 }
4268 save_hwm = this_hwm;
4269 code += len;
4270 }
4271
4272 /* Now chain through the pending brackets, and fill in their length
4273 fields (which are holding the chain links pro tem). */
4274
4275 while (bralink != NULL)
4276 {
4277 int oldlinkoffset;
4278 int offset = code - bralink + 1;
4279 uschar *bra = code - offset;
4280 oldlinkoffset = GET(bra, 1);
4281 bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
4282 *code++ = OP_KET;
4283 PUTINC(code, 0, offset);
4284 PUT(bra, 1, offset);
4285 }
4286 }
4287
4288 /* If the maximum is unlimited, set a repeater in the final copy. We
4289 can't just offset backwards from the current code point, because we
4290 don't know if there's been an options resetting after the ket. The
4291 correct offset was computed above.
4292
4293 Then, when we are doing the actual compile phase, check to see whether
4294 this group is a non-atomic one that could match an empty string. If so,
4295 convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
4296 that runtime checking can be done. [This check is also applied to
4297 atomic groups at runtime, but in a different way.] */
4298
4299 else
4300 {
4301 uschar *ketcode = code - ketoffset;
4302 uschar *bracode = ketcode - GET(ketcode, 1);
4303 *ketcode = OP_KETRMAX + repeat_type;
4304 if (lengthptr == NULL && *bracode != OP_ONCE)
4305 {
4306 uschar *scode = bracode;
4307 do
4308 {
4309 if (could_be_empty_branch(scode, ketcode, utf8))
4310 {
4311 *bracode += OP_SBRA - OP_BRA;
4312 break;
4313 }
4314 scode += GET(scode, 1);
4315 }
4316 while (*scode == OP_ALT);
4317 }
4318 }
4319 }
4320
4321 /* If previous is OP_FAIL, it was generated by an empty class [] in
4322 JavaScript mode. The other ways in which OP_FAIL can be generated, that is
4323 by (*FAIL) or (?!) set previous to NULL, which gives a "nothing to repeat"
4324 error above. We can just ignore the repeat in JS case. */
4325
4326 else if (*previous == OP_FAIL) goto END_REPEAT;
4327
4328 /* Else there's some kind of shambles */
4329
4330 else
4331 {
4332 *errorcodeptr = ERR11;
4333 goto FAILED;
4334 }
4335
4336 /* If the character following a repeat is '+', or if certain optimization
4337 tests above succeeded, possessive_quantifier is TRUE. For some of the
4338 simpler opcodes, there is an special alternative opcode for this. For
4339 anything else, we wrap the entire repeated item inside OP_ONCE brackets.
4340 The '+' notation is just syntactic sugar, taken from Sun's Java package,
4341 but the special opcodes can optimize it a bit. The repeated item starts at
4342 tempcode, not at previous, which might be the first part of a string whose
4343 (former) last char we repeated.
4344
4345 Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
4346 an 'upto' may follow. We skip over an 'exact' item, and then test the
4347 length of what remains before proceeding. */
4348
4349 if (possessive_quantifier)
4350 {
4351 int len;
4352
4353 if (*tempcode == OP_TYPEEXACT)
4354 tempcode += _pcre_OP_lengths[*tempcode] +
4355 ((tempcode[3] == OP_PROP || tempcode[3] == OP_NOTPROP)? 2 : 0);
4356
4357 else if (*tempcode == OP_EXACT || *tempcode == OP_NOTEXACT)
4358 {
4359 tempcode += _pcre_OP_lengths[*tempcode];
4360 #ifdef SUPPORT_UTF8
4361 if (utf8 && tempcode[-1] >= 0xc0)
4362 tempcode += _pcre_utf8_table4[tempcode[-1] & 0x3f];
4363 #endif
4364 }
4365
4366 len = code - tempcode;
4367 if (len > 0) switch (*tempcode)
4368 {
4369 case OP_STAR: *tempcode = OP_POSSTAR; break;
4370 case OP_PLUS: *tempcode = OP_POSPLUS; break;
4371 case OP_QUERY: *tempcode = OP_POSQUERY; break;
4372 case OP_UPTO: *tempcode = OP_POSUPTO; break;
4373
4374 case OP_TYPESTAR: *tempcode = OP_TYPEPOSSTAR; break;
4375 case OP_TYPEPLUS: *tempcode = OP_TYPEPOSPLUS; break;
4376 case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
4377 case OP_TYPEUPTO: *tempcode = OP_TYPEPOSUPTO; break;
4378
4379 case OP_NOTSTAR: *tempcode = OP_NOTPOSSTAR; break;
4380 case OP_NOTPLUS: *tempcode = OP_NOTPOSPLUS; break;
4381 case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
4382 case OP_NOTUPTO: *tempcode = OP_NOTPOSUPTO; break;
4383
4384 default:
4385 memmove(tempcode + 1+LINK_SIZE, tempcode, len);
4386 code += 1 + LINK_SIZE;
4387 len += 1 + LINK_SIZE;
4388 tempcode[0] = OP_ONCE;
4389 *code++ = OP_KET;
4390 PUTINC(code, 0, len);
4391 PUT(tempcode, 1, len);
4392 break;
4393 }
4394 }
4395
4396 /* In all case we no longer have a previous item. We also set the
4397 "follows varying string" flag for subsequently encountered reqbytes if
4398 it isn't already set and we have just passed a varying length item. */
4399
4400 END_REPEAT:
4401 previous = NULL;
4402 cd->req_varyopt |= reqvary;
4403 break;
4404
4405
4406 /* ===================================================================*/
4407 /* Start of nested parenthesized sub-expression, or comment or lookahead or
4408 lookbehind or option setting or condition or all the other extended
4409 parenthesis forms. */
4410
4411 case CHAR_LEFT_PARENTHESIS:
4412 newoptions = options;
4413 skipbytes = 0;
4414 bravalue = OP_CBRA;
4415 save_hwm = cd->hwm;
4416 reset_bracount = FALSE;
4417
4418 /* First deal with various "verbs" that can be introduced by '*'. */
4419
4420 if (*(++ptr) == CHAR_ASTERISK && (cd->ctypes[ptr[1]] & ctype_letter) != 0)
4421 {
4422 int i, namelen;
4423 const char *vn = verbnames;
4424 const uschar *name = ++ptr;
4425 previous = NULL;
4426 while ((cd->ctypes[*++ptr] & ctype_letter) != 0) {};
4427 if (*ptr == CHAR_COLON)
4428 {
4429 *errorcodeptr = ERR59; /* Not supported */
4430 goto FAILED;
4431 }
4432 if (*ptr != CHAR_RIGHT_PARENTHESIS)
4433 {
4434 *errorcodeptr = ERR60;
4435 goto FAILED;
4436 }
4437 namelen = ptr - name;
4438 for (i = 0; i < verbcount; i++)
4439 {
4440 if (namelen == verbs[i].len &&
4441 strncmp((char *)name, vn, namelen) == 0)
4442 {
4443 /* Check for open captures before ACCEPT */
4444
4445 if (verbs[i].op == OP_ACCEPT)
4446 {
4447 open_capitem *oc;
4448 cd->had_accept = TRUE;
4449 for (oc = cd->open_caps; oc != NULL; oc = oc->next)
4450 {
4451 *code++ = OP_CLOSE;
4452 PUT2INC(code, 0, oc->number);
4453 }
4454 }
4455 *code++ = verbs[i].op;
4456 break;
4457 }
4458 vn += verbs[i].len + 1;
4459 }
4460 if (i < verbcount) continue;
4461 *errorcodeptr = ERR60;
4462 goto FAILED;
4463 }
4464
4465 /* Deal with the extended parentheses; all are introduced by '?', and the
4466 appearance of any of them means that this is not a capturing group. */
4467
4468 else if (*ptr == CHAR_QUESTION_MARK)
4469 {
4470 int i, set, unset, namelen;
4471 int *optset;
4472 const uschar *name;
4473 uschar *slot;
4474
4475 switch (*(++ptr))
4476 {
4477 case CHAR_NUMBER_SIGN: /* Comment; skip to ket */
4478 ptr++;
4479 while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
4480 if (*ptr == 0)
4481 {
4482 *errorcodeptr = ERR18;
4483 goto FAILED;
4484 }
4485 continue;
4486
4487
4488 /* ------------------------------------------------------------ */
4489 case CHAR_VERTICAL_LINE: /* Reset capture count for each branch */
4490 reset_bracount = TRUE;
4491 /* Fall through */
4492
4493 /* ------------------------------------------------------------ */
4494 case CHAR_COLON: /* Non-capturing bracket */
4495 bravalue = OP_BRA;
4496 ptr++;
4497 break;
4498
4499
4500 /* ------------------------------------------------------------ */
4501 case CHAR_LEFT_PARENTHESIS:
4502 bravalue = OP_COND; /* Conditional group */
4503
4504 /* A condition can be an assertion, a number (referring to a numbered
4505 group), a name (referring to a named group), or 'R', referring to
4506 recursion. R<digits> and R&name are also permitted for recursion tests.
4507
4508 There are several syntaxes for testing a named group: (?(name)) is used
4509 by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
4510
4511 There are two unfortunate ambiguities, caused by history. (a) 'R' can
4512 be the recursive thing or the name 'R' (and similarly for 'R' followed
4513 by digits), and (b) a number could be a name that consists of digits.
4514 In both cases, we look for a name first; if not found, we try the other
4515 cases. */
4516
4517 /* For conditions that are assertions, check the syntax, and then exit
4518 the switch. This will take control down to where bracketed groups,
4519 including assertions, are processed. */
4520
4521 if (ptr[1] == CHAR_QUESTION_MARK && (ptr[2] == CHAR_EQUALS_SIGN ||
4522 ptr[2] == CHAR_EXCLAMATION_MARK || ptr[2] == CHAR_LESS_THAN_SIGN))
4523 break;
4524
4525 /* Most other conditions use OP_CREF (a couple change to OP_RREF
4526 below), and all need to skip 3 bytes at the start of the group. */
4527
4528 code[1+LINK_SIZE] = OP_CREF;
4529 skipbytes = 3;
4530 refsign = -1;
4531
4532 /* Check for a test for recursion in a named group. */
4533
4534 if (ptr[1] == CHAR_R && ptr[2] == CHAR_AMPERSAND)
4535 {
4536 terminator = -1;
4537 ptr += 2;
4538 code[1+LINK_SIZE] = OP_RREF; /* Change the type of test */
4539 }
4540
4541 /* Check for a test for a named group's having been set, using the Perl
4542 syntax (?(<name>) or (?('name') */
4543
4544 else if (ptr[1] == CHAR_LESS_THAN_SIGN)
4545 {
4546 terminator = CHAR_GREATER_THAN_SIGN;
4547 ptr++;
4548 }
4549 else if (ptr[1] == CHAR_APOSTROPHE)
4550 {
4551 terminator = CHAR_APOSTROPHE;
4552 ptr++;
4553 }
4554 else
4555 {
4556 terminator = 0;
4557 if (ptr[1] == CHAR_MINUS || ptr[1] == CHAR_PLUS) refsign = *(++ptr);
4558 }
4559
4560 /* We now expect to read a name; any thing else is an error */
4561
4562 if ((cd->ctypes[ptr[1]] & ctype_word) == 0)
4563 {
4564 ptr += 1; /* To get the right offset */
4565 *errorcodeptr = ERR28;
4566 goto FAILED;
4567 }
4568
4569 /* Read the name, but also get it as a number if it's all digits */
4570
4571 recno = 0;
4572 name = ++ptr;
4573 while ((cd->ctypes[*ptr] & ctype_word) != 0)
4574 {
4575 if (recno >= 0)
4576 recno = ((digitab[*ptr] & ctype_digit) != 0)?
4577 recno * 10 + *ptr - CHAR_0 : -1;
4578 ptr++;
4579 }
4580 namelen = ptr - name;
4581
4582 if ((terminator > 0 && *ptr++ != terminator) ||
4583 *ptr++ != CHAR_RIGHT_PARENTHESIS)
4584 {
4585 ptr--; /* Error offset */
4586 *errorcodeptr = ERR26;
4587 goto FAILED;
4588 }
4589
4590 /* Do no further checking in the pre-compile phase. */
4591
4592 if (lengthptr != NULL) break;
4593
4594 /* In the real compile we do the work of looking for the actual
4595 reference. If the string started with "+" or "-" we require the rest to
4596 be digits, in which case recno will be set. */
4597
4598 if (refsign > 0)
4599 {
4600 if (recno <= 0)
4601 {
4602 *errorcodeptr = ERR58;
4603 goto FAILED;
4604 }
4605 recno = (refsign == CHAR_MINUS)?
4606 cd->bracount - recno + 1 : recno +cd->bracount;
4607 if (recno <= 0 || recno > cd->final_bracount)
4608 {
4609 *errorcodeptr = ERR15;
4610 goto FAILED;
4611 }
4612 PUT2(code, 2+LINK_SIZE, recno);
4613 break;
4614 }
4615
4616 /* Otherwise (did not start with "+" or "-"), start by looking for the
4617 name. */
4618
4619 slot = cd->name_table;
4620 for (i = 0; i < cd->names_found; i++)
4621 {
4622 if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
4623 slot += cd->name_entry_size;
4624 }
4625
4626 /* Found a previous named subpattern */
4627
4628 if (i < cd->names_found)
4629 {
4630 recno = GET2(slot, 0);
4631 PUT2(code, 2+LINK_SIZE, recno);
4632 }
4633
4634 /* Search the pattern for a forward reference */
4635
4636 else if ((i = find_parens(cd, name, namelen,
4637 (options & PCRE_EXTENDED) != 0)) > 0)
4638 {
4639 PUT2(code, 2+LINK_SIZE, i);
4640 }
4641
4642 /* If terminator == 0 it means that the name followed directly after
4643 the opening parenthesis [e.g. (?(abc)...] and in this case there are
4644 some further alternatives to try. For the cases where terminator != 0
4645 [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
4646 now checked all the possibilities, so give an error. */
4647
4648 else if (terminator != 0)
4649 {
4650 *errorcodeptr = ERR15;
4651 goto FAILED;
4652 }
4653
4654 /* Check for (?(R) for recursion. Allow digits after R to specify a
4655 specific group number. */
4656
4657 else if (*name == CHAR_R)
4658 {
4659 recno = 0;
4660 for (i = 1; i < namelen; i++)
4661 {
4662 if ((digitab[name[i]] & ctype_digit) == 0)
4663 {
4664 *errorcodeptr = ERR15;
4665 goto FAILED;
4666 }
4667 recno = recno * 10 + name[i] - CHAR_0;
4668 }
4669 if (recno == 0) recno = RREF_ANY;
4670 code[1+LINK_SIZE] = OP_RREF; /* Change test type */
4671 PUT2(code, 2+LINK_SIZE, recno);
4672 }
4673
4674 /* Similarly, check for the (?(DEFINE) "condition", which is always
4675 false. */
4676
4677 else if (namelen == 6 && strncmp((char *)name, STRING_DEFINE, 6) == 0)
4678 {
4679 code[1+LINK_SIZE] = OP_DEF;
4680 skipbytes = 1;
4681 }
4682
4683 /* Check for the "name" actually being a subpattern number. We are
4684 in the second pass here, so final_bracount is set. */
4685
4686 else if (recno > 0 && recno <= cd->final_bracount)
4687 {
4688 PUT2(code, 2+LINK_SIZE, recno);
4689 }
4690
4691 /* Either an unidentified subpattern, or a reference to (?(0) */
4692
4693 else
4694 {
4695 *errorcodeptr = (recno == 0)? ERR35: ERR15;
4696 goto FAILED;
4697 }
4698 break;
4699
4700
4701 /* ------------------------------------------------------------ */
4702 case CHAR_EQUALS_SIGN: /* Positive lookahead */
4703 bravalue = OP_ASSERT;
4704 ptr++;
4705 break;
4706
4707
4708 /* ------------------------------------------------------------ */
4709 case CHAR_EXCLAMATION_MARK: /* Negative lookahead */
4710 ptr++;
4711 if (*ptr == CHAR_RIGHT_PARENTHESIS) /* Optimize (?!) */
4712 {
4713 *code++ = OP_FAIL;
4714 previous = NULL;
4715 continue;
4716 }
4717 bravalue = OP_ASSERT_NOT;
4718 break;
4719
4720
4721 /* ------------------------------------------------------------ */
4722 case CHAR_LESS_THAN_SIGN: /* Lookbehind or named define */
4723 switch (ptr[1])
4724 {
4725 case CHAR_EQUALS_SIGN: /* Positive lookbehind */
4726 bravalue = OP_ASSERTBACK;
4727 ptr += 2;
4728 break;
4729
4730 case CHAR_EXCLAMATION_MARK: /* Negative lookbehind */
4731 bravalue = OP_ASSERTBACK_NOT;
4732 ptr += 2;
4733 break;
4734
4735 default: /* Could be name define, else bad */
4736 if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME;
4737 ptr++; /* Correct offset for error */
4738 *errorcodeptr = ERR24;
4739 goto FAILED;
4740 }
4741 break;
4742
4743
4744 /* ------------------------------------------------------------ */
4745 case CHAR_GREATER_THAN_SIGN: /* One-time brackets */
4746 bravalue = OP_ONCE;
4747 ptr++;
4748 break;
4749
4750
4751 /* ------------------------------------------------------------ */
4752 case CHAR_C: /* Callout - may be followed by digits; */
4753 previous_callout = code; /* Save for later completion */
4754 after_manual_callout = 1; /* Skip one item before completing */
4755 *code++ = OP_CALLOUT;
4756 {
4757 int n = 0;
4758 while ((digitab[*(++ptr)] & ctype_digit) != 0)
4759 n = n * 10 + *ptr - CHAR_0;
4760 if (*ptr != CHAR_RIGHT_PARENTHESIS)
4761 {
4762 *errorcodeptr = ERR39;
4763 goto FAILED;
4764 }
4765 if (n > 255)
4766 {
4767 *errorcodeptr = ERR38;
4768 goto FAILED;
4769 }
4770 *code++ = n;
4771 PUT(code, 0, ptr - cd->start_pattern + 1); /* Pattern offset */
4772 PUT(code, LINK_SIZE, 0); /* Default length */
4773 code += 2 * LINK_SIZE;
4774 }
4775 previous = NULL;
4776 continue;
4777
4778
4779 /* ------------------------------------------------------------ */
4780 case CHAR_P: /* Python-style named subpattern handling */
4781 if (*(++ptr) == CHAR_EQUALS_SIGN ||
4782 *ptr == CHAR_GREATER_THAN_SIGN) /* Reference or recursion */
4783 {
4784 is_recurse = *ptr == CHAR_GREATER_THAN_SIGN;
4785 terminator = CHAR_RIGHT_PARENTHESIS;
4786 goto NAMED_REF_OR_RECURSE;
4787 }
4788 else if (*ptr != CHAR_LESS_THAN_SIGN) /* Test for Python-style defn */
4789 {
4790 *errorcodeptr = ERR41;
4791 goto FAILED;
4792 }
4793 /* Fall through to handle (?P< as (?< is handled */
4794
4795
4796 /* ------------------------------------------------------------ */
4797 DEFINE_NAME: /* Come here from (?< handling */
4798 case CHAR_APOSTROPHE:
4799 {
4800 terminator = (*ptr == CHAR_LESS_THAN_SIGN)?
4801 CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
4802 name = ++ptr;
4803
4804 while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4805 namelen = ptr - name;
4806
4807 /* In the pre-compile phase, just do a syntax check. */
4808
4809 if (lengthptr != NULL)
4810 {
4811 if (*ptr != terminator)
4812 {
4813 *errorcodeptr = ERR42;
4814 goto FAILED;
4815 }
4816 if (cd->names_found >= MAX_NAME_COUNT)
4817 {
4818 *errorcodeptr = ERR49;
4819 goto FAILED;
4820 }
4821 if (namelen + 3 > cd->name_entry_size)
4822 {
4823 cd->name_entry_size = namelen + 3;
4824 if (namelen > MAX_NAME_SIZE)
4825 {
4826 *errorcodeptr = ERR48;
4827 goto FAILED;
4828 }
4829 }
4830 }
4831
4832 /* In the real compile, create the entry in the table */
4833
4834 else
4835 {
4836 slot = cd->name_table;
4837 for (i = 0; i < cd->names_found; i++)
4838 {
4839 int crc = memcmp(name, slot+2, namelen);
4840 if (crc == 0)
4841 {
4842 if (slot[2+namelen] == 0)
4843 {
4844 if ((options & PCRE_DUPNAMES) == 0)
4845 {
4846 *errorcodeptr = ERR43;
4847 goto FAILED;
4848 }
4849 }
4850 else crc = -1; /* Current name is substring */
4851 }
4852 if (crc < 0)
4853 {
4854 memmove(slot + cd->name_entry_size, slot,
4855 (cd->names_found - i) * cd->name_entry_size);
4856 break;
4857 }
4858 slot += cd->name_entry_size;
4859 }
4860
4861 PUT2(slot, 0, cd->bracount + 1);
4862 memcpy(slot + 2, name, namelen);
4863 slot[2+namelen] = 0;
4864 }
4865 }
4866
4867 /* In both cases, count the number of names we've encountered. */
4868
4869 ptr++; /* Move past > or ' */
4870 cd->names_found++;
4871 goto NUMBERED_GROUP;
4872
4873
4874 /* ------------------------------------------------------------ */
4875 case CHAR_AMPERSAND: /* Perl recursion/subroutine syntax */
4876 terminator = CHAR_RIGHT_PARENTHESIS;
4877 is_recurse = TRUE;
4878 /* Fall through */
4879
4880 /* We come here from the Python syntax above that handles both
4881 references (?P=name) and recursion (?P>name), as well as falling
4882 through from the Perl recursion syntax (?&name). We also come here from
4883 the Perl \k<name> or \k'name' back reference syntax and the \k{name}
4884 .NET syntax, and the Oniguruma \g<...> and \g'...' subroutine syntax. */
4885
4886 NAMED_REF_OR_RECURSE:
4887 name = ++ptr;
4888 while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4889 namelen = ptr - name;
4890
4891 /* In the pre-compile phase, do a syntax check and set a dummy
4892 reference number. */
4893
4894 if (lengthptr != NULL)
4895 {
4896 if (namelen == 0)
4897 {
4898 *errorcodeptr = ERR62;
4899 goto FAILED;
4900 }
4901 if (*ptr != terminator)
4902 {
4903 *errorcodeptr = ERR42;
4904 goto FAILED;
4905 }
4906 if (namelen > MAX_NAME_SIZE)
4907 {
4908 *errorcodeptr = ERR48;
4909 goto FAILED;
4910 }
4911 recno = 0;
4912 }
4913
4914 /* In the real compile, seek the name in the table. We check the name
4915 first, and then check that we have reached the end of the name in the
4916 table. That way, if the name that is longer than any in the table,
4917 the comparison will fail without reading beyond the table entry. */
4918
4919 else
4920 {
4921 slot = cd->name_table;
4922 for (i = 0; i < cd->names_found; i++)
4923 {
4924 if (strncmp((char *)name, (char *)slot+2, namelen) == 0 &&
4925 slot[2+namelen] == 0)
4926 break;
4927 slot += cd->name_entry_size;
4928 }
4929
4930 if (i < cd->names_found) /* Back reference */
4931 {
4932 recno = GET2(slot, 0);
4933 }
4934 else if ((recno = /* Forward back reference */
4935 find_parens(cd, name, namelen,
4936 (options & PCRE_EXTENDED) != 0)) <= 0)
4937 {
4938 *errorcodeptr = ERR15;
4939 goto FAILED;
4940 }
4941 }
4942
4943 /* In both phases, we can now go to the code than handles numerical
4944 recursion or backreferences. */
4945
4946 if (is_recurse) goto HANDLE_RECURSION;
4947 else goto HANDLE_REFERENCE;
4948
4949
4950 /* ------------------------------------------------------------ */
4951 case CHAR_R: /* Recursion */
4952 ptr++; /* Same as (?0) */
4953 /* Fall through */
4954
4955
4956 /* ------------------------------------------------------------ */
4957 case CHAR_MINUS: case CHAR_PLUS: /* Recursion or subroutine */
4958 case CHAR_0: case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4:
4959 case CHAR_5: case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
4960 {
4961 const uschar *called;
4962 terminator = CHAR_RIGHT_PARENTHESIS;
4963
4964 /* Come here from the \g<...> and \g'...' code (Oniguruma
4965 compatibility). However, the syntax has been checked to ensure that
4966 the ... are a (signed) number, so that neither ERR63 nor ERR29 will
4967 be called on this path, nor with the jump to OTHER_CHAR_AFTER_QUERY
4968 ever be taken. */
4969
4970 HANDLE_NUMERICAL_RECURSION:
4971
4972 if ((refsign = *ptr) == CHAR_PLUS)
4973 {
4974 ptr++;
4975 if ((digitab[*ptr] & ctype_digit) == 0)
4976 {
4977 *errorcodeptr = ERR63;
4978 goto FAILED;
4979 }
4980 }
4981 else if (refsign == CHAR_MINUS)
4982 {
4983 if ((digitab[ptr[1]] & ctype_digit) == 0)
4984 goto OTHER_CHAR_AFTER_QUERY;
4985 ptr++;
4986 }
4987
4988 recno = 0;
4989 while((digitab[*ptr] & ctype_digit) != 0)
4990 recno = recno * 10 + *ptr++ - CHAR_0;
4991
4992 if (*ptr != terminator)
4993 {
4994 *errorcodeptr = ERR29;
4995 goto FAILED;
4996 }
4997
4998 if (refsign == CHAR_MINUS)
4999 {
5000 if (recno == 0)
5001 {
5002 *errorcodeptr = ERR58;
5003 goto FAILED;
5004 }
5005 recno = cd->bracount - recno + 1;
5006 if (recno <= 0)
5007 {
5008 *errorcodeptr = ERR15;
5009 goto FAILED;
5010 }
5011 }
5012 else if (refsign == CHAR_PLUS)
5013 {
5014 if (recno == 0)
5015 {
5016 *errorcodeptr = ERR58;
5017 goto FAILED;
5018 }
5019 recno += cd->bracount;
5020 }
5021
5022 /* Come here from code above that handles a named recursion */
5023
5024 HANDLE_RECURSION:
5025
5026 previous = code;
5027 called = cd->start_code;
5028
5029 /* When we are actually compiling, find the bracket that is being
5030 referenced. Temporarily end the regex in case it doesn't exist before
5031 this point. If we end up with a forward reference, first check that
5032 the bracket does occur later so we can give the error (and position)
5033 now. Then remember this forward reference in the workspace so it can
5034 be filled in at the end. */
5035
5036 if (lengthptr == NULL)
5037 {
5038 *code = OP_END;
5039 if (recno != 0) called = find_bracket(cd->start_code, utf8, recno);
5040
5041 /* Forward reference */
5042
5043 if (called == NULL)
5044 {
5045 if (find_parens(cd, NULL, recno,
5046 (options & PCRE_EXTENDED) != 0) < 0)
5047 {
5048 *errorcodeptr = ERR15;
5049 goto FAILED;
5050 }
5051 called = cd->start_code + recno;
5052 PUTINC(cd->hwm, 0, code + 2 + LINK_SIZE - cd->start_code);
5053 }
5054
5055 /* If not a forward reference, and the subpattern is still open,
5056 this is a recursive call. We check to see if this is a left
5057 recursion that could loop for ever, and diagnose that case. */
5058
5059 else if (GET(called, 1) == 0 &&
5060 could_be_empty(called, code, bcptr, utf8))
5061 {
5062 *errorcodeptr = ERR40;
5063 goto FAILED;
5064 }
5065 }
5066
5067 /* Insert the recursion/subroutine item, automatically wrapped inside
5068 "once" brackets. Set up a "previous group" length so that a
5069 subsequent quantifier will work. */
5070
5071 *code = OP_ONCE;
5072 PUT(code, 1, 2 + 2*LINK_SIZE);
5073 code += 1 + LINK_SIZE;
5074
5075 *code = OP_RECURSE;
5076 PUT(code, 1, called - cd->start_code);
5077 code += 1 + LINK_SIZE;
5078
5079 *code = OP_KET;
5080 PUT(code, 1, 2 + 2*LINK_SIZE);
5081 code += 1 + LINK_SIZE;
5082
5083 length_prevgroup = 3 + 3*LINK_SIZE;
5084 }
5085
5086 /* Can't determine a first byte now */
5087
5088 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
5089 continue;
5090
5091
5092 /* ------------------------------------------------------------ */
5093 default: /* Other characters: check option setting */
5094 OTHER_CHAR_AFTER_QUERY:
5095 set = unset = 0;
5096 optset = &set;
5097
5098 while (*ptr != CHAR_RIGHT_PARENTHESIS && *ptr != CHAR_COLON)
5099 {
5100 switch (*ptr++)
5101 {
5102 case CHAR_MINUS: optset = &unset; break;
5103
5104 case CHAR_J: /* Record that it changed in the external options */
5105 *optset |= PCRE_DUPNAMES;
5106 cd->external_flags |= PCRE_JCHANGED;
5107 break;
5108
5109 case CHAR_i: *optset |= PCRE_CASELESS; break;
5110 case CHAR_m: *optset |= PCRE_MULTILINE; break;
5111 case CHAR_s: *optset |= PCRE_DOTALL; break;
5112 case CHAR_x: *optset |= PCRE_EXTENDED; break;
5113 case CHAR_U: *optset |= PCRE_UNGREEDY; break;
5114 case CHAR_X: *optset |= PCRE_EXTRA; break;
5115
5116 default: *errorcodeptr = ERR12;
5117 ptr--; /* Correct the offset */
5118 goto FAILED;
5119 }
5120 }
5121
5122 /* Set up the changed option bits, but don't change anything yet. */
5123
5124 newoptions = (options | set) & (~unset);
5125
5126 /* If the options ended with ')' this is not the start of a nested
5127 group with option changes, so the options change at this level. If this
5128 item is right at the start of the pattern, the options can be
5129 abstracted and made external in the pre-compile phase, and ignored in
5130 the compile phase. This can be helpful when matching -- for instance in
5131 caseless checking of required bytes.
5132
5133 If the code pointer is not (cd->start_code + 1 + LINK_SIZE), we are
5134 definitely *not* at the start of the pattern because something has been
5135 compiled. In the pre-compile phase, however, the code pointer can have
5136 that value after the start, because it gets reset as code is discarded
5137 during the pre-compile. However, this can happen only at top level - if
5138 we are within parentheses, the starting BRA will still be present. At
5139 any parenthesis level, the length value can be used to test if anything
5140 has been compiled at that level. Thus, a test for both these conditions
5141 is necessary to ensure we correctly detect the start of the pattern in
5142 both phases.
5143
5144 If we are not at the pattern start, compile code to change the ims
5145 options if this setting actually changes any of them, and reset the
5146 greedy defaults and the case value for firstbyte and reqbyte. */
5147
5148 if (*ptr == CHAR_RIGHT_PARENTHESIS)
5149 {
5150 if (code == cd->start_code + 1 + LINK_SIZE &&
5151 (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE))
5152 {
5153 cd->external_options = newoptions;
5154 }
5155 else
5156 {
5157 if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
5158 {
5159 *code++ = OP_OPT;
5160 *code++ = newoptions & PCRE_IMS;
5161 }
5162 greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
5163 greedy_non_default = greedy_default ^ 1;
5164 req_caseopt = ((newoptions & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
5165 }
5166
5167 /* Change options at this level, and pass them back for use
5168 in subsequent branches. When not at the start of the pattern, this
5169 information is also necessary so that a resetting item can be
5170 compiled at the end of a group (if we are in a group). */
5171
5172 *optionsptr = options = newoptions;
5173 previous = NULL; /* This item can't be repeated */
5174 continue; /* It is complete */
5175 }
5176
5177 /* If the options ended with ':' we are heading into a nested group
5178 with possible change of options. Such groups are non-capturing and are
5179 not assertions of any kind. All we need to do is skip over the ':';
5180 the newoptions value is handled below. */
5181
5182 bravalue = OP_BRA;
5183 ptr++;
5184 } /* End of switch for character following (? */
5185 } /* End of (? handling */
5186
5187 /* Opening parenthesis not followed by '?'. If PCRE_NO_AUTO_CAPTURE is set,
5188 all unadorned brackets become non-capturing and behave like (?:...)
5189 brackets. */
5190
5191 else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
5192 {
5193 bravalue = OP_BRA;
5194 }
5195
5196 /* Else we have a capturing group. */
5197
5198 else
5199 {
5200 NUMBERED_GROUP:
5201 cd->bracount += 1;
5202 PUT2(code, 1+LINK_SIZE, cd->bracount);
5203 skipbytes = 2;
5204 }
5205
5206 /* Process nested bracketed regex. Assertions may not be repeated, but
5207 other kinds can be. All their opcodes are >= OP_ONCE. We copy code into a
5208 non-register variable in order to be able to pass its address because some
5209 compilers complain otherwise. Pass in a new setting for the ims options if
5210 they have changed. */
5211
5212 previous = (bravalue >= OP_ONCE)? code : NULL;
5213 *code = bravalue;
5214 tempcode = code;
5215 tempreqvary = cd->req_varyopt; /* Save value before bracket */
5216 length_prevgroup = 0; /* Initialize for pre-compile phase */
5217
5218 if (!compile_regex(
5219 newoptions, /* The complete new option state */
5220 options & PCRE_IMS, /* The previous ims option state */
5221 &tempcode, /* Where to put code (updated) */
5222 &ptr, /* Input pointer (updated) */
5223 errorcodeptr, /* Where to put an error message */
5224 (bravalue == OP_ASSERTBACK ||
5225 bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
5226 reset_bracount, /* True if (?| group */
5227 skipbytes, /* Skip over bracket number */
5228 &subfirstbyte, /* For possible first char */
5229 &subreqbyte, /* For possible last char */
5230 bcptr, /* Current branch chain */
5231 cd, /* Tables block */
5232 (lengthptr == NULL)? NULL : /* Actual compile phase */
5233 &length_prevgroup /* Pre-compile phase */
5234 ))
5235 goto FAILED;
5236
5237 /* At the end of compiling, code is still pointing to the start of the
5238 group, while tempcode has been updated to point past the end of the group
5239 and any option resetting that may follow it. The pattern pointer (ptr)
5240 is on the bracket. */
5241
5242 /* If this is a conditional bracket, check that there are no more than
5243 two branches in the group, or just one if it's a DEFINE group. We do this
5244 in the real compile phase, not in the pre-pass, where the whole group may
5245 not be available. */
5246
5247 if (bravalue == OP_COND && lengthptr == NULL)
5248 {
5249 uschar *tc = code;
5250 int condcount = 0;
5251
5252 do {
5253 condcount++;
5254 tc += GET(tc,1);
5255 }
5256 while (*tc != OP_KET);
5257
5258 /* A DEFINE group is never obeyed inline (the "condition" is always
5259 false). It must have only one branch. */
5260
5261 if (code[LINK_SIZE+1] == OP_DEF)
5262 {
5263 if (condcount > 1)
5264 {
5265 *errorcodeptr = ERR54;
5266 goto FAILED;
5267 }
5268 bravalue = OP_DEF; /* Just a flag to suppress char handling below */
5269 }
5270
5271 /* A "normal" conditional group. If there is just one branch, we must not
5272 make use of its firstbyte or reqbyte, because this is equivalent to an
5273 empty second branch. */
5274
5275 else
5276 {
5277 if (condcount > 2)
5278 {
5279 *errorcodeptr = ERR27;
5280 goto FAILED;
5281 }
5282 if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
5283 }
5284 }
5285
5286 /* Error if hit end of pattern */
5287
5288 if (*ptr != CHAR_RIGHT_PARENTHESIS)
5289 {
5290 *errorcodeptr = ERR14;
5291 goto FAILED;
5292 }
5293
5294 /* In the pre-compile phase, update the length by the length of the group,
5295 less the brackets at either end. Then reduce the compiled code to just a
5296 set of non-capturing brackets so that it doesn't use much memory if it is
5297 duplicated by a quantifier.*/
5298
5299 if (lengthptr != NULL)
5300 {
5301 if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE)
5302 {
5303 *errorcodeptr = ERR20;
5304 goto FAILED;
5305 }
5306 *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
5307 *code++ = OP_BRA;
5308 PUTINC(code, 0, 1 + LINK_SIZE);
5309 *code++ = OP_KET;
5310 PUTINC(code, 0, 1 + LINK_SIZE);
5311 break; /* No need to waste time with special character handling */
5312 }
5313
5314 /* Otherwise update the main code pointer to the end of the group. */
5315
5316 code = tempcode;
5317
5318 /* For a DEFINE group, required and first character settings are not
5319 relevant. */
5320
5321 if (bravalue == OP_DEF) break;
5322
5323 /* Handle updating of the required and first characters for other types of
5324 group. Update for normal brackets of all kinds, and conditions with two
5325 branches (see code above). If the bracket is followed by a quantifier with
5326 zero repeat, we have to back off. Hence the definition of zeroreqbyte and
5327 zerofirstbyte outside the main loop so that they can be accessed for the
5328 back off. */
5329
5330 zeroreqbyte = reqbyte;
5331 zerofirstbyte = firstbyte;
5332 groupsetfirstbyte = FALSE;
5333
5334 if (bravalue >= OP_ONCE)
5335 {
5336 /* If we have not yet set a firstbyte in this branch, take it from the
5337 subpattern, remembering that it was set here so that a repeat of more
5338 than one can replicate it as reqbyte if necessary. If the subpattern has
5339 no firstbyte, set "none" for the whole branch. In both cases, a zero
5340 repeat forces firstbyte to "none". */
5341
5342 if (firstbyte == REQ_UNSET)
5343 {
5344 if (subfirstbyte >= 0)
5345 {
5346 firstbyte = subfirstbyte;
5347 groupsetfirstbyte = TRUE;
5348 }
5349 else firstbyte = REQ_NONE;
5350 zerofirstbyte = REQ_NONE;
5351 }
5352
5353 /* If firstbyte was previously set, convert the subpattern's firstbyte
5354 into reqbyte if there wasn't one, using the vary flag that was in
5355 existence beforehand. */
5356
5357 else if (subfirstbyte >= 0 && subreqbyte < 0)
5358 subreqbyte = subfirstbyte | tempreqvary;
5359
5360 /* If the subpattern set a required byte (or set a first byte that isn't
5361 really the first byte - see above), set it. */
5362
5363 if (subreqbyte >= 0) reqbyte = subreqbyte;
5364 }
5365
5366 /* For a forward assertion, we take the reqbyte, if set. This can be
5367 helpful if the pattern that follows the assertion doesn't set a different
5368 char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
5369 for an assertion, however because it leads to incorrect effect for patterns
5370 such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
5371 of a firstbyte. This is overcome by a scan at the end if there's no
5372 firstbyte, looking for an asserted first char. */
5373
5374 else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
5375 break; /* End of processing '(' */
5376
5377
5378 /* ===================================================================*/
5379 /* Handle metasequences introduced by \. For ones like \d, the ESC_ values
5380 are arranged to be the negation of the corresponding OP_values. For the
5381 back references, the values are ESC_REF plus the reference number. Only
5382 back references and those types that consume a character may be repeated.
5383 We can test for values between ESC_b and ESC_Z for the latter; this may
5384 have to change if any new ones are ever created. */
5385
5386 case CHAR_BACKSLASH:
5387 tempptr = ptr;
5388 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, FALSE);
5389 if (*errorcodeptr != 0) goto FAILED;
5390
5391 if (c < 0)
5392 {
5393 if (-c == ESC_Q) /* Handle start of quoted string */
5394 {
5395 if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
5396 ptr += 2; /* avoid empty string */
5397 else inescq = TRUE;
5398 continue;
5399 }
5400
5401 if (-c == ESC_E) continue; /* Perl ignores an orphan \E */
5402
5403 /* For metasequences that actually match a character, we disable the
5404 setting of a first character if it hasn't already been set. */
5405
5406 if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
5407 firstbyte = REQ_NONE;
5408
5409 /* Set values to reset to if this is followed by a zero repeat. */
5410
5411 zerofirstbyte = firstbyte;
5412 zeroreqbyte = reqbyte;
5413
5414 /* \g<name> or \g'name' is a subroutine call by name and \g<n> or \g'n'
5415 is a subroutine call by number (Oniguruma syntax). In fact, the value
5416 -ESC_g is returned only for these cases. So we don't need to check for <
5417 or ' if the value is -ESC_g. For the Perl syntax \g{n} the value is
5418 -ESC_REF+n, and for the Perl syntax \g{name} the result is -ESC_k (as
5419 that is a synonym for a named back reference). */
5420
5421 if (-c == ESC_g)
5422 {
5423 const uschar *p;
5424 save_hwm = cd->hwm; /* Normally this is set when '(' is read */
5425 terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
5426 CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
5427
5428 /* These two statements stop the compiler for warning about possibly
5429 unset variables caused by the jump to HANDLE_NUMERICAL_RECURSION. In
5430 fact, because we actually check for a number below, the paths that
5431 would actually be in error are never taken. */
5432
5433 skipbytes = 0;
5434 reset_bracount = FALSE;
5435
5436 /* Test for a name */
5437
5438 if (ptr[1] != CHAR_PLUS && ptr[1] != CHAR_MINUS)
5439 {
5440 BOOL isnumber = TRUE;
5441 for (p = ptr + 1; *p != 0 && *p != terminator; p++)
5442 {
5443 if ((cd->ctypes[*p] & ctype_digit) == 0) isnumber = FALSE;
5444 if ((cd->ctypes[*p] & ctype_word) == 0) break;
5445 }
5446 if (*p != terminator)
5447 {
5448 *errorcodeptr = ERR57;
5449 break;
5450 }
5451 if (isnumber)
5452 {
5453 ptr++;
5454 goto HANDLE_NUMERICAL_RECURSION;
5455 }
5456 is_recurse = TRUE;
5457 goto NAMED_REF_OR_RECURSE;
5458 }
5459
5460 /* Test a signed number in angle brackets or quotes. */
5461
5462 p = ptr + 2;
5463 while ((digitab[*p] & ctype_digit) != 0) p++;
5464 if (*p != terminator)
5465 {
5466 *errorcodeptr = ERR57;
5467 break;
5468 }
5469 ptr++;
5470 goto HANDLE_NUMERICAL_RECURSION;
5471 }
5472
5473 /* \k<name> or \k'name' is a back reference by name (Perl syntax).
5474 We also support \k{name} (.NET syntax) */
5475
5476 if (-c == ESC_k && (ptr[1] == CHAR_LESS_THAN_SIGN ||
5477 ptr[1] == CHAR_APOSTROPHE || ptr[1] == CHAR_LEFT_CURLY_BRACKET))
5478 {
5479 is_recurse = FALSE;
5480 terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
5481 CHAR_GREATER_THAN_SIGN : (*ptr == CHAR_APOSTROPHE)?
5482 CHAR_APOSTROPHE : CHAR_RIGHT_CURLY_BRACKET;
5483 goto NAMED_REF_OR_RECURSE;
5484 }
5485
5486 /* Back references are handled specially; must disable firstbyte if
5487 not set to cope with cases like (?=(\w+))\1: which would otherwise set
5488 ':' later. */
5489
5490 if (-c >= ESC_REF)
5491 {
5492 recno = -c - ESC_REF;
5493
5494 HANDLE_REFERENCE: /* Come here from named backref handling */
5495 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
5496 previous = code;
5497 *code++ = OP_REF;
5498 PUT2INC(code, 0, recno);
5499 cd->backref_map |= (recno < 32)? (1 << recno) : 1;
5500 if (recno > cd->top_backref) cd->top_backref = recno;
5501 }
5502
5503 /* So are Unicode property matches, if supported. */
5504
5505 #ifdef SUPPORT_UCP
5506 else if (-c == ESC_P || -c == ESC_p)
5507 {
5508 BOOL negated;
5509 int pdata;
5510 int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
5511 if (ptype < 0) goto FAILED;
5512 previous = code;
5513 *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
5514 *code++ = ptype;
5515 *code++ = pdata;
5516 }
5517 #else
5518
5519 /* If Unicode properties are not supported, \X, \P, and \p are not
5520 allowed. */
5521
5522 else if (-c == ESC_X || -c == ESC_P || -c == ESC_p)
5523 {
5524 *errorcodeptr = ERR45;
5525 goto FAILED;
5526 }
5527 #endif
5528
5529 /* For the rest (including \X when Unicode properties are supported), we
5530 can obtain the OP value by negating the escape value. */
5531
5532 else
5533 {
5534 previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
5535 *code++ = -c;
5536 }
5537 continue;
5538 }
5539
5540 /* We have a data character whose value is in c. In UTF-8 mode it may have
5541 a value > 127. We set its representation in the length/buffer, and then
5542 handle it as a data character. */
5543
5544 #ifdef SUPPORT_UTF8
5545 if (utf8 && c > 127)
5546 mclength = _pcre_ord2utf8(c, mcbuffer);
5547 else
5548 #endif
5549
5550 {
5551 mcbuffer[0] = c;
5552 mclength = 1;
5553 }
5554 goto ONE_CHAR;
5555
5556
5557 /* ===================================================================*/
5558 /* Handle a literal character. It is guaranteed not to be whitespace or #
5559 when the extended flag is set. If we are in UTF-8 mode, it may be a
5560 multi-byte literal character. */
5561
5562 default:
5563 NORMAL_CHAR:
5564 mclength = 1;
5565 mcbuffer[0] = c;
5566
5567 #ifdef SUPPORT_UTF8
5568 if (utf8 && c >= 0xc0)
5569 {
5570 while ((ptr[1] & 0xc0) == 0x80)
5571 mcbuffer[mclength++] = *(++ptr);
5572 }
5573 #endif
5574
5575 /* At this point we have the character's bytes in mcbuffer, and the length
5576 in mclength. When not in UTF-8 mode, the length is always 1. */
5577
5578 ONE_CHAR:
5579 previous = code;
5580 *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARNC : OP_CHAR;
5581 for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
5582
5583 /* Remember if \r or \n were seen */
5584
5585 if (mcbuffer[0] == CHAR_CR || mcbuffer[0] == CHAR_NL)
5586 cd->external_flags |= PCRE_HASCRORLF;
5587
5588 /* Set the first and required bytes appropriately. If no previous first
5589 byte, set it from this character, but revert to none on a zero repeat.
5590 Otherwise, leave the firstbyte value alone, and don't change it on a zero
5591 repeat. */
5592
5593 if (firstbyte == REQ_UNSET)
5594 {
5595 zerofirstbyte = REQ_NONE;
5596 zeroreqbyte = reqbyte;
5597
5598 /* If the character is more than one byte long, we can set firstbyte
5599 only if it is not to be matched caselessly. */
5600
5601 if (mclength == 1 || req_caseopt == 0)
5602 {
5603 firstbyte = mcbuffer[0] | req_caseopt;
5604 if (mclength != 1) reqbyte = code[-1] | cd->req_varyopt;
5605 }
5606 else firstbyte = reqbyte = REQ_NONE;
5607 }
5608
5609 /* firstbyte was previously set; we can set reqbyte only the length is
5610 1 or the matching is caseful. */
5611
5612 else
5613 {
5614 zerofirstbyte = firstbyte;
5615 zeroreqbyte = reqbyte;
5616 if (mclength == 1 || req_caseopt == 0)
5617 reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
5618 }
5619
5620 break; /* End of literal character handling */
5621 }
5622 } /* end of big loop */
5623
5624
5625 /* Control never reaches here by falling through, only by a goto for all the
5626 error states. Pass back the position in the pattern so that it can be displayed
5627 to the user for diagnosing the error. */
5628
5629 FAILED:
5630 *ptrptr = ptr;
5631 return FALSE;
5632 }
5633
5634
5635
5636
5637 /*************************************************
5638 * Compile sequence of alternatives *
5639 *************************************************/
5640
5641 /* On entry, ptr is pointing past the bracket character, but on return it
5642 points to the closing bracket, or vertical bar, or end of string. The code
5643 variable is pointing at the byte into which the BRA operator has been stored.
5644 If the ims options are changed at the start (for a (?ims: group) or during any
5645 branch, we need to insert an OP_OPT item at the start of every following branch
5646 to ensure they get set correctly at run time, and also pass the new options
5647 into every subsequent branch compile.
5648
5649 This function is used during the pre-compile phase when we are trying to find
5650 out the amount of memory needed, as well as during the real compile phase. The
5651 value of lengthptr distinguishes the two phases.
5652
5653 Arguments:
5654 options option bits, including any changes for this subpattern
5655 oldims previous settings of ims option bits
5656 codeptr -> the address of the current code pointer
5657 ptrptr -> the address of the current pattern pointer
5658 errorcodeptr -> pointer to error code variable
5659 lookbehind TRUE if this is a lookbehind assertion
5660 reset_bracount TRUE to reset the count for each branch
5661 skipbytes skip this many bytes at start (for brackets and OP_COND)
5662 firstbyteptr place to put the first required character, or a negative number
5663 reqbyteptr place to put the last required character, or a negative number
5664 bcptr pointer to the chain of currently open branches
5665 cd points to the data block with tables pointers etc.
5666 lengthptr NULL during the real compile phase
5667 points to length accumulator during pre-compile phase
5668
5669 Returns: TRUE on success
5670 */
5671
5672 static BOOL
5673 compile_regex(int options, int oldims, uschar **codeptr, const uschar **ptrptr,
5674 int *errorcodeptr, BOOL lookbehind, BOOL reset_bracount, int skipbytes,
5675 int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd,
5676 int *lengthptr)
5677 {
5678 const uschar *ptr = *ptrptr;
5679 uschar *code = *codeptr;
5680 uschar *last_branch = code;
5681 uschar *start_bracket = code;
5682 uschar *reverse_count = NULL;
5683 open_capitem capitem;
5684 int capnumber = 0;
5685 int firstbyte, reqbyte;
5686 int branchfirstbyte, branchreqbyte;
5687 int length;
5688 int orig_bracount;
5689 int max_bracount;
5690 branch_chain bc;
5691
5692 bc.outer = bcptr;
5693 bc.current = code;
5694
5695 firstbyte = reqbyte = REQ_UNSET;
5696
5697 /* Accumulate the length for use in the pre-compile phase. Start with the
5698 length of the BRA and KET and any extra bytes that are required at the
5699 beginning. We accumulate in a local variable to save frequent testing of
5700 lenthptr for NULL. We cannot do this by looking at the value of code at the
5701 start and end of each alternative, because compiled items are discarded during
5702 the pre-compile phase so that the work space is not exceeded. */
5703
5704 length = 2 + 2*LINK_SIZE + skipbytes;
5705
5706 /* WARNING: If the above line is changed for any reason, you must also change
5707 the code that abstracts option settings at the start of the pattern and makes
5708 them global. It tests the value of length for (2 + 2*LINK_SIZE) in the
5709 pre-compile phase to find out whether anything has yet been compiled or not. */
5710
5711 /* If this is a capturing subpattern, add to the chain of open capturing items
5712 so that we can detect them if (*ACCEPT) is encountered. */
5713
5714 if (*code == OP_CBRA)
5715 {
5716 capnumber = GET2(code, 1 + LINK_SIZE);
5717 capitem.number = capnumber;
5718 capitem.next = cd->open_caps;
5719 cd->open_caps = &capitem;
5720 }
5721
5722 /* Offset is set zero to mark that this bracket is still open */
5723
5724 PUT(code, 1, 0);
5725 code += 1 + LINK_SIZE + skipbytes;
5726
5727 /* Loop for each alternative branch */
5728
5729 orig_bracount = max_bracount = cd->bracount;
5730 for (;;)
5731 {
5732 /* For a (?| group, reset the capturing bracket count so that each branch
5733 uses the same numbers. */
5734
5735 if (reset_bracount) cd->bracount = orig_bracount;
5736
5737 /* Handle a change of ims options at the start of the branch */
5738
5739 if ((options & PCRE_IMS) != oldims)
5740 {
5741 *code++ = OP_OPT;
5742 *code++ = options & PCRE_IMS;
5743 length += 2;
5744 }
5745
5746 /* Set up dummy OP_REVERSE if lookbehind assertion */
5747
5748 if (lookbehind)
5749 {
5750 *code++ = OP_REVERSE;
5751 reverse_count = code;
5752 PUTINC(code, 0, 0);
5753 length += 1 + LINK_SIZE;
5754 }
5755
5756 /* Now compile the branch; in the pre-compile phase its length gets added
5757 into the length. */
5758
5759 if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstbyte,
5760 &branchreqbyte, &bc, cd, (lengthptr == NULL)? NULL : &length))
5761 {
5762 *ptrptr = ptr;
5763 return FALSE;
5764 }
5765
5766 /* Keep the highest bracket count in case (?| was used and some branch
5767 has fewer than the rest. */
5768
5769 if (cd->bracount > max_bracount) max_bracount = cd->bracount;
5770
5771 /* In the real compile phase, there is some post-processing to be done. */
5772
5773 if (lengthptr == NULL)
5774 {
5775 /* If this is the first branch, the firstbyte and reqbyte values for the
5776 branch become the values for the regex. */
5777
5778 if (*last_branch != OP_ALT)
5779 {
5780 firstbyte = branchfirstbyte;
5781 reqbyte = branchreqbyte;
5782 }
5783
5784 /* If this is not the first branch, the first char and reqbyte have to
5785 match the values from all the previous branches, except that if the
5786 previous value for reqbyte didn't have REQ_VARY set, it can still match,
5787 and we set REQ_VARY for the regex. */
5788
5789 else
5790 {
5791 /* If we previously had a firstbyte, but it doesn't match the new branch,
5792 we have to abandon the firstbyte for the regex, but if there was
5793 previously no reqbyte, it takes on the value of the old firstbyte. */
5794
5795 if (firstbyte >= 0 && firstbyte != branchfirstbyte)
5796 {
5797 if (reqbyte < 0) reqbyte = firstbyte;
5798 firstbyte = REQ_NONE;
5799 }
5800
5801 /* If we (now or from before) have no firstbyte, a firstbyte from the
5802 branch becomes a reqbyte if there isn't a branch reqbyte. */
5803
5804 if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
5805 branchreqbyte = branchfirstbyte;
5806
5807 /* Now ensure that the reqbytes match */
5808
5809 if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
5810 reqbyte = REQ_NONE;
5811 else reqbyte |= branchreqbyte; /* To "or" REQ_VARY */
5812 }
5813
5814 /* If lookbehind, check that this branch matches a fixed-length string, and
5815 put the length into the OP_REVERSE item. Temporarily mark the end of the
5816 branch with OP_END. */
5817
5818 if (lookbehind)
5819 {
5820 int fixed_length;
5821 *code = OP_END;
5822 fixed_length = find_fixedlength(last_branch, options);
5823 DPRINTF(("fixed length = %d\n", fixed_length));
5824 if (fixed_length < 0)
5825 {
5826 *errorcodeptr = (fixed_length == -2)? ERR36 : ERR25;
5827 *ptrptr = ptr;
5828 return FALSE;
5829 }
5830 PUT(reverse_count, 0, fixed_length);
5831 }
5832 }
5833
5834 /* Reached end of expression, either ')' or end of pattern. In the real
5835 compile phase, go back through the alternative branches and reverse the chain
5836 of offsets, with the field in the BRA item now becoming an offset to the
5837 first alternative. If there are no alternatives, it points to the end of the
5838 group. The length in the terminating ket is always the length of the whole
5839 bracketed item. If any of the ims options were changed inside the group,
5840 compile a resetting op-code following, except at the very end of the pattern.
5841 Return leaving the pointer at the terminating char. */
5842
5843 if (*ptr != CHAR_VERTICAL_LINE)
5844 {
5845 if (lengthptr == NULL)
5846 {
5847 int branch_length = code - last_branch;
5848 do
5849 {
5850 int prev_length = GET(last_branch, 1);
5851 PUT(last_branch, 1, branch_length);
5852 branch_length = prev_length;
5853 last_branch -= branch_length;
5854 }
5855 while (branch_length > 0);
5856 }
5857
5858 /* If it was a capturing subpattern, remove it from the chain. */
5859
5860 if (capnumber > 0) cd->open_caps = cd->open_caps->next;
5861
5862 /* Fill in the ket */
5863
5864 *code = OP_KET;
5865 PUT(code, 1, code - start_bracket);
5866 code += 1 + LINK_SIZE;
5867
5868 /* Resetting option if needed */
5869
5870 if ((options & PCRE_IMS) != oldims && *ptr == CHAR_RIGHT_PARENTHESIS)
5871 {
5872 *code++ = OP_OPT;
5873 *code++ = oldims;
5874 length += 2;
5875 }
5876
5877 /* Retain the highest bracket number, in case resetting was used. */
5878
5879 cd->bracount = max_bracount;
5880
5881 /* Set values to pass back */
5882
5883 *codeptr = code;
5884 *ptrptr = ptr;
5885 *firstbyteptr = firstbyte;
5886 *reqbyteptr = reqbyte;
5887 if (lengthptr != NULL)
5888 {
5889 if (OFLOW_MAX - *lengthptr < length)
5890 {
5891 *errorcodeptr = ERR20;
5892 return FALSE;
5893 }
5894 *lengthptr += length;
5895 }
5896 return TRUE;
5897 }
5898
5899 /* Another branch follows. In the pre-compile phase, we can move the code
5900 pointer back to where it was for the start of the first branch. (That is,
5901 pretend that each branch is the only one.)
5902
5903 In the real compile phase, insert an ALT node. Its length field points back
5904 to the previous branch while the bracket remains open. At the end the chain
5905 is reversed. It's done like this so that the start of the bracket has a
5906 zero offset until it is closed, making it possible to detect recursion. */
5907
5908 if (lengthptr != NULL)
5909 {
5910 code = *codeptr + 1 + LINK_SIZE + skipbytes;
5911 length += 1 + LINK_SIZE;
5912 }
5913 else
5914 {
5915 *code = OP_ALT;
5916 PUT(code, 1, code - last_branch);
5917 bc.current = last_branch = code;
5918 code += 1 + LINK_SIZE;
5919 }
5920
5921 ptr++;
5922 }
5923 /* Control never reaches here */
5924 }
5925
5926
5927
5928
5929 /*************************************************
5930 * Check for anchored expression *
5931 *************************************************/
5932
5933 /* Try to find out if this is an anchored regular expression. Consider each
5934 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
5935 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
5936 it's anchored. However, if this is a multiline pattern, then only OP_SOD
5937 counts, since OP_CIRC can match in the middle.
5938
5939 We can also consider a regex to be anchored if OP_SOM starts all its branches.
5940 This is the code for \G, which means "match at start of match position, taking
5941 into account the match offset".
5942
5943 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
5944 because that will try the rest of the pattern at all possible matching points,
5945 so there is no point trying again.... er ....
5946
5947 .... except when the .* appears inside capturing parentheses, and there is a
5948 subsequent back reference to those parentheses. We haven't enough information
5949 to catch that case precisely.
5950
5951 At first, the best we could do was to detect when .* was in capturing brackets
5952 and the highest back reference was greater than or equal to that level.
5953 However, by keeping a bitmap of the first 31 back references, we can catch some
5954 of the more common cases more precisely.
5955
5956 Arguments:
5957 code points to start of expression (the bracket)
5958 options points to the options setting
5959 bracket_map a bitmap of which brackets we are inside while testing; this
5960 handles up to substring 31; after that we just have to take
5961 the less precise approach
5962 backref_map the back reference bitmap
5963
5964 Returns: TRUE or FALSE
5965 */
5966
5967 static BOOL
5968 is_anchored(register const uschar *code, int *options, unsigned int bracket_map,
5969 unsigned int backref_map)
5970 {
5971 do {
5972 const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
5973 options, PCRE_MULTILINE, FALSE);
5974 register int op = *scode;
5975
5976 /* Non-capturing brackets */
5977
5978 if (op == OP_BRA)
5979 {
5980 if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
5981 }
5982
5983 /* Capturing brackets */
5984
5985 else if (op == OP_CBRA)
5986 {
5987 int n = GET2(scode, 1+LINK_SIZE);
5988 int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
5989 if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;
5990 }
5991
5992 /* Other brackets */
5993
5994 else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
5995 {
5996 if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
5997 }
5998
5999 /* .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and
6000 it isn't in brackets that are or may be referenced. */
6001
6002 else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
6003 op == OP_TYPEPOSSTAR))
6004 {
6005 if (scode[1] != OP_ALLANY || (bracket_map & backref_map) != 0)
6006 return FALSE;
6007 }
6008
6009 /* Check for explicit anchoring */
6010
6011 else if (op != OP_SOD && op != OP_SOM &&
6012 ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
6013 return FALSE;
6014 code += GET(code, 1);
6015 }
6016 while (*code == OP_ALT); /* Loop for each alternative */
6017 return TRUE;
6018 }
6019
6020
6021
6022 /*************************************************
6023 * Check for starting with ^ or .* *
6024 *************************************************/
6025
6026 /* This is called to find out if every branch starts with ^ or .* so that
6027 "first char" processing can be done to speed things up in multiline
6028 matching and for non-DOTALL patterns that start with .* (which must start at
6029 the beginning or after \n). As in the case of is_anchored() (see above), we
6030 have to take account of back references to capturing brackets that contain .*
6031 because in that case we can't make the assumption.
6032
6033 Arguments:
6034 code points to start of expression (the bracket)
6035 bracket_map a bitmap of which brackets we are inside while testing; this
6036 handles up to substring 31; after that we just have to take
6037 the less precise approach
6038 backref_map the back reference bitmap
6039
6040 Returns: TRUE or FALSE
6041 */
6042
6043 static BOOL
6044 is_startline(const uschar *code, unsigned int bracket_map,
6045 unsigned int backref_map)
6046 {
6047 do {
6048 const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
6049 NULL, 0, FALSE);
6050 register int op = *scode;
6051
6052 /* If we are at the start of a conditional assertion group, *both* the
6053 conditional assertion *and* what follows the condition must satisfy the test
6054 for start of line. Other kinds of condition fail. Note that there may be an
6055 auto-callout at the start of a condition. */
6056
6057 if (op == OP_COND)
6058 {
6059 scode += 1 + LINK_SIZE;
6060 if (*scode == OP_CALLOUT) scode += _pcre_OP_lengths[OP_CALLOUT];
6061 switch (*scode)
6062 {
6063 case OP_CREF:
6064 case OP_RREF:
6065 case OP_DEF:
6066 return FALSE;
6067
6068 default: /* Assertion */
6069 if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
6070 do scode += GET(scode, 1); while (*scode == OP_ALT);
6071 scode += 1 + LINK_SIZE;
6072 break;
6073 }
6074 scode = first_significant_code(scode, NULL, 0, FALSE);
6075 op = *scode;
6076 }
6077
6078 /* Non-capturing brackets */
6079
6080 if (op == OP_BRA)
6081 {
6082 if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
6083 }
6084
6085 /* Capturing brackets */
6086
6087 else if (op == OP_CBRA)
6088 {
6089 int n = GET2(scode, 1+LINK_SIZE);
6090 int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
6091 if (!is_startline(scode, new_map, backref_map)) return FALSE;
6092 }
6093
6094 /* Other brackets */
6095
6096 else if (op == OP_ASSERT || op == OP_ONCE)
6097 {
6098 if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
6099 }
6100
6101 /* .* means "start at start or after \n" if it isn't in brackets that
6102 may be referenced. */
6103
6104 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
6105 {
6106 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
6107 }
6108
6109 /* Check for explicit circumflex */
6110
6111 else if (op != OP_CIRC) return FALSE;
6112
6113 /* Move on to the next alternative */
6114
6115 code += GET(code, 1);
6116 }
6117 while (*code == OP_ALT); /* Loop for each alternative */
6118 return TRUE;
6119 }
6120
6121
6122
6123 /*************************************************
6124 * Check for asserted fixed first char *
6125 *************************************************/
6126
6127 /* During compilation, the "first char" settings from forward assertions are
6128 discarded, because they can cause conflicts with actual literals that follow.
6129 However, if we end up without a first char setting for an unanchored pattern,
6130 it is worth scanning the regex to see if there is an initial asserted first
6131 char. If all branches start with the same asserted char, or with a bracket all
6132 of whose alternatives start with the same asserted char (recurse ad lib), then
6133 we return that char, otherwise -1.
6134
6135 Arguments:
6136 code points to start of expression (the bracket)
6137 options pointer to the options (used to check casing changes)
6138 inassert TRUE if in an assertion
6139
6140 Returns: -1 or the fixed first char
6141 */
6142
6143 static int
6144 find_firstassertedchar(const uschar *code, int *options, BOOL inassert)
6145 {
6146 register int c = -1;
6147 do {
6148 int d;
6149 const uschar *scode =
6150 first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE);
6151 register int op = *scode;
6152
6153 switch(op)
6154 {
6155 default:
6156 return -1;
6157
6158 case OP_BRA:
6159 case OP_CBRA:
6160 case OP_ASSERT:
6161 case OP_ONCE:
6162 case OP_COND:
6163 if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0)
6164 return -1;
6165 if (c < 0) c = d; else if (c != d) return -1;
6166 break;
6167
6168 case OP_EXACT: /* Fall through */
6169 scode += 2;
6170
6171 case OP_CHAR:
6172 case OP_CHARNC:
6173 case OP_PLUS:
6174 case OP_MINPLUS:
6175 case OP_POSPLUS:
6176 if (!inassert) return -1;
6177 if (c < 0)
6178 {
6179 c = scode[1];
6180 if ((*options & PCRE_CASELESS) != 0) c |= REQ_CASELESS;
6181 }
6182 else if (c != scode[1]) return -1;
6183 break;
6184 }
6185
6186 code += GET(code, 1);
6187 }
6188 while (*code == OP_ALT);
6189 return c;
6190 }
6191
6192
6193
6194 /*************************************************
6195 * Compile a Regular Expression *
6196 *************************************************/
6197
6198 /* This function takes a string and returns a pointer to a block of store
6199 holding a compiled version of the expression. The original API for this
6200 function had no error code return variable; it is retained for backwards
6201 compatibility. The new function is given a new name.
6202
6203 Arguments:
6204 pattern the regular expression
6205 options various option bits
6206 errorcodeptr pointer to error code variable (pcre_compile2() only)
6207 can be NULL if you don't want a code value
6208 errorptr pointer to pointer to error text
6209 erroroffset ptr offset in pattern where error was detected
6210 tables pointer to character tables or NULL
6211
6212 Returns: pointer to compiled data block, or NULL on error,
6213 with errorptr and erroroffset set
6214 */
6215
6216 PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION
6217 pcre_compile(const char *pattern, int options, const char **errorptr,
6218 int *erroroffset, const unsigned char *tables)
6219 {
6220 return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
6221 }
6222
6223
6224 PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION
6225 pcre_compile2(const char *pattern, int options, int *errorcodeptr,
6226 const char **errorptr, int *erroroffset, const unsigned char *tables)
6227 {
6228 real_pcre *re;
6229 int length = 1; /* For final END opcode */
6230 int firstbyte, reqbyte, newline;
6231 int errorcode = 0;
6232 int skipatstart = 0;
6233 #ifdef SUPPORT_UTF8
6234 BOOL utf8;
6235 #endif
6236 size_t size;
6237 uschar *code;
6238 const uschar *codestart;
6239 const uschar *ptr;
6240 compile_data compile_block;
6241 compile_data *cd = &compile_block;
6242
6243 /* This space is used for "compiling" into during the first phase, when we are
6244 computing the amount of memory that is needed. Compiled items are thrown away
6245 as soon as possible, so that a fairly large buffer should be sufficient for
6246 this purpose. The same space is used in the second phase for remembering where
6247 to fill in forward references to subpatterns. */
6248
6249 uschar cworkspace[COMPILE_WORK_SIZE];
6250
6251 /* Set this early so that early errors get offset 0. */
6252
6253 ptr = (const uschar *)pattern;
6254
6255 /* We can't pass back an error message if errorptr is NULL; I guess the best we
6256 can do is just return NULL, but we can set a code value if there is a code
6257 pointer. */
6258
6259 if (errorptr == NULL)
6260 {
6261 if (errorcodeptr != NULL) *errorcodeptr = 99;
6262 return NULL;
6263 }
6264
6265 *errorptr = NULL;
6266 if (errorcodeptr != NULL) *errorcodeptr = ERR0;
6267
6268 /* However, we can give a message for this error */
6269
6270 if (erroroffset == NULL)
6271 {
6272 errorcode = ERR16;
6273 goto PCRE_EARLY_ERROR_RETURN2;
6274 }
6275
6276 *erroroffset = 0;
6277
6278 /* Set up pointers to the individual character tables */
6279
6280 if (tables == NULL) tables = _pcre_default_tables;
6281 cd->lcc = tables + lcc_offset;
6282 cd->fcc = tables + fcc_offset;
6283 cd->cbits = tables + cbits_offset;
6284 cd->ctypes = tables + ctypes_offset;
6285
6286 /* Check that all undefined public option bits are zero */
6287
6288 if ((options & ~PUBLIC_COMPILE_OPTIONS) != 0)
6289 {
6290 errorcode = ERR17;
6291 goto PCRE_EARLY_ERROR_RETURN;
6292 }
6293
6294 /* Check for global one-time settings at the start of the pattern, and remember
6295 the offset for later. */
6296
6297 while (ptr[skipatstart] == CHAR_LEFT_PARENTHESIS &&
6298 ptr[skipatstart+1] == CHAR_ASTERISK)
6299 {
6300 int newnl = 0;
6301 int newbsr = 0;
6302
6303 if (strncmp((char *)(ptr+skipatstart+2), STRING_UTF8_RIGHTPAR, 5) == 0)
6304 { skipatstart += 7; options |= PCRE_UTF8; continue; }
6305
6306 if (strncmp((char *)(ptr+skipatstart+2), STRING_CR_RIGHTPAR, 3) == 0)
6307 { skipatstart += 5; newnl = PCRE_NEWLINE_CR; }
6308 else if (strncmp((char *)(ptr+skipatstart+2), STRING_LF_RIGHTPAR, 3) == 0)
6309 { skipatstart += 5; newnl = PCRE_NEWLINE_LF; }
6310 else if (strncmp((char *)(ptr+skipatstart+2), STRING_CRLF_RIGHTPAR, 5) == 0)
6311 { skipatstart += 7; newnl = PCRE_NEWLINE_CR + PCRE_NEWLINE_LF; }
6312 else if (strncmp((char *)(ptr+skipatstart+2), STRING_ANY_RIGHTPAR, 4) == 0)
6313 { skipatstart += 6; newnl = PCRE_NEWLINE_ANY; }
6314 else if (strncmp((char *)(ptr+skipatstart+2), STRING_ANYCRLF_RIGHTPAR, 8) == 0)
6315 { skipatstart += 10; newnl = PCRE_NEWLINE_ANYCRLF; }
6316
6317 else if (strncmp((char *)(ptr+skipatstart+2), STRING_BSR_ANYCRLF_RIGHTPAR, 12) == 0)
6318 { skipatstart += 14; newbsr = PCRE_BSR_ANYCRLF; }
6319 else if (strncmp((char *)(ptr+skipatstart+2), STRING_BSR_UNICODE_RIGHTPAR, 12) == 0)
6320 { skipatstart += 14; newbsr = PCRE_BSR_UNICODE; }
6321
6322 if (newnl != 0)
6323 options = (options & ~PCRE_NEWLINE_BITS) | newnl;
6324 else if (newbsr != 0)
6325 options = (options & ~(PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) | newbsr;
6326 else break;
6327 }
6328
6329 /* Can't support UTF8 unless PCRE has been compiled to include the code. */
6330
6331 #ifdef SUPPORT_UTF8
6332 utf8 = (options & PCRE_UTF8) != 0;
6333 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
6334 (*erroroffset = _pcre_valid_utf8((uschar *)pattern, -1)) >= 0)
6335 {
6336 errorcode = ERR44;
6337 goto PCRE_EARLY_ERROR_RETURN2;
6338 }
6339 #else
6340 if ((options & PCRE_UTF8) != 0)
6341 {
6342 errorcode = ERR32;
6343 goto PCRE_EARLY_ERROR_RETURN;
6344 }
6345 #endif
6346
6347 /* Check validity of \R options. */
6348
6349 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
6350 {
6351 case 0:
6352 case PCRE_BSR_ANYCRLF:
6353 case PCRE_BSR_UNICODE:
6354 break;
6355 default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
6356 }
6357
6358 /* Handle different types of newline. The three bits give seven cases. The
6359 current code allows for fixed one- or two-byte sequences, plus "any" and
6360 "anycrlf". */
6361
6362 switch (options & PCRE_NEWLINE_BITS)
6363 {
6364 case 0: newline = NEWLINE; break; /* Build-time default */
6365 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
6366 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
6367 case PCRE_NEWLINE_CR+
6368 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
6369 case PCRE_NEWLINE_ANY: newline = -1; break;
6370 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
6371 default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
6372 }
6373
6374 if (newline == -2)
6375 {
6376 cd->nltype = NLTYPE_ANYCRLF;
6377 }
6378 else if (newline < 0)
6379 {
6380 cd->nltype = NLTYPE_ANY;
6381 }
6382 else
6383 {
6384 cd->nltype = NLTYPE_FIXED;
6385 if (newline > 255)
6386 {
6387 cd->nllen = 2;
6388 cd->nl[0] = (newline >> 8) & 255;
6389 cd->nl[1] = newline & 255;
6390 }
6391 else
6392 {
6393 cd->nllen = 1;
6394 cd->nl[0] = newline;
6395 }
6396 }
6397
6398 /* Maximum back reference and backref bitmap. The bitmap records up to 31 back
6399 references to help in deciding whether (.*) can be treated as anchored or not.
6400 */
6401
6402 cd->top_backref = 0;
6403 cd->backref_map = 0;
6404
6405 /* Reflect pattern for debugging output */
6406
6407 DPRINTF(("------------------------------------------------------------------\n"));
6408 DPRINTF(("%s\n", pattern));
6409
6410 /* Pretend to compile the pattern while actually just accumulating the length
6411 of memory required. This behaviour is triggered by passing a non-NULL final
6412 argument to compile_regex(). We pass a block of workspace (cworkspace) for it
6413 to compile parts of the pattern into; the compiled code is discarded when it is
6414 no longer needed, so hopefully this workspace will never overflow, though there
6415 is a test for its doing so. */
6416
6417 cd->bracount = cd->final_bracount = 0;
6418 cd->names_found = 0;
6419 cd->name_entry_size = 0;
6420 cd->name_table = NULL;
6421 cd->start_workspace = cworkspace;
6422 cd->start_code = cworkspace;
6423 cd->hwm = cworkspace;
6424 cd->start_pattern = (const uschar *)pattern;
6425 cd->end_pattern = (const uschar *)(pattern + strlen(pattern));
6426 cd->req_varyopt = 0;
6427 cd->external_options = options;
6428 cd->external_flags = 0;
6429 cd->open_caps = NULL;
6430
6431 /* Now do the pre-compile. On error, errorcode will be set non-zero, so we
6432 don't need to look at the result of the function here. The initial options have
6433 been put into the cd block so that they can be changed if an option setting is
6434 found within the regex right at the beginning. Bringing initial option settings
6435 outside can help speed up starting point checks. */
6436
6437 ptr += skipatstart;
6438 code = cworkspace;
6439 *code = OP_BRA;
6440 (void)compile_regex(cd->external_options, cd->external_options & PCRE_IMS,
6441 &code, &ptr, &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd,
6442 &length);
6443 if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN;
6444
6445 DPRINTF(("end pre-compile: length=%d workspace=%d\n", length,
6446 cd->hwm - cworkspace));
6447
6448 if (length > MAX_PATTERN_SIZE)
6449 {
6450 errorcode = ERR20;
6451 goto PCRE_EARLY_ERROR_RETURN;
6452 }
6453
6454 /* Compute the size of data block needed and get it, either from malloc or
6455 externally provided function. Integer overflow should no longer be possible
6456 because nowadays we limit the maximum value of cd->names_found and
6457 cd->name_entry_size. */
6458
6459 size = length + sizeof(real_pcre) + cd->names_found * (cd->name_entry_size + 3);
6460 re = (real_pcre *)(pcre_malloc)(size);
6461
6462 if (re == NULL)
6463 {
6464 errorcode = ERR21;
6465 goto PCRE_EARLY_ERROR_RETURN;
6466 }
6467
6468 /* Put in the magic number, and save the sizes, initial options, internal
6469 flags, and character table pointer. NULL is used for the default character
6470 tables. The nullpad field is at the end; it's there to help in the case when a
6471 regex compiled on a system with 4-byte pointers is run on another with 8-byte
6472 pointers. */
6473
6474 re->magic_number = MAGIC_NUMBER;
6475 re->size = size;
6476 re->options = cd->external_options;
6477 re->flags = cd->external_flags;
6478 re->dummy1 = 0;
6479 re->first_byte = 0;
6480 re->req_byte = 0;
6481 re->name_table_offset = sizeof(real_pcre);
6482 re->name_entry_size = cd->name_entry_size;
6483 re->name_count = cd->names_found;
6484 re->ref_count = 0;
6485 re->tables = (tables == _pcre_default_tables)? NULL : tables;
6486 re->nullpad = NULL;
6487
6488 /* The starting points of the name/number translation table and of the code are
6489 passed around in the compile data block. The start/end pattern and initial
6490 options are already set from the pre-compile phase, as is the name_entry_size
6491 field. Reset the bracket count and the names_found field. Also reset the hwm
6492 field; this time it's used for remembering forward references to subpatterns.
6493 */
6494
6495 cd->final_bracount = cd->bracount; /* Save for checking forward references */
6496 cd->bracount = 0;
6497 cd->names_found = 0;
6498 cd->name_table = (uschar *)re + re->name_table_offset;
6499 codestart = cd->name_table + re->name_entry_size * re->name_count;
6500 cd->start_code = codestart;
6501 cd->hwm = cworkspace;
6502 cd->req_varyopt = 0;
6503 cd->had_accept = FALSE;
6504 cd->open_caps = NULL;
6505
6506 /* Set up a starting, non-extracting bracket, then compile the expression. On
6507 error, errorcode will be set non-zero, so we don't need to look at the result
6508 of the function here. */
6509
6510 ptr = (const uschar *)pattern + skipatstart;
6511 code = (uschar *)codestart;
6512 *code = OP_BRA;
6513 (void)compile_regex(re->options, re->options & PCRE_IMS, &code, &ptr,
6514 &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd, NULL);
6515 re->top_bracket = cd->bracount;
6516 re->top_backref = cd->top_backref;
6517 re->flags = cd->external_flags;
6518
6519 if (cd->had_accept) reqbyte = -1; /* Must disable after (*ACCEPT) */
6520
6521 /* If not reached end of pattern on success, there's an excess bracket. */
6522
6523 if (errorcode == 0 && *ptr != 0) errorcode = ERR22;
6524
6525 /* Fill in the terminating state and check for disastrous overflow, but
6526 if debugging, leave the test till after things are printed out. */
6527
6528 *code++ = OP_END;
6529
6530 #ifndef DEBUG
6531 if (code - codestart > length) errorcode = ERR23;
6532 #endif
6533
6534 /* Fill in any forward references that are required. */
6535
6536 while (errorcode == 0 && cd->hwm > cworkspace)
6537 {
6538 int offset, recno;
6539 const uschar *groupptr;
6540 cd->hwm -= LINK_SIZE;
6541 offset = GET(cd->hwm, 0);
6542 recno = GET(codestart, offset);
6543 groupptr = find_bracket(codestart, (re->options & PCRE_UTF8) != 0, recno);
6544 if (groupptr == NULL) errorcode = ERR53;
6545 else PUT(((uschar *)codestart), offset, groupptr - codestart);
6546 }
6547
6548 /* Give an error if there's back reference to a non-existent capturing
6549 subpattern. */
6550
6551 if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15;
6552
6553 /* Failed to compile, or error while post-processing */
6554
6555 if (errorcode != 0)
6556 {
6557 (pcre_free)(re);
6558 PCRE_EARLY_ERROR_RETURN:
6559 *erroroffset = ptr - (const uschar *)pattern;
6560 PCRE_EARLY_ERROR_RETURN2:
6561 *errorptr = find_error_text(errorcode);
6562 if (errorcodeptr != NULL) *errorcodeptr = errorcode;
6563 return NULL;
6564 }
6565
6566 /* If the anchored option was not passed, set the flag if we can determine that
6567 the pattern is anchored by virtue of ^ characters or \A or anything else (such
6568 as starting with .* when DOTALL is set).
6569
6570 Otherwise, if we know what the first byte has to be, save it, because that
6571 speeds up unanchored matches no end. If not, see if we can set the
6572 PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
6573 start with ^. and also when all branches start with .* for non-DOTALL matches.
6574 */
6575
6576 if ((re->options & PCRE_ANCHORED) == 0)
6577 {
6578 int temp_options = re->options; /* May get changed during these scans */
6579 if (is_anchored(codestart, &temp_options, 0, cd->backref_map))
6580 re->options |= PCRE_ANCHORED;
6581 else
6582 {
6583 if (firstbyte < 0)
6584 firstbyte = find_firstassertedchar(codestart, &temp_options, FALSE);
6585 if (firstbyte >= 0) /* Remove caseless flag for non-caseable chars */
6586 {
6587 int ch = firstbyte & 255;
6588 re->first_byte = ((firstbyte & REQ_CASELESS) != 0 &&
6589 cd->fcc[ch] == ch)? ch : firstbyte;
6590 re->flags |= PCRE_FIRSTSET;
6591 }
6592 else if (is_startline(codestart, 0, cd->backref_map))
6593 re->flags |= PCRE_STARTLINE;
6594 }
6595 }
6596
6597 /* For an anchored pattern, we use the "required byte" only if it follows a
6598 variable length item in the regex. Remove the caseless flag for non-caseable
6599 bytes. */
6600
6601 if (reqbyte >= 0 &&
6602 ((re->options & PCRE_ANCHORED) == 0 || (reqbyte & REQ_VARY) != 0))
6603 {
6604 int ch = reqbyte & 255;
6605 re->req_byte = ((reqbyte & REQ_CASELESS) != 0 &&
6606 cd->fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte;
6607 re->flags |= PCRE_REQCHSET;
6608 }
6609
6610 /* Print out the compiled data if debugging is enabled. This is never the
6611 case when building a production library. */
6612
6613 #ifdef DEBUG
6614
6615 printf("Length = %d top_bracket = %d top_backref = %d\n",
6616 length, re->top_bracket, re->top_backref);
6617
6618 printf("Options=%08x\n", re->options);
6619
6620 if ((re->flags & PCRE_FIRSTSET) != 0)
6621 {
6622 int ch = re->first_byte & 255;
6623 const char *caseless = ((re->first_byte & REQ_CASELESS) == 0)?
6624 "" : " (caseless)";
6625 if (isprint(ch)) printf("First char = %c%s\n", ch, caseless);
6626 else printf("First char = \\x%02x%s\n", ch, caseless);
6627 }
6628
6629 if ((re->flags & PCRE_REQCHSET) != 0)
6630 {
6631 int ch = re->req_byte & 255;
6632 const char *caseless = ((re->req_byte & REQ_CASELESS) == 0)?
6633 "" : " (caseless)";
6634 if (isprint(ch)) printf("Req char = %c%s\n", ch, caseless);
6635 else printf("Req char = \\x%02x%s\n", ch, caseless);
6636 }
6637
6638 pcre_printint(re, stdout, TRUE);
6639
6640 /* This check is done here in the debugging case so that the code that
6641 was compiled can be seen. */
6642
6643 if (code - codestart > length)
6644 {
6645 (pcre_free)(re);
6646 *errorptr = find_error_text(ERR23);
6647 *erroroffset = ptr - (uschar *)pattern;
6648 if (errorcodeptr != NULL) *errorcodeptr = ERR23;
6649 return NULL;
6650 }
6651 #endif /* DEBUG */
6652
6653 return (pcre *)re;
6654 }
6655
6656 /* End of pcre_compile.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12