/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Contents of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 412 - (show annotations) (download)
Sat Apr 11 10:34:37 2009 UTC (5 years, 6 months ago) by ph10
File MIME type: text/plain
File size: 211898 byte(s)
Add support for (*UTF8).

1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2009 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains the external function pcre_compile(), along with
42 supporting internal functions that are not used by other modules. */
43
44
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48
49 #define NLBLOCK cd /* Block containing newline information */
50 #define PSSTART start_pattern /* Field containing processed string start */
51 #define PSEND end_pattern /* Field containing processed string end */
52
53 #include "pcre_internal.h"
54
55
56 /* When DEBUG is defined, we need the pcre_printint() function, which is also
57 used by pcretest. DEBUG is not defined when building a production library. */
58
59 #ifdef DEBUG
60 #include "pcre_printint.src"
61 #endif
62
63
64 /* Macro for setting individual bits in class bitmaps. */
65
66 #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
67
68 /* Maximum length value to check against when making sure that the integer that
69 holds the compiled pattern length does not overflow. We make it a bit less than
70 INT_MAX to allow for adding in group terminating bytes, so that we don't have
71 to check them every time. */
72
73 #define OFLOW_MAX (INT_MAX - 20)
74
75
76 /*************************************************
77 * Code parameters and static tables *
78 *************************************************/
79
80 /* This value specifies the size of stack workspace that is used during the
81 first pre-compile phase that determines how much memory is required. The regex
82 is partly compiled into this space, but the compiled parts are discarded as
83 soon as they can be, so that hopefully there will never be an overrun. The code
84 does, however, check for an overrun. The largest amount I've seen used is 218,
85 so this number is very generous.
86
87 The same workspace is used during the second, actual compile phase for
88 remembering forward references to groups so that they can be filled in at the
89 end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
90 is 4 there is plenty of room. */
91
92 #define COMPILE_WORK_SIZE (4096)
93
94
95 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
96 are simple data values; negative values are for special things like \d and so
97 on. Zero means further processing is needed (for things like \x), or the escape
98 is invalid. */
99
100 #ifndef EBCDIC
101
102 /* This is the "normal" table for ASCII systems or for EBCDIC systems running
103 in UTF-8 mode. */
104
105 static const short int escapes[] = {
106 0, 0,
107 0, 0,
108 0, 0,
109 0, 0,
110 0, 0,
111 CHAR_COLON, CHAR_SEMICOLON,
112 CHAR_LESS_THAN_SIGN, CHAR_EQUALS_SIGN,
113 CHAR_GREATER_THAN_SIGN, CHAR_QUESTION_MARK,
114 CHAR_COMMERCIAL_AT, -ESC_A,
115 -ESC_B, -ESC_C,
116 -ESC_D, -ESC_E,
117 0, -ESC_G,
118 -ESC_H, 0,
119 0, -ESC_K,
120 0, 0,
121 0, 0,
122 -ESC_P, -ESC_Q,
123 -ESC_R, -ESC_S,
124 0, 0,
125 -ESC_V, -ESC_W,
126 -ESC_X, 0,
127 -ESC_Z, CHAR_LEFT_SQUARE_BRACKET,
128 CHAR_BACKSLASH, CHAR_RIGHT_SQUARE_BRACKET,
129 CHAR_CIRCUMFLEX_ACCENT, CHAR_UNDERSCORE,
130 CHAR_GRAVE_ACCENT, 7,
131 -ESC_b, 0,
132 -ESC_d, ESC_e,
133 ESC_f, 0,
134 -ESC_h, 0,
135 0, -ESC_k,
136 0, 0,
137 ESC_n, 0,
138 -ESC_p, 0,
139 ESC_r, -ESC_s,
140 ESC_tee, 0,
141 -ESC_v, -ESC_w,
142 0, 0,
143 -ESC_z
144 };
145
146 #else
147
148 /* This is the "abnormal" table for EBCDIC systems without UTF-8 support. */
149
150 static const short int escapes[] = {
151 /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
152 /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
153 /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
154 /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
155 /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
156 /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
157 /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
158 /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
159 /* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0,
160 /* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
161 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
162 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0,
163 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
164 /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
165 /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
166 /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
167 /* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0,
168 /* D0 */ '}', 0, -ESC_K, 0, 0, 0, 0, -ESC_P,
169 /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
170 /* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X,
171 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
172 /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
173 /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
174 };
175 #endif
176
177
178 /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
179 searched linearly. Put all the names into a single string, in order to reduce
180 the number of relocations when a shared library is dynamically linked. The
181 string is built from string macros so that it works in UTF-8 mode on EBCDIC
182 platforms. */
183
184 typedef struct verbitem {
185 int len;
186 int op;
187 } verbitem;
188
189 static const char verbnames[] =
190 STRING_ACCEPT0
191 STRING_COMMIT0
192 STRING_F0
193 STRING_FAIL0
194 STRING_PRUNE0
195 STRING_SKIP0
196 STRING_THEN;
197
198 static const verbitem verbs[] = {
199 { 6, OP_ACCEPT },
200 { 6, OP_COMMIT },
201 { 1, OP_FAIL },
202 { 4, OP_FAIL },
203 { 5, OP_PRUNE },
204 { 4, OP_SKIP },
205 { 4, OP_THEN }
206 };
207
208 static const int verbcount = sizeof(verbs)/sizeof(verbitem);
209
210
211 /* Tables of names of POSIX character classes and their lengths. The names are
212 now all in a single string, to reduce the number of relocations when a shared
213 library is dynamically loaded. The list of lengths is terminated by a zero
214 length entry. The first three must be alpha, lower, upper, as this is assumed
215 for handling case independence. */
216
217 static const char posix_names[] =
218 STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
219 STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
220 STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
221 STRING_word0 STRING_xdigit;
222
223 static const uschar posix_name_lengths[] = {
224 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
225
226 /* Table of class bit maps for each POSIX class. Each class is formed from a
227 base map, with an optional addition or removal of another map. Then, for some
228 classes, there is some additional tweaking: for [:blank:] the vertical space
229 characters are removed, and for [:alpha:] and [:alnum:] the underscore
230 character is removed. The triples in the table consist of the base map offset,
231 second map offset or -1 if no second map, and a non-negative value for map
232 addition or a negative value for map subtraction (if there are two maps). The
233 absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
234 remove vertical space characters, 2 => remove underscore. */
235
236 static const int posix_class_maps[] = {
237 cbit_word, cbit_digit, -2, /* alpha */
238 cbit_lower, -1, 0, /* lower */
239 cbit_upper, -1, 0, /* upper */
240 cbit_word, -1, 2, /* alnum - word without underscore */
241 cbit_print, cbit_cntrl, 0, /* ascii */
242 cbit_space, -1, 1, /* blank - a GNU extension */
243 cbit_cntrl, -1, 0, /* cntrl */
244 cbit_digit, -1, 0, /* digit */
245 cbit_graph, -1, 0, /* graph */
246 cbit_print, -1, 0, /* print */
247 cbit_punct, -1, 0, /* punct */
248 cbit_space, -1, 0, /* space */
249 cbit_word, -1, 0, /* word - a Perl extension */
250 cbit_xdigit,-1, 0 /* xdigit */
251 };
252
253
254 #define STRING(a) # a
255 #define XSTRING(s) STRING(s)
256
257 /* The texts of compile-time error messages. These are "char *" because they
258 are passed to the outside world. Do not ever re-use any error number, because
259 they are documented. Always add a new error instead. Messages marked DEAD below
260 are no longer used. This used to be a table of strings, but in order to reduce
261 the number of relocations needed when a shared library is loaded dynamically,
262 it is now one long string. We cannot use a table of offsets, because the
263 lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
264 simply count through to the one we want - this isn't a performance issue
265 because these strings are used only when there is a compilation error. */
266
267 static const char error_texts[] =
268 "no error\0"
269 "\\ at end of pattern\0"
270 "\\c at end of pattern\0"
271 "unrecognized character follows \\\0"
272 "numbers out of order in {} quantifier\0"
273 /* 5 */
274 "number too big in {} quantifier\0"
275 "missing terminating ] for character class\0"
276 "invalid escape sequence in character class\0"
277 "range out of order in character class\0"
278 "nothing to repeat\0"
279 /* 10 */
280 "operand of unlimited repeat could match the empty string\0" /** DEAD **/
281 "internal error: unexpected repeat\0"
282 "unrecognized character after (? or (?-\0"
283 "POSIX named classes are supported only within a class\0"
284 "missing )\0"
285 /* 15 */
286 "reference to non-existent subpattern\0"
287 "erroffset passed as NULL\0"
288 "unknown option bit(s) set\0"
289 "missing ) after comment\0"
290 "parentheses nested too deeply\0" /** DEAD **/
291 /* 20 */
292 "regular expression is too large\0"
293 "failed to get memory\0"
294 "unmatched parentheses\0"
295 "internal error: code overflow\0"
296 "unrecognized character after (?<\0"
297 /* 25 */
298 "lookbehind assertion is not fixed length\0"
299 "malformed number or name after (?(\0"
300 "conditional group contains more than two branches\0"
301 "assertion expected after (?(\0"
302 "(?R or (?[+-]digits must be followed by )\0"
303 /* 30 */
304 "unknown POSIX class name\0"
305 "POSIX collating elements are not supported\0"
306 "this version of PCRE is not compiled with PCRE_UTF8 support\0"
307 "spare error\0" /** DEAD **/
308 "character value in \\x{...} sequence is too large\0"
309 /* 35 */
310 "invalid condition (?(0)\0"
311 "\\C not allowed in lookbehind assertion\0"
312 "PCRE does not support \\L, \\l, \\N, \\U, or \\u\0"
313 "number after (?C is > 255\0"
314 "closing ) for (?C expected\0"
315 /* 40 */
316 "recursive call could loop indefinitely\0"
317 "unrecognized character after (?P\0"
318 "syntax error in subpattern name (missing terminator)\0"
319 "two named subpatterns have the same name\0"
320 "invalid UTF-8 string\0"
321 /* 45 */
322 "support for \\P, \\p, and \\X has not been compiled\0"
323 "malformed \\P or \\p sequence\0"
324 "unknown property name after \\P or \\p\0"
325 "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
326 "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
327 /* 50 */
328 "repeated subpattern is too long\0" /** DEAD **/
329 "octal value is greater than \\377 (not in UTF-8 mode)\0"
330 "internal error: overran compiling workspace\0"
331 "internal error: previously-checked referenced subpattern not found\0"
332 "DEFINE group contains more than one branch\0"
333 /* 55 */
334 "repeating a DEFINE group is not allowed\0"
335 "inconsistent NEWLINE options\0"
336 "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
337 "a numbered reference must not be zero\0"
338 "(*VERB) with an argument is not supported\0"
339 /* 60 */
340 "(*VERB) not recognized\0"
341 "number is too big\0"
342 "subpattern name expected\0"
343 "digit expected after (?+\0"
344 "] is an invalid data character in JavaScript compatibility mode";
345
346
347 /* Table to identify digits and hex digits. This is used when compiling
348 patterns. Note that the tables in chartables are dependent on the locale, and
349 may mark arbitrary characters as digits - but the PCRE compiling code expects
350 to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
351 a private table here. It costs 256 bytes, but it is a lot faster than doing
352 character value tests (at least in some simple cases I timed), and in some
353 applications one wants PCRE to compile efficiently as well as match
354 efficiently.
355
356 For convenience, we use the same bit definitions as in chartables:
357
358 0x04 decimal digit
359 0x08 hexadecimal digit
360
361 Then we can use ctype_digit and ctype_xdigit in the code. */
362
363 #ifndef EBCDIC
364
365 /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
366 UTF-8 mode. */
367
368 static const unsigned char digitab[] =
369 {
370 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
371 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
372 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
373 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
374 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
375 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
376 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
377 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
378 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
379 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
380 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
381 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
382 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
383 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
384 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
385 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
386 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
387 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
388 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
389 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
390 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
391 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
392 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
393 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
394 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
395 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
396 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
397 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
398 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
399 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
400 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
401 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
402
403 #else
404
405 /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
406
407 static const unsigned char digitab[] =
408 {
409 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
410 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
411 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
412 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
413 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
414 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
415 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
416 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
417 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
418 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
419 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
420 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */
421 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
422 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
423 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
424 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
425 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
426 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
427 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
428 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
429 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
430 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
431 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
432 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
433 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
434 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
435 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
436 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
437 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
438 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
439 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
440 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
441
442 static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
443 0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
444 0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
445 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
446 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
447 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
448 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
449 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
450 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
451 0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
452 0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
453 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
454 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */
455 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
456 0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
457 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
458 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
459 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
460 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
461 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
462 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
463 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
464 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
465 0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
466 0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
467 0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
468 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
469 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
470 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
471 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
472 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
473 0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
474 0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
475 #endif
476
477
478 /* Definition to allow mutual recursion */
479
480 static BOOL
481 compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
482 int *, int *, branch_chain *, compile_data *, int *);
483
484
485
486 /*************************************************
487 * Find an error text *
488 *************************************************/
489
490 /* The error texts are now all in one long string, to save on relocations. As
491 some of the text is of unknown length, we can't use a table of offsets.
492 Instead, just count through the strings. This is not a performance issue
493 because it happens only when there has been a compilation error.
494
495 Argument: the error number
496 Returns: pointer to the error string
497 */
498
499 static const char *
500 find_error_text(int n)
501 {
502 const char *s = error_texts;
503 for (; n > 0; n--) while (*s++ != 0) {};
504 return s;
505 }
506
507
508 /*************************************************
509 * Handle escapes *
510 *************************************************/
511
512 /* This function is called when a \ has been encountered. It either returns a
513 positive value for a simple escape such as \n, or a negative value which
514 encodes one of the more complicated things such as \d. A backreference to group
515 n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
516 UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
517 ptr is pointing at the \. On exit, it is on the final character of the escape
518 sequence.
519
520 Arguments:
521 ptrptr points to the pattern position pointer
522 errorcodeptr points to the errorcode variable
523 bracount number of previous extracting brackets
524 options the options bits
525 isclass TRUE if inside a character class
526
527 Returns: zero or positive => a data character
528 negative => a special escape sequence
529 on error, errorcodeptr is set
530 */
531
532 static int
533 check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
534 int options, BOOL isclass)
535 {
536 BOOL utf8 = (options & PCRE_UTF8) != 0;
537 const uschar *ptr = *ptrptr + 1;
538 int c, i;
539
540 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
541 ptr--; /* Set pointer back to the last byte */
542
543 /* If backslash is at the end of the pattern, it's an error. */
544
545 if (c == 0) *errorcodeptr = ERR1;
546
547 /* Non-alphanumerics are literals. For digits or letters, do an initial lookup
548 in a table. A non-zero result is something that can be returned immediately.
549 Otherwise further processing may be required. */
550
551 #ifndef EBCDIC /* ASCII/UTF-8 coding */
552 else if (c < CHAR_0 || c > CHAR_z) {} /* Not alphanumeric */
553 else if ((i = escapes[c - CHAR_0]) != 0) c = i;
554
555 #else /* EBCDIC coding */
556 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphanumeric */
557 else if ((i = escapes[c - 0x48]) != 0) c = i;
558 #endif
559
560 /* Escapes that need further processing, or are illegal. */
561
562 else
563 {
564 const uschar *oldptr;
565 BOOL braced, negated;
566
567 switch (c)
568 {
569 /* A number of Perl escapes are not handled by PCRE. We give an explicit
570 error. */
571
572 case CHAR_l:
573 case CHAR_L:
574 case CHAR_N:
575 case CHAR_u:
576 case CHAR_U:
577 *errorcodeptr = ERR37;
578 break;
579
580 /* \g must be followed by one of a number of specific things:
581
582 (1) A number, either plain or braced. If positive, it is an absolute
583 backreference. If negative, it is a relative backreference. This is a Perl
584 5.10 feature.
585
586 (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
587 is part of Perl's movement towards a unified syntax for back references. As
588 this is synonymous with \k{name}, we fudge it up by pretending it really
589 was \k.
590
591 (3) For Oniguruma compatibility we also support \g followed by a name or a
592 number either in angle brackets or in single quotes. However, these are
593 (possibly recursive) subroutine calls, _not_ backreferences. Just return
594 the -ESC_g code (cf \k). */
595
596 case CHAR_g:
597 if (ptr[1] == CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_APOSTROPHE)
598 {
599 c = -ESC_g;
600 break;
601 }
602
603 /* Handle the Perl-compatible cases */
604
605 if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
606 {
607 const uschar *p;
608 for (p = ptr+2; *p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET; p++)
609 if (*p != CHAR_MINUS && (digitab[*p] & ctype_digit) == 0) break;
610 if (*p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET)
611 {
612 c = -ESC_k;
613 break;
614 }
615 braced = TRUE;
616 ptr++;
617 }
618 else braced = FALSE;
619
620 if (ptr[1] == CHAR_MINUS)
621 {
622 negated = TRUE;
623 ptr++;
624 }
625 else negated = FALSE;
626
627 c = 0;
628 while ((digitab[ptr[1]] & ctype_digit) != 0)
629 c = c * 10 + *(++ptr) - CHAR_0;
630
631 if (c < 0) /* Integer overflow */
632 {
633 *errorcodeptr = ERR61;
634 break;
635 }
636
637 if (braced && *(++ptr) != CHAR_RIGHT_CURLY_BRACKET)
638 {
639 *errorcodeptr = ERR57;
640 break;
641 }
642
643 if (c == 0)
644 {
645 *errorcodeptr = ERR58;
646 break;
647 }
648
649 if (negated)
650 {
651 if (c > bracount)
652 {
653 *errorcodeptr = ERR15;
654 break;
655 }
656 c = bracount - (c - 1);
657 }
658
659 c = -(ESC_REF + c);
660 break;
661
662 /* The handling of escape sequences consisting of a string of digits
663 starting with one that is not zero is not straightforward. By experiment,
664 the way Perl works seems to be as follows:
665
666 Outside a character class, the digits are read as a decimal number. If the
667 number is less than 10, or if there are that many previous extracting
668 left brackets, then it is a back reference. Otherwise, up to three octal
669 digits are read to form an escaped byte. Thus \123 is likely to be octal
670 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
671 value is greater than 377, the least significant 8 bits are taken. Inside a
672 character class, \ followed by a digit is always an octal number. */
673
674 case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
675 case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
676
677 if (!isclass)
678 {
679 oldptr = ptr;
680 c -= CHAR_0;
681 while ((digitab[ptr[1]] & ctype_digit) != 0)
682 c = c * 10 + *(++ptr) - CHAR_0;
683 if (c < 0) /* Integer overflow */
684 {
685 *errorcodeptr = ERR61;
686 break;
687 }
688 if (c < 10 || c <= bracount)
689 {
690 c = -(ESC_REF + c);
691 break;
692 }
693 ptr = oldptr; /* Put the pointer back and fall through */
694 }
695
696 /* Handle an octal number following \. If the first digit is 8 or 9, Perl
697 generates a binary zero byte and treats the digit as a following literal.
698 Thus we have to pull back the pointer by one. */
699
700 if ((c = *ptr) >= CHAR_8)
701 {
702 ptr--;
703 c = 0;
704 break;
705 }
706
707 /* \0 always starts an octal number, but we may drop through to here with a
708 larger first octal digit. The original code used just to take the least
709 significant 8 bits of octal numbers (I think this is what early Perls used
710 to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
711 than 3 octal digits. */
712
713 case CHAR_0:
714 c -= CHAR_0;
715 while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
716 c = c * 8 + *(++ptr) - CHAR_0;
717 if (!utf8 && c > 255) *errorcodeptr = ERR51;
718 break;
719
720 /* \x is complicated. \x{ddd} is a character number which can be greater
721 than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
722 treated as a data character. */
723
724 case CHAR_x:
725 if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
726 {
727 const uschar *pt = ptr + 2;
728 int count = 0;
729
730 c = 0;
731 while ((digitab[*pt] & ctype_xdigit) != 0)
732 {
733 register int cc = *pt++;
734 if (c == 0 && cc == CHAR_0) continue; /* Leading zeroes */
735 count++;
736
737 #ifndef EBCDIC /* ASCII/UTF-8 coding */
738 if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
739 c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
740 #else /* EBCDIC coding */
741 if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */
742 c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
743 #endif
744 }
745
746 if (*pt == CHAR_RIGHT_CURLY_BRACKET)
747 {
748 if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
749 ptr = pt;
750 break;
751 }
752
753 /* If the sequence of hex digits does not end with '}', then we don't
754 recognize this construct; fall through to the normal \x handling. */
755 }
756
757 /* Read just a single-byte hex-defined char */
758
759 c = 0;
760 while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
761 {
762 int cc; /* Some compilers don't like */
763 cc = *(++ptr); /* ++ in initializers */
764 #ifndef EBCDIC /* ASCII/UTF-8 coding */
765 if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
766 c = c * 16 + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
767 #else /* EBCDIC coding */
768 if (cc <= CHAR_z) cc += 64; /* Convert to upper case */
769 c = c * 16 + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
770 #endif
771 }
772 break;
773
774 /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
775 This coding is ASCII-specific, but then the whole concept of \cx is
776 ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
777
778 case CHAR_c:
779 c = *(++ptr);
780 if (c == 0)
781 {
782 *errorcodeptr = ERR2;
783 break;
784 }
785
786 #ifndef EBCDIC /* ASCII/UTF-8 coding */
787 if (c >= CHAR_a && c <= CHAR_z) c -= 32;
788 c ^= 0x40;
789 #else /* EBCDIC coding */
790 if (c >= CHAR_a && c <= CHAR_z) c += 64;
791 c ^= 0xC0;
792 #endif
793 break;
794
795 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
796 other alphanumeric following \ is an error if PCRE_EXTRA was set;
797 otherwise, for Perl compatibility, it is a literal. This code looks a bit
798 odd, but there used to be some cases other than the default, and there may
799 be again in future, so I haven't "optimized" it. */
800
801 default:
802 if ((options & PCRE_EXTRA) != 0) switch(c)
803 {
804 default:
805 *errorcodeptr = ERR3;
806 break;
807 }
808 break;
809 }
810 }
811
812 *ptrptr = ptr;
813 return c;
814 }
815
816
817
818 #ifdef SUPPORT_UCP
819 /*************************************************
820 * Handle \P and \p *
821 *************************************************/
822
823 /* This function is called after \P or \p has been encountered, provided that
824 PCRE is compiled with support for Unicode properties. On entry, ptrptr is
825 pointing at the P or p. On exit, it is pointing at the final character of the
826 escape sequence.
827
828 Argument:
829 ptrptr points to the pattern position pointer
830 negptr points to a boolean that is set TRUE for negation else FALSE
831 dptr points to an int that is set to the detailed property value
832 errorcodeptr points to the error code variable
833
834 Returns: type value from ucp_type_table, or -1 for an invalid type
835 */
836
837 static int
838 get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
839 {
840 int c, i, bot, top;
841 const uschar *ptr = *ptrptr;
842 char name[32];
843
844 c = *(++ptr);
845 if (c == 0) goto ERROR_RETURN;
846
847 *negptr = FALSE;
848
849 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
850 negation. */
851
852 if (c == CHAR_LEFT_CURLY_BRACKET)
853 {
854 if (ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
855 {
856 *negptr = TRUE;
857 ptr++;
858 }
859 for (i = 0; i < (int)sizeof(name) - 1; i++)
860 {
861 c = *(++ptr);
862 if (c == 0) goto ERROR_RETURN;
863 if (c == CHAR_RIGHT_CURLY_BRACKET) break;
864 name[i] = c;
865 }
866 if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
867 name[i] = 0;
868 }
869
870 /* Otherwise there is just one following character */
871
872 else
873 {
874 name[0] = c;
875 name[1] = 0;
876 }
877
878 *ptrptr = ptr;
879
880 /* Search for a recognized property name using binary chop */
881
882 bot = 0;
883 top = _pcre_utt_size;
884
885 while (bot < top)
886 {
887 i = (bot + top) >> 1;
888 c = strcmp(name, _pcre_utt_names + _pcre_utt[i].name_offset);
889 if (c == 0)
890 {
891 *dptr = _pcre_utt[i].value;
892 return _pcre_utt[i].type;
893 }
894 if (c > 0) bot = i + 1; else top = i;
895 }
896
897 *errorcodeptr = ERR47;
898 *ptrptr = ptr;
899 return -1;
900
901 ERROR_RETURN:
902 *errorcodeptr = ERR46;
903 *ptrptr = ptr;
904 return -1;
905 }
906 #endif
907
908
909
910
911 /*************************************************
912 * Check for counted repeat *
913 *************************************************/
914
915 /* This function is called when a '{' is encountered in a place where it might
916 start a quantifier. It looks ahead to see if it really is a quantifier or not.
917 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
918 where the ddds are digits.
919
920 Arguments:
921 p pointer to the first char after '{'
922
923 Returns: TRUE or FALSE
924 */
925
926 static BOOL
927 is_counted_repeat(const uschar *p)
928 {
929 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
930 while ((digitab[*p] & ctype_digit) != 0) p++;
931 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
932
933 if (*p++ != CHAR_COMMA) return FALSE;
934 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
935
936 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
937 while ((digitab[*p] & ctype_digit) != 0) p++;
938
939 return (*p == CHAR_RIGHT_CURLY_BRACKET);
940 }
941
942
943
944 /*************************************************
945 * Read repeat counts *
946 *************************************************/
947
948 /* Read an item of the form {n,m} and return the values. This is called only
949 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
950 so the syntax is guaranteed to be correct, but we need to check the values.
951
952 Arguments:
953 p pointer to first char after '{'
954 minp pointer to int for min
955 maxp pointer to int for max
956 returned as -1 if no max
957 errorcodeptr points to error code variable
958
959 Returns: pointer to '}' on success;
960 current ptr on error, with errorcodeptr set non-zero
961 */
962
963 static const uschar *
964 read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
965 {
966 int min = 0;
967 int max = -1;
968
969 /* Read the minimum value and do a paranoid check: a negative value indicates
970 an integer overflow. */
971
972 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - CHAR_0;
973 if (min < 0 || min > 65535)
974 {
975 *errorcodeptr = ERR5;
976 return p;
977 }
978
979 /* Read the maximum value if there is one, and again do a paranoid on its size.
980 Also, max must not be less than min. */
981
982 if (*p == CHAR_RIGHT_CURLY_BRACKET) max = min; else
983 {
984 if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)
985 {
986 max = 0;
987 while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - CHAR_0;
988 if (max < 0 || max > 65535)
989 {
990 *errorcodeptr = ERR5;
991 return p;
992 }
993 if (max < min)
994 {
995 *errorcodeptr = ERR4;
996 return p;
997 }
998 }
999 }
1000
1001 /* Fill in the required variables, and pass back the pointer to the terminating
1002 '}'. */
1003
1004 *minp = min;
1005 *maxp = max;
1006 return p;
1007 }
1008
1009
1010
1011 /*************************************************
1012 * Subroutine for finding forward reference *
1013 *************************************************/
1014
1015 /* This recursive function is called only from find_parens() below. The
1016 top-level call starts at the beginning of the pattern. All other calls must
1017 start at a parenthesis. It scans along a pattern's text looking for capturing
1018 subpatterns, and counting them. If it finds a named pattern that matches the
1019 name it is given, it returns its number. Alternatively, if the name is NULL, it
1020 returns when it reaches a given numbered subpattern. We know that if (?P< is
1021 encountered, the name will be terminated by '>' because that is checked in the
1022 first pass. Recursion is used to keep track of subpatterns that reset the
1023 capturing group numbers - the (?| feature.
1024
1025 Arguments:
1026 ptrptr address of the current character pointer (updated)
1027 cd compile background data
1028 name name to seek, or NULL if seeking a numbered subpattern
1029 lorn name length, or subpattern number if name is NULL
1030 xmode TRUE if we are in /x mode
1031 count pointer to the current capturing subpattern number (updated)
1032
1033 Returns: the number of the named subpattern, or -1 if not found
1034 */
1035
1036 static int
1037 find_parens_sub(uschar **ptrptr, compile_data *cd, const uschar *name, int lorn,
1038 BOOL xmode, int *count)
1039 {
1040 uschar *ptr = *ptrptr;
1041 int start_count = *count;
1042 int hwm_count = start_count;
1043 BOOL dup_parens = FALSE;
1044
1045 /* If the first character is a parenthesis, check on the type of group we are
1046 dealing with. The very first call may not start with a parenthesis. */
1047
1048 if (ptr[0] == CHAR_LEFT_PARENTHESIS)
1049 {
1050 if (ptr[1] == CHAR_QUESTION_MARK &&
1051 ptr[2] == CHAR_VERTICAL_LINE)
1052 {
1053 ptr += 3;
1054 dup_parens = TRUE;
1055 }
1056
1057 /* Handle a normal, unnamed capturing parenthesis */
1058
1059 else if (ptr[1] != CHAR_QUESTION_MARK && ptr[1] != CHAR_ASTERISK)
1060 {
1061 *count += 1;
1062 if (name == NULL && *count == lorn) return *count;
1063 ptr++;
1064 }
1065
1066 /* Handle a condition. If it is an assertion, just carry on so that it
1067 is processed as normal. If not, skip to the closing parenthesis of the
1068 condition (there can't be any nested parens. */
1069
1070 else if (ptr[2] == CHAR_LEFT_PARENTHESIS)
1071 {
1072 ptr += 2;
1073 if (ptr[1] != CHAR_QUESTION_MARK)
1074 {
1075 while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
1076 if (*ptr != 0) ptr++;
1077 }
1078 }
1079
1080 /* We have either (? or (* and not a condition */
1081
1082 else
1083 {
1084 ptr += 2;
1085 if (*ptr == CHAR_P) ptr++; /* Allow optional P */
1086
1087 /* We have to disambiguate (?<! and (?<= from (?<name> for named groups */
1088
1089 if ((*ptr == CHAR_LESS_THAN_SIGN && ptr[1] != CHAR_EXCLAMATION_MARK &&
1090 ptr[1] != CHAR_EQUALS_SIGN) || *ptr == CHAR_APOSTROPHE)
1091 {
1092 int term;
1093 const uschar *thisname;
1094 *count += 1;
1095 if (name == NULL && *count == lorn) return *count;
1096 term = *ptr++;
1097 if (term == CHAR_LESS_THAN_SIGN) term = CHAR_GREATER_THAN_SIGN;
1098 thisname = ptr;
1099 while (*ptr != term) ptr++;
1100 if (name != NULL && lorn == ptr - thisname &&
1101 strncmp((const char *)name, (const char *)thisname, lorn) == 0)
1102 return *count;
1103 }
1104 }
1105 }
1106
1107 /* Past any initial parenthesis handling, scan for parentheses or vertical
1108 bars. */
1109
1110 for (; *ptr != 0; ptr++)
1111 {
1112 /* Skip over backslashed characters and also entire \Q...\E */
1113
1114 if (*ptr == CHAR_BACKSLASH)
1115 {
1116 if (*(++ptr) == 0) goto FAIL_EXIT;
1117 if (*ptr == CHAR_Q) for (;;)
1118 {
1119 while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
1120 if (*ptr == 0) goto FAIL_EXIT;
1121 if (*(++ptr) == CHAR_E) break;
1122 }
1123 continue;
1124 }
1125
1126 /* Skip over character classes; this logic must be similar to the way they
1127 are handled for real. If the first character is '^', skip it. Also, if the
1128 first few characters (either before or after ^) are \Q\E or \E we skip them
1129 too. This makes for compatibility with Perl. Note the use of STR macros to
1130 encode "Q\\E" so that it works in UTF-8 on EBCDIC platforms. */
1131
1132 if (*ptr == CHAR_LEFT_SQUARE_BRACKET)
1133 {
1134 BOOL negate_class = FALSE;
1135 for (;;)
1136 {
1137 int c = *(++ptr);
1138 if (c == CHAR_BACKSLASH)
1139 {
1140 if (ptr[1] == CHAR_E)
1141 ptr++;
1142 else if (strncmp((const char *)ptr+1,
1143 STR_Q STR_BACKSLASH STR_E, 3) == 0)
1144 ptr += 3;
1145 else
1146 break;
1147 }
1148 else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
1149 negate_class = TRUE;
1150 else break;
1151 }
1152
1153 /* If the next character is ']', it is a data character that must be
1154 skipped, except in JavaScript compatibility mode. */
1155
1156 if (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET &&
1157 (cd->external_options & PCRE_JAVASCRIPT_COMPAT) == 0)
1158 ptr++;
1159
1160 while (*(++ptr) != CHAR_RIGHT_SQUARE_BRACKET)
1161 {
1162 if (*ptr == 0) return -1;
1163 if (*ptr == CHAR_BACKSLASH)
1164 {
1165 if (*(++ptr) == 0) goto FAIL_EXIT;
1166 if (*ptr == CHAR_Q) for (;;)
1167 {
1168 while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
1169 if (*ptr == 0) goto FAIL_EXIT;
1170 if (*(++ptr) == CHAR_E) break;
1171 }
1172 continue;
1173 }
1174 }
1175 continue;
1176 }
1177
1178 /* Skip comments in /x mode */
1179
1180 if (xmode && *ptr == CHAR_NUMBER_SIGN)
1181 {
1182 while (*(++ptr) != 0 && *ptr != CHAR_NL) {};
1183 if (*ptr == 0) goto FAIL_EXIT;
1184 continue;
1185 }
1186
1187 /* Check for the special metacharacters */
1188
1189 if (*ptr == CHAR_LEFT_PARENTHESIS)
1190 {
1191 int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, count);
1192 if (rc > 0) return rc;
1193 if (*ptr == 0) goto FAIL_EXIT;
1194 }
1195
1196 else if (*ptr == CHAR_RIGHT_PARENTHESIS)
1197 {
1198 if (dup_parens && *count < hwm_count) *count = hwm_count;
1199 *ptrptr = ptr;
1200 return -1;
1201 }
1202
1203 else if (*ptr == CHAR_VERTICAL_LINE && dup_parens)
1204 {
1205 if (*count > hwm_count) hwm_count = *count;
1206 *count = start_count;
1207 }
1208 }
1209
1210 FAIL_EXIT:
1211 *ptrptr = ptr;
1212 return -1;
1213 }
1214
1215
1216
1217
1218 /*************************************************
1219 * Find forward referenced subpattern *
1220 *************************************************/
1221
1222 /* This function scans along a pattern's text looking for capturing
1223 subpatterns, and counting them. If it finds a named pattern that matches the
1224 name it is given, it returns its number. Alternatively, if the name is NULL, it
1225 returns when it reaches a given numbered subpattern. This is used for forward
1226 references to subpatterns. We used to be able to start this scan from the
1227 current compiling point, using the current count value from cd->bracount, and
1228 do it all in a single loop, but the addition of the possibility of duplicate
1229 subpattern numbers means that we have to scan from the very start, in order to
1230 take account of such duplicates, and to use a recursive function to keep track
1231 of the different types of group.
1232
1233 Arguments:
1234 cd compile background data
1235 name name to seek, or NULL if seeking a numbered subpattern
1236 lorn name length, or subpattern number if name is NULL
1237 xmode TRUE if we are in /x mode
1238
1239 Returns: the number of the found subpattern, or -1 if not found
1240 */
1241
1242 static int
1243 find_parens(compile_data *cd, const uschar *name, int lorn, BOOL xmode)
1244 {
1245 uschar *ptr = (uschar *)cd->start_pattern;
1246 int count = 0;
1247 int rc;
1248
1249 /* If the pattern does not start with an opening parenthesis, the first call
1250 to find_parens_sub() will scan right to the end (if necessary). However, if it
1251 does start with a parenthesis, find_parens_sub() will return when it hits the
1252 matching closing parens. That is why we have to have a loop. */
1253
1254 for (;;)
1255 {
1256 rc = find_parens_sub(&ptr, cd, name, lorn, xmode, &count);
1257 if (rc > 0 || *ptr++ == 0) break;
1258 }
1259
1260 return rc;
1261 }
1262
1263
1264
1265
1266 /*************************************************
1267 * Find first significant op code *
1268 *************************************************/
1269
1270 /* This is called by several functions that scan a compiled expression looking
1271 for a fixed first character, or an anchoring op code etc. It skips over things
1272 that do not influence this. For some calls, a change of option is important.
1273 For some calls, it makes sense to skip negative forward and all backward
1274 assertions, and also the \b assertion; for others it does not.
1275
1276 Arguments:
1277 code pointer to the start of the group
1278 options pointer to external options
1279 optbit the option bit whose changing is significant, or
1280 zero if none are
1281 skipassert TRUE if certain assertions are to be skipped
1282
1283 Returns: pointer to the first significant opcode
1284 */
1285
1286 static const uschar*
1287 first_significant_code(const uschar *code, int *options, int optbit,
1288 BOOL skipassert)
1289 {
1290 for (;;)
1291 {
1292 switch ((int)*code)
1293 {
1294 case OP_OPT:
1295 if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
1296 *options = (int)code[1];
1297 code += 2;
1298 break;
1299
1300 case OP_ASSERT_NOT:
1301 case OP_ASSERTBACK:
1302 case OP_ASSERTBACK_NOT:
1303 if (!skipassert) return code;
1304 do code += GET(code, 1); while (*code == OP_ALT);
1305 code += _pcre_OP_lengths[*code];
1306 break;
1307
1308 case OP_WORD_BOUNDARY:
1309 case OP_NOT_WORD_BOUNDARY:
1310 if (!skipassert) return code;
1311 /* Fall through */
1312
1313 case OP_CALLOUT:
1314 case OP_CREF:
1315 case OP_RREF:
1316 case OP_DEF:
1317 code += _pcre_OP_lengths[*code];
1318 break;
1319
1320 default:
1321 return code;
1322 }
1323 }
1324 /* Control never reaches here */
1325 }
1326
1327
1328
1329
1330 /*************************************************
1331 * Find the fixed length of a pattern *
1332 *************************************************/
1333
1334 /* Scan a pattern and compute the fixed length of subject that will match it,
1335 if the length is fixed. This is needed for dealing with backward assertions.
1336 In UTF8 mode, the result is in characters rather than bytes.
1337
1338 Arguments:
1339 code points to the start of the pattern (the bracket)
1340 options the compiling options
1341
1342 Returns: the fixed length, or -1 if there is no fixed length,
1343 or -2 if \C was encountered
1344 */
1345
1346 static int
1347 find_fixedlength(uschar *code, int options)
1348 {
1349 int length = -1;
1350
1351 register int branchlength = 0;
1352 register uschar *cc = code + 1 + LINK_SIZE;
1353
1354 /* Scan along the opcodes for this branch. If we get to the end of the
1355 branch, check the length against that of the other branches. */
1356
1357 for (;;)
1358 {
1359 int d;
1360 register int op = *cc;
1361 switch (op)
1362 {
1363 case OP_CBRA:
1364 case OP_BRA:
1365 case OP_ONCE:
1366 case OP_COND:
1367 d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);
1368 if (d < 0) return d;
1369 branchlength += d;
1370 do cc += GET(cc, 1); while (*cc == OP_ALT);
1371 cc += 1 + LINK_SIZE;
1372 break;
1373
1374 /* Reached end of a branch; if it's a ket it is the end of a nested
1375 call. If it's ALT it is an alternation in a nested call. If it is
1376 END it's the end of the outer call. All can be handled by the same code. */
1377
1378 case OP_ALT:
1379 case OP_KET:
1380 case OP_KETRMAX:
1381 case OP_KETRMIN:
1382 case OP_END:
1383 if (length < 0) length = branchlength;
1384 else if (length != branchlength) return -1;
1385 if (*cc != OP_ALT) return length;
1386 cc += 1 + LINK_SIZE;
1387 branchlength = 0;
1388 break;
1389
1390 /* Skip over assertive subpatterns */
1391
1392 case OP_ASSERT:
1393 case OP_ASSERT_NOT:
1394 case OP_ASSERTBACK:
1395 case OP_ASSERTBACK_NOT:
1396 do cc += GET(cc, 1); while (*cc == OP_ALT);
1397 /* Fall through */
1398
1399 /* Skip over things that don't match chars */
1400
1401 case OP_REVERSE:
1402 case OP_CREF:
1403 case OP_RREF:
1404 case OP_DEF:
1405 case OP_OPT:
1406 case OP_CALLOUT:
1407 case OP_SOD:
1408 case OP_SOM:
1409 case OP_EOD:
1410 case OP_EODN:
1411 case OP_CIRC:
1412 case OP_DOLL:
1413 case OP_NOT_WORD_BOUNDARY:
1414 case OP_WORD_BOUNDARY:
1415 cc += _pcre_OP_lengths[*cc];
1416 break;
1417
1418 /* Handle literal characters */
1419
1420 case OP_CHAR:
1421 case OP_CHARNC:
1422 case OP_NOT:
1423 branchlength++;
1424 cc += 2;
1425 #ifdef SUPPORT_UTF8
1426 if ((options & PCRE_UTF8) != 0)
1427 {
1428 while ((*cc & 0xc0) == 0x80) cc++;
1429 }
1430 #endif
1431 break;
1432
1433 /* Handle exact repetitions. The count is already in characters, but we
1434 need to skip over a multibyte character in UTF8 mode. */
1435
1436 case OP_EXACT:
1437 branchlength += GET2(cc,1);
1438 cc += 4;
1439 #ifdef SUPPORT_UTF8
1440 if ((options & PCRE_UTF8) != 0)
1441 {
1442 while((*cc & 0x80) == 0x80) cc++;
1443 }
1444 #endif
1445 break;
1446
1447 case OP_TYPEEXACT:
1448 branchlength += GET2(cc,1);
1449 if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
1450 cc += 4;
1451 break;
1452
1453 /* Handle single-char matchers */
1454
1455 case OP_PROP:
1456 case OP_NOTPROP:
1457 cc += 2;
1458 /* Fall through */
1459
1460 case OP_NOT_DIGIT:
1461 case OP_DIGIT:
1462 case OP_NOT_WHITESPACE:
1463 case OP_WHITESPACE:
1464 case OP_NOT_WORDCHAR:
1465 case OP_WORDCHAR:
1466 case OP_ANY:
1467 case OP_ALLANY:
1468 branchlength++;
1469 cc++;
1470 break;
1471
1472 /* The single-byte matcher isn't allowed */
1473
1474 case OP_ANYBYTE:
1475 return -2;
1476
1477 /* Check a class for variable quantification */
1478
1479 #ifdef SUPPORT_UTF8
1480 case OP_XCLASS:
1481 cc += GET(cc, 1) - 33;
1482 /* Fall through */
1483 #endif
1484
1485 case OP_CLASS:
1486 case OP_NCLASS:
1487 cc += 33;
1488
1489 switch (*cc)
1490 {
1491 case OP_CRSTAR:
1492 case OP_CRMINSTAR:
1493 case OP_CRQUERY:
1494 case OP_CRMINQUERY:
1495 return -1;
1496
1497 case OP_CRRANGE:
1498 case OP_CRMINRANGE:
1499 if (GET2(cc,1) != GET2(cc,3)) return -1;
1500 branchlength += GET2(cc,1);
1501 cc += 5;
1502 break;
1503
1504 default:
1505 branchlength++;
1506 }
1507 break;
1508
1509 /* Anything else is variable length */
1510
1511 default:
1512 return -1;
1513 }
1514 }
1515 /* Control never gets here */
1516 }
1517
1518
1519
1520
1521 /*************************************************
1522 * Scan compiled regex for numbered bracket *
1523 *************************************************/
1524
1525 /* This little function scans through a compiled pattern until it finds a
1526 capturing bracket with the given number.
1527
1528 Arguments:
1529 code points to start of expression
1530 utf8 TRUE in UTF-8 mode
1531 number the required bracket number
1532
1533 Returns: pointer to the opcode for the bracket, or NULL if not found
1534 */
1535
1536 static const uschar *
1537 find_bracket(const uschar *code, BOOL utf8, int number)
1538 {
1539 for (;;)
1540 {
1541 register int c = *code;
1542 if (c == OP_END) return NULL;
1543
1544 /* XCLASS is used for classes that cannot be represented just by a bit
1545 map. This includes negated single high-valued characters. The length in
1546 the table is zero; the actual length is stored in the compiled code. */
1547
1548 if (c == OP_XCLASS) code += GET(code, 1);
1549
1550 /* Handle capturing bracket */
1551
1552 else if (c == OP_CBRA)
1553 {
1554 int n = GET2(code, 1+LINK_SIZE);
1555 if (n == number) return (uschar *)code;
1556 code += _pcre_OP_lengths[c];
1557 }
1558
1559 /* Otherwise, we can get the item's length from the table, except that for
1560 repeated character types, we have to test for \p and \P, which have an extra
1561 two bytes of parameters. */
1562
1563 else
1564 {
1565 switch(c)
1566 {
1567 case OP_TYPESTAR:
1568 case OP_TYPEMINSTAR:
1569 case OP_TYPEPLUS:
1570 case OP_TYPEMINPLUS:
1571 case OP_TYPEQUERY:
1572 case OP_TYPEMINQUERY:
1573 case OP_TYPEPOSSTAR:
1574 case OP_TYPEPOSPLUS:
1575 case OP_TYPEPOSQUERY:
1576 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1577 break;
1578
1579 case OP_TYPEUPTO:
1580 case OP_TYPEMINUPTO:
1581 case OP_TYPEEXACT:
1582 case OP_TYPEPOSUPTO:
1583 if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1584 break;
1585 }
1586
1587 /* Add in the fixed length from the table */
1588
1589 code += _pcre_OP_lengths[c];
1590
1591 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1592 a multi-byte character. The length in the table is a minimum, so we have to
1593 arrange to skip the extra bytes. */
1594
1595 #ifdef SUPPORT_UTF8
1596 if (utf8) switch(c)
1597 {
1598 case OP_CHAR:
1599 case OP_CHARNC:
1600 case OP_EXACT:
1601 case OP_UPTO:
1602 case OP_MINUPTO:
1603 case OP_POSUPTO:
1604 case OP_STAR:
1605 case OP_MINSTAR:
1606 case OP_POSSTAR:
1607 case OP_PLUS:
1608 case OP_MINPLUS:
1609 case OP_POSPLUS:
1610 case OP_QUERY:
1611 case OP_MINQUERY:
1612 case OP_POSQUERY:
1613 if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1614 break;
1615 }
1616 #else
1617 (void)(utf8); /* Keep compiler happy by referencing function argument */
1618 #endif
1619 }
1620 }
1621 }
1622
1623
1624
1625 /*************************************************
1626 * Scan compiled regex for recursion reference *
1627 *************************************************/
1628
1629 /* This little function scans through a compiled pattern until it finds an
1630 instance of OP_RECURSE.
1631
1632 Arguments:
1633 code points to start of expression
1634 utf8 TRUE in UTF-8 mode
1635
1636 Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1637 */
1638
1639 static const uschar *
1640 find_recurse(const uschar *code, BOOL utf8)
1641 {
1642 for (;;)
1643 {
1644 register int c = *code;
1645 if (c == OP_END) return NULL;
1646 if (c == OP_RECURSE) return code;
1647
1648 /* XCLASS is used for classes that cannot be represented just by a bit
1649 map. This includes negated single high-valued characters. The length in
1650 the table is zero; the actual length is stored in the compiled code. */
1651
1652 if (c == OP_XCLASS) code += GET(code, 1);
1653
1654 /* Otherwise, we can get the item's length from the table, except that for
1655 repeated character types, we have to test for \p and \P, which have an extra
1656 two bytes of parameters. */
1657
1658 else
1659 {
1660 switch(c)
1661 {
1662 case OP_TYPESTAR:
1663 case OP_TYPEMINSTAR:
1664 case OP_TYPEPLUS:
1665 case OP_TYPEMINPLUS:
1666 case OP_TYPEQUERY:
1667 case OP_TYPEMINQUERY:
1668 case OP_TYPEPOSSTAR:
1669 case OP_TYPEPOSPLUS:
1670 case OP_TYPEPOSQUERY:
1671 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1672 break;
1673
1674 case OP_TYPEPOSUPTO:
1675 case OP_TYPEUPTO:
1676 case OP_TYPEMINUPTO:
1677 case OP_TYPEEXACT:
1678 if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1679 break;
1680 }
1681
1682 /* Add in the fixed length from the table */
1683
1684 code += _pcre_OP_lengths[c];
1685
1686 /* In UTF-8 mode, opcodes that are followed by a character may be followed
1687 by a multi-byte character. The length in the table is a minimum, so we have
1688 to arrange to skip the extra bytes. */
1689
1690 #ifdef SUPPORT_UTF8
1691 if (utf8) switch(c)
1692 {
1693 case OP_CHAR:
1694 case OP_CHARNC:
1695 case OP_EXACT:
1696 case OP_UPTO:
1697 case OP_MINUPTO:
1698 case OP_POSUPTO:
1699 case OP_STAR:
1700 case OP_MINSTAR:
1701 case OP_POSSTAR:
1702 case OP_PLUS:
1703 case OP_MINPLUS:
1704 case OP_POSPLUS:
1705 case OP_QUERY:
1706 case OP_MINQUERY:
1707 case OP_POSQUERY:
1708 if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1709 break;
1710 }
1711 #else
1712 (void)(utf8); /* Keep compiler happy by referencing function argument */
1713 #endif
1714 }
1715 }
1716 }
1717
1718
1719
1720 /*************************************************
1721 * Scan compiled branch for non-emptiness *
1722 *************************************************/
1723
1724 /* This function scans through a branch of a compiled pattern to see whether it
1725 can match the empty string or not. It is called from could_be_empty()
1726 below and from compile_branch() when checking for an unlimited repeat of a
1727 group that can match nothing. Note that first_significant_code() skips over
1728 backward and negative forward assertions when its final argument is TRUE. If we
1729 hit an unclosed bracket, we return "empty" - this means we've struck an inner
1730 bracket whose current branch will already have been scanned.
1731
1732 Arguments:
1733 code points to start of search
1734 endcode points to where to stop
1735 utf8 TRUE if in UTF8 mode
1736
1737 Returns: TRUE if what is matched could be empty
1738 */
1739
1740 static BOOL
1741 could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1742 {
1743 register int c;
1744 for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1745 code < endcode;
1746 code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1747 {
1748 const uschar *ccode;
1749
1750 c = *code;
1751
1752 /* Skip over forward assertions; the other assertions are skipped by
1753 first_significant_code() with a TRUE final argument. */
1754
1755 if (c == OP_ASSERT)
1756 {
1757 do code += GET(code, 1); while (*code == OP_ALT);
1758 c = *code;
1759 continue;
1760 }
1761
1762 /* Groups with zero repeats can of course be empty; skip them. */
1763
1764 if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO)
1765 {
1766 code += _pcre_OP_lengths[c];
1767 do code += GET(code, 1); while (*code == OP_ALT);
1768 c = *code;
1769 continue;
1770 }
1771
1772 /* For other groups, scan the branches. */
1773
1774 if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND)
1775 {
1776 BOOL empty_branch;
1777 if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1778
1779 /* If a conditional group has only one branch, there is a second, implied,
1780 empty branch, so just skip over the conditional, because it could be empty.
1781 Otherwise, scan the individual branches of the group. */
1782
1783 if (c == OP_COND && code[GET(code, 1)] != OP_ALT)
1784 code += GET(code, 1);
1785 else
1786 {
1787 empty_branch = FALSE;
1788 do
1789 {
1790 if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1791 empty_branch = TRUE;
1792 code += GET(code, 1);
1793 }
1794 while (*code == OP_ALT);
1795 if (!empty_branch) return FALSE; /* All branches are non-empty */
1796 }
1797
1798 c = *code;
1799 continue;
1800 }
1801
1802 /* Handle the other opcodes */
1803
1804 switch (c)
1805 {
1806 /* Check for quantifiers after a class. XCLASS is used for classes that
1807 cannot be represented just by a bit map. This includes negated single
1808 high-valued characters. The length in _pcre_OP_lengths[] is zero; the
1809 actual length is stored in the compiled code, so we must update "code"
1810 here. */
1811
1812 #ifdef SUPPORT_UTF8
1813 case OP_XCLASS:
1814 ccode = code += GET(code, 1);
1815 goto CHECK_CLASS_REPEAT;
1816 #endif
1817
1818 case OP_CLASS:
1819 case OP_NCLASS:
1820 ccode = code + 33;
1821
1822 #ifdef SUPPORT_UTF8
1823 CHECK_CLASS_REPEAT:
1824 #endif
1825
1826 switch (*ccode)
1827 {
1828 case OP_CRSTAR: /* These could be empty; continue */
1829 case OP_CRMINSTAR:
1830 case OP_CRQUERY:
1831 case OP_CRMINQUERY:
1832 break;
1833
1834 default: /* Non-repeat => class must match */
1835 case OP_CRPLUS: /* These repeats aren't empty */
1836 case OP_CRMINPLUS:
1837 return FALSE;
1838
1839 case OP_CRRANGE:
1840 case OP_CRMINRANGE:
1841 if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
1842 break;
1843 }
1844 break;
1845
1846 /* Opcodes that must match a character */
1847
1848 case OP_PROP:
1849 case OP_NOTPROP:
1850 case OP_EXTUNI:
1851 case OP_NOT_DIGIT:
1852 case OP_DIGIT:
1853 case OP_NOT_WHITESPACE:
1854 case OP_WHITESPACE:
1855 case OP_NOT_WORDCHAR:
1856 case OP_WORDCHAR:
1857 case OP_ANY:
1858 case OP_ALLANY:
1859 case OP_ANYBYTE:
1860 case OP_CHAR:
1861 case OP_CHARNC:
1862 case OP_NOT:
1863 case OP_PLUS:
1864 case OP_MINPLUS:
1865 case OP_POSPLUS:
1866 case OP_EXACT:
1867 case OP_NOTPLUS:
1868 case OP_NOTMINPLUS:
1869 case OP_NOTPOSPLUS:
1870 case OP_NOTEXACT:
1871 case OP_TYPEPLUS:
1872 case OP_TYPEMINPLUS:
1873 case OP_TYPEPOSPLUS:
1874 case OP_TYPEEXACT:
1875 return FALSE;
1876
1877 /* These are going to continue, as they may be empty, but we have to
1878 fudge the length for the \p and \P cases. */
1879
1880 case OP_TYPESTAR:
1881 case OP_TYPEMINSTAR:
1882 case OP_TYPEPOSSTAR:
1883 case OP_TYPEQUERY:
1884 case OP_TYPEMINQUERY:
1885 case OP_TYPEPOSQUERY:
1886 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1887 break;
1888
1889 /* Same for these */
1890
1891 case OP_TYPEUPTO:
1892 case OP_TYPEMINUPTO:
1893 case OP_TYPEPOSUPTO:
1894 if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1895 break;
1896
1897 /* End of branch */
1898
1899 case OP_KET:
1900 case OP_KETRMAX:
1901 case OP_KETRMIN:
1902 case OP_ALT:
1903 return TRUE;
1904
1905 /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1906 MINUPTO, and POSUPTO may be followed by a multibyte character */
1907
1908 #ifdef SUPPORT_UTF8
1909 case OP_STAR:
1910 case OP_MINSTAR:
1911 case OP_POSSTAR:
1912 case OP_QUERY:
1913 case OP_MINQUERY:
1914 case OP_POSQUERY:
1915 case OP_UPTO:
1916 case OP_MINUPTO:
1917 case OP_POSUPTO:
1918 if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1919 break;
1920 #endif
1921 }
1922 }
1923
1924 return TRUE;
1925 }
1926
1927
1928
1929 /*************************************************
1930 * Scan compiled regex for non-emptiness *
1931 *************************************************/
1932
1933 /* This function is called to check for left recursive calls. We want to check
1934 the current branch of the current pattern to see if it could match the empty
1935 string. If it could, we must look outwards for branches at other levels,
1936 stopping when we pass beyond the bracket which is the subject of the recursion.
1937
1938 Arguments:
1939 code points to start of the recursion
1940 endcode points to where to stop (current RECURSE item)
1941 bcptr points to the chain of current (unclosed) branch starts
1942 utf8 TRUE if in UTF-8 mode
1943
1944 Returns: TRUE if what is matched could be empty
1945 */
1946
1947 static BOOL
1948 could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1949 BOOL utf8)
1950 {
1951 while (bcptr != NULL && bcptr->current >= code)
1952 {
1953 if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1954 bcptr = bcptr->outer;
1955 }
1956 return TRUE;
1957 }
1958
1959
1960
1961 /*************************************************
1962 * Check for POSIX class syntax *
1963 *************************************************/
1964
1965 /* This function is called when the sequence "[:" or "[." or "[=" is
1966 encountered in a character class. It checks whether this is followed by a
1967 sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
1968 reach an unescaped ']' without the special preceding character, return FALSE.
1969
1970 Originally, this function only recognized a sequence of letters between the
1971 terminators, but it seems that Perl recognizes any sequence of characters,
1972 though of course unknown POSIX names are subsequently rejected. Perl gives an
1973 "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
1974 didn't consider this to be a POSIX class. Likewise for [:1234:].
1975
1976 The problem in trying to be exactly like Perl is in the handling of escapes. We
1977 have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
1978 class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
1979 below handles the special case of \], but does not try to do any other escape
1980 processing. This makes it different from Perl for cases such as [:l\ower:]
1981 where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize
1982 "l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,
1983 I think.
1984
1985 Arguments:
1986 ptr pointer to the initial [
1987 endptr where to return the end pointer
1988
1989 Returns: TRUE or FALSE
1990 */
1991
1992 static BOOL
1993 check_posix_syntax(const uschar *ptr, const uschar **endptr)
1994 {
1995 int terminator; /* Don't combine these lines; the Solaris cc */
1996 terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
1997 for (++ptr; *ptr != 0; ptr++)
1998 {
1999 if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET) ptr++; else
2000 {
2001 if (*ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
2002 if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
2003 {
2004 *endptr = ptr;
2005 return TRUE;
2006 }
2007 }
2008 }
2009 return FALSE;
2010 }
2011
2012
2013
2014
2015 /*************************************************
2016 * Check POSIX class name *
2017 *************************************************/
2018
2019 /* This function is called to check the name given in a POSIX-style class entry
2020 such as [:alnum:].
2021
2022 Arguments:
2023 ptr points to the first letter
2024 len the length of the name
2025
2026 Returns: a value representing the name, or -1 if unknown
2027 */
2028
2029 static int
2030 check_posix_name(const uschar *ptr, int len)
2031 {
2032 const char *pn = posix_names;
2033 register int yield = 0;
2034 while (posix_name_lengths[yield] != 0)
2035 {
2036 if (len == posix_name_lengths[yield] &&
2037 strncmp((const char *)ptr, pn, len) == 0) return yield;
2038 pn += posix_name_lengths[yield] + 1;
2039 yield++;
2040 }
2041 return -1;
2042 }
2043
2044
2045 /*************************************************
2046 * Adjust OP_RECURSE items in repeated group *
2047 *************************************************/
2048
2049 /* OP_RECURSE items contain an offset from the start of the regex to the group
2050 that is referenced. This means that groups can be replicated for fixed
2051 repetition simply by copying (because the recursion is allowed to refer to
2052 earlier groups that are outside the current group). However, when a group is
2053 optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is
2054 inserted before it, after it has been compiled. This means that any OP_RECURSE
2055 items within it that refer to the group itself or any contained groups have to
2056 have their offsets adjusted. That one of the jobs of this function. Before it
2057 is called, the partially compiled regex must be temporarily terminated with
2058 OP_END.
2059
2060 This function has been extended with the possibility of forward references for
2061 recursions and subroutine calls. It must also check the list of such references
2062 for the group we are dealing with. If it finds that one of the recursions in
2063 the current group is on this list, it adjusts the offset in the list, not the
2064 value in the reference (which is a group number).
2065
2066 Arguments:
2067 group points to the start of the group
2068 adjust the amount by which the group is to be moved
2069 utf8 TRUE in UTF-8 mode
2070 cd contains pointers to tables etc.
2071 save_hwm the hwm forward reference pointer at the start of the group
2072
2073 Returns: nothing
2074 */
2075
2076 static void
2077 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
2078 uschar *save_hwm)
2079 {
2080 uschar *ptr = group;
2081
2082 while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
2083 {
2084 int offset;
2085 uschar *hc;
2086
2087 /* See if this recursion is on the forward reference list. If so, adjust the
2088 reference. */
2089
2090 for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
2091 {
2092 offset = GET(hc, 0);
2093 if (cd->start_code + offset == ptr + 1)
2094 {
2095 PUT(hc, 0, offset + adjust);
2096 break;
2097 }
2098 }
2099
2100 /* Otherwise, adjust the recursion offset if it's after the start of this
2101 group. */
2102
2103 if (hc >= cd->hwm)
2104 {
2105 offset = GET(ptr, 1);
2106 if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
2107 }
2108
2109 ptr += 1 + LINK_SIZE;
2110 }
2111 }
2112
2113
2114
2115 /*************************************************
2116 * Insert an automatic callout point *
2117 *************************************************/
2118
2119 /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
2120 callout points before each pattern item.
2121
2122 Arguments:
2123 code current code pointer
2124 ptr current pattern pointer
2125 cd pointers to tables etc
2126
2127 Returns: new code pointer
2128 */
2129
2130 static uschar *
2131 auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
2132 {
2133 *code++ = OP_CALLOUT;
2134 *code++ = 255;
2135 PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */
2136 PUT(code, LINK_SIZE, 0); /* Default length */
2137 return code + 2*LINK_SIZE;
2138 }
2139
2140
2141
2142 /*************************************************
2143 * Complete a callout item *
2144 *************************************************/
2145
2146 /* A callout item contains the length of the next item in the pattern, which
2147 we can't fill in till after we have reached the relevant point. This is used
2148 for both automatic and manual callouts.
2149
2150 Arguments:
2151 previous_callout points to previous callout item
2152 ptr current pattern pointer
2153 cd pointers to tables etc
2154
2155 Returns: nothing
2156 */
2157
2158 static void
2159 complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
2160 {
2161 int length = ptr - cd->start_pattern - GET(previous_callout, 2);
2162 PUT(previous_callout, 2 + LINK_SIZE, length);
2163 }
2164
2165
2166
2167 #ifdef SUPPORT_UCP
2168 /*************************************************
2169 * Get othercase range *
2170 *************************************************/
2171
2172 /* This function is passed the start and end of a class range, in UTF-8 mode
2173 with UCP support. It searches up the characters, looking for internal ranges of
2174 characters in the "other" case. Each call returns the next one, updating the
2175 start address.
2176
2177 Arguments:
2178 cptr points to starting character value; updated
2179 d end value
2180 ocptr where to put start of othercase range
2181 odptr where to put end of othercase range
2182
2183 Yield: TRUE when range returned; FALSE when no more
2184 */
2185
2186 static BOOL
2187 get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
2188 unsigned int *odptr)
2189 {
2190 unsigned int c, othercase, next;
2191
2192 for (c = *cptr; c <= d; c++)
2193 { if ((othercase = UCD_OTHERCASE(c)) != c) break; }
2194
2195 if (c > d) return FALSE;
2196
2197 *ocptr = othercase;
2198 next = othercase + 1;
2199
2200 for (++c; c <= d; c++)
2201 {
2202 if (UCD_OTHERCASE(c) != next) break;
2203 next++;
2204 }
2205
2206 *odptr = next - 1;
2207 *cptr = c;
2208
2209 return TRUE;
2210 }
2211 #endif /* SUPPORT_UCP */
2212
2213
2214
2215 /*************************************************
2216 * Check if auto-possessifying is possible *
2217 *************************************************/
2218
2219 /* This function is called for unlimited repeats of certain items, to see
2220 whether the next thing could possibly match the repeated item. If not, it makes
2221 sense to automatically possessify the repeated item.
2222
2223 Arguments:
2224 op_code the repeated op code
2225 this data for this item, depends on the opcode
2226 utf8 TRUE in UTF-8 mode
2227 utf8_char used for utf8 character bytes, NULL if not relevant
2228 ptr next character in pattern
2229 options options bits
2230 cd contains pointers to tables etc.
2231
2232 Returns: TRUE if possessifying is wanted
2233 */
2234
2235 static BOOL
2236 check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
2237 const uschar *ptr, int options, compile_data *cd)
2238 {
2239 int next;
2240
2241 /* Skip whitespace and comments in extended mode */
2242
2243 if ((options & PCRE_EXTENDED) != 0)
2244 {
2245 for (;;)
2246 {
2247 while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2248 if (*ptr == CHAR_NUMBER_SIGN)
2249 {
2250 while (*(++ptr) != 0)
2251 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2252 }
2253 else break;
2254 }
2255 }
2256
2257 /* If the next item is one that we can handle, get its value. A non-negative
2258 value is a character, a negative value is an escape value. */
2259
2260 if (*ptr == CHAR_BACKSLASH)
2261 {
2262 int temperrorcode = 0;
2263 next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
2264 if (temperrorcode != 0) return FALSE;
2265 ptr++; /* Point after the escape sequence */
2266 }
2267
2268 else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
2269 {
2270 #ifdef SUPPORT_UTF8
2271 if (utf8) { GETCHARINC(next, ptr); } else
2272 #endif
2273 next = *ptr++;
2274 }
2275
2276 else return FALSE;
2277
2278 /* Skip whitespace and comments in extended mode */
2279
2280 if ((options & PCRE_EXTENDED) != 0)
2281 {
2282 for (;;)
2283 {
2284 while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2285 if (*ptr == CHAR_NUMBER_SIGN)
2286 {
2287 while (*(++ptr) != 0)
2288 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2289 }
2290 else break;
2291 }
2292 }
2293
2294 /* If the next thing is itself optional, we have to give up. */
2295
2296 if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
2297 strncmp((char *)ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
2298 return FALSE;
2299
2300 /* Now compare the next item with the previous opcode. If the previous is a
2301 positive single character match, "item" either contains the character or, if
2302 "item" is greater than 127 in utf8 mode, the character's bytes are in
2303 utf8_char. */
2304
2305
2306 /* Handle cases when the next item is a character. */
2307
2308 if (next >= 0) switch(op_code)
2309 {
2310 case OP_CHAR:
2311 #ifdef SUPPORT_UTF8
2312 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2313 #else
2314 (void)(utf8_char); /* Keep compiler happy by referencing function argument */
2315 #endif
2316 return item != next;
2317
2318 /* For CHARNC (caseless character) we must check the other case. If we have
2319 Unicode property support, we can use it to test the other case of
2320 high-valued characters. */
2321
2322 case OP_CHARNC:
2323 #ifdef SUPPORT_UTF8
2324 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2325 #endif
2326 if (item == next) return FALSE;
2327 #ifdef SUPPORT_UTF8
2328 if (utf8)
2329 {
2330 unsigned int othercase;
2331 if (next < 128) othercase = cd->fcc[next]; else
2332 #ifdef SUPPORT_UCP
2333 othercase = UCD_OTHERCASE((unsigned int)next);
2334 #else
2335 othercase = NOTACHAR;
2336 #endif
2337 return (unsigned int)item != othercase;
2338 }
2339 else
2340 #endif /* SUPPORT_UTF8 */
2341 return (item != cd->fcc[next]); /* Non-UTF-8 mode */
2342
2343 /* For OP_NOT, "item" must be a single-byte character. */
2344
2345 case OP_NOT:
2346 if (item == next) return TRUE;
2347 if ((options & PCRE_CASELESS) == 0) return FALSE;
2348 #ifdef SUPPORT_UTF8
2349 if (utf8)
2350 {
2351 unsigned int othercase;
2352 if (next < 128) othercase = cd->fcc[next]; else
2353 #ifdef SUPPORT_UCP
2354 othercase = UCD_OTHERCASE(next);
2355 #else
2356 othercase = NOTACHAR;
2357 #endif
2358 return (unsigned int)item == othercase;
2359 }
2360 else
2361 #endif /* SUPPORT_UTF8 */
2362 return (item == cd->fcc[next]); /* Non-UTF-8 mode */
2363
2364 case OP_DIGIT:
2365 return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
2366
2367 case OP_NOT_DIGIT:
2368 return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
2369
2370 case OP_WHITESPACE:
2371 return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
2372
2373 case OP_NOT_WHITESPACE:
2374 return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
2375
2376 case OP_WORDCHAR:
2377 return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
2378
2379 case OP_NOT_WORDCHAR:
2380 return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
2381
2382 case OP_HSPACE:
2383 case OP_NOT_HSPACE:
2384 switch(next)
2385 {
2386 case 0x09:
2387 case 0x20:
2388 case 0xa0:
2389 case 0x1680:
2390 case 0x180e:
2391 case 0x2000:
2392 case 0x2001:
2393 case 0x2002:
2394 case 0x2003:
2395 case 0x2004:
2396 case 0x2005:
2397 case 0x2006:
2398 case 0x2007:
2399 case 0x2008:
2400 case 0x2009:
2401 case 0x200A:
2402 case 0x202f:
2403 case 0x205f:
2404 case 0x3000:
2405 return op_code != OP_HSPACE;
2406 default:
2407 return op_code == OP_HSPACE;
2408 }
2409
2410 case OP_VSPACE:
2411 case OP_NOT_VSPACE:
2412 switch(next)
2413 {
2414 case 0x0a:
2415 case 0x0b:
2416 case 0x0c:
2417 case 0x0d:
2418 case 0x85:
2419 case 0x2028:
2420 case 0x2029:
2421 return op_code != OP_VSPACE;
2422 default:
2423 return op_code == OP_VSPACE;
2424 }
2425
2426 default:
2427 return FALSE;
2428 }
2429
2430
2431 /* Handle the case when the next item is \d, \s, etc. */
2432
2433 switch(op_code)
2434 {
2435 case OP_CHAR:
2436 case OP_CHARNC:
2437 #ifdef SUPPORT_UTF8
2438 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2439 #endif
2440 switch(-next)
2441 {
2442 case ESC_d:
2443 return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
2444
2445 case ESC_D:
2446 return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
2447
2448 case ESC_s:
2449 return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
2450
2451 case ESC_S:
2452 return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
2453
2454 case ESC_w:
2455 return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
2456
2457 case ESC_W:
2458 return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
2459
2460 case ESC_h:
2461 case ESC_H:
2462 switch(item)
2463 {
2464 case 0x09:
2465 case 0x20:
2466 case 0xa0:
2467 case 0x1680:
2468 case 0x180e:
2469 case 0x2000:
2470 case 0x2001:
2471 case 0x2002:
2472 case 0x2003:
2473 case 0x2004:
2474 case 0x2005:
2475 case 0x2006:
2476 case 0x2007:
2477 case 0x2008:
2478 case 0x2009:
2479 case 0x200A:
2480 case 0x202f:
2481 case 0x205f:
2482 case 0x3000:
2483 return -next != ESC_h;
2484 default:
2485 return -next == ESC_h;
2486 }
2487
2488 case ESC_v:
2489 case ESC_V:
2490 switch(item)
2491 {
2492 case 0x0a:
2493 case 0x0b:
2494 case 0x0c:
2495 case 0x0d:
2496 case 0x85:
2497 case 0x2028:
2498 case 0x2029:
2499 return -next != ESC_v;
2500 default:
2501 return -next == ESC_v;
2502 }
2503
2504 default:
2505 return FALSE;
2506 }
2507
2508 case OP_DIGIT:
2509 return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
2510 next == -ESC_h || next == -ESC_v;
2511
2512 case OP_NOT_DIGIT:
2513 return next == -ESC_d;
2514
2515 case OP_WHITESPACE:
2516 return next == -ESC_S || next == -ESC_d || next == -ESC_w;
2517
2518 case OP_NOT_WHITESPACE:
2519 return next == -ESC_s || next == -ESC_h || next == -ESC_v;
2520
2521 case OP_HSPACE:
2522 return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w;
2523
2524 case OP_NOT_HSPACE:
2525 return next == -ESC_h;
2526
2527 /* Can't have \S in here because VT matches \S (Perl anomaly) */
2528 case OP_VSPACE:
2529 return next == -ESC_V || next == -ESC_d || next == -ESC_w;
2530
2531 case OP_NOT_VSPACE:
2532 return next == -ESC_v;
2533
2534 case OP_WORDCHAR:
2535 return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v;
2536
2537 case OP_NOT_WORDCHAR:
2538 return next == -ESC_w || next == -ESC_d;
2539
2540 default:
2541 return FALSE;
2542 }
2543
2544 /* Control does not reach here */
2545 }
2546
2547
2548
2549 /*************************************************
2550 * Compile one branch *
2551 *************************************************/
2552
2553 /* Scan the pattern, compiling it into the a vector. If the options are
2554 changed during the branch, the pointer is used to change the external options
2555 bits. This function is used during the pre-compile phase when we are trying
2556 to find out the amount of memory needed, as well as during the real compile
2557 phase. The value of lengthptr distinguishes the two phases.
2558
2559 Arguments:
2560 optionsptr pointer to the option bits
2561 codeptr points to the pointer to the current code point
2562 ptrptr points to the current pattern pointer
2563 errorcodeptr points to error code variable
2564 firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2565 reqbyteptr set to the last literal character required, else < 0
2566 bcptr points to current branch chain
2567 cd contains pointers to tables etc.
2568 lengthptr NULL during the real compile phase
2569 points to length accumulator during pre-compile phase
2570
2571 Returns: TRUE on success
2572 FALSE, with *errorcodeptr set non-zero on error
2573 */
2574
2575 static BOOL
2576 compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2577 int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2578 compile_data *cd, int *lengthptr)
2579 {
2580 int repeat_type, op_type;
2581 int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
2582 int bravalue = 0;
2583 int greedy_default, greedy_non_default;
2584 int firstbyte, reqbyte;
2585 int zeroreqbyte, zerofirstbyte;
2586 int req_caseopt, reqvary, tempreqvary;
2587 int options = *optionsptr;
2588 int after_manual_callout = 0;
2589 int length_prevgroup = 0;
2590 register int c;
2591 register uschar *code = *codeptr;
2592 uschar *last_code = code;
2593 uschar *orig_code = code;
2594 uschar *tempcode;
2595 BOOL inescq = FALSE;
2596 BOOL groupsetfirstbyte = FALSE;
2597 const uschar *ptr = *ptrptr;
2598 const uschar *tempptr;
2599 uschar *previous = NULL;
2600 uschar *previous_callout = NULL;
2601 uschar *save_hwm = NULL;
2602 uschar classbits[32];
2603
2604 #ifdef SUPPORT_UTF8
2605 BOOL class_utf8;
2606 BOOL utf8 = (options & PCRE_UTF8) != 0;
2607 uschar *class_utf8data;
2608 uschar *class_utf8data_base;
2609 uschar utf8_char[6];
2610 #else
2611 BOOL utf8 = FALSE;
2612 uschar *utf8_char = NULL;
2613 #endif
2614
2615 #ifdef DEBUG
2616 if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2617 #endif
2618
2619 /* Set up the default and non-default settings for greediness */
2620
2621 greedy_default = ((options & PCRE_UNGREEDY) != 0);
2622 greedy_non_default = greedy_default ^ 1;
2623
2624 /* Initialize no first byte, no required byte. REQ_UNSET means "no char
2625 matching encountered yet". It gets changed to REQ_NONE if we hit something that
2626 matches a non-fixed char first char; reqbyte just remains unset if we never
2627 find one.
2628
2629 When we hit a repeat whose minimum is zero, we may have to adjust these values
2630 to take the zero repeat into account. This is implemented by setting them to
2631 zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2632 item types that can be repeated set these backoff variables appropriately. */
2633
2634 firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2635
2636 /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2637 according to the current setting of the caseless flag. REQ_CASELESS is a bit
2638 value > 255. It is added into the firstbyte or reqbyte variables to record the
2639 case status of the value. This is used only for ASCII characters. */
2640
2641 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2642
2643 /* Switch on next character until the end of the branch */
2644
2645 for (;; ptr++)
2646 {
2647 BOOL negate_class;
2648 BOOL should_flip_negation;
2649 BOOL possessive_quantifier;
2650 BOOL is_quantifier;
2651 BOOL is_recurse;
2652 BOOL reset_bracount;
2653 int class_charcount;
2654 int class_lastchar;
2655 int newoptions;
2656 int recno;
2657 int refsign;
2658 int skipbytes;
2659 int subreqbyte;
2660 int subfirstbyte;
2661 int terminator;
2662 int mclength;
2663 uschar mcbuffer[8];
2664
2665 /* Get next byte in the pattern */
2666
2667 c = *ptr;
2668
2669 /* If we are in the pre-compile phase, accumulate the length used for the
2670 previous cycle of this loop. */
2671
2672 if (lengthptr != NULL)
2673 {
2674 #ifdef DEBUG
2675 if (code > cd->hwm) cd->hwm = code; /* High water info */
2676 #endif
2677 if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
2678 {
2679 *errorcodeptr = ERR52;
2680 goto FAILED;
2681 }
2682
2683 /* There is at least one situation where code goes backwards: this is the
2684 case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2685 the class is simply eliminated. However, it is created first, so we have to
2686 allow memory for it. Therefore, don't ever reduce the length at this point.
2687 */
2688
2689 if (code < last_code) code = last_code;
2690
2691 /* Paranoid check for integer overflow */
2692
2693 if (OFLOW_MAX - *lengthptr < code - last_code)
2694 {
2695 *errorcodeptr = ERR20;
2696 goto FAILED;
2697 }
2698
2699 *lengthptr += code - last_code;
2700 DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2701
2702 /* If "previous" is set and it is not at the start of the work space, move
2703 it back to there, in order to avoid filling up the work space. Otherwise,
2704 if "previous" is NULL, reset the current code pointer to the start. */
2705
2706 if (previous != NULL)
2707 {
2708 if (previous > orig_code)
2709 {
2710 memmove(orig_code, previous, code - previous);
2711 code -= previous - orig_code;
2712 previous = orig_code;
2713 }
2714 }
2715 else code = orig_code;
2716
2717 /* Remember where this code item starts so we can pick up the length
2718 next time round. */
2719
2720 last_code = code;
2721 }
2722
2723 /* In the real compile phase, just check the workspace used by the forward
2724 reference list. */
2725
2726 else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
2727 {
2728 *errorcodeptr = ERR52;
2729 goto FAILED;
2730 }
2731
2732 /* If in \Q...\E, check for the end; if not, we have a literal */
2733
2734 if (inescq && c != 0)
2735 {
2736 if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)
2737 {
2738 inescq = FALSE;
2739 ptr++;
2740 continue;
2741 }
2742 else
2743 {
2744 if (previous_callout != NULL)
2745 {
2746 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2747 complete_callout(previous_callout, ptr, cd);
2748 previous_callout = NULL;
2749 }
2750 if ((options & PCRE_AUTO_CALLOUT) != 0)
2751 {
2752 previous_callout = code;
2753 code = auto_callout(code, ptr, cd);
2754 }
2755 goto NORMAL_CHAR;
2756 }
2757 }
2758
2759 /* Fill in length of a previous callout, except when the next thing is
2760 a quantifier. */
2761
2762 is_quantifier =
2763 c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK ||
2764 (c == CHAR_LEFT_CURLY_BRACKET && is_counted_repeat(ptr+1));
2765
2766 if (!is_quantifier && previous_callout != NULL &&
2767 after_manual_callout-- <= 0)
2768 {
2769 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2770 complete_callout(previous_callout, ptr, cd);
2771 previous_callout = NULL;
2772 }
2773
2774 /* In extended mode, skip white space and comments */
2775
2776 if ((options & PCRE_EXTENDED) != 0)
2777 {
2778 if ((cd->ctypes[c] & ctype_space) != 0) continue;
2779 if (c == CHAR_NUMBER_SIGN)
2780 {
2781 while (*(++ptr) != 0)
2782 {
2783 if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
2784 }
2785 if (*ptr != 0) continue;
2786
2787 /* Else fall through to handle end of string */
2788 c = 0;
2789 }
2790 }
2791
2792 /* No auto callout for quantifiers. */
2793
2794 if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
2795 {
2796 previous_callout = code;
2797 code = auto_callout(code, ptr, cd);
2798 }
2799
2800 switch(c)
2801 {
2802 /* ===================================================================*/
2803 case 0: /* The branch terminates at string end */
2804 case CHAR_VERTICAL_LINE: /* or | or ) */
2805 case CHAR_RIGHT_PARENTHESIS:
2806 *firstbyteptr = firstbyte;
2807 *reqbyteptr = reqbyte;
2808 *codeptr = code;
2809 *ptrptr = ptr;
2810 if (lengthptr != NULL)
2811 {
2812 if (OFLOW_MAX - *lengthptr < code - last_code)
2813 {
2814 *errorcodeptr = ERR20;
2815 goto FAILED;
2816 }
2817 *lengthptr += code - last_code; /* To include callout length */
2818 DPRINTF((">> end branch\n"));
2819 }
2820 return TRUE;
2821
2822
2823 /* ===================================================================*/
2824 /* Handle single-character metacharacters. In multiline mode, ^ disables
2825 the setting of any following char as a first character. */
2826
2827 case CHAR_CIRCUMFLEX_ACCENT:
2828 if ((options & PCRE_MULTILINE) != 0)
2829 {
2830 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2831 }
2832 previous = NULL;
2833 *code++ = OP_CIRC;
2834 break;
2835
2836 case CHAR_DOLLAR_SIGN:
2837 previous = NULL;
2838 *code++ = OP_DOLL;
2839 break;
2840
2841 /* There can never be a first char if '.' is first, whatever happens about
2842 repeats. The value of reqbyte doesn't change either. */
2843
2844 case CHAR_DOT:
2845 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2846 zerofirstbyte = firstbyte;
2847 zeroreqbyte = reqbyte;
2848 previous = code;
2849 *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;
2850 break;
2851
2852
2853 /* ===================================================================*/
2854 /* Character classes. If the included characters are all < 256, we build a
2855 32-byte bitmap of the permitted characters, except in the special case
2856 where there is only one such character. For negated classes, we build the
2857 map as usual, then invert it at the end. However, we use a different opcode
2858 so that data characters > 255 can be handled correctly.
2859
2860 If the class contains characters outside the 0-255 range, a different
2861 opcode is compiled. It may optionally have a bit map for characters < 256,
2862 but those above are are explicitly listed afterwards. A flag byte tells
2863 whether the bitmap is present, and whether this is a negated class or not.
2864
2865 In JavaScript compatibility mode, an isolated ']' causes an error. In
2866 default (Perl) mode, it is treated as a data character. */
2867
2868 case CHAR_RIGHT_SQUARE_BRACKET:
2869 if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
2870 {
2871 *errorcodeptr = ERR64;
2872 goto FAILED;
2873 }
2874 goto NORMAL_CHAR;
2875
2876 case CHAR_LEFT_SQUARE_BRACKET:
2877 previous = code;
2878
2879 /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
2880 they are encountered at the top level, so we'll do that too. */
2881
2882 if ((ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
2883 ptr[1] == CHAR_EQUALS_SIGN) &&
2884 check_posix_syntax(ptr, &tempptr))
2885 {
2886 *errorcodeptr = (ptr[1] == CHAR_COLON)? ERR13 : ERR31;
2887 goto FAILED;
2888 }
2889
2890 /* If the first character is '^', set the negation flag and skip it. Also,
2891 if the first few characters (either before or after ^) are \Q\E or \E we
2892 skip them too. This makes for compatibility with Perl. */
2893
2894 negate_class = FALSE;
2895 for (;;)
2896 {
2897 c = *(++ptr);
2898 if (c == CHAR_BACKSLASH)
2899 {
2900 if (ptr[1] == CHAR_E)
2901 ptr++;
2902 else if (strncmp((const char *)ptr+1,
2903 STR_Q STR_BACKSLASH STR_E, 3) == 0)
2904 ptr += 3;
2905 else
2906 break;
2907 }
2908 else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
2909 negate_class = TRUE;
2910 else break;
2911 }
2912
2913 /* Empty classes are allowed in JavaScript compatibility mode. Otherwise,
2914 an initial ']' is taken as a data character -- the code below handles
2915 that. In JS mode, [] must always fail, so generate OP_FAIL, whereas
2916 [^] must match any character, so generate OP_ALLANY. */
2917
2918 if (c == CHAR_RIGHT_SQUARE_BRACKET &&
2919 (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
2920 {
2921 *code++ = negate_class? OP_ALLANY : OP_FAIL;
2922 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2923 zerofirstbyte = firstbyte;
2924 break;
2925 }
2926
2927 /* If a class contains a negative special such as \S, we need to flip the
2928 negation flag at the end, so that support for characters > 255 works
2929 correctly (they are all included in the class). */
2930
2931 should_flip_negation = FALSE;
2932
2933 /* Keep a count of chars with values < 256 so that we can optimize the case
2934 of just a single character (as long as it's < 256). However, For higher
2935 valued UTF-8 characters, we don't yet do any optimization. */
2936
2937 class_charcount = 0;
2938 class_lastchar = -1;
2939
2940 /* Initialize the 32-char bit map to all zeros. We build the map in a
2941 temporary bit of memory, in case the class contains only 1 character (less
2942 than 256), because in that case the compiled code doesn't use the bit map.
2943 */
2944
2945 memset(classbits, 0, 32 * sizeof(uschar));
2946
2947 #ifdef SUPPORT_UTF8
2948 class_utf8 = FALSE; /* No chars >= 256 */
2949 class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */
2950 class_utf8data_base = class_utf8data; /* For resetting in pass 1 */
2951 #endif
2952
2953 /* Process characters until ] is reached. By writing this as a "do" it
2954 means that an initial ] is taken as a data character. At the start of the
2955 loop, c contains the first byte of the character. */
2956
2957 if (c != 0) do
2958 {
2959 const uschar *oldptr;
2960
2961 #ifdef SUPPORT_UTF8
2962 if (utf8 && c > 127)
2963 { /* Braces are required because the */
2964 GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
2965 }
2966
2967 /* In the pre-compile phase, accumulate the length of any UTF-8 extra
2968 data and reset the pointer. This is so that very large classes that
2969 contain a zillion UTF-8 characters no longer overwrite the work space
2970 (which is on the stack). */
2971
2972 if (lengthptr != NULL)
2973 {
2974 *lengthptr += class_utf8data - class_utf8data_base;
2975 class_utf8data = class_utf8data_base;
2976 }
2977
2978 #endif
2979
2980 /* Inside \Q...\E everything is literal except \E */
2981
2982 if (inescq)
2983 {
2984 if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E) /* If we are at \E */
2985 {
2986 inescq = FALSE; /* Reset literal state */
2987 ptr++; /* Skip the 'E' */
2988 continue; /* Carry on with next */
2989 }
2990 goto CHECK_RANGE; /* Could be range if \E follows */
2991 }
2992
2993 /* Handle POSIX class names. Perl allows a negation extension of the
2994 form [:^name:]. A square bracket that doesn't match the syntax is
2995 treated as a literal. We also recognize the POSIX constructions
2996 [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
2997 5.6 and 5.8 do. */
2998
2999 if (c == CHAR_LEFT_SQUARE_BRACKET &&
3000 (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
3001 ptr[1] == CHAR_EQUALS_SIGN) && check_posix_syntax(ptr, &tempptr))
3002 {
3003 BOOL local_negate = FALSE;
3004 int posix_class, taboffset, tabopt;
3005 register const uschar *cbits = cd->cbits;
3006 uschar pbits[32];
3007
3008 if (ptr[1] != CHAR_COLON)
3009 {
3010 *errorcodeptr = ERR31;
3011 goto FAILED;
3012 }
3013
3014 ptr += 2;
3015 if (*ptr == CHAR_CIRCUMFLEX_ACCENT)
3016 {
3017 local_negate = TRUE;
3018 should_flip_negation = TRUE; /* Note negative special */
3019 ptr++;
3020 }
3021
3022 posix_class = check_posix_name(ptr, tempptr - ptr);
3023 if (posix_class < 0)
3024 {
3025 *errorcodeptr = ERR30;
3026 goto FAILED;
3027 }
3028
3029 /* If matching is caseless, upper and lower are converted to
3030 alpha. This relies on the fact that the class table starts with
3031 alpha, lower, upper as the first 3 entries. */
3032
3033 if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
3034 posix_class = 0;
3035
3036 /* We build the bit map for the POSIX class in a chunk of local store
3037 because we may be adding and subtracting from it, and we don't want to
3038 subtract bits that may be in the main map already. At the end we or the
3039 result into the bit map that is being built. */
3040
3041 posix_class *= 3;
3042
3043 /* Copy in the first table (always present) */
3044
3045 memcpy(pbits, cbits + posix_class_maps[posix_class],
3046 32 * sizeof(uschar));
3047
3048 /* If there is a second table, add or remove it as required. */
3049
3050 taboffset = posix_class_maps[posix_class + 1];
3051 tabopt = posix_class_maps[posix_class + 2];
3052
3053 if (taboffset >= 0)
3054 {
3055 if (tabopt >= 0)
3056 for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
3057 else
3058 for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
3059 }
3060
3061 /* Not see if we need to remove any special characters. An option
3062 value of 1 removes vertical space and 2 removes underscore. */
3063
3064 if (tabopt < 0) tabopt = -tabopt;
3065 if (tabopt == 1) pbits[1] &= ~0x3c;
3066 else if (tabopt == 2) pbits[11] &= 0x7f;
3067
3068 /* Add the POSIX table or its complement into the main table that is
3069 being built and we are done. */
3070
3071 if (local_negate)
3072 for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
3073 else
3074 for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
3075
3076 ptr = tempptr + 1;
3077 class_charcount = 10; /* Set > 1; assumes more than 1 per class */
3078 continue; /* End of POSIX syntax handling */
3079 }
3080
3081 /* Backslash may introduce a single character, or it may introduce one
3082 of the specials, which just set a flag. The sequence \b is a special
3083 case. Inside a class (and only there) it is treated as backspace.
3084 Elsewhere it marks a word boundary. Other escapes have preset maps ready
3085 to 'or' into the one we are building. We assume they have more than one
3086 character in them, so set class_charcount bigger than one. */
3087
3088 if (c == CHAR_BACKSLASH)
3089 {
3090 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3091 if (*errorcodeptr != 0) goto FAILED;
3092
3093 if (-c == ESC_b) c = CHAR_BS; /* \b is backspace in a class */
3094 else if (-c == ESC_X) c = CHAR_X; /* \X is literal X in a class */
3095 else if (-c == ESC_R) c = CHAR_R; /* \R is literal R in a class */
3096 else if (-c == ESC_Q) /* Handle start of quoted string */
3097 {
3098 if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
3099 {
3100 ptr += 2; /* avoid empty string */
3101 }
3102 else inescq = TRUE;
3103 continue;
3104 }
3105 else if (-c == ESC_E) continue; /* Ignore orphan \E */
3106
3107 if (c < 0)
3108 {
3109 register const uschar *cbits = cd->cbits;
3110 class_charcount += 2; /* Greater than 1 is what matters */
3111
3112 /* Save time by not doing this in the pre-compile phase. */
3113
3114 if (lengthptr == NULL) switch (-c)
3115 {
3116 case ESC_d:
3117 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
3118 continue;
3119
3120 case ESC_D:
3121 should_flip_negation = TRUE;
3122 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
3123 continue;
3124
3125 case ESC_w:
3126 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
3127 continue;
3128
3129 case ESC_W:
3130 should_flip_negation = TRUE;
3131 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
3132 continue;
3133
3134 case ESC_s:
3135 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
3136 classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
3137 continue;
3138
3139 case ESC_S:
3140 should_flip_negation = TRUE;
3141 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
3142 classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
3143 continue;
3144
3145 default: /* Not recognized; fall through */
3146 break; /* Need "default" setting to stop compiler warning. */
3147 }
3148
3149 /* In the pre-compile phase, just do the recognition. */
3150
3151 else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
3152 c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
3153
3154 /* We need to deal with \H, \h, \V, and \v in both phases because
3155 they use extra memory. */
3156
3157 if (-c == ESC_h)
3158 {
3159 SETBIT(classbits, 0x09); /* VT */
3160 SETBIT(classbits, 0x20); /* SPACE */
3161 SETBIT(classbits, 0xa0); /* NSBP */
3162 #ifdef SUPPORT_UTF8
3163 if (utf8)
3164 {
3165 class_utf8 = TRUE;
3166 *class_utf8data++ = XCL_SINGLE;
3167 class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
3168 *class_utf8data++ = XCL_SINGLE;
3169 class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
3170 *class_utf8data++ = XCL_RANGE;
3171 class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
3172 class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
3173 *class_utf8data++ = XCL_SINGLE;
3174 class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
3175 *class_utf8data++ = XCL_SINGLE;
3176 class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
3177 *class_utf8data++ = XCL_SINGLE;
3178 class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
3179 }
3180 #endif
3181 continue;
3182 }
3183
3184 if (-c == ESC_H)
3185 {
3186 for (c = 0; c < 32; c++)
3187 {
3188 int x = 0xff;
3189 switch (c)
3190 {
3191 case 0x09/8: x ^= 1 << (0x09%8); break;
3192 case 0x20/8: x ^= 1 << (0x20%8); break;
3193 case 0xa0/8: x ^= 1 << (0xa0%8); break;
3194 default: break;
3195 }
3196 classbits[c] |= x;
3197 }
3198
3199 #ifdef SUPPORT_UTF8
3200 if (utf8)
3201 {
3202 class_utf8 = TRUE;
3203 *class_utf8data++ = XCL_RANGE;
3204 class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
3205 class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
3206 *class_utf8data++ = XCL_RANGE;
3207 class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
3208 class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
3209 *class_utf8data++ = XCL_RANGE;
3210 class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
3211 class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
3212 *class_utf8data++ = XCL_RANGE;
3213 class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
3214 class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
3215 *class_utf8data++ = XCL_RANGE;
3216 class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
3217 class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
3218 *class_utf8data++ = XCL_RANGE;
3219 class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
3220 class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
3221 *class_utf8data++ = XCL_RANGE;
3222 class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
3223 class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
3224 }
3225 #endif
3226 continue;
3227 }
3228
3229 if (-c == ESC_v)
3230 {
3231 SETBIT(classbits, 0x0a); /* LF */
3232 SETBIT(classbits, 0x0b); /* VT */
3233 SETBIT(classbits, 0x0c); /* FF */
3234 SETBIT(classbits, 0x0d); /* CR */
3235 SETBIT(classbits, 0x85); /* NEL */
3236 #ifdef SUPPORT_UTF8
3237 if (utf8)
3238 {
3239 class_utf8 = TRUE;
3240 *class_utf8data++ = XCL_RANGE;
3241 class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
3242 class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
3243 }
3244 #endif
3245 continue;
3246 }
3247
3248 if (-c == ESC_V)
3249 {
3250 for (c = 0; c < 32; c++)
3251 {
3252 int x = 0xff;
3253 switch (c)
3254 {
3255 case 0x0a/8: x ^= 1 << (0x0a%8);
3256 x ^= 1 << (0x0b%8);
3257 x ^= 1 << (0x0c%8);
3258 x ^= 1 << (0x0d%8);
3259 break;
3260 case 0x85/8: x ^= 1 << (0x85%8); break;
3261 default: break;
3262 }
3263 classbits[c] |= x;
3264 }
3265
3266 #ifdef SUPPORT_UTF8
3267 if (utf8)
3268 {
3269 class_utf8 = TRUE;
3270 *class_utf8data++ = XCL_RANGE;
3271 class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
3272 class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
3273 *class_utf8data++ = XCL_RANGE;
3274 class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
3275 class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
3276 }
3277 #endif
3278 continue;
3279 }
3280
3281 /* We need to deal with \P and \p in both phases. */
3282
3283 #ifdef SUPPORT_UCP
3284 if (-c == ESC_p || -c == ESC_P)
3285 {
3286 BOOL negated;
3287 int pdata;
3288 int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
3289 if (ptype < 0) goto FAILED;
3290 class_utf8 = TRUE;
3291 *class_utf8data++ = ((-c == ESC_p) != negated)?
3292 XCL_PROP : XCL_NOTPROP;
3293 *class_utf8data++ = ptype;
3294 *class_utf8data++ = pdata;
3295 class_charcount -= 2; /* Not a < 256 character */
3296 continue;
3297 }
3298 #endif
3299 /* Unrecognized escapes are faulted if PCRE is running in its
3300 strict mode. By default, for compatibility with Perl, they are
3301 treated as literals. */
3302
3303 if ((options & PCRE_EXTRA) != 0)
3304 {
3305 *errorcodeptr = ERR7;
3306 goto FAILED;
3307 }
3308
3309 class_charcount -= 2; /* Undo the default count from above */
3310 c = *ptr; /* Get the final character and fall through */
3311 }
3312
3313 /* Fall through if we have a single character (c >= 0). This may be
3314 greater than 256 in UTF-8 mode. */
3315
3316 } /* End of backslash handling */
3317
3318 /* A single character may be followed by '-' to form a range. However,
3319 Perl does not permit ']' to be the end of the range. A '-' character
3320 at the end is treated as a literal. Perl ignores orphaned \E sequences
3321 entirely. The code for handling \Q and \E is messy. */
3322
3323 CHECK_RANGE:
3324 while (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
3325 {
3326 inescq = FALSE;
3327 ptr += 2;
3328 }
3329
3330 oldptr = ptr;
3331
3332 /* Remember \r or \n */
3333
3334 if (c == CHAR_CR || c == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
3335
3336 /* Check for range */
3337
3338 if (!inescq && ptr[1] == CHAR_MINUS)
3339 {
3340 int d;
3341 ptr += 2;
3342 while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E) ptr += 2;
3343
3344 /* If we hit \Q (not followed by \E) at this point, go into escaped
3345 mode. */
3346
3347 while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_Q)
3348 {
3349 ptr += 2;
3350 if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E)
3351 { ptr += 2; continue; }
3352 inescq = TRUE;
3353 break;
3354 }
3355
3356 if (*ptr == 0 || (!inescq && *ptr == CHAR_RIGHT_SQUARE_BRACKET))
3357 {
3358 ptr = oldptr;
3359 goto LONE_SINGLE_CHARACTER;
3360 }
3361
3362 #ifdef SUPPORT_UTF8
3363 if (utf8)
3364 { /* Braces are required because the */
3365 GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
3366 }
3367 else
3368 #endif
3369 d = *ptr; /* Not UTF-8 mode */
3370
3371 /* The second part of a range can be a single-character escape, but
3372 not any of the other escapes. Perl 5.6 treats a hyphen as a literal
3373 in such circumstances. */
3374
3375 if (!inescq && d == CHAR_BACKSLASH)
3376 {
3377 d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3378 if (*errorcodeptr != 0) goto FAILED;
3379
3380 /* \b is backspace; \X is literal X; \R is literal R; any other
3381 special means the '-' was literal */
3382
3383 if (d < 0)
3384 {
3385 if (d == -ESC_b) d = CHAR_BS;
3386 else if (d == -ESC_X) d = CHAR_X;
3387 else if (d == -ESC_R) d = CHAR_R; else
3388 {
3389 ptr = oldptr;
3390 goto LONE_SINGLE_CHARACTER; /* A few lines below */
3391 }
3392 }
3393 }
3394
3395 /* Check that the two values are in the correct order. Optimize
3396 one-character ranges */
3397
3398 if (d < c)
3399 {
3400 *errorcodeptr = ERR8;
3401 goto FAILED;
3402 }
3403
3404 if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
3405
3406 /* Remember \r or \n */
3407
3408 if (d == CHAR_CR || d == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
3409
3410 /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
3411 matching, we have to use an XCLASS with extra data items. Caseless
3412 matching for characters > 127 is available only if UCP support is
3413 available. */
3414
3415 #ifdef SUPPORT_UTF8
3416 if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
3417 {
3418 class_utf8 = TRUE;
3419
3420 /* With UCP support, we can find the other case equivalents of
3421 the relevant characters. There may be several ranges. Optimize how
3422 they fit with the basic range. */
3423
3424 #ifdef SUPPORT_UCP
3425 if ((options & PCRE_CASELESS) != 0)
3426 {
3427 unsigned int occ, ocd;
3428 unsigned int cc = c;
3429 unsigned int origd = d;
3430 while (get_othercase_range(&cc, origd, &occ, &ocd))
3431 {
3432 if (occ >= (unsigned int)c &&
3433 ocd <= (unsigned int)d)
3434 continue; /* Skip embedded ranges */
3435
3436 if (occ < (unsigned int)c &&
3437 ocd >= (unsigned int)c - 1) /* Extend the basic range */
3438 { /* if there is overlap, */
3439 c = occ; /* noting that if occ < c */
3440 continue; /* we can't have ocd > d */
3441 } /* because a subrange is */
3442 if (ocd > (unsigned int)d &&
3443 occ <= (unsigned int)d + 1) /* always shorter than */
3444 { /* the basic range. */
3445 d = ocd;
3446 continue;
3447 }
3448
3449 if (occ == ocd)
3450 {
3451 *class_utf8data++ = XCL_SINGLE;
3452 }
3453 else
3454 {
3455 *class_utf8data++ = XCL_RANGE;
3456 class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
3457 }
3458 class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
3459 }
3460 }
3461 #endif /* SUPPORT_UCP */
3462
3463 /* Now record the original range, possibly modified for UCP caseless
3464 overlapping ranges. */
3465
3466 *class_utf8data++ = XCL_RANGE;
3467 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3468 class_utf8data += _pcre_ord2utf8(d, class_utf8data);
3469
3470 /* With UCP support, we are done. Without UCP support, there is no
3471 caseless matching for UTF-8 characters > 127; we can use the bit map
3472 for the smaller ones. */
3473
3474 #ifdef SUPPORT_UCP
3475 continue; /* With next character in the class */
3476 #else
3477 if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
3478
3479 /* Adjust upper limit and fall through to set up the map */
3480
3481 d = 127;
3482
3483 #endif /* SUPPORT_UCP */
3484 }
3485 #endif /* SUPPORT_UTF8 */
3486
3487 /* We use the bit map for all cases when not in UTF-8 mode; else
3488 ranges that lie entirely within 0-127 when there is UCP support; else
3489 for partial ranges without UCP support. */
3490
3491 class_charcount += d - c + 1;
3492 class_lastchar = d;
3493
3494 /* We can save a bit of time by skipping this in the pre-compile. */
3495
3496 if (lengthptr == NULL) for (; c <= d; c++)
3497 {
3498 classbits[c/8] |= (1 << (c&7));
3499 if ((options & PCRE_CASELESS) != 0)
3500 {
3501 int uc = cd->fcc[c]; /* flip case */
3502 classbits[uc/8] |= (1 << (uc&7));
3503 }
3504 }
3505
3506 continue; /* Go get the next char in the class */
3507 }
3508
3509 /* Handle a lone single character - we can get here for a normal
3510 non-escape char, or after \ that introduces a single character or for an
3511 apparent range that isn't. */
3512
3513 LONE_SINGLE_CHARACTER:
3514
3515 /* Handle a character that cannot go in the bit map */
3516
3517 #ifdef SUPPORT_UTF8
3518 if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
3519 {
3520 class_utf8 = TRUE;
3521 *class_utf8data++ = XCL_SINGLE;
3522 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3523
3524 #ifdef SUPPORT_UCP
3525 if ((options & PCRE_CASELESS) != 0)
3526 {
3527 unsigned int othercase;
3528 if ((othercase = UCD_OTHERCASE(c)) != c)
3529 {
3530 *class_utf8data++ = XCL_SINGLE;
3531 class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
3532 }
3533 }
3534 #endif /* SUPPORT_UCP */
3535
3536 }
3537 else
3538 #endif /* SUPPORT_UTF8 */
3539
3540 /* Handle a single-byte character */
3541 {
3542 classbits[c/8] |= (1 << (c&7));
3543 if ((options & PCRE_CASELESS) != 0)
3544 {
3545 c = cd->fcc[c]; /* flip case */
3546 classbits[c/8] |= (1 << (c&7));
3547 }
3548 class_charcount++;
3549 class_lastchar = c;
3550 }
3551 }
3552
3553 /* Loop until ']' reached. This "while" is the end of the "do" above. */
3554
3555 while ((c = *(++ptr)) != 0 && (c != CHAR_RIGHT_SQUARE_BRACKET || inescq));
3556
3557 if (c == 0) /* Missing terminating ']' */
3558 {
3559 *errorcodeptr = ERR6;
3560 goto FAILED;
3561 }
3562
3563
3564 /* This code has been disabled because it would mean that \s counts as
3565 an explicit \r or \n reference, and that's not really what is wanted. Now
3566 we set the flag only if there is a literal "\r" or "\n" in the class. */
3567
3568 #if 0
3569 /* Remember whether \r or \n are in this class */
3570
3571 if (negate_class)
3572 {
3573 if ((classbits[1] & 0x24) != 0x24) cd->external_flags |= PCRE_HASCRORLF;
3574 }
3575 else
3576 {
3577 if ((classbits[1] & 0x24) != 0) cd->external_flags |= PCRE_HASCRORLF;
3578 }
3579 #endif
3580
3581
3582 /* If class_charcount is 1, we saw precisely one character whose value is
3583 less than 256. As long as there were no characters >= 128 and there was no
3584 use of \p or \P, in other words, no use of any XCLASS features, we can
3585 optimize.
3586
3587 In UTF-8 mode, we can optimize the negative case only if there were no
3588 characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR
3589 operate on single-bytes only. This is an historical hangover. Maybe one day
3590 we can tidy these opcodes to handle multi-byte characters.
3591
3592 The optimization throws away the bit map. We turn the item into a
3593 1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
3594 that OP_NOT does not support multibyte characters. In the positive case, it
3595 can cause firstbyte to be set. Otherwise, there can be no first char if
3596 this item is first, whatever repeat count may follow. In the case of
3597 reqbyte, save the previous value for reinstating. */
3598
3599 #ifdef SUPPORT_UTF8
3600 if (class_charcount == 1 && !class_utf8 &&
3601 (!utf8 || !negate_class || class_lastchar < 128))
3602 #else
3603 if (class_charcount == 1)
3604 #endif
3605 {
3606 zeroreqbyte = reqbyte;
3607
3608 /* The OP_NOT opcode works on one-byte characters only. */
3609
3610 if (negate_class)
3611 {
3612 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3613 zerofirstbyte = firstbyte;
3614 *code++ = OP_NOT;
3615 *code++ = class_lastchar;
3616 break;
3617 }
3618
3619 /* For a single, positive character, get the value into mcbuffer, and
3620 then we can handle this with the normal one-character code. */
3621
3622 #ifdef SUPPORT_UTF8
3623 if (utf8 && class_lastchar > 127)
3624 mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
3625 else
3626 #endif
3627 {
3628 mcbuffer[0] = class_lastchar;
3629 mclength = 1;
3630 }
3631 goto ONE_CHAR;
3632 } /* End of 1-char optimization */
3633
3634 /* The general case - not the one-char optimization. If this is the first
3635 thing in the branch, there can be no first char setting, whatever the
3636 repeat count. Any reqbyte setting must remain unchanged after any kind of
3637 repeat. */
3638
3639 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3640 zerofirstbyte = firstbyte;
3641 zeroreqbyte = reqbyte;
3642
3643 /* If there are characters with values > 255, we have to compile an
3644 extended class, with its own opcode, unless there was a negated special
3645 such as \S in the class, because in that case all characters > 255 are in
3646 the class, so any that were explicitly given as well can be ignored. If
3647 (when there are explicit characters > 255 that must be listed) there are no
3648 characters < 256, we can omit the bitmap in the actual compiled code. */
3649
3650 #ifdef SUPPORT_UTF8
3651 if (class_utf8 && !should_flip_negation)
3652 {
3653 *class_utf8data++ = XCL_END; /* Marks the end of extra data */
3654 *code++ = OP_XCLASS;
3655 code += LINK_SIZE;
3656 *code = negate_class? XCL_NOT : 0;
3657
3658 /* If the map is required, move up the extra data to make room for it;
3659 otherwise just move the code pointer to the end of the extra data. */
3660
3661 if (class_charcount > 0)
3662 {
3663 *code++ |= XCL_MAP;
3664 memmove(code + 32, code, class_utf8data - code);
3665 memcpy(code, classbits, 32);
3666 code = class_utf8data + 32;
3667 }
3668 else code = class_utf8data;
3669
3670 /* Now fill in the complete length of the item */
3671
3672 PUT(previous, 1, code - previous);
3673 break; /* End of class handling */
3674 }
3675 #endif
3676
3677 /* If there are no characters > 255, set the opcode to OP_CLASS or
3678 OP_NCLASS, depending on whether the whole class was negated and whether
3679 there were negative specials such as \S in the class. Then copy the 32-byte
3680 map into the code vector, negating it if necessary. */
3681
3682 *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
3683 if (negate_class)
3684 {
3685 if (lengthptr == NULL) /* Save time in the pre-compile phase */
3686 for (c = 0; c < 32; c++) code[c] = ~classbits[c];
3687 }
3688 else
3689 {
3690 memcpy(code, classbits, 32);
3691 }
3692 code += 32;
3693 break;
3694
3695
3696 /* ===================================================================*/
3697 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
3698 has been tested above. */
3699
3700 case CHAR_LEFT_CURLY_BRACKET:
3701 if (!is_quantifier) goto NORMAL_CHAR;
3702 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
3703 if (*errorcodeptr != 0) goto FAILED;
3704 goto REPEAT;
3705
3706 case CHAR_ASTERISK:
3707 repeat_min = 0;
3708 repeat_max = -1;
3709 goto REPEAT;
3710
3711 case CHAR_PLUS:
3712 repeat_min = 1;
3713 repeat_max = -1;
3714 goto REPEAT;
3715
3716 case CHAR_QUESTION_MARK:
3717 repeat_min = 0;
3718 repeat_max = 1;
3719
3720 REPEAT:
3721 if (previous == NULL)
3722 {
3723 *errorcodeptr = ERR9;
3724 goto FAILED;
3725 }
3726
3727 if (repeat_min == 0)
3728 {
3729 firstbyte = zerofirstbyte; /* Adjust for zero repeat */
3730 reqbyte = zeroreqbyte; /* Ditto */
3731 }
3732
3733 /* Remember whether this is a variable length repeat */
3734
3735 reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
3736
3737 op_type = 0; /* Default single-char op codes */
3738 possessive_quantifier = FALSE; /* Default not possessive quantifier */
3739
3740 /* Save start of previous item, in case we have to move it up to make space
3741 for an inserted OP_ONCE for the additional '+' extension. */
3742
3743 tempcode = previous;
3744
3745 /* If the next character is '+', we have a possessive quantifier. This
3746 implies greediness, whatever the setting of the PCRE_UNGREEDY option.
3747 If the next character is '?' this is a minimizing repeat, by default,
3748 but if PCRE_UNGREEDY is set, it works the other way round. We change the
3749 repeat type to the non-default. */
3750
3751 if (ptr[1] == CHAR_PLUS)
3752 {
3753 repeat_type = 0; /* Force greedy */
3754 possessive_quantifier = TRUE;
3755 ptr++;
3756 }
3757 else if (ptr[1] == CHAR_QUESTION_MARK)
3758 {
3759 repeat_type = greedy_non_default;
3760 ptr++;
3761 }
3762 else repeat_type = greedy_default;
3763
3764 /* If previous was a character match, abolish the item and generate a
3765 repeat item instead. If a char item has a minumum of more than one, ensure
3766 that it is set in reqbyte - it might not be if a sequence such as x{3} is
3767 the first thing in a branch because the x will have gone into firstbyte
3768 instead. */
3769
3770 if (*previous == OP_CHAR || *previous == OP_CHARNC)
3771 {
3772 /* Deal with UTF-8 characters that take up more than one byte. It's
3773 easier to write this out separately than try to macrify it. Use c to
3774 hold the length of the character in bytes, plus 0x80 to flag that it's a
3775 length rather than a small character. */
3776
3777 #ifdef SUPPORT_UTF8
3778 if (utf8 && (code[-1] & 0x80) != 0)
3779 {
3780 uschar *lastchar = code - 1;
3781 while((*lastchar & 0xc0) == 0x80) lastchar--;
3782 c = code - lastchar; /* Length of UTF-8 character */
3783 memcpy(utf8_char, lastchar, c); /* Save the char */
3784 c |= 0x80; /* Flag c as a length */
3785 }
3786 else
3787 #endif
3788
3789 /* Handle the case of a single byte - either with no UTF8 support, or
3790 with UTF-8 disabled, or for a UTF-8 character < 128. */
3791
3792 {
3793 c = code[-1];
3794 if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
3795 }
3796
3797 /* If the repetition is unlimited, it pays to see if the next thing on
3798 the line is something that cannot possibly match this character. If so,
3799 automatically possessifying this item gains some performance in the case
3800 where the match fails. */
3801
3802 if (!possessive_quantifier &&
3803 repeat_max < 0 &&
3804 check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3805 options, cd))
3806 {
3807 repeat_type = 0; /* Force greedy */
3808 possessive_quantifier = TRUE;
3809 }
3810
3811 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
3812 }
3813
3814 /* If previous was a single negated character ([^a] or similar), we use
3815 one of the special opcodes, replacing it. The code is shared with single-
3816 character repeats by setting opt_type to add a suitable offset into
3817 repeat_type. We can also test for auto-possessification. OP_NOT is
3818 currently used only for single-byte chars. */
3819
3820 else if (*previous == OP_NOT)
3821 {
3822 op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
3823 c = previous[1];
3824 if (!possessive_quantifier &&
3825 repeat_max < 0 &&
3826 check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3827 {
3828 repeat_type = 0; /* Force greedy */
3829 possessive_quantifier = TRUE;
3830 }
3831 goto OUTPUT_SINGLE_REPEAT;
3832 }
3833
3834 /* If previous was a character type match (\d or similar), abolish it and
3835 create a suitable repeat item. The code is shared with single-character
3836 repeats by setting op_type to add a suitable offset into repeat_type. Note
3837 the the Unicode property types will be present only when SUPPORT_UCP is
3838 defined, but we don't wrap the little bits of code here because it just
3839 makes it horribly messy. */
3840
3841 else if (*previous < OP_EODN)
3842 {
3843 uschar *oldcode;
3844 int prop_type, prop_value;
3845 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
3846 c = *previous;
3847
3848 if (!possessive_quantifier &&
3849 repeat_max < 0 &&
3850 check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3851 {
3852 repeat_type = 0; /* Force greedy */
3853 possessive_quantifier = TRUE;
3854 }
3855
3856 OUTPUT_SINGLE_REPEAT:
3857 if (*previous == OP_PROP || *previous == OP_NOTPROP)
3858 {
3859 prop_type = previous[1];
3860 prop_value = previous[2];
3861 }
3862 else prop_type = prop_value = -1;
3863
3864 oldcode = code;
3865 code = previous; /* Usually overwrite previous item */
3866
3867 /* If the maximum is zero then the minimum must also be zero; Perl allows
3868 this case, so we do too - by simply omitting the item altogether. */
3869
3870 if (repeat_max == 0) goto END_REPEAT;
3871
3872 /* All real repeats make it impossible to handle partial matching (maybe
3873 one day we will be able to remove this restriction). */
3874
3875 if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL;
3876
3877 /* Combine the op_type with the repeat_type */
3878
3879 repeat_type += op_type;
3880
3881 /* A minimum of zero is handled either as the special case * or ?, or as
3882 an UPTO, with the maximum given. */
3883
3884 if (repeat_min == 0)
3885 {
3886 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
3887 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
3888 else
3889 {
3890 *code++ = OP_UPTO + repeat_type;
3891 PUT2INC(code, 0, repeat_max);
3892 }
3893 }
3894
3895 /* A repeat minimum of 1 is optimized into some special cases. If the
3896 maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
3897 left in place and, if the maximum is greater than 1, we use OP_UPTO with
3898 one less than the maximum. */
3899
3900 else if (repeat_min == 1)
3901 {
3902 if (repeat_max == -1)
3903 *code++ = OP_PLUS + repeat_type;
3904 else
3905 {
3906 code = oldcode; /* leave previous item in place */
3907 if (repeat_max == 1) goto END_REPEAT;
3908 *code++ = OP_UPTO + repeat_type;
3909 PUT2INC(code, 0, repeat_max - 1);
3910 }
3911 }
3912
3913 /* The case {n,n} is just an EXACT, while the general case {n,m} is
3914 handled as an EXACT followed by an UPTO. */
3915
3916 else
3917 {
3918 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
3919 PUT2INC(code, 0, repeat_min);
3920
3921 /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
3922 we have to insert the character for the previous code. For a repeated
3923 Unicode property match, there are two extra bytes that define the
3924 required property. In UTF-8 mode, long characters have their length in
3925 c, with the 0x80 bit as a flag. */
3926
3927 if (repeat_max < 0)
3928 {
3929 #ifdef SUPPORT_UTF8
3930 if (utf8 && c >= 128)
3931 {
3932 memcpy(code, utf8_char, c & 7);
3933 code += c & 7;
3934 }
3935 else
3936 #endif
3937 {
3938 *code++ = c;
3939 if (prop_type >= 0)
3940 {
3941 *code++ = prop_type;
3942 *code++ = prop_value;
3943 }
3944 }
3945 *code++ = OP_STAR + repeat_type;
3946 }
3947
3948 /* Else insert an UPTO if the max is greater than the min, again
3949 preceded by the character, for the previously inserted code. If the
3950 UPTO is just for 1 instance, we can use QUERY instead. */
3951
3952 else if (repeat_max != repeat_min)
3953 {
3954 #ifdef SUPPORT_UTF8
3955 if (utf8 && c >= 128)
3956 {
3957 memcpy(code, utf8_char, c & 7);
3958 code += c & 7;
3959 }
3960 else
3961 #endif
3962 *code++ = c;
3963 if (prop_type >= 0)
3964 {
3965 *code++ = prop_type;
3966 *code++ = prop_value;
3967 }
3968 repeat_max -= repeat_min;
3969
3970 if (repeat_max == 1)
3971 {
3972 *code++ = OP_QUERY + repeat_type;
3973 }
3974 else
3975 {
3976 *code++ = OP_UPTO + repeat_type;
3977 PUT2INC(code, 0, repeat_max);
3978 }
3979 }
3980 }
3981
3982 /* The character or character type itself comes last in all cases. */
3983
3984 #ifdef SUPPORT_UTF8
3985 if (utf8 && c >= 128)
3986 {
3987 memcpy(code, utf8_char, c & 7);
3988 code += c & 7;
3989 }
3990 else
3991 #endif
3992 *code++ = c;
3993
3994 /* For a repeated Unicode property match, there are two extra bytes that
3995 define the required property. */
3996
3997 #ifdef SUPPORT_UCP
3998 if (prop_type >= 0)
3999 {
4000 *code++ = prop_type;
4001 *code++ = prop_value;
4002 }
4003 #endif
4004 }
4005
4006 /* If previous was a character class or a back reference, we put the repeat
4007 stuff after it, but just skip the item if the repeat was {0,0}. */
4008
4009 else if (*previous == OP_CLASS ||
4010 *previous == OP_NCLASS ||
4011 #ifdef SUPPORT_UTF8
4012 *previous == OP_XCLASS ||
4013 #endif
4014 *previous == OP_REF)
4015 {
4016 if (repeat_max == 0)
4017 {
4018 code = previous;
4019 goto END_REPEAT;
4020 }
4021
4022 /* All real repeats make it impossible to handle partial matching (maybe
4023 one day we will be able to remove this restriction). */
4024
4025 if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL;
4026
4027 if (repeat_min == 0 && repeat_max == -1)
4028 *code++ = OP_CRSTAR + repeat_type;
4029 else if (repeat_min == 1 && repeat_max == -1)
4030 *code++ = OP_CRPLUS + repeat_type;
4031 else if (repeat_min == 0 && repeat_max == 1)
4032 *code++ = OP_CRQUERY + repeat_type;
4033 else
4034 {
4035 *code++ = OP_CRRANGE + repeat_type;
4036 PUT2INC(code, 0, repeat_min);
4037 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
4038 PUT2INC(code, 0, repeat_max);
4039 }
4040 }
4041
4042 /* If previous was a bracket group, we may have to replicate it in certain
4043 cases. */
4044
4045 else if (*previous == OP_BRA || *previous == OP_CBRA ||
4046 *previous == OP_ONCE || *previous == OP_COND)
4047 {
4048 register int i;
4049 int ketoffset = 0;
4050 int len = code - previous;
4051 uschar *bralink = NULL;
4052
4053 /* Repeating a DEFINE group is pointless */
4054
4055 if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
4056 {
4057 *errorcodeptr = ERR55;
4058 goto FAILED;
4059 }
4060
4061 /* If the maximum repeat count is unlimited, find the end of the bracket
4062 by scanning through from the start, and compute the offset back to it
4063 from the current code pointer. There may be an OP_OPT setting following
4064 the final KET, so we can't find the end just by going back from the code
4065 pointer. */
4066
4067 if (repeat_max == -1)
4068 {
4069 register uschar *ket = previous;
4070 do ket += GET(ket, 1); while (*ket != OP_KET);
4071 ketoffset = code - ket;
4072 }
4073
4074 /* The case of a zero minimum is special because of the need to stick
4075 OP_BRAZERO in front of it, and because the group appears once in the
4076 data, whereas in other cases it appears the minimum number of times. For
4077 this reason, it is simplest to treat this case separately, as otherwise
4078 the code gets far too messy. There are several special subcases when the
4079 minimum is zero. */
4080
4081 if (repeat_min == 0)
4082 {
4083 /* If the maximum is also zero, we used to just omit the group from the
4084 output altogether, like this:
4085
4086 ** if (repeat_max == 0)
4087 ** {
4088 ** code = previous;
4089 ** goto END_REPEAT;
4090 ** }
4091
4092 However, that fails when a group is referenced as a subroutine from
4093 elsewhere in the pattern, so now we stick in OP_SKIPZERO in front of it
4094 so that it is skipped on execution. As we don't have a list of which
4095 groups are referenced, we cannot do this selectively.
4096
4097 If the maximum is 1 or unlimited, we just have to stick in the BRAZERO
4098 and do no more at this point. However, we do need to adjust any
4099 OP_RECURSE calls inside the group that refer to the group itself or any
4100 internal or forward referenced group, because the offset is from the
4101 start of the whole regex. Temporarily terminate the pattern while doing
4102 this. */
4103
4104 if (repeat_max <= 1) /* Covers 0, 1, and unlimited */
4105 {
4106 *code = OP_END;
4107 adjust_recurse(previous, 1, utf8, cd, save_hwm);
4108 memmove(previous+1, previous, len);
4109 code++;
4110 if (repeat_max == 0)
4111 {
4112 *previous++ = OP_SKIPZERO;
4113 goto END_REPEAT;
4114 }
4115 *previous++ = OP_BRAZERO + repeat_type;
4116 }
4117
4118 /* If the maximum is greater than 1 and limited, we have to replicate
4119 in a nested fashion, sticking OP_BRAZERO before each set of brackets.
4120 The first one has to be handled carefully because it's the original
4121 copy, which has to be moved up. The remainder can be handled by code
4122 that is common with the non-zero minimum case below. We have to
4123 adjust the value or repeat_max, since one less copy is required. Once
4124 again, we may have to adjust any OP_RECURSE calls inside the group. */
4125
4126 else
4127 {
4128 int offset;
4129 *code = OP_END;
4130 adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
4131 memmove(previous + 2 + LINK_SIZE, previous, len);
4132 code += 2 + LINK_SIZE;
4133 *previous++ = OP_BRAZERO + repeat_type;
4134 *previous++ = OP_BRA;
4135
4136 /* We chain together the bracket offset fields that have to be
4137 filled in later when the ends of the brackets are reached. */
4138
4139 offset = (bralink == NULL)? 0 : previous - bralink;
4140 bralink = previous;
4141 PUTINC(previous, 0, offset);
4142 }
4143
4144 repeat_max--;
4145 }
4146
4147 /* If the minimum is greater than zero, replicate the group as many
4148 times as necessary, and adjust the maximum to the number of subsequent
4149 copies that we need. If we set a first char from the group, and didn't
4150 set a required char, copy the latter from the former. If there are any
4151 forward reference subroutine calls in the group, there will be entries on
4152 the workspace list; replicate these with an appropriate increment. */
4153
4154 else
4155 {
4156 if (repeat_min > 1)
4157 {
4158 /* In the pre-compile phase, we don't actually do the replication. We
4159 just adjust the length as if we had. Do some paranoid checks for
4160 potential integer overflow. */
4161
4162 if (lengthptr != NULL)
4163 {
4164 int delta = (repeat_min - 1)*length_prevgroup;
4165 if ((double)(repeat_min - 1)*(double)length_prevgroup >
4166 (double)INT_MAX ||
4167 OFLOW_MAX - *lengthptr < delta)
4168 {
4169 *errorcodeptr = ERR20;
4170 goto FAILED;
4171 }
4172 *lengthptr += delta;
4173 }
4174
4175 /* This is compiling for real */
4176
4177 else
4178 {
4179 if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
4180 for (i = 1; i < repeat_min; i++)
4181 {
4182 uschar *hc;
4183 uschar *this_hwm = cd->hwm;
4184 memcpy(code, previous, len);
4185 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
4186 {
4187 PUT(cd->hwm, 0, GET(hc, 0) + len);
4188 cd->hwm += LINK_SIZE;
4189 }
4190 save_hwm = this_hwm;
4191 code += len;
4192 }
4193 }
4194 }
4195
4196 if (repeat_max > 0) repeat_max -= repeat_min;
4197 }
4198
4199 /* This code is common to both the zero and non-zero minimum cases. If
4200 the maximum is limited, it replicates the group in a nested fashion,
4201 remembering the bracket starts on a stack. In the case of a zero minimum,
4202 the first one was set up above. In all cases the repeat_max now specifies
4203 the number of additional copies needed. Again, we must remember to
4204 replicate entries on the forward reference list. */
4205
4206 if (repeat_max >= 0)
4207 {
4208 /* In the pre-compile phase, we don't actually do the replication. We
4209 just adjust the length as if we had. For each repetition we must add 1
4210 to the length for BRAZERO and for all but the last repetition we must
4211 add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
4212 paranoid checks to avoid integer overflow. */
4213
4214 if (lengthptr != NULL && repeat_max > 0)
4215 {
4216 int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
4217 2 - 2*LINK_SIZE; /* Last one doesn't nest */
4218 if ((double)repeat_max *
4219 (double)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
4220 > (double)INT_MAX ||
4221 OFLOW_MAX - *lengthptr < delta)
4222 {
4223 *errorcodeptr = ERR20;
4224 goto FAILED;
4225 }
4226 *lengthptr += delta;
4227 }
4228
4229 /* This is compiling for real */
4230
4231 else for (i = repeat_max - 1; i >= 0; i--)
4232 {
4233 uschar *hc;
4234 uschar *this_hwm = cd->hwm;
4235
4236 *code++ = OP_BRAZERO + repeat_type;
4237
4238 /* All but the final copy start a new nesting, maintaining the
4239 chain of brackets outstanding. */
4240
4241 if (i != 0)
4242 {
4243 int offset;
4244 *code++ = OP_BRA;
4245 offset = (bralink == NULL)? 0 : code - bralink;
4246 bralink = code;
4247 PUTINC(code, 0, offset);
4248 }
4249
4250 memcpy(code, previous, len);
4251 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
4252 {
4253 PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
4254 cd->hwm += LINK_SIZE;
4255 }
4256 save_hwm = this_hwm;
4257 code += len;
4258 }
4259
4260 /* Now chain through the pending brackets, and fill in their length
4261 fields (which are holding the chain links pro tem). */
4262
4263 while (bralink != NULL)
4264 {
4265 int oldlinkoffset;
4266 int offset = code - bralink + 1;
4267 uschar *bra = code - offset;
4268 oldlinkoffset = GET(bra, 1);
4269 bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
4270 *code++ = OP_KET;
4271 PUTINC(code, 0, offset);
4272 PUT(bra, 1, offset);
4273 }
4274 }
4275
4276 /* If the maximum is unlimited, set a repeater in the final copy. We
4277 can't just offset backwards from the current code point, because we
4278 don't know if there's been an options resetting after the ket. The
4279 correct offset was computed above.
4280
4281 Then, when we are doing the actual compile phase, check to see whether
4282 this group is a non-atomic one that could match an empty string. If so,
4283 convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
4284 that runtime checking can be done. [This check is also applied to
4285 atomic groups at runtime, but in a different way.] */
4286
4287 else
4288 {
4289 uschar *ketcode = code - ketoffset;
4290 uschar *bracode = ketcode - GET(ketcode, 1);
4291 *ketcode = OP_KETRMAX + repeat_type;
4292 if (lengthptr == NULL && *bracode != OP_ONCE)
4293 {
4294 uschar *scode = bracode;
4295 do
4296 {
4297 if (could_be_empty_branch(scode, ketcode, utf8))
4298 {
4299 *bracode += OP_SBRA - OP_BRA;
4300 break;
4301 }
4302 scode += GET(scode, 1);
4303 }
4304 while (*scode == OP_ALT);
4305 }
4306 }
4307 }
4308
4309 /* If previous is OP_FAIL, it was generated by an empty class [] in
4310 JavaScript mode. The other ways in which OP_FAIL can be generated, that is
4311 by (*FAIL) or (?!) set previous to NULL, which gives a "nothing to repeat"
4312 error above. We can just ignore the repeat in JS case. */
4313
4314 else if (*previous == OP_FAIL) goto END_REPEAT;
4315
4316 /* Else there's some kind of shambles */
4317
4318 else
4319 {
4320 *errorcodeptr = ERR11;
4321 goto FAILED;
4322 }
4323
4324 /* If the character following a repeat is '+', or if certain optimization
4325 tests above succeeded, possessive_quantifier is TRUE. For some of the
4326 simpler opcodes, there is an special alternative opcode for this. For
4327 anything else, we wrap the entire repeated item inside OP_ONCE brackets.
4328 The '+' notation is just syntactic sugar, taken from Sun's Java package,
4329 but the special opcodes can optimize it a bit. The repeated item starts at
4330 tempcode, not at previous, which might be the first part of a string whose
4331 (former) last char we repeated.
4332
4333 Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
4334 an 'upto' may follow. We skip over an 'exact' item, and then test the
4335 length of what remains before proceeding. */
4336
4337 if (possessive_quantifier)
4338 {
4339 int len;
4340 if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||
4341 *tempcode == OP_NOTEXACT)
4342 tempcode += _pcre_OP_lengths[*tempcode] +
4343 ((*tempcode == OP_TYPEEXACT &&
4344 (tempcode[3] == OP_PROP || tempcode[3] == OP_NOTPROP))? 2:0);
4345 len = code - tempcode;
4346 if (len > 0) switch (*tempcode)
4347 {
4348 case OP_STAR: *tempcode = OP_POSSTAR; break;
4349 case OP_PLUS: *tempcode = OP_POSPLUS; break;
4350 case OP_QUERY: *tempcode = OP_POSQUERY; break;
4351 case OP_UPTO: *tempcode = OP_POSUPTO; break;
4352
4353 case OP_TYPESTAR: *tempcode = OP_TYPEPOSSTAR; break;
4354 case OP_TYPEPLUS: *tempcode = OP_TYPEPOSPLUS; break;
4355 case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
4356 case OP_TYPEUPTO: *tempcode = OP_TYPEPOSUPTO; break;
4357
4358 case OP_NOTSTAR: *tempcode = OP_NOTPOSSTAR; break;
4359 case OP_NOTPLUS: *tempcode = OP_NOTPOSPLUS; break;
4360 case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
4361 case OP_NOTUPTO: *tempcode = OP_NOTPOSUPTO; break;
4362
4363 default:
4364 memmove(tempcode + 1+LINK_SIZE, tempcode, len);
4365 code += 1 + LINK_SIZE;
4366 len += 1 + LINK_SIZE;
4367 tempcode[0] = OP_ONCE;
4368 *code++ = OP_KET;
4369 PUTINC(code, 0, len);
4370 PUT(tempcode, 1, len);
4371 break;
4372 }
4373 }
4374
4375 /* In all case we no longer have a previous item. We also set the
4376 "follows varying string" flag for subsequently encountered reqbytes if
4377 it isn't already set and we have just passed a varying length item. */
4378
4379 END_REPEAT:
4380 previous = NULL;
4381 cd->req_varyopt |= reqvary;
4382 break;
4383
4384
4385 /* ===================================================================*/
4386 /* Start of nested parenthesized sub-expression, or comment or lookahead or
4387 lookbehind or option setting or condition or all the other extended
4388 parenthesis forms. */
4389
4390 case CHAR_LEFT_PARENTHESIS:
4391 newoptions = options;
4392 skipbytes = 0;
4393 bravalue = OP_CBRA;
4394 save_hwm = cd->hwm;
4395 reset_bracount = FALSE;
4396
4397 /* First deal with various "verbs" that can be introduced by '*'. */
4398
4399 if (*(++ptr) == CHAR_ASTERISK && (cd->ctypes[ptr[1]] & ctype_letter) != 0)
4400 {
4401 int i, namelen;
4402 const char *vn = verbnames;
4403 const uschar *name = ++ptr;
4404 previous = NULL;
4405 while ((cd->ctypes[*++ptr] & ctype_letter) != 0) {};
4406 if (*ptr == CHAR_COLON)
4407 {
4408 *errorcodeptr = ERR59; /* Not supported */
4409 goto FAILED;
4410 }
4411 if (*ptr != CHAR_RIGHT_PARENTHESIS)
4412 {
4413 *errorcodeptr = ERR60;
4414 goto FAILED;
4415 }
4416 namelen = ptr - name;
4417 for (i = 0; i < verbcount; i++)
4418 {
4419 if (namelen == verbs[i].len &&
4420 strncmp((char *)name, vn, namelen) == 0)
4421 {
4422 *code = verbs[i].op;
4423 if (*code++ == OP_ACCEPT) cd->had_accept = TRUE;
4424 break;
4425 }
4426 vn += verbs[i].len + 1;
4427 }
4428 if (i < verbcount) continue;
4429 *errorcodeptr = ERR60;
4430 goto FAILED;
4431 }
4432
4433 /* Deal with the extended parentheses; all are introduced by '?', and the
4434 appearance of any of them means that this is not a capturing group. */
4435
4436 else if (*ptr == CHAR_QUESTION_MARK)
4437 {
4438 int i, set, unset, namelen;
4439 int *optset;
4440 const uschar *name;
4441 uschar *slot;
4442
4443 switch (*(++ptr))
4444 {
4445 case CHAR_NUMBER_SIGN: /* Comment; skip to ket */
4446 ptr++;
4447 while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
4448 if (*ptr == 0)
4449 {
4450 *errorcodeptr = ERR18;
4451 goto FAILED;
4452 }
4453 continue;
4454
4455
4456 /* ------------------------------------------------------------ */
4457 case CHAR_VERTICAL_LINE: /* Reset capture count for each branch */
4458 reset_bracount = TRUE;
4459 /* Fall through */
4460
4461 /* ------------------------------------------------------------ */
4462 case CHAR_COLON: /* Non-capturing bracket */
4463 bravalue = OP_BRA;
4464 ptr++;
4465 break;
4466
4467
4468 /* ------------------------------------------------------------ */
4469 case CHAR_LEFT_PARENTHESIS:
4470 bravalue = OP_COND; /* Conditional group */
4471
4472 /* A condition can be an assertion, a number (referring to a numbered
4473 group), a name (referring to a named group), or 'R', referring to
4474 recursion. R<digits> and R&name are also permitted for recursion tests.
4475
4476 There are several syntaxes for testing a named group: (?(name)) is used
4477 by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
4478
4479 There are two unfortunate ambiguities, caused by history. (a) 'R' can
4480 be the recursive thing or the name 'R' (and similarly for 'R' followed
4481 by digits), and (b) a number could be a name that consists of digits.
4482 In both cases, we look for a name first; if not found, we try the other
4483 cases. */
4484
4485 /* For conditions that are assertions, check the syntax, and then exit
4486 the switch. This will take control down to where bracketed groups,
4487 including assertions, are processed. */
4488
4489 if (ptr[1] == CHAR_QUESTION_MARK && (ptr[2] == CHAR_EQUALS_SIGN ||
4490 ptr[2] == CHAR_EXCLAMATION_MARK || ptr[2] == CHAR_LESS_THAN_SIGN))
4491 break;
4492
4493 /* Most other conditions use OP_CREF (a couple change to OP_RREF
4494 below), and all need to skip 3 bytes at the start of the group. */
4495
4496 code[1+LINK_SIZE] = OP_CREF;
4497 skipbytes = 3;
4498 refsign = -1;
4499
4500 /* Check for a test for recursion in a named group. */
4501
4502 if (ptr[1] == CHAR_R && ptr[2] == CHAR_AMPERSAND)
4503 {
4504 terminator = -1;
4505 ptr += 2;
4506 code[1+LINK_SIZE] = OP_RREF; /* Change the type of test */
4507 }
4508
4509 /* Check for a test for a named group's having been set, using the Perl
4510 syntax (?(<name>) or (?('name') */
4511
4512 else if (ptr[1] == CHAR_LESS_THAN_SIGN)
4513 {
4514 terminator = CHAR_GREATER_THAN_SIGN;
4515 ptr++;
4516 }
4517 else if (ptr[1] == CHAR_APOSTROPHE)
4518 {
4519 terminator = CHAR_APOSTROPHE;
4520 ptr++;
4521 }
4522 else
4523 {
4524 terminator = 0;
4525 if (ptr[1] == CHAR_MINUS || ptr[1] == CHAR_PLUS) refsign = *(++ptr);
4526 }
4527
4528 /* We now expect to read a name; any thing else is an error */
4529
4530 if ((cd->ctypes[ptr[1]] & ctype_word) == 0)
4531 {
4532 ptr += 1; /* To get the right offset */
4533 *errorcodeptr = ERR28;
4534 goto FAILED;
4535 }
4536
4537 /* Read the name, but also get it as a number if it's all digits */
4538
4539 recno = 0;
4540 name = ++ptr;
4541 while ((cd->ctypes[*ptr] & ctype_word) != 0)
4542 {
4543 if (recno >= 0)
4544 recno = ((digitab[*ptr] & ctype_digit) != 0)?
4545 recno * 10 + *ptr - CHAR_0 : -1;
4546 ptr++;
4547 }
4548 namelen = ptr - name;
4549
4550 if ((terminator > 0 && *ptr++ != terminator) ||
4551 *ptr++ != CHAR_RIGHT_PARENTHESIS)
4552 {
4553 ptr--; /* Error offset */
4554 *errorcodeptr = ERR26;
4555 goto FAILED;
4556 }
4557
4558 /* Do no further checking in the pre-compile phase. */
4559
4560 if (lengthptr != NULL) break;
4561
4562 /* In the real compile we do the work of looking for the actual
4563 reference. If the string started with "+" or "-" we require the rest to
4564 be digits, in which case recno will be set. */
4565
4566 if (refsign > 0)
4567 {
4568 if (recno <= 0)
4569 {
4570 *errorcodeptr = ERR58;
4571 goto FAILED;
4572 }
4573 recno = (refsign == CHAR_MINUS)?
4574 cd->bracount - recno + 1 : recno +cd->bracount;
4575 if (recno <= 0 || recno > cd->final_bracount)
4576 {
4577 *errorcodeptr = ERR15;
4578 goto FAILED;
4579 }
4580 PUT2(code, 2+LINK_SIZE, recno);
4581 break;
4582 }
4583
4584 /* Otherwise (did not start with "+" or "-"), start by looking for the
4585 name. */
4586
4587 slot = cd->name_table;
4588 for (i = 0; i < cd->names_found; i++)
4589 {
4590 if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
4591 slot += cd->name_entry_size;
4592 }
4593
4594 /* Found a previous named subpattern */
4595
4596 if (i < cd->names_found)
4597 {
4598 recno = GET2(slot, 0);
4599 PUT2(code, 2+LINK_SIZE, recno);
4600 }
4601
4602 /* Search the pattern for a forward reference */
4603
4604 else if ((i = find_parens(cd, name, namelen,
4605 (options & PCRE_EXTENDED) != 0)) > 0)
4606 {
4607 PUT2(code, 2+LINK_SIZE, i);
4608 }
4609
4610 /* If terminator == 0 it means that the name followed directly after
4611 the opening parenthesis [e.g. (?(abc)...] and in this case there are
4612 some further alternatives to try. For the cases where terminator != 0
4613 [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
4614 now checked all the possibilities, so give an error. */
4615
4616 else if (terminator != 0)
4617 {
4618 *errorcodeptr = ERR15;
4619 goto FAILED;
4620 }
4621
4622 /* Check for (?(R) for recursion. Allow digits after R to specify a
4623 specific group number. */
4624
4625 else if (*name == CHAR_R)
4626 {
4627 recno = 0;
4628 for (i = 1; i < namelen; i++)
4629 {
4630 if ((digitab[name[i]] & ctype_digit) == 0)
4631 {
4632 *errorcodeptr = ERR15;
4633 goto FAILED;
4634 }
4635 recno = recno * 10 + name[i] - CHAR_0;
4636 }
4637 if (recno == 0) recno = RREF_ANY;
4638 code[1+LINK_SIZE] = OP_RREF; /* Change test type */
4639 PUT2(code, 2+LINK_SIZE, recno);
4640 }
4641
4642 /* Similarly, check for the (?(DEFINE) "condition", which is always
4643 false. */
4644
4645 else if (namelen == 6 && strncmp((char *)name, STRING_DEFINE, 6) == 0)
4646 {
4647 code[1+LINK_SIZE] = OP_DEF;
4648 skipbytes = 1;
4649 }
4650
4651 /* Check for the "name" actually being a subpattern number. We are
4652 in the second pass here, so final_bracount is set. */
4653
4654 else if (recno > 0 && recno <= cd->final_bracount)
4655 {
4656 PUT2(code, 2+LINK_SIZE, recno);
4657 }
4658
4659 /* Either an unidentified subpattern, or a reference to (?(0) */
4660
4661 else
4662 {
4663 *errorcodeptr = (recno == 0)? ERR35: ERR15;
4664 goto FAILED;
4665 }
4666 break;
4667
4668
4669 /* ------------------------------------------------------------ */
4670 case CHAR_EQUALS_SIGN: /* Positive lookahead */
4671 bravalue = OP_ASSERT;
4672 ptr++;
4673 break;
4674
4675
4676 /* ------------------------------------------------------------ */
4677 case CHAR_EXCLAMATION_MARK: /* Negative lookahead */
4678 ptr++;
4679 if (*ptr == CHAR_RIGHT_PARENTHESIS) /* Optimize (?!) */
4680 {
4681 *code++ = OP_FAIL;
4682 previous = NULL;
4683 continue;
4684 }
4685 bravalue = OP_ASSERT_NOT;
4686 break;
4687
4688
4689 /* ------------------------------------------------------------ */
4690 case CHAR_LESS_THAN_SIGN: /* Lookbehind or named define */
4691 switch (ptr[1])
4692 {
4693 case CHAR_EQUALS_SIGN: /* Positive lookbehind */
4694 bravalue = OP_ASSERTBACK;
4695 ptr += 2;
4696 break;
4697
4698 case CHAR_EXCLAMATION_MARK: /* Negative lookbehind */
4699 bravalue = OP_ASSERTBACK_NOT;
4700 ptr += 2;
4701 break;
4702
4703 default: /* Could be name define, else bad */
4704 if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME;
4705 ptr++; /* Correct offset for error */
4706 *errorcodeptr = ERR24;
4707 goto FAILED;
4708 }
4709 break;
4710
4711
4712 /* ------------------------------------------------------------ */
4713 case CHAR_GREATER_THAN_SIGN: /* One-time brackets */
4714 bravalue = OP_ONCE;
4715 ptr++;
4716 break;
4717
4718
4719 /* ------------------------------------------------------------ */
4720 case CHAR_C: /* Callout - may be followed by digits; */
4721 previous_callout = code; /* Save for later completion */
4722 after_manual_callout = 1; /* Skip one item before completing */
4723 *code++ = OP_CALLOUT;
4724 {
4725 int n = 0;
4726 while ((digitab[*(++ptr)] & ctype_digit) != 0)
4727 n = n * 10 + *ptr - CHAR_0;
4728 if (*ptr != CHAR_RIGHT_PARENTHESIS)
4729 {
4730 *errorcodeptr = ERR39;
4731 goto FAILED;
4732 }
4733 if (n > 255)
4734 {
4735 *errorcodeptr = ERR38;
4736 goto FAILED;
4737 }
4738 *code++ = n;
4739 PUT(code, 0, ptr - cd->start_pattern + 1); /* Pattern offset */
4740 PUT(code, LINK_SIZE, 0); /* Default length */
4741 code += 2 * LINK_SIZE;
4742 }
4743 previous = NULL;
4744 continue;
4745
4746
4747 /* ------------------------------------------------------------ */
4748 case CHAR_P: /* Python-style named subpattern handling */
4749 if (*(++ptr) == CHAR_EQUALS_SIGN ||
4750 *ptr == CHAR_GREATER_THAN_SIGN) /* Reference or recursion */
4751 {
4752 is_recurse = *ptr == CHAR_GREATER_THAN_SIGN;
4753 terminator = CHAR_RIGHT_PARENTHESIS;
4754 goto NAMED_REF_OR_RECURSE;
4755 }
4756 else if (*ptr != CHAR_LESS_THAN_SIGN) /* Test for Python-style defn */
4757 {
4758 *errorcodeptr = ERR41;
4759 goto FAILED;
4760 }
4761 /* Fall through to handle (?P< as (?< is handled */
4762
4763
4764 /* ------------------------------------------------------------ */
4765 DEFINE_NAME: /* Come here from (?< handling */
4766 case CHAR_APOSTROPHE:
4767 {
4768 terminator = (*ptr == CHAR_LESS_THAN_SIGN)?
4769 CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
4770 name = ++ptr;
4771
4772 while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4773 namelen = ptr - name;
4774
4775 /* In the pre-compile phase, just do a syntax check. */
4776
4777 if (lengthptr != NULL)
4778 {
4779 if (*ptr != terminator)
4780 {
4781 *errorcodeptr = ERR42;
4782 goto FAILED;
4783 }
4784 if (cd->names_found >= MAX_NAME_COUNT)
4785 {
4786 *errorcodeptr = ERR49;
4787 goto FAILED;
4788 }
4789 if (namelen + 3 > cd->name_entry_size)
4790 {
4791 cd->name_entry_size = namelen + 3;
4792 if (namelen > MAX_NAME_SIZE)
4793 {
4794 *errorcodeptr = ERR48;
4795 goto FAILED;
4796 }
4797 }
4798 }
4799
4800 /* In the real compile, create the entry in the table */
4801
4802 else
4803 {
4804 slot = cd->name_table;
4805 for (i = 0; i < cd->names_found; i++)
4806 {
4807 int crc = memcmp(name, slot+2, namelen);
4808 if (crc == 0)
4809 {
4810 if (slot[2+namelen] == 0)
4811 {
4812 if ((options & PCRE_DUPNAMES) == 0)
4813 {
4814 *errorcodeptr = ERR43;
4815 goto FAILED;
4816 }
4817 }
4818 else crc = -1; /* Current name is substring */
4819 }
4820 if (crc < 0)
4821 {
4822 memmove(slot + cd->name_entry_size, slot,
4823 (cd->names_found - i) * cd->name_entry_size);
4824 break;
4825 }
4826 slot += cd->name_entry_size;
4827 }
4828
4829 PUT2(slot, 0, cd->bracount + 1);
4830 memcpy(slot + 2, name, namelen);
4831 slot[2+namelen] = 0;
4832 }
4833 }
4834
4835 /* In both cases, count the number of names we've encountered. */
4836
4837 ptr++; /* Move past > or ' */
4838 cd->names_found++;
4839 goto NUMBERED_GROUP;
4840
4841
4842 /* ------------------------------------------------------------ */
4843 case CHAR_AMPERSAND: /* Perl recursion/subroutine syntax */
4844 terminator = CHAR_RIGHT_PARENTHESIS;
4845 is_recurse = TRUE;
4846 /* Fall through */
4847
4848 /* We come here from the Python syntax above that handles both
4849 references (?P=name) and recursion (?P>name), as well as falling
4850 through from the Perl recursion syntax (?&name). We also come here from
4851 the Perl \k<name> or \k'name' back reference syntax and the \k{name}
4852 .NET syntax, and the Oniguruma \g<...> and \g'...' subroutine syntax. */
4853
4854 NAMED_REF_OR_RECURSE:
4855 name = ++ptr;
4856 while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4857 namelen = ptr - name;
4858
4859 /* In the pre-compile phase, do a syntax check and set a dummy
4860 reference number. */
4861
4862 if (lengthptr != NULL)
4863 {
4864 if (namelen == 0)
4865 {
4866 *errorcodeptr = ERR62;
4867 goto FAILED;
4868 }
4869 if (*ptr != terminator)
4870 {
4871 *errorcodeptr = ERR42;
4872 goto FAILED;
4873 }
4874 if (namelen > MAX_NAME_SIZE)
4875 {
4876 *errorcodeptr = ERR48;
4877 goto FAILED;
4878 }
4879 recno = 0;
4880 }
4881
4882 /* In the real compile, seek the name in the table. We check the name
4883 first, and then check that we have reached the end of the name in the
4884 table. That way, if the name that is longer than any in the table,
4885 the comparison will fail without reading beyond the table entry. */
4886
4887 else
4888 {
4889 slot = cd->name_table;
4890 for (i = 0; i < cd->names_found; i++)
4891 {
4892 if (strncmp((char *)name, (char *)slot+2, namelen) == 0 &&
4893 slot[2+namelen] == 0)
4894 break;
4895 slot += cd->name_entry_size;
4896 }
4897
4898 if (i < cd->names_found) /* Back reference */
4899 {
4900 recno = GET2(slot, 0);
4901 }
4902 else if ((recno = /* Forward back reference */
4903 find_parens(cd, name, namelen,
4904 (options & PCRE_EXTENDED) != 0)) <= 0)
4905 {
4906 *errorcodeptr = ERR15;
4907 goto FAILED;
4908 }
4909 }
4910
4911 /* In both phases, we can now go to the code than handles numerical
4912 recursion or backreferences. */
4913
4914 if (is_recurse) goto HANDLE_RECURSION;
4915 else goto HANDLE_REFERENCE;
4916
4917
4918 /* ------------------------------------------------------------ */
4919 case CHAR_R: /* Recursion */
4920 ptr++; /* Same as (?0) */
4921 /* Fall through */
4922
4923
4924 /* ------------------------------------------------------------ */
4925 case CHAR_MINUS: case CHAR_PLUS: /* Recursion or subroutine */
4926 case CHAR_0: case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4:
4927 case CHAR_5: case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
4928 {
4929 const uschar *called;
4930 terminator = CHAR_RIGHT_PARENTHESIS;
4931
4932 /* Come here from the \g<...> and \g'...' code (Oniguruma
4933 compatibility). However, the syntax has been checked to ensure that
4934 the ... are a (signed) number, so that neither ERR63 nor ERR29 will
4935 be called on this path, nor with the jump to OTHER_CHAR_AFTER_QUERY
4936 ever be taken. */
4937
4938 HANDLE_NUMERICAL_RECURSION:
4939
4940 if ((refsign = *ptr) == CHAR_PLUS)
4941 {
4942 ptr++;
4943 if ((digitab[*ptr] & ctype_digit) == 0)
4944 {
4945 *errorcodeptr = ERR63;
4946 goto FAILED;
4947 }
4948 }
4949 else if (refsign == CHAR_MINUS)
4950 {
4951 if ((digitab[ptr[1]] & ctype_digit) == 0)
4952 goto OTHER_CHAR_AFTER_QUERY;
4953 ptr++;
4954 }
4955
4956 recno = 0;
4957 while((digitab[*ptr] & ctype_digit) != 0)
4958 recno = recno * 10 + *ptr++ - CHAR_0;
4959
4960 if (*ptr != terminator)
4961 {
4962 *errorcodeptr = ERR29;
4963 goto FAILED;
4964 }
4965
4966 if (refsign == CHAR_MINUS)
4967 {
4968 if (recno == 0)
4969 {
4970 *errorcodeptr = ERR58;
4971 goto FAILED;
4972 }
4973 recno = cd->bracount - recno + 1;
4974 if (recno <= 0)
4975 {
4976 *errorcodeptr = ERR15;
4977 goto FAILED;
4978 }
4979 }
4980 else if (refsign == CHAR_PLUS)
4981 {
4982 if (recno == 0)
4983 {
4984 *errorcodeptr = ERR58;
4985 goto FAILED;
4986 }
4987 recno += cd->bracount;
4988 }
4989
4990 /* Come here from code above that handles a named recursion */
4991
4992 HANDLE_RECURSION:
4993
4994 previous = code;
4995 called = cd->start_code;
4996
4997 /* When we are actually compiling, find the bracket that is being
4998 referenced. Temporarily end the regex in case it doesn't exist before
4999 this point. If we end up with a forward reference, first check that
5000 the bracket does occur later so we can give the error (and position)
5001 now. Then remember this forward reference in the workspace so it can
5002 be filled in at the end. */
5003
5004 if (lengthptr == NULL)
5005 {
5006 *code = OP_END;
5007 if (recno != 0) called = find_bracket(cd->start_code, utf8, recno);
5008
5009 /* Forward reference */
5010
5011 if (called == NULL)
5012 {
5013 if (find_parens(cd, NULL, recno,
5014 (options & PCRE_EXTENDED) != 0) < 0)
5015 {
5016 *errorcodeptr = ERR15;
5017 goto FAILED;
5018 }
5019 called = cd->start_code + recno;
5020 PUTINC(cd->hwm, 0, code + 2 + LINK_SIZE - cd->start_code);
5021 }
5022
5023 /* If not a forward reference, and the subpattern is still open,
5024 this is a recursive call. We check to see if this is a left
5025 recursion that could loop for ever, and diagnose that case. */
5026
5027 else if (GET(called, 1) == 0 &&
5028 could_be_empty(called, code, bcptr, utf8))
5029 {
5030 *errorcodeptr = ERR40;
5031 goto FAILED;
5032 }
5033 }
5034
5035 /* Insert the recursion/subroutine item, automatically wrapped inside
5036 "once" brackets. Set up a "previous group" length so that a
5037 subsequent quantifier will work. */
5038
5039 *code = OP_ONCE;
5040 PUT(code, 1, 2 + 2*LINK_SIZE);
5041 code += 1 + LINK_SIZE;
5042
5043 *code = OP_RECURSE;
5044 PUT(code, 1, called - cd->start_code);
5045 code += 1 + LINK_SIZE;
5046
5047 *code = OP_KET;
5048 PUT(code, 1, 2 + 2*LINK_SIZE);
5049 code += 1 + LINK_SIZE;
5050
5051 length_prevgroup = 3 + 3*LINK_SIZE;
5052 }
5053
5054 /* Can't determine a first byte now */
5055
5056 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
5057 continue;
5058
5059
5060 /* ------------------------------------------------------------ */
5061 default: /* Other characters: check option setting */
5062 OTHER_CHAR_AFTER_QUERY:
5063 set = unset = 0;
5064 optset = &set;
5065
5066 while (*ptr != CHAR_RIGHT_PARENTHESIS && *ptr != CHAR_COLON)
5067 {
5068 switch (*ptr++)
5069 {
5070 case CHAR_MINUS: optset = &unset; break;
5071
5072 case CHAR_J: /* Record that it changed in the external options */
5073 *optset |= PCRE_DUPNAMES;
5074 cd->external_flags |= PCRE_JCHANGED;
5075 break;
5076
5077 case CHAR_i: *optset |= PCRE_CASELESS; break;
5078 case CHAR_m: *optset |= PCRE_MULTILINE; break;
5079 case CHAR_s: *optset |= PCRE_DOTALL; break;
5080 case CHAR_x: *optset |= PCRE_EXTENDED; break;
5081 case CHAR_U: *optset |= PCRE_UNGREEDY; break;
5082 case CHAR_X: *optset |= PCRE_EXTRA; break;
5083
5084 default: *errorcodeptr = ERR12;
5085 ptr--; /* Correct the offset */
5086 goto FAILED;
5087 }
5088 }
5089
5090 /* Set up the changed option bits, but don't change anything yet. */
5091
5092 newoptions = (options | set) & (~unset);
5093
5094 /* If the options ended with ')' this is not the start of a nested
5095 group with option changes, so the options change at this level. If this
5096 item is right at the start of the pattern, the options can be
5097 abstracted and made external in the pre-compile phase, and ignored in
5098 the compile phase. This can be helpful when matching -- for instance in
5099 caseless checking of required bytes.
5100
5101 If the code pointer is not (cd->start_code + 1 + LINK_SIZE), we are
5102 definitely *not* at the start of the pattern because something has been
5103 compiled. In the pre-compile phase, however, the code pointer can have
5104 that value after the start, because it gets reset as code is discarded
5105 during the pre-compile. However, this can happen only at top level - if
5106 we are within parentheses, the starting BRA will still be present. At
5107 any parenthesis level, the length value can be used to test if anything
5108 has been compiled at that level. Thus, a test for both these conditions
5109 is necessary to ensure we correctly detect the start of the pattern in
5110 both phases.
5111
5112 If we are not at the pattern start, compile code to change the ims
5113 options if this setting actually changes any of them, and reset the
5114 greedy defaults and the case value for firstbyte and reqbyte. */
5115
5116 if (*ptr == CHAR_RIGHT_PARENTHESIS)
5117 {
5118 if (code == cd->start_code + 1 + LINK_SIZE &&
5119 (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE))
5120 {
5121 cd->external_options = newoptions;
5122 }
5123 else
5124 {
5125 if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
5126 {
5127 *code++ = OP_OPT;
5128 *code++ = newoptions & PCRE_IMS;
5129 }
5130 greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
5131 greedy_non_default = greedy_default ^ 1;
5132 req_caseopt = ((newoptions & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
5133 }
5134
5135 /* Change options at this level, and pass them back for use
5136 in subsequent branches. When not at the start of the pattern, this
5137 information is also necessary so that a resetting item can be
5138 compiled at the end of a group (if we are in a group). */
5139
5140 *optionsptr = options = newoptions;
5141 previous = NULL; /* This item can't be repeated */
5142 continue; /* It is complete */
5143 }
5144
5145 /* If the options ended with ':' we are heading into a nested group
5146 with possible change of options. Such groups are non-capturing and are
5147 not assertions of any kind. All we need to do is skip over the ':';
5148 the newoptions value is handled below. */
5149
5150 bravalue = OP_BRA;
5151 ptr++;
5152 } /* End of switch for character following (? */
5153 } /* End of (? handling */
5154
5155 /* Opening parenthesis not followed by '?'. If PCRE_NO_AUTO_CAPTURE is set,
5156 all unadorned brackets become non-capturing and behave like (?:...)
5157 brackets. */
5158
5159 else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
5160 {
5161 bravalue = OP_BRA;
5162 }
5163
5164 /* Else we have a capturing group. */
5165
5166 else
5167 {
5168 NUMBERED_GROUP:
5169 cd->bracount += 1;
5170 PUT2(code, 1+LINK_SIZE, cd->bracount);
5171 skipbytes = 2;
5172 }
5173
5174 /* Process nested bracketed regex. Assertions may not be repeated, but
5175 other kinds can be. All their opcodes are >= OP_ONCE. We copy code into a
5176 non-register variable in order to be able to pass its address because some
5177 compilers complain otherwise. Pass in a new setting for the ims options if
5178 they have changed. */
5179
5180 previous = (bravalue >= OP_ONCE)? code : NULL;
5181 *code = bravalue;
5182 tempcode = code;
5183 tempreqvary = cd->req_varyopt; /* Save value before bracket */
5184 length_prevgroup = 0; /* Initialize for pre-compile phase */
5185
5186 if (!compile_regex(
5187 newoptions, /* The complete new option state */
5188 options & PCRE_IMS, /* The previous ims option state */
5189 &tempcode, /* Where to put code (updated) */
5190 &ptr, /* Input pointer (updated) */
5191 errorcodeptr, /* Where to put an error message */
5192 (bravalue == OP_ASSERTBACK ||
5193 bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
5194 reset_bracount, /* True if (?| group */
5195 skipbytes, /* Skip over bracket number */
5196 &subfirstbyte, /* For possible first char */
5197 &subreqbyte, /* For possible last char */
5198 bcptr, /* Current branch chain */
5199 cd, /* Tables block */
5200 (lengthptr == NULL)? NULL : /* Actual compile phase */
5201 &length_prevgroup /* Pre-compile phase */
5202 ))
5203 goto FAILED;
5204
5205 /* At the end of compiling, code is still pointing to the start of the
5206 group, while tempcode has been updated to point past the end of the group
5207 and any option resetting that may follow it. The pattern pointer (ptr)
5208 is on the bracket. */
5209
5210 /* If this is a conditional bracket, check that there are no more than
5211 two branches in the group, or just one if it's a DEFINE group. We do this
5212 in the real compile phase, not in the pre-pass, where the whole group may
5213 not be available. */
5214
5215 if (bravalue == OP_COND && lengthptr == NULL)
5216 {
5217 uschar *tc = code;
5218 int condcount = 0;
5219
5220 do {
5221 condcount++;
5222 tc += GET(tc,1);
5223 }
5224 while (*tc != OP_KET);
5225
5226 /* A DEFINE group is never obeyed inline (the "condition" is always
5227 false). It must have only one branch. */
5228
5229 if (code[LINK_SIZE+1] == OP_DEF)
5230 {
5231 if (condcount > 1)
5232 {
5233 *errorcodeptr = ERR54;
5234 goto FAILED;
5235 }
5236 bravalue = OP_DEF; /* Just a flag to suppress char handling below */
5237 }
5238
5239 /* A "normal" conditional group. If there is just one branch, we must not
5240 make use of its firstbyte or reqbyte, because this is equivalent to an
5241 empty second branch. */
5242
5243 else
5244 {
5245 if (condcount > 2)
5246 {
5247 *errorcodeptr = ERR27;
5248 goto FAILED;
5249 }
5250 if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
5251 }
5252 }
5253
5254 /* Error if hit end of pattern */
5255
5256 if (*ptr != CHAR_RIGHT_PARENTHESIS)
5257 {
5258 *errorcodeptr = ERR14;
5259 goto FAILED;
5260 }
5261
5262 /* In the pre-compile phase, update the length by the length of the group,
5263 less the brackets at either end. Then reduce the compiled code to just a
5264 set of non-capturing brackets so that it doesn't use much memory if it is
5265 duplicated by a quantifier.*/
5266
5267 if (lengthptr != NULL)
5268 {
5269 if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE)
5270 {
5271 *errorcodeptr = ERR20;
5272 goto FAILED;
5273 }
5274 *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
5275 *code++ = OP_BRA;
5276 PUTINC(code, 0, 1 + LINK_SIZE);
5277 *code++ = OP_KET;
5278 PUTINC(code, 0, 1 + LINK_SIZE);
5279 break; /* No need to waste time with special character handling */
5280 }
5281
5282 /* Otherwise update the main code pointer to the end of the group. */
5283
5284 code = tempcode;
5285
5286 /* For a DEFINE group, required and first character settings are not
5287 relevant. */
5288
5289 if (bravalue == OP_DEF) break;
5290
5291 /* Handle updating of the required and first characters for other types of
5292 group. Update for normal brackets of all kinds, and conditions with two
5293 branches (see code above). If the bracket is followed by a quantifier with
5294 zero repeat, we have to back off. Hence the definition of zeroreqbyte and
5295 zerofirstbyte outside the main loop so that they can be accessed for the
5296 back off. */
5297
5298 zeroreqbyte = reqbyte;
5299 zerofirstbyte = firstbyte;
5300 groupsetfirstbyte = FALSE;
5301
5302 if (bravalue >= OP_ONCE)
5303 {
5304 /* If we have not yet set a firstbyte in this branch, take it from the
5305 subpattern, remembering that it was set here so that a repeat of more
5306 than one can replicate it as reqbyte if necessary. If the subpattern has
5307 no firstbyte, set "none" for the whole branch. In both cases, a zero
5308 repeat forces firstbyte to "none". */
5309
5310 if (firstbyte == REQ_UNSET)
5311 {
5312 if (subfirstbyte >= 0)
5313 {
5314 firstbyte = subfirstbyte;
5315 groupsetfirstbyte = TRUE;
5316 }
5317 else firstbyte = REQ_NONE;
5318 zerofirstbyte = REQ_NONE;
5319 }
5320
5321 /* If firstbyte was previously set, convert the subpattern's firstbyte
5322 into reqbyte if there wasn't one, using the vary flag that was in
5323 existence beforehand. */
5324
5325 else if (subfirstbyte >= 0 && subreqbyte < 0)
5326 subreqbyte = subfirstbyte | tempreqvary;
5327
5328 /* If the subpattern set a required byte (or set a first byte that isn't
5329 really the first byte - see above), set it. */
5330
5331 if (subreqbyte >= 0) reqbyte = subreqbyte;
5332 }
5333
5334 /* For a forward assertion, we take the reqbyte, if set. This can be
5335 helpful if the pattern that follows the assertion doesn't set a different
5336 char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
5337 for an assertion, however because it leads to incorrect effect for patterns
5338 such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
5339 of a firstbyte. This is overcome by a scan at the end if there's no
5340 firstbyte, looking for an asserted first char. */
5341
5342 else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
5343 break; /* End of processing '(' */
5344
5345
5346 /* ===================================================================*/
5347 /* Handle metasequences introduced by \. For ones like \d, the ESC_ values
5348 are arranged to be the negation of the corresponding OP_values. For the
5349 back references, the values are ESC_REF plus the reference number. Only
5350 back references and those types that consume a character may be repeated.
5351 We can test for values between ESC_b and ESC_Z for the latter; this may
5352 have to change if any new ones are ever created. */
5353
5354 case CHAR_BACKSLASH:
5355 tempptr = ptr;
5356 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, FALSE);
5357 if (*errorcodeptr != 0) goto FAILED;
5358
5359 if (c < 0)
5360 {
5361 if (-c == ESC_Q) /* Handle start of quoted string */
5362 {
5363 if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
5364 ptr += 2; /* avoid empty string */
5365 else inescq = TRUE;
5366 continue;
5367 }
5368
5369 if (-c == ESC_E) continue; /* Perl ignores an orphan \E */
5370
5371 /* For metasequences that actually match a character, we disable the
5372 setting of a first character if it hasn't already been set. */
5373
5374 if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
5375 firstbyte = REQ_NONE;
5376
5377 /* Set values to reset to if this is followed by a zero repeat. */
5378
5379 zerofirstbyte = firstbyte;
5380 zeroreqbyte = reqbyte;
5381
5382 /* \g<name> or \g'name' is a subroutine call by name and \g<n> or \g'n'
5383 is a subroutine call by number (Oniguruma syntax). In fact, the value
5384 -ESC_g is returned only for these cases. So we don't need to check for <
5385 or ' if the value is -ESC_g. For the Perl syntax \g{n} the value is
5386 -ESC_REF+n, and for the Perl syntax \g{name} the result is -ESC_k (as
5387 that is a synonym for a named back reference). */
5388
5389 if (-c == ESC_g)
5390 {
5391 const uschar *p;
5392 save_hwm = cd->hwm; /* Normally this is set when '(' is read */
5393 terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
5394 CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
5395
5396 /* These two statements stop the compiler for warning about possibly
5397 unset variables caused by the jump to HANDLE_NUMERICAL_RECURSION. In
5398 fact, because we actually check for a number below, the paths that
5399 would actually be in error are never taken. */
5400
5401 skipbytes = 0;
5402 reset_bracount = FALSE;
5403
5404 /* Test for a name */
5405
5406 if (ptr[1] != CHAR_PLUS && ptr[1] != CHAR_MINUS)
5407 {
5408 BOOL isnumber = TRUE;
5409 for (p = ptr + 1; *p != 0 && *p != terminator; p++)
5410 {
5411 if ((cd->ctypes[*p] & ctype_digit) == 0) isnumber = FALSE;
5412 if ((cd->ctypes[*p] & ctype_word) == 0) break;
5413 }
5414 if (*p != terminator)
5415 {
5416 *errorcodeptr = ERR57;
5417 break;
5418 }
5419 if (isnumber)
5420 {
5421 ptr++;
5422 goto HANDLE_NUMERICAL_RECURSION;
5423 }
5424 is_recurse = TRUE;
5425 goto NAMED_REF_OR_RECURSE;
5426 }
5427
5428 /* Test a signed number in angle brackets or quotes. */
5429
5430 p = ptr + 2;
5431 while ((digitab[*p] & ctype_digit) != 0) p++;
5432 if (*p != terminator)
5433 {
5434 *errorcodeptr = ERR57;
5435 break;
5436 }
5437 ptr++;
5438 goto HANDLE_NUMERICAL_RECURSION;
5439 }
5440
5441 /* \k<name> or \k'name' is a back reference by name (Perl syntax).
5442 We also support \k{name} (.NET syntax) */
5443
5444 if (-c == ESC_k && (ptr[1] == CHAR_LESS_THAN_SIGN ||
5445 ptr[1] == CHAR_APOSTROPHE || ptr[1] == CHAR_LEFT_CURLY_BRACKET))
5446 {
5447 is_recurse = FALSE;
5448 terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
5449 CHAR_GREATER_THAN_SIGN : (*ptr == CHAR_APOSTROPHE)?
5450 CHAR_APOSTROPHE : CHAR_RIGHT_CURLY_BRACKET;
5451 goto NAMED_REF_OR_RECURSE;
5452 }
5453
5454 /* Back references are handled specially; must disable firstbyte if
5455 not set to cope with cases like (?=(\w+))\1: which would otherwise set
5456 ':' later. */
5457
5458 if (-c >= ESC_REF)
5459 {
5460 recno = -c - ESC_REF;
5461
5462 HANDLE_REFERENCE: /* Come here from named backref handling */
5463 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
5464 previous = code;
5465 *code++ = OP_REF;
5466 PUT2INC(code, 0, recno);
5467 cd->backref_map |= (recno < 32)? (1 << recno) : 1;
5468 if (recno > cd->top_backref) cd->top_backref = recno;
5469 }
5470
5471 /* So are Unicode property matches, if supported. */
5472
5473 #ifdef SUPPORT_UCP
5474 else if (-c == ESC_P || -c == ESC_p)
5475 {
5476 BOOL negated;
5477 int pdata;
5478 int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
5479 if (ptype < 0) goto FAILED;
5480 previous = code;
5481 *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
5482 *code++ = ptype;
5483 *code++ = pdata;
5484 }
5485 #else
5486
5487 /* If Unicode properties are not supported, \X, \P, and \p are not
5488 allowed. */
5489
5490 else if (-c == ESC_X || -c == ESC_P || -c == ESC_p)
5491 {
5492 *errorcodeptr = ERR45;
5493 goto FAILED;
5494 }
5495 #endif
5496
5497 /* For the rest (including \X when Unicode properties are supported), we
5498 can obtain the OP value by negating the escape value. */
5499
5500 else
5501 {
5502 previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
5503 *code++ = -c;
5504 }
5505 continue;
5506 }
5507
5508 /* We have a data character whose value is in c. In UTF-8 mode it may have
5509 a value > 127. We set its representation in the length/buffer, and then
5510 handle it as a data character. */
5511
5512 #ifdef SUPPORT_UTF8
5513 if (utf8 && c > 127)
5514 mclength = _pcre_ord2utf8(c, mcbuffer);
5515 else
5516 #endif
5517
5518 {
5519 mcbuffer[0] = c;
5520 mclength = 1;
5521 }
5522 goto ONE_CHAR;
5523
5524
5525 /* ===================================================================*/
5526 /* Handle a literal character. It is guaranteed not to be whitespace or #
5527 when the extended flag is set. If we are in UTF-8 mode, it may be a
5528 multi-byte literal character. */
5529
5530 default:
5531 NORMAL_CHAR:
5532 mclength = 1;
5533 mcbuffer[0] = c;
5534
5535 #ifdef SUPPORT_UTF8
5536 if (utf8 && c >= 0xc0)
5537 {
5538 while ((ptr[1] & 0xc0) == 0x80)
5539 mcbuffer[mclength++] = *(++ptr);
5540 }
5541 #endif
5542
5543 /* At this point we have the character's bytes in mcbuffer, and the length
5544 in mclength. When not in UTF-8 mode, the length is always 1. */
5545
5546 ONE_CHAR:
5547 previous = code;
5548 *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARNC : OP_CHAR;
5549 for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
5550
5551 /* Remember if \r or \n were seen */
5552
5553 if (mcbuffer[0] == CHAR_CR || mcbuffer[0] == CHAR_NL)
5554 cd->external_flags |= PCRE_HASCRORLF;
5555
5556 /* Set the first and required bytes appropriately. If no previous first
5557 byte, set it from this character, but revert to none on a zero repeat.
5558 Otherwise, leave the firstbyte value alone, and don't change it on a zero
5559 repeat. */
5560
5561 if (firstbyte == REQ_UNSET)
5562 {
5563 zerofirstbyte = REQ_NONE;
5564 zeroreqbyte = reqbyte;
5565
5566 /* If the character is more than one byte long, we can set firstbyte
5567 only if it is not to be matched caselessly. */
5568
5569 if (mclength == 1 || req_caseopt == 0)
5570 {
5571 firstbyte = mcbuffer[0] | req_caseopt;
5572 if (mclength != 1) reqbyte = code[-1] | cd->req_varyopt;
5573 }
5574 else firstbyte = reqbyte = REQ_NONE;
5575 }
5576
5577 /* firstbyte was previously set; we can set reqbyte only the length is
5578 1 or the matching is caseful. */
5579
5580 else
5581 {
5582 zerofirstbyte = firstbyte;
5583 zeroreqbyte = reqbyte;
5584 if (mclength == 1 || req_caseopt == 0)
5585 reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
5586 }
5587
5588 break; /* End of literal character handling */
5589 }
5590 } /* end of big loop */
5591
5592
5593 /* Control never reaches here by falling through, only by a goto for all the
5594 error states. Pass back the position in the pattern so that it can be displayed
5595 to the user for diagnosing the error. */
5596
5597 FAILED:
5598 *ptrptr = ptr;
5599 return FALSE;
5600 }
5601
5602
5603
5604
5605 /*************************************************
5606 * Compile sequence of alternatives *
5607 *************************************************/
5608
5609 /* On entry, ptr is pointing past the bracket character, but on return it
5610 points to the closing bracket, or vertical bar, or end of string. The code
5611 variable is pointing at the byte into which the BRA operator has been stored.
5612 If the ims options are changed at the start (for a (?ims: group) or during any
5613 branch, we need to insert an OP_OPT item at the start of every following branch
5614 to ensure they get set correctly at run time, and also pass the new options
5615 into every subsequent branch compile.
5616
5617 This function is used during the pre-compile phase when we are trying to find
5618 out the amount of memory needed, as well as during the real compile phase. The
5619 value of lengthptr distinguishes the two phases.
5620
5621 Arguments:
5622 options option bits, including any changes for this subpattern
5623 oldims previous settings of ims option bits
5624 codeptr -> the address of the current code pointer
5625 ptrptr -> the address of the current pattern pointer
5626 errorcodeptr -> pointer to error code variable
5627 lookbehind TRUE if this is a lookbehind assertion
5628 reset_bracount TRUE to reset the count for each branch
5629 skipbytes skip this many bytes at start (for brackets and OP_COND)
5630 firstbyteptr place to put the first required character, or a negative number
5631 reqbyteptr place to put the last required character, or a negative number
5632 bcptr pointer to the chain of currently open branches
5633 cd points to the data block with tables pointers etc.
5634 lengthptr NULL during the real compile phase
5635 points to length accumulator during pre-compile phase
5636
5637 Returns: TRUE on success
5638 */
5639
5640 static BOOL
5641 compile_regex(int options, int oldims, uschar **codeptr, const uschar **ptrptr,
5642 int *errorcodeptr, BOOL lookbehind, BOOL reset_bracount, int skipbytes,
5643 int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd,
5644 int *lengthptr)
5645 {
5646 const uschar *ptr = *ptrptr;
5647 uschar *code = *codeptr;
5648 uschar *last_branch = code;
5649 uschar *start_bracket = code;
5650 uschar *reverse_count = NULL;
5651 int firstbyte, reqbyte;
5652 int branchfirstbyte, branchreqbyte;
5653 int length;
5654 int orig_bracount;
5655 int max_bracount;
5656 branch_chain bc;
5657
5658 bc.outer = bcptr;
5659 bc.current = code;
5660
5661 firstbyte = reqbyte = REQ_UNSET;
5662
5663 /* Accumulate the length for use in the pre-compile phase. Start with the
5664 length of the BRA and KET and any extra bytes that are required at the
5665 beginning. We accumulate in a local variable to save frequent testing of
5666 lenthptr for NULL. We cannot do this by looking at the value of code at the
5667 start and end of each alternative, because compiled items are discarded during
5668 the pre-compile phase so that the work space is not exceeded. */
5669
5670 length = 2 + 2*LINK_SIZE + skipbytes;
5671
5672 /* WARNING: If the above line is changed for any reason, you must also change
5673 the code that abstracts option settings at the start of the pattern and makes
5674 them global. It tests the value of length for (2 + 2*LINK_SIZE) in the
5675 pre-compile phase to find out whether anything has yet been compiled or not. */
5676
5677 /* Offset is set zero to mark that this bracket is still open */
5678
5679 PUT(code, 1, 0);
5680 code += 1 + LINK_SIZE + skipbytes;
5681
5682 /* Loop for each alternative branch */
5683
5684 orig_bracount = max_bracount = cd->bracount;
5685 for (;;)
5686 {
5687 /* For a (?| group, reset the capturing bracket count so that each branch
5688 uses the same numbers. */
5689
5690 if (reset_bracount) cd->bracount = orig_bracount;
5691
5692 /* Handle a change of ims options at the start of the branch */
5693
5694 if ((options & PCRE_IMS) != oldims)
5695 {
5696 *code++ = OP_OPT;
5697 *code++ = options & PCRE_IMS;
5698 length += 2;
5699 }
5700
5701 /* Set up dummy OP_REVERSE if lookbehind assertion */
5702
5703 if (lookbehind)
5704 {
5705 *code++ = OP_REVERSE;
5706 reverse_count = code;
5707 PUTINC(code, 0, 0);
5708 length += 1 + LINK_SIZE;
5709 }
5710
5711 /* Now compile the branch; in the pre-compile phase its length gets added
5712 into the length. */
5713
5714 if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstbyte,
5715 &branchreqbyte, &bc, cd, (lengthptr == NULL)? NULL : &length))
5716 {
5717 *ptrptr = ptr;
5718 return FALSE;
5719 }
5720
5721 /* Keep the highest bracket count in case (?| was used and some branch
5722 has fewer than the rest. */
5723
5724 if (cd->bracount > max_bracount) max_bracount = cd->bracount;
5725
5726 /* In the real compile phase, there is some post-processing to be done. */
5727
5728 if (lengthptr == NULL)
5729 {
5730 /* If this is the first branch, the firstbyte and reqbyte values for the
5731 branch become the values for the regex. */
5732
5733 if (*last_branch != OP_ALT)
5734 {
5735 firstbyte = branchfirstbyte;
5736 reqbyte = branchreqbyte;
5737 }
5738
5739 /* If this is not the first branch, the first char and reqbyte have to
5740 match the values from all the previous branches, except that if the
5741 previous value for reqbyte didn't have REQ_VARY set, it can still match,
5742 and we set REQ_VARY for the regex. */
5743
5744 else
5745 {
5746 /* If we previously had a firstbyte, but it doesn't match the new branch,
5747 we have to abandon the firstbyte for the regex, but if there was
5748 previously no reqbyte, it takes on the value of the old firstbyte. */
5749
5750 if (firstbyte >= 0 && firstbyte != branchfirstbyte)
5751 {
5752 if (reqbyte < 0) reqbyte = firstbyte;
5753 firstbyte = REQ_NONE;
5754 }
5755
5756 /* If we (now or from before) have no firstbyte, a firstbyte from the
5757 branch becomes a reqbyte if there isn't a branch reqbyte. */
5758
5759 if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
5760 branchreqbyte = branchfirstbyte;
5761
5762 /* Now ensure that the reqbytes match */
5763
5764 if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
5765 reqbyte = REQ_NONE;
5766 else reqbyte |= branchreqbyte; /* To "or" REQ_VARY */
5767 }
5768
5769 /* If lookbehind, check that this branch matches a fixed-length string, and
5770 put the length into the OP_REVERSE item. Temporarily mark the end of the
5771 branch with OP_END. */
5772
5773 if (lookbehind)
5774 {
5775 int fixed_length;
5776 *code = OP_END;
5777 fixed_length = find_fixedlength(last_branch, options);
5778 DPRINTF(("fixed length = %d\n", fixed_length));
5779 if (fixed_length < 0)
5780 {
5781 *errorcodeptr = (fixed_length == -2)? ERR36 : ERR25;
5782 *ptrptr = ptr;
5783 return FALSE;
5784 }
5785 PUT(reverse_count, 0, fixed_length);
5786 }
5787 }
5788
5789 /* Reached end of expression, either ')' or end of pattern. In the real
5790 compile phase, go back through the alternative branches and reverse the chain
5791 of offsets, with the field in the BRA item now becoming an offset to the
5792 first alternative. If there are no alternatives, it points to the end of the
5793 group. The length in the terminating ket is always the length of the whole
5794 bracketed item. If any of the ims options were changed inside the group,
5795 compile a resetting op-code following, except at the very end of the pattern.
5796 Return leaving the pointer at the terminating char. */
5797
5798 if (*ptr != CHAR_VERTICAL_LINE)
5799 {
5800 if (lengthptr == NULL)
5801 {
5802 int branch_length = code - last_branch;
5803 do
5804 {
5805 int prev_length = GET(last_branch, 1);
5806 PUT(last_branch, 1, branch_length);
5807 branch_length = prev_length;
5808 last_branch -= branch_length;
5809 }
5810 while (branch_length > 0);
5811 }
5812
5813 /* Fill in the ket */
5814
5815 *code = OP_KET;
5816 PUT(code, 1, code - start_bracket);
5817 code += 1 + LINK_SIZE;
5818
5819 /* Resetting option if needed */
5820
5821 if ((options & PCRE_IMS) != oldims && *ptr == CHAR_RIGHT_PARENTHESIS)
5822 {
5823 *code++ = OP_OPT;
5824 *code++ = oldims;
5825 length += 2;
5826 }
5827
5828 /* Retain the highest bracket number, in case resetting was used. */
5829
5830 cd->bracount = max_bracount;
5831
5832 /* Set values to pass back */
5833
5834 *codeptr = code;
5835 *ptrptr = ptr;
5836 *firstbyteptr = firstbyte;
5837 *reqbyteptr = reqbyte;
5838 if (lengthptr != NULL)
5839 {
5840 if (OFLOW_MAX - *lengthptr < length)
5841 {
5842 *errorcodeptr = ERR20;
5843 return FALSE;
5844 }
5845 *lengthptr += length;
5846 }
5847 return TRUE;
5848 }
5849
5850 /* Another branch follows. In the pre-compile phase, we can move the code
5851 pointer back to where it was for the start of the first branch. (That is,
5852 pretend that each branch is the only one.)
5853
5854 In the real compile phase, insert an ALT node. Its length field points back
5855 to the previous branch while the bracket remains open. At the end the chain
5856 is reversed. It's done like this so that the start of the bracket has a
5857 zero offset until it is closed, making it possible to detect recursion. */
5858
5859 if (lengthptr != NULL)
5860 {
5861 code = *codeptr + 1 + LINK_SIZE + skipbytes;
5862 length += 1 + LINK_SIZE;
5863 }
5864 else
5865 {
5866 *code = OP_ALT;
5867 PUT(code, 1, code - last_branch);
5868 bc.current = last_branch = code;
5869 code += 1 + LINK_SIZE;
5870 }
5871
5872 ptr++;
5873 }
5874 /* Control never reaches here */
5875 }
5876
5877
5878
5879
5880 /*************************************************
5881 * Check for anchored expression *
5882 *************************************************/
5883
5884 /* Try to find out if this is an anchored regular expression. Consider each
5885 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
5886 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
5887 it's anchored. However, if this is a multiline pattern, then only OP_SOD
5888 counts, since OP_CIRC can match in the middle.
5889
5890 We can also consider a regex to be anchored if OP_SOM starts all its branches.
5891 This is the code for \G, which means "match at start of match position, taking
5892 into account the match offset".
5893
5894 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
5895 because that will try the rest of the pattern at all possible matching points,
5896 so there is no point trying again.... er ....
5897
5898 .... except when the .* appears inside capturing parentheses, and there is a
5899 subsequent back reference to those parentheses. We haven't enough information
5900 to catch that case precisely.
5901
5902 At first, the best we could do was to detect when .* was in capturing brackets
5903 and the highest back reference was greater than or equal to that level.
5904 However, by keeping a bitmap of the first 31 back references, we can catch some
5905 of the more common cases more precisely.
5906
5907 Arguments:
5908 code points to start of expression (the bracket)
5909 options points to the options setting
5910 bracket_map a bitmap of which brackets we are inside while testing; this
5911 handles up to substring 31; after that we just have to take
5912 the less precise approach
5913 backref_map the back reference bitmap
5914
5915 Returns: TRUE or FALSE
5916 */
5917
5918 static BOOL
5919 is_anchored(register const uschar *code, int *options, unsigned int bracket_map,
5920 unsigned int backref_map)
5921 {
5922 do {
5923 const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
5924 options, PCRE_MULTILINE, FALSE);
5925 register int op = *scode;
5926
5927 /* Non-capturing brackets */
5928
5929 if (op == OP_BRA)
5930 {
5931 if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
5932 }
5933
5934 /* Capturing brackets */
5935
5936 else if (op == OP_CBRA)
5937 {
5938 int n = GET2(scode, 1+LINK_SIZE);
5939 int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
5940 if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;
5941 }
5942
5943 /* Other brackets */
5944
5945 else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
5946 {
5947 if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
5948 }
5949
5950 /* .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and
5951 it isn't in brackets that are or may be referenced. */
5952
5953 else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
5954 op == OP_TYPEPOSSTAR))
5955 {
5956 if (scode[1] != OP_ALLANY || (bracket_map & backref_map) != 0)
5957 return FALSE;
5958 }
5959
5960 /* Check for explicit anchoring */
5961
5962 else if (op != OP_SOD && op != OP_SOM &&
5963 ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
5964 return FALSE;
5965 code += GET(code, 1);
5966 }
5967 while (*code == OP_ALT); /* Loop for each alternative */
5968 return TRUE;
5969 }
5970
5971
5972
5973 /*************************************************
5974 * Check for starting with ^ or .* *
5975 *************************************************/
5976
5977 /* This is called to find out if every branch starts with ^ or .* so that
5978 "first char" processing can be done to speed things up in multiline
5979 matching and for non-DOTALL patterns that start with .* (which must start at
5980 the beginning or after \n). As in the case of is_anchored() (see above), we
5981 have to take account of back references to capturing brackets that contain .*
5982 because in that case we can't make the assumption.
5983
5984 Arguments:
5985 code points to start of expression (the bracket)
5986 bracket_map a bitmap of which brackets we are inside while testing; this
5987 handles up to substring 31; after that we just have to take
5988 the less precise approach
5989 backref_map the back reference bitmap
5990
5991 Returns: TRUE or FALSE
5992 */
5993
5994 static BOOL
5995 is_startline(const uschar *code, unsigned int bracket_map,
5996 unsigned int backref_map)
5997 {
5998 do {
5999 const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
6000 NULL, 0, FALSE);
6001 register int op = *scode;
6002
6003 /* If we are at the start of a conditional assertion group, *both* the
6004 conditional assertion *and* what follows the condition must satisfy the test
6005 for start of line. Other kinds of condition fail. Note that there may be an
6006 auto-callout at the start of a condition. */
6007
6008 if (op == OP_COND)
6009 {
6010 scode += 1 + LINK_SIZE;
6011 if (*scode == OP_CALLOUT) scode += _pcre_OP_lengths[OP_CALLOUT];
6012 switch (*scode)
6013 {
6014 case OP_CREF:
6015 case OP_RREF:
6016 case OP_DEF:
6017 return FALSE;
6018
6019 default: /* Assertion */
6020 if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
6021 do scode += GET(scode, 1); while (*scode == OP_ALT);
6022 scode += 1 + LINK_SIZE;
6023 break;
6024 }
6025 scode = first_significant_code(scode, NULL, 0, FALSE);
6026 op = *scode;
6027 }
6028
6029 /* Non-capturing brackets */
6030
6031 if (op == OP_BRA)
6032 {
6033 if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
6034 }
6035
6036 /* Capturing brackets */
6037
6038 else if (op == OP_CBRA)
6039 {
6040 int n = GET2(scode, 1+LINK_SIZE);
6041 int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
6042 if (!is_startline(scode, new_map, backref_map)) return FALSE;
6043 }
6044
6045 /* Other brackets */
6046
6047 else if (op == OP_ASSERT || op == OP_ONCE)
6048 {
6049 if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
6050 }
6051
6052 /* .* means "start at start or after \n" if it isn't in brackets that
6053 may be referenced. */
6054
6055 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
6056 {
6057 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
6058 }
6059
6060 /* Check for explicit circumflex */
6061
6062 else if (op != OP_CIRC) return FALSE;
6063
6064 /* Move on to the next alternative */
6065
6066 code += GET(code, 1);
6067 }
6068 while (*code == OP_ALT); /* Loop for each alternative */
6069 return TRUE;
6070 }
6071
6072
6073
6074 /*************************************************
6075 * Check for asserted fixed first char *
6076 *************************************************/
6077
6078 /* During compilation, the "first char" settings from forward assertions are
6079 discarded, because they can cause conflicts with actual literals that follow.
6080 However, if we end up without a first char setting for an unanchored pattern,
6081 it is worth scanning the regex to see if there is an initial asserted first
6082 char. If all branches start with the same asserted char, or with a bracket all
6083 of whose alternatives start with the same asserted char (recurse ad lib), then
6084 we return that char, otherwise -1.
6085
6086 Arguments:
6087 code points to start of expression (the bracket)
6088 options pointer to the options (used to check casing changes)
6089 inassert TRUE if in an assertion
6090
6091 Returns: -1 or the fixed first char
6092 */
6093
6094 static int
6095 find_firstassertedchar(const uschar *code, int *options, BOOL inassert)
6096 {
6097 register int c = -1;
6098 do {
6099 int d;
6100 const uschar *scode =
6101 first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE);
6102 register int op = *scode;
6103
6104 switch(op)
6105 {
6106 default:
6107 return -1;
6108
6109 case OP_BRA:
6110 case OP_CBRA:
6111 case OP_ASSERT:
6112 case OP_ONCE:
6113 case OP_COND:
6114 if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0)
6115 return -1;
6116 if (c < 0) c = d; else if (c != d) return -1;
6117 break;
6118
6119 case OP_EXACT: /* Fall through */
6120 scode += 2;
6121
6122 case OP_CHAR:
6123 case OP_CHARNC:
6124 case OP_PLUS:
6125 case OP_MINPLUS:
6126 case OP_POSPLUS:
6127 if (!inassert) return -1;
6128 if (c < 0)
6129 {
6130 c = scode[1];
6131 if ((*options & PCRE_CASELESS) != 0) c |= REQ_CASELESS;
6132 }
6133 else if (c != scode[1]) return -1;
6134 break;
6135 }
6136
6137 code += GET(code, 1);
6138 }
6139 while (*code == OP_ALT);
6140 return c;
6141 }
6142
6143
6144
6145 /*************************************************
6146 * Compile a Regular Expression *
6147 *************************************************/
6148
6149 /* This function takes a string and returns a pointer to a block of store
6150 holding a compiled version of the expression. The original API for this
6151 function had no error code return variable; it is retained for backwards
6152 compatibility. The new function is given a new name.
6153
6154 Arguments:
6155 pattern the regular expression
6156 options various option bits
6157 errorcodeptr pointer to error code variable (pcre_compile2() only)
6158 can be NULL if you don't want a code value
6159 errorptr pointer to pointer to error text
6160 erroroffset ptr offset in pattern where error was detected
6161 tables pointer to character tables or NULL
6162
6163 Returns: pointer to compiled data block, or NULL on error,
6164 with errorptr and erroroffset set
6165 */
6166
6167 PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION
6168 pcre_compile(const char *pattern, int options, const char **errorptr,
6169 int *erroroffset, const unsigned char *tables)
6170 {
6171 return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
6172 }
6173
6174
6175 PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION
6176 pcre_compile2(const char *pattern, int options, int *errorcodeptr,
6177 const char **errorptr, int *erroroffset, const unsigned char *tables)
6178 {
6179 real_pcre *re;
6180 int length = 1; /* For final END opcode */
6181 int firstbyte, reqbyte, newline;
6182 int errorcode = 0;
6183 int skipatstart = 0;
6184 #ifdef SUPPORT_UTF8
6185 BOOL utf8;
6186 #endif
6187 size_t size;
6188 uschar *code;
6189 const uschar *codestart;
6190 const uschar *ptr;
6191 compile_data compile_block;
6192 compile_data *cd = &compile_block;
6193
6194 /* This space is used for "compiling" into during the first phase, when we are
6195 computing the amount of memory that is needed. Compiled items are thrown away
6196 as soon as possible, so that a fairly large buffer should be sufficient for
6197 this purpose. The same space is used in the second phase for remembering where
6198 to fill in forward references to subpatterns. */
6199
6200 uschar cworkspace[COMPILE_WORK_SIZE];
6201
6202 /* Set this early so that early errors get offset 0. */
6203
6204 ptr = (const uschar *)pattern;
6205
6206 /* We can't pass back an error message if errorptr is NULL; I guess the best we
6207 can do is just return NULL, but we can set a code value if there is a code
6208 pointer. */
6209
6210 if (errorptr == NULL)
6211 {
6212 if (errorcodeptr != NULL) *errorcodeptr = 99;
6213 return NULL;
6214 }
6215
6216 *errorptr = NULL;
6217 if (errorcodeptr != NULL) *errorcodeptr = ERR0;
6218
6219 /* However, we can give a message for this error */
6220
6221 if (erroroffset == NULL)
6222 {
6223 errorcode = ERR16;
6224 goto PCRE_EARLY_ERROR_RETURN2;
6225 }
6226
6227 *erroroffset = 0;
6228
6229 /* Set up pointers to the individual character tables */
6230
6231 if (tables == NULL) tables = _pcre_default_tables;
6232 cd->lcc = tables + lcc_offset;
6233 cd->fcc = tables + fcc_offset;
6234 cd->cbits = tables + cbits_offset;
6235 cd->ctypes = tables + ctypes_offset;
6236
6237 /* Check that all undefined public option bits are zero */
6238
6239 if ((options & ~PUBLIC_COMPILE_OPTIONS) != 0)
6240 {
6241 errorcode = ERR17;
6242 goto PCRE_EARLY_ERROR_RETURN;
6243 }
6244
6245 /* Check for global one-time settings at the start of the pattern, and remember
6246 the offset for later. */
6247
6248 while (ptr[skipatstart] == CHAR_LEFT_PARENTHESIS &&
6249 ptr[skipatstart+1] == CHAR_ASTERISK)
6250 {
6251 int newnl = 0;
6252 int newbsr = 0;
6253
6254 if (strncmp((char *)(ptr+skipatstart+2), STRING_UTF8_RIGHTPAR, 5) == 0)
6255 { skipatstart += 7; options |= PCRE_UTF8; continue; }
6256
6257 if (strncmp((char *)(ptr+skipatstart+2), STRING_CR_RIGHTPAR, 3) == 0)
6258 { skipatstart += 5; newnl = PCRE_NEWLINE_CR; }
6259 else if (strncmp((char *)(ptr+skipatstart+2), STRING_LF_RIGHTPAR, 3) == 0)
6260 { skipatstart += 5; newnl = PCRE_NEWLINE_LF; }
6261 else if (strncmp((char *)(ptr+skipatstart+2), STRING_CRLF_RIGHTPAR, 5) == 0)
6262 { skipatstart += 7; newnl = PCRE_NEWLINE_CR + PCRE_NEWLINE_LF; }
6263 else if (strncmp((char *)(ptr+skipatstart+2), STRING_ANY_RIGHTPAR, 4) == 0)
6264 { skipatstart += 6; newnl = PCRE_NEWLINE_ANY; }
6265 else if (strncmp((char *)(ptr+skipatstart+2), STRING_ANYCRLF_RIGHTPAR, 8) == 0)
6266 { skipatstart += 10; newnl = PCRE_NEWLINE_ANYCRLF; }
6267
6268 else if (strncmp((char *)(ptr+skipatstart+2), STRING_BSR_ANYCRLF_RIGHTPAR, 12) == 0)
6269 { skipatstart += 14; newbsr = PCRE_BSR_ANYCRLF; }
6270 else if (strncmp((char *)(ptr+skipatstart+2), STRING_BSR_UNICODE_RIGHTPAR, 12) == 0)
6271 { skipatstart += 14; newbsr = PCRE_BSR_UNICODE; }
6272
6273 if (newnl != 0)
6274 options = (options & ~PCRE_NEWLINE_BITS) | newnl;
6275 else if (newbsr != 0)
6276 options = (options & ~(PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) | newbsr;
6277 else break;
6278 }
6279
6280 /* Can't support UTF8 unless PCRE has been compiled to include the code. */
6281
6282 #ifdef SUPPORT_UTF8
6283 utf8 = (options & PCRE_UTF8) != 0;
6284 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
6285 (*erroroffset = _pcre_valid_utf8((uschar *)pattern, -1)) >= 0)
6286 {
6287 errorcode = ERR44;
6288 goto PCRE_EARLY_ERROR_RETURN2;
6289 }
6290 #else
6291 if ((options & PCRE_UTF8) != 0)
6292 {
6293 errorcode = ERR32;
6294 goto PCRE_EARLY_ERROR_RETURN;
6295 }
6296 #endif
6297
6298 /* Check validity of \R options. */
6299
6300 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
6301 {
6302 case 0:
6303 case PCRE_BSR_ANYCRLF:
6304 case PCRE_BSR_UNICODE:
6305 break;
6306 default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
6307 }
6308
6309 /* Handle different types of newline. The three bits give seven cases. The
6310 current code allows for fixed one- or two-byte sequences, plus "any" and
6311 "anycrlf". */
6312
6313 switch (options & PCRE_NEWLINE_BITS)
6314 {
6315 case 0: newline = NEWLINE; break; /* Build-time default */
6316 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
6317 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
6318 case PCRE_NEWLINE_CR+
6319 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
6320 case PCRE_NEWLINE_ANY: newline = -1; break;
6321 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
6322 default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
6323 }
6324
6325 if (newline == -2)
6326 {
6327 cd->nltype = NLTYPE_ANYCRLF;
6328 }
6329 else if (newline < 0)
6330 {
6331 cd->nltype = NLTYPE_ANY;
6332 }
6333 else
6334 {
6335 cd->nltype = NLTYPE_FIXED;
6336 if (newline > 255)
6337 {
6338 cd->nllen = 2;
6339 cd->nl[0] = (newline >> 8) & 255;
6340 cd->nl[1] = newline & 255;
6341 }
6342 else
6343 {
6344 cd->nllen = 1;
6345 cd->nl[0] = newline;
6346 }
6347 }
6348
6349 /* Maximum back reference and backref bitmap. The bitmap records up to 31 back
6350 references to help in deciding whether (.*) can be treated as anchored or not.
6351 */
6352
6353 cd->top_backref = 0;
6354 cd->backref_map = 0;
6355
6356 /* Reflect pattern for debugging output */
6357
6358 DPRINTF(("------------------------------------------------------------------\n"));
6359 DPRINTF(("%s\n", pattern));
6360
6361 /* Pretend to compile the pattern while actually just accumulating the length
6362 of memory required. This behaviour is triggered by passing a non-NULL final
6363 argument to compile_regex(). We pass a block of workspace (cworkspace) for it
6364 to compile parts of the pattern into; the compiled code is discarded when it is
6365 no longer needed, so hopefully this workspace will never overflow, though there
6366 is a test for its doing so. */
6367
6368 cd->bracount = cd->final_bracount = 0;
6369 cd->names_found = 0;
6370 cd->name_entry_size = 0;
6371 cd->name_table = NULL;
6372 cd->start_workspace = cworkspace;
6373 cd->start_code = cworkspace;
6374 cd->hwm = cworkspace;
6375 cd->start_pattern = (const uschar *)pattern;
6376 cd->end_pattern = (const uschar *)(pattern + strlen(pattern));
6377 cd->req_varyopt = 0;
6378 cd->external_options = options;
6379 cd->external_flags = 0;
6380
6381 /* Now do the pre-compile. On error, errorcode will be set non-zero, so we
6382 don't need to look at the result of the function here. The initial options have
6383 been put into the cd block so that they can be changed if an option setting is
6384 found within the regex right at the beginning. Bringing initial option settings
6385 outside can help speed up starting point checks. */
6386
6387 ptr += skipatstart;
6388 code = cworkspace;
6389 *code = OP_BRA;
6390 (void)compile_regex(cd->external_options, cd->external_options & PCRE_IMS,
6391 &code, &ptr, &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd,
6392 &length);
6393 if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN;
6394
6395 DPRINTF(("end pre-compile: length=%d workspace=%d\n", length,
6396 cd->hwm - cworkspace));
6397
6398 if (length > MAX_PATTERN_SIZE)
6399 {
6400 errorcode = ERR20;
6401 goto PCRE_EARLY_ERROR_RETURN;
6402 }
6403
6404 /* Compute the size of data block needed and get it, either from malloc or
6405 externally provided function. Integer overflow should no longer be possible
6406 because nowadays we limit the maximum value of cd->names_found and
6407 cd->name_entry_size. */
6408
6409 size = length + sizeof(real_pcre) + cd->names_found * (cd->name_entry_size + 3);
6410 re = (real_pcre *)(pcre_malloc)(size);
6411
6412 if (re == NULL)
6413 {
6414 errorcode = ERR21;
6415 goto PCRE_EARLY_ERROR_RETURN;
6416 }
6417
6418 /* Put in the magic number, and save the sizes, initial options, internal
6419 flags, and character table pointer. NULL is used for the default character
6420 tables. The nullpad field is at the end; it's there to help in the case when a
6421 regex compiled on a system with 4-byte pointers is run on another with 8-byte
6422 pointers. */
6423
6424 re->magic_number = MAGIC_NUMBER;
6425 re->size = size;
6426 re->options = cd->external_options;
6427 re->flags = cd->external_flags;
6428 re->dummy1 = 0;
6429 re->first_byte = 0;
6430 re->req_byte = 0;
6431 re->name_table_offset = sizeof(real_pcre);
6432 re->name_entry_size = cd->name_entry_size;
6433 re->name_count = cd->names_found;
6434 re->ref_count = 0;
6435 re->tables = (tables == _pcre_default_tables)? NULL : tables;
6436 re->nullpad = NULL;
6437
6438 /* The starting points of the name/number translation table and of the code are
6439 passed around in the compile data block. The start/end pattern and initial
6440 options are already set from the pre-compile phase, as is the name_entry_size
6441 field. Reset the bracket count and the names_found field. Also reset the hwm
6442 field; this time it's used for remembering forward references to subpatterns.
6443 */
6444
6445 cd->final_bracount = cd->bracount; /* Save for checking forward references */
6446 cd->bracount = 0;
6447 cd->names_found = 0;
6448 cd->name_table = (uschar *)re + re->name_table_offset;
6449 codestart = cd->name_table + re->name_entry_size * re->name_count;
6450 cd->start_code = codestart;
6451 cd->hwm = cworkspace;
6452 cd->req_varyopt = 0;
6453 cd->had_accept = FALSE;
6454
6455 /* Set up a starting, non-extracting bracket, then compile the expression. On
6456 error, errorcode will be set non-zero, so we don't need to look at the result
6457 of the function here. */
6458
6459 ptr = (const uschar *)pattern + skipatstart;
6460 code = (uschar *)codestart;
6461 *code = OP_BRA;
6462 (void)compile_regex(re->options, re->options & PCRE_IMS, &code, &ptr,
6463 &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd, NULL);
6464 re->top_bracket = cd->bracount;
6465 re->top_backref = cd->top_backref;
6466 re->flags = cd->external_flags;
6467
6468 if (cd->had_accept) reqbyte = -1; /* Must disable after (*ACCEPT) */
6469
6470 /* If not reached end of pattern on success, there's an excess bracket. */
6471
6472 if (errorcode == 0 && *ptr != 0) errorcode = ERR22;
6473
6474 /* Fill in the terminating state and check for disastrous overflow, but
6475 if debugging, leave the test till after things are printed out. */
6476
6477 *code++ = OP_END;
6478
6479 #ifndef DEBUG
6480 if (code - codestart > length) errorcode = ERR23;
6481 #endif
6482
6483 /* Fill in any forward references that are required. */
6484
6485 while (errorcode == 0 && cd->hwm > cworkspace)
6486 {
6487 int offset, recno;
6488 const uschar *groupptr;
6489 cd->hwm -= LINK_SIZE;
6490 offset = GET(cd->hwm, 0);
6491 recno = GET(codestart, offset);
6492 groupptr = find_bracket(codestart, (re->options & PCRE_UTF8) != 0, recno);
6493 if (groupptr == NULL) errorcode = ERR53;
6494 else PUT(((uschar *)codestart), offset, groupptr - codestart);
6495 }
6496
6497 /* Give an error if there's back reference to a non-existent capturing
6498 subpattern. */
6499
6500 if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15;
6501
6502 /* Failed to compile, or error while post-processing */
6503
6504 if (errorcode != 0)
6505 {
6506 (pcre_free)(re);
6507 PCRE_EARLY_ERROR_RETURN:
6508 *erroroffset = ptr - (const uschar *)pattern;
6509 PCRE_EARLY_ERROR_RETURN2:
6510 *errorptr = find_error_text(errorcode);
6511 if (errorcodeptr != NULL) *errorcodeptr = errorcode;
6512 return NULL;
6513 }
6514
6515 /* If the anchored option was not passed, set the flag if we can determine that
6516 the pattern is anchored by virtue of ^ characters or \A or anything else (such
6517 as starting with .* when DOTALL is set).
6518
6519 Otherwise, if we know what the first byte has to be, save it, because that
6520 speeds up unanchored matches no end. If not, see if we can set the
6521 PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
6522 start with ^. and also when all branches start with .* for non-DOTALL matches.
6523 */
6524
6525 if ((re->options & PCRE_ANCHORED) == 0)
6526 {
6527 int temp_options = re->options; /* May get changed during these scans */
6528 if (is_anchored(codestart, &temp_options, 0, cd->backref_map))
6529 re->options |= PCRE_ANCHORED;
6530 else
6531 {
6532 if (firstbyte < 0)
6533 firstbyte = find_firstassertedchar(codestart, &temp_options, FALSE);
6534 if (firstbyte >= 0) /* Remove caseless flag for non-caseable chars */
6535 {
6536 int ch = firstbyte & 255;
6537 re->first_byte = ((firstbyte & REQ_CASELESS) != 0 &&
6538 cd->fcc[ch] == ch)? ch : firstbyte;
6539 re->flags |= PCRE_FIRSTSET;
6540 }
6541 else if (is_startline(codestart, 0, cd->backref_map))
6542 re->flags |= PCRE_STARTLINE;
6543 }
6544 }
6545
6546 /* For an anchored pattern, we use the "required byte" only if it follows a
6547 variable length item in the regex. Remove the caseless flag for non-caseable
6548 bytes. */
6549
6550 if (reqbyte >= 0 &&
6551 ((re->options & PCRE_ANCHORED) == 0 || (reqbyte & REQ_VARY) != 0))
6552 {
6553 int ch = reqbyte & 255;
6554 re->req_byte = ((reqbyte & REQ_CASELESS) != 0 &&
6555 cd->fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte;
6556 re->flags |= PCRE_REQCHSET;
6557 }
6558
6559 /* Print out the compiled data if debugging is enabled. This is never the
6560 case when building a production library. */
6561
6562 #ifdef DEBUG
6563
6564 printf("Length = %d top_bracket = %d top_backref = %d\n",
6565 length, re->top_bracket, re->top_backref);
6566
6567 printf("Options=%08x\n", re->options);
6568
6569 if ((re->flags & PCRE_FIRSTSET) != 0)
6570 {
6571 int ch = re->first_byte & 255;
6572 const char *caseless = ((re->first_byte & REQ_CASELESS) == 0)?
6573 "" : " (caseless)";
6574 if (isprint(ch)) printf("First char = %c%s\n", ch, caseless);
6575 else printf("First char = \\x%02x%s\n", ch, caseless);
6576 }
6577
6578 if ((re->flags & PCRE_REQCHSET) != 0)
6579 {
6580 int ch = re->req_byte & 255;
6581 const char *caseless = ((re->req_byte & REQ_CASELESS) == 0)?
6582 "" : " (caseless)";
6583 if (isprint(ch)) printf("Req char = %c%s\n", ch, caseless);
6584 else printf("Req char = \\x%02x%s\n", ch, caseless);
6585 }
6586
6587 pcre_printint(re, stdout, TRUE);
6588
6589 /* This check is done here in the debugging case so that the code that
6590 was compiled can be seen. */
6591
6592 if (code - codestart > length)
6593 {
6594 (pcre_free)(re);
6595 *errorptr = find_error_text(ERR23);
6596 *erroroffset = ptr - (uschar *)pattern;
6597 if (errorcodeptr != NULL) *errorcodeptr = ERR23;
6598 return NULL;
6599 }
6600 #endif /* DEBUG */
6601
6602 return (pcre *)re;
6603 }
6604
6605 /* End of pcre_compile.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12