/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Contents of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 206 - (show annotations) (download)
Fri Aug 3 14:53:04 2007 UTC (6 years, 8 months ago) by ph10
File MIME type: text/plain
File size: 185600 byte(s)
Fix loop for null-matching condition nested in an outer unlimited repeat.

1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2007 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains the external function pcre_compile(), along with
42 supporting internal functions that are not used by other modules. */
43
44
45 #ifdef HAVE_CONFIG_H
46 #include <config.h>
47 #endif
48
49 #define NLBLOCK cd /* Block containing newline information */
50 #define PSSTART start_pattern /* Field containing processed string start */
51 #define PSEND end_pattern /* Field containing processed string end */
52
53 #include "pcre_internal.h"
54
55
56 /* When DEBUG is defined, we need the pcre_printint() function, which is also
57 used by pcretest. DEBUG is not defined when building a production library. */
58
59 #ifdef DEBUG
60 #include "pcre_printint.src"
61 #endif
62
63
64 /* Macro for setting individual bits in class bitmaps. */
65
66 #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
67
68 /* Maximum length value to check against when making sure that the integer that
69 holds the compiled pattern length does not overflow. We make it a bit less than
70 INT_MAX to allow for adding in group terminating bytes, so that we don't have
71 to check them every time. */
72
73 #define OFLOW_MAX (INT_MAX - 20)
74
75
76 /*************************************************
77 * Code parameters and static tables *
78 *************************************************/
79
80 /* This value specifies the size of stack workspace that is used during the
81 first pre-compile phase that determines how much memory is required. The regex
82 is partly compiled into this space, but the compiled parts are discarded as
83 soon as they can be, so that hopefully there will never be an overrun. The code
84 does, however, check for an overrun. The largest amount I've seen used is 218,
85 so this number is very generous.
86
87 The same workspace is used during the second, actual compile phase for
88 remembering forward references to groups so that they can be filled in at the
89 end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
90 is 4 there is plenty of room. */
91
92 #define COMPILE_WORK_SIZE (4096)
93
94
95 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
96 are simple data values; negative values are for special things like \d and so
97 on. Zero means further processing is needed (for things like \x), or the escape
98 is invalid. */
99
100 #ifndef EBCDIC /* This is the "normal" table for ASCII systems */
101 static const short int escapes[] = {
102 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
103 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
104 '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */
105 -ESC_H, 0, 0, -ESC_K, 0, 0, 0, 0, /* H - O */
106 -ESC_P, -ESC_Q, -ESC_R, -ESC_S, 0, 0, -ESC_V, -ESC_W, /* P - W */
107 -ESC_X, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
108 '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */
109 -ESC_h, 0, 0, -ESC_k, 0, 0, ESC_n, 0, /* h - o */
110 -ESC_p, 0, ESC_r, -ESC_s, ESC_tee, 0, -ESC_v, -ESC_w, /* p - w */
111 0, 0, -ESC_z /* x - z */
112 };
113
114 #else /* This is the "abnormal" table for EBCDIC systems */
115 static const short int escapes[] = {
116 /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
117 /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
118 /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
119 /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
120 /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
121 /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
122 /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
123 /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
124 /* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0,
125 /* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
126 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
127 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0,
128 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
129 /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
130 /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
131 /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
132 /* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0,
133 /* D0 */ '}', 0, -ESC_K, 0, 0, 0, 0, -ESC_P,
134 /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
135 /* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X,
136 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
137 /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
138 /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
139 };
140 #endif
141
142
143 /* Tables of names of POSIX character classes and their lengths. The list is
144 terminated by a zero length entry. The first three must be alpha, lower, upper,
145 as this is assumed for handling case independence. */
146
147 static const char *const posix_names[] = {
148 "alpha", "lower", "upper",
149 "alnum", "ascii", "blank", "cntrl", "digit", "graph",
150 "print", "punct", "space", "word", "xdigit" };
151
152 static const uschar posix_name_lengths[] = {
153 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
154
155 /* Table of class bit maps for each POSIX class. Each class is formed from a
156 base map, with an optional addition or removal of another map. Then, for some
157 classes, there is some additional tweaking: for [:blank:] the vertical space
158 characters are removed, and for [:alpha:] and [:alnum:] the underscore
159 character is removed. The triples in the table consist of the base map offset,
160 second map offset or -1 if no second map, and a non-negative value for map
161 addition or a negative value for map subtraction (if there are two maps). The
162 absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
163 remove vertical space characters, 2 => remove underscore. */
164
165 static const int posix_class_maps[] = {
166 cbit_word, cbit_digit, -2, /* alpha */
167 cbit_lower, -1, 0, /* lower */
168 cbit_upper, -1, 0, /* upper */
169 cbit_word, -1, 2, /* alnum - word without underscore */
170 cbit_print, cbit_cntrl, 0, /* ascii */
171 cbit_space, -1, 1, /* blank - a GNU extension */
172 cbit_cntrl, -1, 0, /* cntrl */
173 cbit_digit, -1, 0, /* digit */
174 cbit_graph, -1, 0, /* graph */
175 cbit_print, -1, 0, /* print */
176 cbit_punct, -1, 0, /* punct */
177 cbit_space, -1, 0, /* space */
178 cbit_word, -1, 0, /* word - a Perl extension */
179 cbit_xdigit,-1, 0 /* xdigit */
180 };
181
182
183 #define STRING(a) # a
184 #define XSTRING(s) STRING(s)
185
186 /* The texts of compile-time error messages. These are "char *" because they
187 are passed to the outside world. Do not ever re-use any error number, because
188 they are documented. Always add a new error instead. Messages marked DEAD below
189 are no longer used. */
190
191 static const char *error_texts[] = {
192 "no error",
193 "\\ at end of pattern",
194 "\\c at end of pattern",
195 "unrecognized character follows \\",
196 "numbers out of order in {} quantifier",
197 /* 5 */
198 "number too big in {} quantifier",
199 "missing terminating ] for character class",
200 "invalid escape sequence in character class",
201 "range out of order in character class",
202 "nothing to repeat",
203 /* 10 */
204 "operand of unlimited repeat could match the empty string", /** DEAD **/
205 "internal error: unexpected repeat",
206 "unrecognized character after (?",
207 "POSIX named classes are supported only within a class",
208 "missing )",
209 /* 15 */
210 "reference to non-existent subpattern",
211 "erroffset passed as NULL",
212 "unknown option bit(s) set",
213 "missing ) after comment",
214 "parentheses nested too deeply", /** DEAD **/
215 /* 20 */
216 "regular expression is too large",
217 "failed to get memory",
218 "unmatched parentheses",
219 "internal error: code overflow",
220 "unrecognized character after (?<",
221 /* 25 */
222 "lookbehind assertion is not fixed length",
223 "malformed number or name after (?(",
224 "conditional group contains more than two branches",
225 "assertion expected after (?(",
226 "(?R or (?[+-]digits must be followed by )",
227 /* 30 */
228 "unknown POSIX class name",
229 "POSIX collating elements are not supported",
230 "this version of PCRE is not compiled with PCRE_UTF8 support",
231 "spare error", /** DEAD **/
232 "character value in \\x{...} sequence is too large",
233 /* 35 */
234 "invalid condition (?(0)",
235 "\\C not allowed in lookbehind assertion",
236 "PCRE does not support \\L, \\l, \\N, \\U, or \\u",
237 "number after (?C is > 255",
238 "closing ) for (?C expected",
239 /* 40 */
240 "recursive call could loop indefinitely",
241 "unrecognized character after (?P",
242 "syntax error in subpattern name (missing terminator)",
243 "two named subpatterns have the same name",
244 "invalid UTF-8 string",
245 /* 45 */
246 "support for \\P, \\p, and \\X has not been compiled",
247 "malformed \\P or \\p sequence",
248 "unknown property name after \\P or \\p",
249 "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)",
250 "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")",
251 /* 50 */
252 "repeated subpattern is too long", /** DEAD **/
253 "octal value is greater than \\377 (not in UTF-8 mode)",
254 "internal error: overran compiling workspace",
255 "internal error: previously-checked referenced subpattern not found",
256 "DEFINE group contains more than one branch",
257 /* 55 */
258 "repeating a DEFINE group is not allowed",
259 "inconsistent NEWLINE options",
260 "\\g is not followed by a braced name or an optionally braced non-zero number",
261 "(?+ or (?- or (?(+ or (?(- must be followed by a non-zero number"
262 };
263
264
265 /* Table to identify digits and hex digits. This is used when compiling
266 patterns. Note that the tables in chartables are dependent on the locale, and
267 may mark arbitrary characters as digits - but the PCRE compiling code expects
268 to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
269 a private table here. It costs 256 bytes, but it is a lot faster than doing
270 character value tests (at least in some simple cases I timed), and in some
271 applications one wants PCRE to compile efficiently as well as match
272 efficiently.
273
274 For convenience, we use the same bit definitions as in chartables:
275
276 0x04 decimal digit
277 0x08 hexadecimal digit
278
279 Then we can use ctype_digit and ctype_xdigit in the code. */
280
281 #ifndef EBCDIC /* This is the "normal" case, for ASCII systems */
282 static const unsigned char digitab[] =
283 {
284 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
285 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
286 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
287 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
288 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
289 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
290 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
291 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
292 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
293 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
294 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
295 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
296 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
297 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
298 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
299 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
300 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
301 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
302 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
303 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
304 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
305 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
306 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
307 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
308 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
309 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
310 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
311 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
312 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
313 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
314 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
315 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
316
317 #else /* This is the "abnormal" case, for EBCDIC systems */
318 static const unsigned char digitab[] =
319 {
320 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
321 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
322 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
323 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
324 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
325 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
326 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
327 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
328 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
329 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
330 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
331 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */
332 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
333 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
334 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
335 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
336 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
337 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
338 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
339 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
340 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
341 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
342 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
343 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
344 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
345 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
346 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
347 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
348 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
349 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
350 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
351 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
352
353 static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
354 0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
355 0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
356 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
357 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
358 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
359 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
360 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
361 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
362 0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
363 0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
364 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
365 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */
366 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
367 0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
368 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
369 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
370 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
371 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
372 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
373 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
374 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
375 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
376 0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
377 0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
378 0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
379 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
380 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
381 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
382 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
383 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
384 0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
385 0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
386 #endif
387
388
389 /* Definition to allow mutual recursion */
390
391 static BOOL
392 compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
393 int *, int *, branch_chain *, compile_data *, int *);
394
395
396
397 /*************************************************
398 * Handle escapes *
399 *************************************************/
400
401 /* This function is called when a \ has been encountered. It either returns a
402 positive value for a simple escape such as \n, or a negative value which
403 encodes one of the more complicated things such as \d. A backreference to group
404 n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
405 UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
406 ptr is pointing at the \. On exit, it is on the final character of the escape
407 sequence.
408
409 Arguments:
410 ptrptr points to the pattern position pointer
411 errorcodeptr points to the errorcode variable
412 bracount number of previous extracting brackets
413 options the options bits
414 isclass TRUE if inside a character class
415
416 Returns: zero or positive => a data character
417 negative => a special escape sequence
418 on error, errorptr is set
419 */
420
421 static int
422 check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
423 int options, BOOL isclass)
424 {
425 BOOL utf8 = (options & PCRE_UTF8) != 0;
426 const uschar *ptr = *ptrptr + 1;
427 int c, i;
428
429 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
430 ptr--; /* Set pointer back to the last byte */
431
432 /* If backslash is at the end of the pattern, it's an error. */
433
434 if (c == 0) *errorcodeptr = ERR1;
435
436 /* Non-alphamerics are literals. For digits or letters, do an initial lookup in
437 a table. A non-zero result is something that can be returned immediately.
438 Otherwise further processing may be required. */
439
440 #ifndef EBCDIC /* ASCII coding */
441 else if (c < '0' || c > 'z') {} /* Not alphameric */
442 else if ((i = escapes[c - '0']) != 0) c = i;
443
444 #else /* EBCDIC coding */
445 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphameric */
446 else if ((i = escapes[c - 0x48]) != 0) c = i;
447 #endif
448
449 /* Escapes that need further processing, or are illegal. */
450
451 else
452 {
453 const uschar *oldptr;
454 BOOL braced, negated;
455
456 switch (c)
457 {
458 /* A number of Perl escapes are not handled by PCRE. We give an explicit
459 error. */
460
461 case 'l':
462 case 'L':
463 case 'N':
464 case 'u':
465 case 'U':
466 *errorcodeptr = ERR37;
467 break;
468
469 /* \g must be followed by a number, either plain or braced. If positive, it
470 is an absolute backreference. If negative, it is a relative backreference.
471 This is a Perl 5.10 feature. Perl 5.10 also supports \g{name} as a
472 reference to a named group. This is part of Perl's movement towards a
473 unified syntax for back references. As this is synonymous with \k{name}, we
474 fudge it up by pretending it really was \k. */
475
476 case 'g':
477 if (ptr[1] == '{')
478 {
479 const uschar *p;
480 for (p = ptr+2; *p != 0 && *p != '}'; p++)
481 if (*p != '-' && (digitab[*p] & ctype_digit) == 0) break;
482 if (*p != 0 && *p != '}')
483 {
484 c = -ESC_k;
485 break;
486 }
487 braced = TRUE;
488 ptr++;
489 }
490 else braced = FALSE;
491
492 if (ptr[1] == '-')
493 {
494 negated = TRUE;
495 ptr++;
496 }
497 else negated = FALSE;
498
499 c = 0;
500 while ((digitab[ptr[1]] & ctype_digit) != 0)
501 c = c * 10 + *(++ptr) - '0';
502
503 if (c == 0 || (braced && *(++ptr) != '}'))
504 {
505 *errorcodeptr = ERR57;
506 return 0;
507 }
508
509 if (negated)
510 {
511 if (c > bracount)
512 {
513 *errorcodeptr = ERR15;
514 return 0;
515 }
516 c = bracount - (c - 1);
517 }
518
519 c = -(ESC_REF + c);
520 break;
521
522 /* The handling of escape sequences consisting of a string of digits
523 starting with one that is not zero is not straightforward. By experiment,
524 the way Perl works seems to be as follows:
525
526 Outside a character class, the digits are read as a decimal number. If the
527 number is less than 10, or if there are that many previous extracting
528 left brackets, then it is a back reference. Otherwise, up to three octal
529 digits are read to form an escaped byte. Thus \123 is likely to be octal
530 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
531 value is greater than 377, the least significant 8 bits are taken. Inside a
532 character class, \ followed by a digit is always an octal number. */
533
534 case '1': case '2': case '3': case '4': case '5':
535 case '6': case '7': case '8': case '9':
536
537 if (!isclass)
538 {
539 oldptr = ptr;
540 c -= '0';
541 while ((digitab[ptr[1]] & ctype_digit) != 0)
542 c = c * 10 + *(++ptr) - '0';
543 if (c < 10 || c <= bracount)
544 {
545 c = -(ESC_REF + c);
546 break;
547 }
548 ptr = oldptr; /* Put the pointer back and fall through */
549 }
550
551 /* Handle an octal number following \. If the first digit is 8 or 9, Perl
552 generates a binary zero byte and treats the digit as a following literal.
553 Thus we have to pull back the pointer by one. */
554
555 if ((c = *ptr) >= '8')
556 {
557 ptr--;
558 c = 0;
559 break;
560 }
561
562 /* \0 always starts an octal number, but we may drop through to here with a
563 larger first octal digit. The original code used just to take the least
564 significant 8 bits of octal numbers (I think this is what early Perls used
565 to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
566 than 3 octal digits. */
567
568 case '0':
569 c -= '0';
570 while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
571 c = c * 8 + *(++ptr) - '0';
572 if (!utf8 && c > 255) *errorcodeptr = ERR51;
573 break;
574
575 /* \x is complicated. \x{ddd} is a character number which can be greater
576 than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
577 treated as a data character. */
578
579 case 'x':
580 if (ptr[1] == '{')
581 {
582 const uschar *pt = ptr + 2;
583 int count = 0;
584
585 c = 0;
586 while ((digitab[*pt] & ctype_xdigit) != 0)
587 {
588 register int cc = *pt++;
589 if (c == 0 && cc == '0') continue; /* Leading zeroes */
590 count++;
591
592 #ifndef EBCDIC /* ASCII coding */
593 if (cc >= 'a') cc -= 32; /* Convert to upper case */
594 c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));
595 #else /* EBCDIC coding */
596 if (cc >= 'a' && cc <= 'z') cc += 64; /* Convert to upper case */
597 c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));
598 #endif
599 }
600
601 if (*pt == '}')
602 {
603 if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
604 ptr = pt;
605 break;
606 }
607
608 /* If the sequence of hex digits does not end with '}', then we don't
609 recognize this construct; fall through to the normal \x handling. */
610 }
611
612 /* Read just a single-byte hex-defined char */
613
614 c = 0;
615 while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
616 {
617 int cc; /* Some compilers don't like ++ */
618 cc = *(++ptr); /* in initializers */
619 #ifndef EBCDIC /* ASCII coding */
620 if (cc >= 'a') cc -= 32; /* Convert to upper case */
621 c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
622 #else /* EBCDIC coding */
623 if (cc <= 'z') cc += 64; /* Convert to upper case */
624 c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
625 #endif
626 }
627 break;
628
629 /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
630 This coding is ASCII-specific, but then the whole concept of \cx is
631 ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
632
633 case 'c':
634 c = *(++ptr);
635 if (c == 0)
636 {
637 *errorcodeptr = ERR2;
638 return 0;
639 }
640
641 #ifndef EBCDIC /* ASCII coding */
642 if (c >= 'a' && c <= 'z') c -= 32;
643 c ^= 0x40;
644 #else /* EBCDIC coding */
645 if (c >= 'a' && c <= 'z') c += 64;
646 c ^= 0xC0;
647 #endif
648 break;
649
650 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
651 other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
652 for Perl compatibility, it is a literal. This code looks a bit odd, but
653 there used to be some cases other than the default, and there may be again
654 in future, so I haven't "optimized" it. */
655
656 default:
657 if ((options & PCRE_EXTRA) != 0) switch(c)
658 {
659 default:
660 *errorcodeptr = ERR3;
661 break;
662 }
663 break;
664 }
665 }
666
667 *ptrptr = ptr;
668 return c;
669 }
670
671
672
673 #ifdef SUPPORT_UCP
674 /*************************************************
675 * Handle \P and \p *
676 *************************************************/
677
678 /* This function is called after \P or \p has been encountered, provided that
679 PCRE is compiled with support for Unicode properties. On entry, ptrptr is
680 pointing at the P or p. On exit, it is pointing at the final character of the
681 escape sequence.
682
683 Argument:
684 ptrptr points to the pattern position pointer
685 negptr points to a boolean that is set TRUE for negation else FALSE
686 dptr points to an int that is set to the detailed property value
687 errorcodeptr points to the error code variable
688
689 Returns: type value from ucp_type_table, or -1 for an invalid type
690 */
691
692 static int
693 get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
694 {
695 int c, i, bot, top;
696 const uschar *ptr = *ptrptr;
697 char name[32];
698
699 c = *(++ptr);
700 if (c == 0) goto ERROR_RETURN;
701
702 *negptr = FALSE;
703
704 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
705 negation. */
706
707 if (c == '{')
708 {
709 if (ptr[1] == '^')
710 {
711 *negptr = TRUE;
712 ptr++;
713 }
714 for (i = 0; i < (int)sizeof(name) - 1; i++)
715 {
716 c = *(++ptr);
717 if (c == 0) goto ERROR_RETURN;
718 if (c == '}') break;
719 name[i] = c;
720 }
721 if (c !='}') goto ERROR_RETURN;
722 name[i] = 0;
723 }
724
725 /* Otherwise there is just one following character */
726
727 else
728 {
729 name[0] = c;
730 name[1] = 0;
731 }
732
733 *ptrptr = ptr;
734
735 /* Search for a recognized property name using binary chop */
736
737 bot = 0;
738 top = _pcre_utt_size;
739
740 while (bot < top)
741 {
742 i = (bot + top) >> 1;
743 c = strcmp(name, _pcre_utt[i].name);
744 if (c == 0)
745 {
746 *dptr = _pcre_utt[i].value;
747 return _pcre_utt[i].type;
748 }
749 if (c > 0) bot = i + 1; else top = i;
750 }
751
752 *errorcodeptr = ERR47;
753 *ptrptr = ptr;
754 return -1;
755
756 ERROR_RETURN:
757 *errorcodeptr = ERR46;
758 *ptrptr = ptr;
759 return -1;
760 }
761 #endif
762
763
764
765
766 /*************************************************
767 * Check for counted repeat *
768 *************************************************/
769
770 /* This function is called when a '{' is encountered in a place where it might
771 start a quantifier. It looks ahead to see if it really is a quantifier or not.
772 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
773 where the ddds are digits.
774
775 Arguments:
776 p pointer to the first char after '{'
777
778 Returns: TRUE or FALSE
779 */
780
781 static BOOL
782 is_counted_repeat(const uschar *p)
783 {
784 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
785 while ((digitab[*p] & ctype_digit) != 0) p++;
786 if (*p == '}') return TRUE;
787
788 if (*p++ != ',') return FALSE;
789 if (*p == '}') return TRUE;
790
791 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
792 while ((digitab[*p] & ctype_digit) != 0) p++;
793
794 return (*p == '}');
795 }
796
797
798
799 /*************************************************
800 * Read repeat counts *
801 *************************************************/
802
803 /* Read an item of the form {n,m} and return the values. This is called only
804 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
805 so the syntax is guaranteed to be correct, but we need to check the values.
806
807 Arguments:
808 p pointer to first char after '{'
809 minp pointer to int for min
810 maxp pointer to int for max
811 returned as -1 if no max
812 errorcodeptr points to error code variable
813
814 Returns: pointer to '}' on success;
815 current ptr on error, with errorcodeptr set non-zero
816 */
817
818 static const uschar *
819 read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
820 {
821 int min = 0;
822 int max = -1;
823
824 /* Read the minimum value and do a paranoid check: a negative value indicates
825 an integer overflow. */
826
827 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
828 if (min < 0 || min > 65535)
829 {
830 *errorcodeptr = ERR5;
831 return p;
832 }
833
834 /* Read the maximum value if there is one, and again do a paranoid on its size.
835 Also, max must not be less than min. */
836
837 if (*p == '}') max = min; else
838 {
839 if (*(++p) != '}')
840 {
841 max = 0;
842 while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
843 if (max < 0 || max > 65535)
844 {
845 *errorcodeptr = ERR5;
846 return p;
847 }
848 if (max < min)
849 {
850 *errorcodeptr = ERR4;
851 return p;
852 }
853 }
854 }
855
856 /* Fill in the required variables, and pass back the pointer to the terminating
857 '}'. */
858
859 *minp = min;
860 *maxp = max;
861 return p;
862 }
863
864
865
866 /*************************************************
867 * Find forward referenced subpattern *
868 *************************************************/
869
870 /* This function scans along a pattern's text looking for capturing
871 subpatterns, and counting them. If it finds a named pattern that matches the
872 name it is given, it returns its number. Alternatively, if the name is NULL, it
873 returns when it reaches a given numbered subpattern. This is used for forward
874 references to subpatterns. We know that if (?P< is encountered, the name will
875 be terminated by '>' because that is checked in the first pass.
876
877 Arguments:
878 ptr current position in the pattern
879 count current count of capturing parens so far encountered
880 name name to seek, or NULL if seeking a numbered subpattern
881 lorn name length, or subpattern number if name is NULL
882 xmode TRUE if we are in /x mode
883
884 Returns: the number of the named subpattern, or -1 if not found
885 */
886
887 static int
888 find_parens(const uschar *ptr, int count, const uschar *name, int lorn,
889 BOOL xmode)
890 {
891 const uschar *thisname;
892
893 for (; *ptr != 0; ptr++)
894 {
895 int term;
896
897 /* Skip over backslashed characters and also entire \Q...\E */
898
899 if (*ptr == '\\')
900 {
901 if (*(++ptr) == 0) return -1;
902 if (*ptr == 'Q') for (;;)
903 {
904 while (*(++ptr) != 0 && *ptr != '\\');
905 if (*ptr == 0) return -1;
906 if (*(++ptr) == 'E') break;
907 }
908 continue;
909 }
910
911 /* Skip over character classes */
912
913 if (*ptr == '[')
914 {
915 while (*(++ptr) != ']')
916 {
917 if (*ptr == '\\')
918 {
919 if (*(++ptr) == 0) return -1;
920 if (*ptr == 'Q') for (;;)
921 {
922 while (*(++ptr) != 0 && *ptr != '\\');
923 if (*ptr == 0) return -1;
924 if (*(++ptr) == 'E') break;
925 }
926 continue;
927 }
928 }
929 continue;
930 }
931
932 /* Skip comments in /x mode */
933
934 if (xmode && *ptr == '#')
935 {
936 while (*(++ptr) != 0 && *ptr != '\n');
937 if (*ptr == 0) return -1;
938 continue;
939 }
940
941 /* An opening parens must now be a real metacharacter */
942
943 if (*ptr != '(') continue;
944 if (ptr[1] != '?')
945 {
946 count++;
947 if (name == NULL && count == lorn) return count;
948 continue;
949 }
950
951 ptr += 2;
952 if (*ptr == 'P') ptr++; /* Allow optional P */
953
954 /* We have to disambiguate (?<! and (?<= from (?<name> */
955
956 if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') &&
957 *ptr != '\'')
958 continue;
959
960 count++;
961
962 if (name == NULL && count == lorn) return count;
963 term = *ptr++;
964 if (term == '<') term = '>';
965 thisname = ptr;
966 while (*ptr != term) ptr++;
967 if (name != NULL && lorn == ptr - thisname &&
968 strncmp((const char *)name, (const char *)thisname, lorn) == 0)
969 return count;
970 }
971
972 return -1;
973 }
974
975
976
977 /*************************************************
978 * Find first significant op code *
979 *************************************************/
980
981 /* This is called by several functions that scan a compiled expression looking
982 for a fixed first character, or an anchoring op code etc. It skips over things
983 that do not influence this. For some calls, a change of option is important.
984 For some calls, it makes sense to skip negative forward and all backward
985 assertions, and also the \b assertion; for others it does not.
986
987 Arguments:
988 code pointer to the start of the group
989 options pointer to external options
990 optbit the option bit whose changing is significant, or
991 zero if none are
992 skipassert TRUE if certain assertions are to be skipped
993
994 Returns: pointer to the first significant opcode
995 */
996
997 static const uschar*
998 first_significant_code(const uschar *code, int *options, int optbit,
999 BOOL skipassert)
1000 {
1001 for (;;)
1002 {
1003 switch ((int)*code)
1004 {
1005 case OP_OPT:
1006 if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
1007 *options = (int)code[1];
1008 code += 2;
1009 break;
1010
1011 case OP_ASSERT_NOT:
1012 case OP_ASSERTBACK:
1013 case OP_ASSERTBACK_NOT:
1014 if (!skipassert) return code;
1015 do code += GET(code, 1); while (*code == OP_ALT);
1016 code += _pcre_OP_lengths[*code];
1017 break;
1018
1019 case OP_WORD_BOUNDARY:
1020 case OP_NOT_WORD_BOUNDARY:
1021 if (!skipassert) return code;
1022 /* Fall through */
1023
1024 case OP_CALLOUT:
1025 case OP_CREF:
1026 case OP_RREF:
1027 case OP_DEF:
1028 code += _pcre_OP_lengths[*code];
1029 break;
1030
1031 default:
1032 return code;
1033 }
1034 }
1035 /* Control never reaches here */
1036 }
1037
1038
1039
1040
1041 /*************************************************
1042 * Find the fixed length of a pattern *
1043 *************************************************/
1044
1045 /* Scan a pattern and compute the fixed length of subject that will match it,
1046 if the length is fixed. This is needed for dealing with backward assertions.
1047 In UTF8 mode, the result is in characters rather than bytes.
1048
1049 Arguments:
1050 code points to the start of the pattern (the bracket)
1051 options the compiling options
1052
1053 Returns: the fixed length, or -1 if there is no fixed length,
1054 or -2 if \C was encountered
1055 */
1056
1057 static int
1058 find_fixedlength(uschar *code, int options)
1059 {
1060 int length = -1;
1061
1062 register int branchlength = 0;
1063 register uschar *cc = code + 1 + LINK_SIZE;
1064
1065 /* Scan along the opcodes for this branch. If we get to the end of the
1066 branch, check the length against that of the other branches. */
1067
1068 for (;;)
1069 {
1070 int d;
1071 register int op = *cc;
1072
1073 switch (op)
1074 {
1075 case OP_CBRA:
1076 case OP_BRA:
1077 case OP_ONCE:
1078 case OP_COND:
1079 d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);
1080 if (d < 0) return d;
1081 branchlength += d;
1082 do cc += GET(cc, 1); while (*cc == OP_ALT);
1083 cc += 1 + LINK_SIZE;
1084 break;
1085
1086 /* Reached end of a branch; if it's a ket it is the end of a nested
1087 call. If it's ALT it is an alternation in a nested call. If it is
1088 END it's the end of the outer call. All can be handled by the same code. */
1089
1090 case OP_ALT:
1091 case OP_KET:
1092 case OP_KETRMAX:
1093 case OP_KETRMIN:
1094 case OP_END:
1095 if (length < 0) length = branchlength;
1096 else if (length != branchlength) return -1;
1097 if (*cc != OP_ALT) return length;
1098 cc += 1 + LINK_SIZE;
1099 branchlength = 0;
1100 break;
1101
1102 /* Skip over assertive subpatterns */
1103
1104 case OP_ASSERT:
1105 case OP_ASSERT_NOT:
1106 case OP_ASSERTBACK:
1107 case OP_ASSERTBACK_NOT:
1108 do cc += GET(cc, 1); while (*cc == OP_ALT);
1109 /* Fall through */
1110
1111 /* Skip over things that don't match chars */
1112
1113 case OP_REVERSE:
1114 case OP_CREF:
1115 case OP_RREF:
1116 case OP_DEF:
1117 case OP_OPT:
1118 case OP_CALLOUT:
1119 case OP_SOD:
1120 case OP_SOM:
1121 case OP_EOD:
1122 case OP_EODN:
1123 case OP_CIRC:
1124 case OP_DOLL:
1125 case OP_NOT_WORD_BOUNDARY:
1126 case OP_WORD_BOUNDARY:
1127 cc += _pcre_OP_lengths[*cc];
1128 break;
1129
1130 /* Handle literal characters */
1131
1132 case OP_CHAR:
1133 case OP_CHARNC:
1134 case OP_NOT:
1135 branchlength++;
1136 cc += 2;
1137 #ifdef SUPPORT_UTF8
1138 if ((options & PCRE_UTF8) != 0)
1139 {
1140 while ((*cc & 0xc0) == 0x80) cc++;
1141 }
1142 #endif
1143 break;
1144
1145 /* Handle exact repetitions. The count is already in characters, but we
1146 need to skip over a multibyte character in UTF8 mode. */
1147
1148 case OP_EXACT:
1149 branchlength += GET2(cc,1);
1150 cc += 4;
1151 #ifdef SUPPORT_UTF8
1152 if ((options & PCRE_UTF8) != 0)
1153 {
1154 while((*cc & 0x80) == 0x80) cc++;
1155 }
1156 #endif
1157 break;
1158
1159 case OP_TYPEEXACT:
1160 branchlength += GET2(cc,1);
1161 cc += 4;
1162 break;
1163
1164 /* Handle single-char matchers */
1165
1166 case OP_PROP:
1167 case OP_NOTPROP:
1168 cc += 2;
1169 /* Fall through */
1170
1171 case OP_NOT_DIGIT:
1172 case OP_DIGIT:
1173 case OP_NOT_WHITESPACE:
1174 case OP_WHITESPACE:
1175 case OP_NOT_WORDCHAR:
1176 case OP_WORDCHAR:
1177 case OP_ANY:
1178 branchlength++;
1179 cc++;
1180 break;
1181
1182 /* The single-byte matcher isn't allowed */
1183
1184 case OP_ANYBYTE:
1185 return -2;
1186
1187 /* Check a class for variable quantification */
1188
1189 #ifdef SUPPORT_UTF8
1190 case OP_XCLASS:
1191 cc += GET(cc, 1) - 33;
1192 /* Fall through */
1193 #endif
1194
1195 case OP_CLASS:
1196 case OP_NCLASS:
1197 cc += 33;
1198
1199 switch (*cc)
1200 {
1201 case OP_CRSTAR:
1202 case OP_CRMINSTAR:
1203 case OP_CRQUERY:
1204 case OP_CRMINQUERY:
1205 return -1;
1206
1207 case OP_CRRANGE:
1208 case OP_CRMINRANGE:
1209 if (GET2(cc,1) != GET2(cc,3)) return -1;
1210 branchlength += GET2(cc,1);
1211 cc += 5;
1212 break;
1213
1214 default:
1215 branchlength++;
1216 }
1217 break;
1218
1219 /* Anything else is variable length */
1220
1221 default:
1222 return -1;
1223 }
1224 }
1225 /* Control never gets here */
1226 }
1227
1228
1229
1230
1231 /*************************************************
1232 * Scan compiled regex for numbered bracket *
1233 *************************************************/
1234
1235 /* This little function scans through a compiled pattern until it finds a
1236 capturing bracket with the given number.
1237
1238 Arguments:
1239 code points to start of expression
1240 utf8 TRUE in UTF-8 mode
1241 number the required bracket number
1242
1243 Returns: pointer to the opcode for the bracket, or NULL if not found
1244 */
1245
1246 static const uschar *
1247 find_bracket(const uschar *code, BOOL utf8, int number)
1248 {
1249 for (;;)
1250 {
1251 register int c = *code;
1252 if (c == OP_END) return NULL;
1253
1254 /* XCLASS is used for classes that cannot be represented just by a bit
1255 map. This includes negated single high-valued characters. The length in
1256 the table is zero; the actual length is stored in the compiled code. */
1257
1258 if (c == OP_XCLASS) code += GET(code, 1);
1259
1260 /* Handle capturing bracket */
1261
1262 else if (c == OP_CBRA)
1263 {
1264 int n = GET2(code, 1+LINK_SIZE);
1265 if (n == number) return (uschar *)code;
1266 code += _pcre_OP_lengths[c];
1267 }
1268
1269 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1270 a multi-byte character. The length in the table is a minimum, so we have to
1271 arrange to skip the extra bytes. */
1272
1273 else
1274 {
1275 code += _pcre_OP_lengths[c];
1276 #ifdef SUPPORT_UTF8
1277 if (utf8) switch(c)
1278 {
1279 case OP_CHAR:
1280 case OP_CHARNC:
1281 case OP_EXACT:
1282 case OP_UPTO:
1283 case OP_MINUPTO:
1284 case OP_POSUPTO:
1285 case OP_STAR:
1286 case OP_MINSTAR:
1287 case OP_POSSTAR:
1288 case OP_PLUS:
1289 case OP_MINPLUS:
1290 case OP_POSPLUS:
1291 case OP_QUERY:
1292 case OP_MINQUERY:
1293 case OP_POSQUERY:
1294 if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1295 break;
1296 }
1297 #endif
1298 }
1299 }
1300 }
1301
1302
1303
1304 /*************************************************
1305 * Scan compiled regex for recursion reference *
1306 *************************************************/
1307
1308 /* This little function scans through a compiled pattern until it finds an
1309 instance of OP_RECURSE.
1310
1311 Arguments:
1312 code points to start of expression
1313 utf8 TRUE in UTF-8 mode
1314
1315 Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1316 */
1317
1318 static const uschar *
1319 find_recurse(const uschar *code, BOOL utf8)
1320 {
1321 for (;;)
1322 {
1323 register int c = *code;
1324 if (c == OP_END) return NULL;
1325 if (c == OP_RECURSE) return code;
1326
1327 /* XCLASS is used for classes that cannot be represented just by a bit
1328 map. This includes negated single high-valued characters. The length in
1329 the table is zero; the actual length is stored in the compiled code. */
1330
1331 if (c == OP_XCLASS) code += GET(code, 1);
1332
1333 /* Otherwise, we get the item's length from the table. In UTF-8 mode, opcodes
1334 that are followed by a character may be followed by a multi-byte character.
1335 The length in the table is a minimum, so we have to arrange to skip the extra
1336 bytes. */
1337
1338 else
1339 {
1340 code += _pcre_OP_lengths[c];
1341 #ifdef SUPPORT_UTF8
1342 if (utf8) switch(c)
1343 {
1344 case OP_CHAR:
1345 case OP_CHARNC:
1346 case OP_EXACT:
1347 case OP_UPTO:
1348 case OP_MINUPTO:
1349 case OP_POSUPTO:
1350 case OP_STAR:
1351 case OP_MINSTAR:
1352 case OP_POSSTAR:
1353 case OP_PLUS:
1354 case OP_MINPLUS:
1355 case OP_POSPLUS:
1356 case OP_QUERY:
1357 case OP_MINQUERY:
1358 case OP_POSQUERY:
1359 if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1360 break;
1361 }
1362 #endif
1363 }
1364 }
1365 }
1366
1367
1368
1369 /*************************************************
1370 * Scan compiled branch for non-emptiness *
1371 *************************************************/
1372
1373 /* This function scans through a branch of a compiled pattern to see whether it
1374 can match the empty string or not. It is called from could_be_empty()
1375 below and from compile_branch() when checking for an unlimited repeat of a
1376 group that can match nothing. Note that first_significant_code() skips over
1377 assertions. If we hit an unclosed bracket, we return "empty" - this means we've
1378 struck an inner bracket whose current branch will already have been scanned.
1379
1380 Arguments:
1381 code points to start of search
1382 endcode points to where to stop
1383 utf8 TRUE if in UTF8 mode
1384
1385 Returns: TRUE if what is matched could be empty
1386 */
1387
1388 static BOOL
1389 could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1390 {
1391 register int c;
1392 for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1393 code < endcode;
1394 code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1395 {
1396 const uschar *ccode;
1397
1398 c = *code;
1399
1400 /* Groups with zero repeats can of course be empty; skip them. */
1401
1402 if (c == OP_BRAZERO || c == OP_BRAMINZERO)
1403 {
1404 code += _pcre_OP_lengths[c];
1405 do code += GET(code, 1); while (*code == OP_ALT);
1406 c = *code;
1407 continue;
1408 }
1409
1410 /* For other groups, scan the branches. */
1411
1412 if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND)
1413 {
1414 BOOL empty_branch;
1415 if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1416
1417 /* Scan a closed bracket */
1418
1419 empty_branch = FALSE;
1420 do
1421 {
1422 if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1423 empty_branch = TRUE;
1424 code += GET(code, 1);
1425 }
1426 while (*code == OP_ALT);
1427 if (!empty_branch) return FALSE; /* All branches are non-empty */
1428 c = *code;
1429 continue;
1430 }
1431
1432 /* Handle the other opcodes */
1433
1434 switch (c)
1435 {
1436 /* Check for quantifiers after a class */
1437
1438 #ifdef SUPPORT_UTF8
1439 case OP_XCLASS:
1440 ccode = code + GET(code, 1);
1441 goto CHECK_CLASS_REPEAT;
1442 #endif
1443
1444 case OP_CLASS:
1445 case OP_NCLASS:
1446 ccode = code + 33;
1447
1448 #ifdef SUPPORT_UTF8
1449 CHECK_CLASS_REPEAT:
1450 #endif
1451
1452 switch (*ccode)
1453 {
1454 case OP_CRSTAR: /* These could be empty; continue */
1455 case OP_CRMINSTAR:
1456 case OP_CRQUERY:
1457 case OP_CRMINQUERY:
1458 break;
1459
1460 default: /* Non-repeat => class must match */
1461 case OP_CRPLUS: /* These repeats aren't empty */
1462 case OP_CRMINPLUS:
1463 return FALSE;
1464
1465 case OP_CRRANGE:
1466 case OP_CRMINRANGE:
1467 if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
1468 break;
1469 }
1470 break;
1471
1472 /* Opcodes that must match a character */
1473
1474 case OP_PROP:
1475 case OP_NOTPROP:
1476 case OP_EXTUNI:
1477 case OP_NOT_DIGIT:
1478 case OP_DIGIT:
1479 case OP_NOT_WHITESPACE:
1480 case OP_WHITESPACE:
1481 case OP_NOT_WORDCHAR:
1482 case OP_WORDCHAR:
1483 case OP_ANY:
1484 case OP_ANYBYTE:
1485 case OP_CHAR:
1486 case OP_CHARNC:
1487 case OP_NOT:
1488 case OP_PLUS:
1489 case OP_MINPLUS:
1490 case OP_POSPLUS:
1491 case OP_EXACT:
1492 case OP_NOTPLUS:
1493 case OP_NOTMINPLUS:
1494 case OP_NOTPOSPLUS:
1495 case OP_NOTEXACT:
1496 case OP_TYPEPLUS:
1497 case OP_TYPEMINPLUS:
1498 case OP_TYPEPOSPLUS:
1499 case OP_TYPEEXACT:
1500 return FALSE;
1501
1502 /* End of branch */
1503
1504 case OP_KET:
1505 case OP_KETRMAX:
1506 case OP_KETRMIN:
1507 case OP_ALT:
1508 return TRUE;
1509
1510 /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1511 MINUPTO, and POSUPTO may be followed by a multibyte character */
1512
1513 #ifdef SUPPORT_UTF8
1514 case OP_STAR:
1515 case OP_MINSTAR:
1516 case OP_POSSTAR:
1517 case OP_QUERY:
1518 case OP_MINQUERY:
1519 case OP_POSQUERY:
1520 case OP_UPTO:
1521 case OP_MINUPTO:
1522 case OP_POSUPTO:
1523 if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1524 break;
1525 #endif
1526 }
1527 }
1528
1529 return TRUE;
1530 }
1531
1532
1533
1534 /*************************************************
1535 * Scan compiled regex for non-emptiness *
1536 *************************************************/
1537
1538 /* This function is called to check for left recursive calls. We want to check
1539 the current branch of the current pattern to see if it could match the empty
1540 string. If it could, we must look outwards for branches at other levels,
1541 stopping when we pass beyond the bracket which is the subject of the recursion.
1542
1543 Arguments:
1544 code points to start of the recursion
1545 endcode points to where to stop (current RECURSE item)
1546 bcptr points to the chain of current (unclosed) branch starts
1547 utf8 TRUE if in UTF-8 mode
1548
1549 Returns: TRUE if what is matched could be empty
1550 */
1551
1552 static BOOL
1553 could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1554 BOOL utf8)
1555 {
1556 while (bcptr != NULL && bcptr->current >= code)
1557 {
1558 if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1559 bcptr = bcptr->outer;
1560 }
1561 return TRUE;
1562 }
1563
1564
1565
1566 /*************************************************
1567 * Check for POSIX class syntax *
1568 *************************************************/
1569
1570 /* This function is called when the sequence "[:" or "[." or "[=" is
1571 encountered in a character class. It checks whether this is followed by an
1572 optional ^ and then a sequence of letters, terminated by a matching ":]" or
1573 ".]" or "=]".
1574
1575 Argument:
1576 ptr pointer to the initial [
1577 endptr where to return the end pointer
1578 cd pointer to compile data
1579
1580 Returns: TRUE or FALSE
1581 */
1582
1583 static BOOL
1584 check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
1585 {
1586 int terminator; /* Don't combine these lines; the Solaris cc */
1587 terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
1588 if (*(++ptr) == '^') ptr++;
1589 while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
1590 if (*ptr == terminator && ptr[1] == ']')
1591 {
1592 *endptr = ptr;
1593 return TRUE;
1594 }
1595 return FALSE;
1596 }
1597
1598
1599
1600
1601 /*************************************************
1602 * Check POSIX class name *
1603 *************************************************/
1604
1605 /* This function is called to check the name given in a POSIX-style class entry
1606 such as [:alnum:].
1607
1608 Arguments:
1609 ptr points to the first letter
1610 len the length of the name
1611
1612 Returns: a value representing the name, or -1 if unknown
1613 */
1614
1615 static int
1616 check_posix_name(const uschar *ptr, int len)
1617 {
1618 register int yield = 0;
1619 while (posix_name_lengths[yield] != 0)
1620 {
1621 if (len == posix_name_lengths[yield] &&
1622 strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
1623 yield++;
1624 }
1625 return -1;
1626 }
1627
1628
1629 /*************************************************
1630 * Adjust OP_RECURSE items in repeated group *
1631 *************************************************/
1632
1633 /* OP_RECURSE items contain an offset from the start of the regex to the group
1634 that is referenced. This means that groups can be replicated for fixed
1635 repetition simply by copying (because the recursion is allowed to refer to
1636 earlier groups that are outside the current group). However, when a group is
1637 optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
1638 it, after it has been compiled. This means that any OP_RECURSE items within it
1639 that refer to the group itself or any contained groups have to have their
1640 offsets adjusted. That one of the jobs of this function. Before it is called,
1641 the partially compiled regex must be temporarily terminated with OP_END.
1642
1643 This function has been extended with the possibility of forward references for
1644 recursions and subroutine calls. It must also check the list of such references
1645 for the group we are dealing with. If it finds that one of the recursions in
1646 the current group is on this list, it adjusts the offset in the list, not the
1647 value in the reference (which is a group number).
1648
1649 Arguments:
1650 group points to the start of the group
1651 adjust the amount by which the group is to be moved
1652 utf8 TRUE in UTF-8 mode
1653 cd contains pointers to tables etc.
1654 save_hwm the hwm forward reference pointer at the start of the group
1655
1656 Returns: nothing
1657 */
1658
1659 static void
1660 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
1661 uschar *save_hwm)
1662 {
1663 uschar *ptr = group;
1664 while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1665 {
1666 int offset;
1667 uschar *hc;
1668
1669 /* See if this recursion is on the forward reference list. If so, adjust the
1670 reference. */
1671
1672 for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
1673 {
1674 offset = GET(hc, 0);
1675 if (cd->start_code + offset == ptr + 1)
1676 {
1677 PUT(hc, 0, offset + adjust);
1678 break;
1679 }
1680 }
1681
1682 /* Otherwise, adjust the recursion offset if it's after the start of this
1683 group. */
1684
1685 if (hc >= cd->hwm)
1686 {
1687 offset = GET(ptr, 1);
1688 if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1689 }
1690
1691 ptr += 1 + LINK_SIZE;
1692 }
1693 }
1694
1695
1696
1697 /*************************************************
1698 * Insert an automatic callout point *
1699 *************************************************/
1700
1701 /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
1702 callout points before each pattern item.
1703
1704 Arguments:
1705 code current code pointer
1706 ptr current pattern pointer
1707 cd pointers to tables etc
1708
1709 Returns: new code pointer
1710 */
1711
1712 static uschar *
1713 auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
1714 {
1715 *code++ = OP_CALLOUT;
1716 *code++ = 255;
1717 PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */
1718 PUT(code, LINK_SIZE, 0); /* Default length */
1719 return code + 2*LINK_SIZE;
1720 }
1721
1722
1723
1724 /*************************************************
1725 * Complete a callout item *
1726 *************************************************/
1727
1728 /* A callout item contains the length of the next item in the pattern, which
1729 we can't fill in till after we have reached the relevant point. This is used
1730 for both automatic and manual callouts.
1731
1732 Arguments:
1733 previous_callout points to previous callout item
1734 ptr current pattern pointer
1735 cd pointers to tables etc
1736
1737 Returns: nothing
1738 */
1739
1740 static void
1741 complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
1742 {
1743 int length = ptr - cd->start_pattern - GET(previous_callout, 2);
1744 PUT(previous_callout, 2 + LINK_SIZE, length);
1745 }
1746
1747
1748
1749 #ifdef SUPPORT_UCP
1750 /*************************************************
1751 * Get othercase range *
1752 *************************************************/
1753
1754 /* This function is passed the start and end of a class range, in UTF-8 mode
1755 with UCP support. It searches up the characters, looking for internal ranges of
1756 characters in the "other" case. Each call returns the next one, updating the
1757 start address.
1758
1759 Arguments:
1760 cptr points to starting character value; updated
1761 d end value
1762 ocptr where to put start of othercase range
1763 odptr where to put end of othercase range
1764
1765 Yield: TRUE when range returned; FALSE when no more
1766 */
1767
1768 static BOOL
1769 get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
1770 unsigned int *odptr)
1771 {
1772 unsigned int c, othercase, next;
1773
1774 for (c = *cptr; c <= d; c++)
1775 { if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR) break; }
1776
1777 if (c > d) return FALSE;
1778
1779 *ocptr = othercase;
1780 next = othercase + 1;
1781
1782 for (++c; c <= d; c++)
1783 {
1784 if (_pcre_ucp_othercase(c) != next) break;
1785 next++;
1786 }
1787
1788 *odptr = next - 1;
1789 *cptr = c;
1790
1791 return TRUE;
1792 }
1793 #endif /* SUPPORT_UCP */
1794
1795
1796
1797 /*************************************************
1798 * Check if auto-possessifying is possible *
1799 *************************************************/
1800
1801 /* This function is called for unlimited repeats of certain items, to see
1802 whether the next thing could possibly match the repeated item. If not, it makes
1803 sense to automatically possessify the repeated item.
1804
1805 Arguments:
1806 op_code the repeated op code
1807 this data for this item, depends on the opcode
1808 utf8 TRUE in UTF-8 mode
1809 utf8_char used for utf8 character bytes, NULL if not relevant
1810 ptr next character in pattern
1811 options options bits
1812 cd contains pointers to tables etc.
1813
1814 Returns: TRUE if possessifying is wanted
1815 */
1816
1817 static BOOL
1818 check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
1819 const uschar *ptr, int options, compile_data *cd)
1820 {
1821 int next;
1822
1823 /* Skip whitespace and comments in extended mode */
1824
1825 if ((options & PCRE_EXTENDED) != 0)
1826 {
1827 for (;;)
1828 {
1829 while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1830 if (*ptr == '#')
1831 {
1832 while (*(++ptr) != 0)
1833 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1834 }
1835 else break;
1836 }
1837 }
1838
1839 /* If the next item is one that we can handle, get its value. A non-negative
1840 value is a character, a negative value is an escape value. */
1841
1842 if (*ptr == '\\')
1843 {
1844 int temperrorcode = 0;
1845 next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
1846 if (temperrorcode != 0) return FALSE;
1847 ptr++; /* Point after the escape sequence */
1848 }
1849
1850 else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
1851 {
1852 #ifdef SUPPORT_UTF8
1853 if (utf8) { GETCHARINC(next, ptr); } else
1854 #endif
1855 next = *ptr++;
1856 }
1857
1858 else return FALSE;
1859
1860 /* Skip whitespace and comments in extended mode */
1861
1862 if ((options & PCRE_EXTENDED) != 0)
1863 {
1864 for (;;)
1865 {
1866 while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1867 if (*ptr == '#')
1868 {
1869 while (*(++ptr) != 0)
1870 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1871 }
1872 else break;
1873 }
1874 }
1875
1876 /* If the next thing is itself optional, we have to give up. */
1877
1878 if (*ptr == '*' || *ptr == '?' || strncmp((char *)ptr, "{0,", 3) == 0)
1879 return FALSE;
1880
1881 /* Now compare the next item with the previous opcode. If the previous is a
1882 positive single character match, "item" either contains the character or, if
1883 "item" is greater than 127 in utf8 mode, the character's bytes are in
1884 utf8_char. */
1885
1886
1887 /* Handle cases when the next item is a character. */
1888
1889 if (next >= 0) switch(op_code)
1890 {
1891 case OP_CHAR:
1892 #ifdef SUPPORT_UTF8
1893 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1894 #endif
1895 return item != next;
1896
1897 /* For CHARNC (caseless character) we must check the other case. If we have
1898 Unicode property support, we can use it to test the other case of
1899 high-valued characters. */
1900
1901 case OP_CHARNC:
1902 #ifdef SUPPORT_UTF8
1903 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1904 #endif
1905 if (item == next) return FALSE;
1906 #ifdef SUPPORT_UTF8
1907 if (utf8)
1908 {
1909 unsigned int othercase;
1910 if (next < 128) othercase = cd->fcc[next]; else
1911 #ifdef SUPPORT_UCP
1912 othercase = _pcre_ucp_othercase((unsigned int)next);
1913 #else
1914 othercase = NOTACHAR;
1915 #endif
1916 return (unsigned int)item != othercase;
1917 }
1918 else
1919 #endif /* SUPPORT_UTF8 */
1920 return (item != cd->fcc[next]); /* Non-UTF-8 mode */
1921
1922 /* For OP_NOT, "item" must be a single-byte character. */
1923
1924 case OP_NOT:
1925 if (next < 0) return FALSE; /* Not a character */
1926 if (item == next) return TRUE;
1927 if ((options & PCRE_CASELESS) == 0) return FALSE;
1928 #ifdef SUPPORT_UTF8
1929 if (utf8)
1930 {
1931 unsigned int othercase;
1932 if (next < 128) othercase = cd->fcc[next]; else
1933 #ifdef SUPPORT_UCP
1934 othercase = _pcre_ucp_othercase(next);
1935 #else
1936 othercase = NOTACHAR;
1937 #endif
1938 return (unsigned int)item == othercase;
1939 }
1940 else
1941 #endif /* SUPPORT_UTF8 */
1942 return (item == cd->fcc[next]); /* Non-UTF-8 mode */
1943
1944 case OP_DIGIT:
1945 return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
1946
1947 case OP_NOT_DIGIT:
1948 return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
1949
1950 case OP_WHITESPACE:
1951 return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
1952
1953 case OP_NOT_WHITESPACE:
1954 return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
1955
1956 case OP_WORDCHAR:
1957 return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
1958
1959 case OP_NOT_WORDCHAR:
1960 return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
1961
1962 case OP_HSPACE:
1963 case OP_NOT_HSPACE:
1964 switch(next)
1965 {
1966 case 0x09:
1967 case 0x20:
1968 case 0xa0:
1969 case 0x1680:
1970 case 0x180e:
1971 case 0x2000:
1972 case 0x2001:
1973 case 0x2002:
1974 case 0x2003:
1975 case 0x2004:
1976 case 0x2005:
1977 case 0x2006:
1978 case 0x2007:
1979 case 0x2008:
1980 case 0x2009:
1981 case 0x200A:
1982 case 0x202f:
1983 case 0x205f:
1984 case 0x3000:
1985 return op_code != OP_HSPACE;
1986 default:
1987 return op_code == OP_HSPACE;
1988 }
1989
1990 case OP_VSPACE:
1991 case OP_NOT_VSPACE:
1992 switch(next)
1993 {
1994 case 0x0a:
1995 case 0x0b:
1996 case 0x0c:
1997 case 0x0d:
1998 case 0x85:
1999 case 0x2028:
2000 case 0x2029:
2001 return op_code != OP_VSPACE;
2002 default:
2003 return op_code == OP_VSPACE;
2004 }
2005
2006 default:
2007 return FALSE;
2008 }
2009
2010
2011 /* Handle the case when the next item is \d, \s, etc. */
2012
2013 switch(op_code)
2014 {
2015 case OP_CHAR:
2016 case OP_CHARNC:
2017 #ifdef SUPPORT_UTF8
2018 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2019 #endif
2020 switch(-next)
2021 {
2022 case ESC_d:
2023 return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
2024
2025 case ESC_D:
2026 return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
2027
2028 case ESC_s:
2029 return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
2030
2031 case ESC_S:
2032 return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
2033
2034 case ESC_w:
2035 return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
2036
2037 case ESC_W:
2038 return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
2039
2040 case ESC_h:
2041 case ESC_H:
2042 switch(item)
2043 {
2044 case 0x09:
2045 case 0x20:
2046 case 0xa0:
2047 case 0x1680:
2048 case 0x180e:
2049 case 0x2000:
2050 case 0x2001:
2051 case 0x2002:
2052 case 0x2003:
2053 case 0x2004:
2054 case 0x2005:
2055 case 0x2006:
2056 case 0x2007:
2057 case 0x2008:
2058 case 0x2009:
2059 case 0x200A:
2060 case 0x202f:
2061 case 0x205f:
2062 case 0x3000:
2063 return -next != ESC_h;
2064 default:
2065 return -next == ESC_h;
2066 }
2067
2068 case ESC_v:
2069 case ESC_V:
2070 switch(item)
2071 {
2072 case 0x0a:
2073 case 0x0b:
2074 case 0x0c:
2075 case 0x0d:
2076 case 0x85:
2077 case 0x2028:
2078 case 0x2029:
2079 return -next != ESC_v;
2080 default:
2081 return -next == ESC_v;
2082 }
2083
2084 default:
2085 return FALSE;
2086 }
2087
2088 case OP_DIGIT:
2089 return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
2090 next == -ESC_h || next == -ESC_v;
2091
2092 case OP_NOT_DIGIT:
2093 return next == -ESC_d;
2094
2095 case OP_WHITESPACE:
2096 return next == -ESC_S || next == -ESC_d || next == -ESC_w;
2097
2098 case OP_NOT_WHITESPACE:
2099 return next == -ESC_s || next == -ESC_h || next == -ESC_v;
2100
2101 case OP_HSPACE:
2102 return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w;
2103
2104 case OP_NOT_HSPACE:
2105 return next == -ESC_h;
2106
2107 /* Can't have \S in here because VT matches \S (Perl anomaly) */
2108 case OP_VSPACE:
2109 return next == -ESC_V || next == -ESC_d || next == -ESC_w;
2110
2111 case OP_NOT_VSPACE:
2112 return next == -ESC_v;
2113
2114 case OP_WORDCHAR:
2115 return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v;
2116
2117 case OP_NOT_WORDCHAR:
2118 return next == -ESC_w || next == -ESC_d;
2119
2120 default:
2121 return FALSE;
2122 }
2123
2124 /* Control does not reach here */
2125 }
2126
2127
2128
2129 /*************************************************
2130 * Compile one branch *
2131 *************************************************/
2132
2133 /* Scan the pattern, compiling it into the a vector. If the options are
2134 changed during the branch, the pointer is used to change the external options
2135 bits. This function is used during the pre-compile phase when we are trying
2136 to find out the amount of memory needed, as well as during the real compile
2137 phase. The value of lengthptr distinguishes the two phases.
2138
2139 Arguments:
2140 optionsptr pointer to the option bits
2141 codeptr points to the pointer to the current code point
2142 ptrptr points to the current pattern pointer
2143 errorcodeptr points to error code variable
2144 firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2145 reqbyteptr set to the last literal character required, else < 0
2146 bcptr points to current branch chain
2147 cd contains pointers to tables etc.
2148 lengthptr NULL during the real compile phase
2149 points to length accumulator during pre-compile phase
2150
2151 Returns: TRUE on success
2152 FALSE, with *errorcodeptr set non-zero on error
2153 */
2154
2155 static BOOL
2156 compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2157 int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2158 compile_data *cd, int *lengthptr)
2159 {
2160 int repeat_type, op_type;
2161 int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
2162 int bravalue = 0;
2163 int greedy_default, greedy_non_default;
2164 int firstbyte, reqbyte;
2165 int zeroreqbyte, zerofirstbyte;
2166 int req_caseopt, reqvary, tempreqvary;
2167 int options = *optionsptr;
2168 int after_manual_callout = 0;
2169 int length_prevgroup = 0;
2170 register int c;
2171 register uschar *code = *codeptr;
2172 uschar *last_code = code;
2173 uschar *orig_code = code;
2174 uschar *tempcode;
2175 BOOL inescq = FALSE;
2176 BOOL groupsetfirstbyte = FALSE;
2177 const uschar *ptr = *ptrptr;
2178 const uschar *tempptr;
2179 uschar *previous = NULL;
2180 uschar *previous_callout = NULL;
2181 uschar *save_hwm = NULL;
2182 uschar classbits[32];
2183
2184 #ifdef SUPPORT_UTF8
2185 BOOL class_utf8;
2186 BOOL utf8 = (options & PCRE_UTF8) != 0;
2187 uschar *class_utf8data;
2188 uschar utf8_char[6];
2189 #else
2190 BOOL utf8 = FALSE;
2191 uschar *utf8_char = NULL;
2192 #endif
2193
2194 #ifdef DEBUG
2195 if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2196 #endif
2197
2198 /* Set up the default and non-default settings for greediness */
2199
2200 greedy_default = ((options & PCRE_UNGREEDY) != 0);
2201 greedy_non_default = greedy_default ^ 1;
2202
2203 /* Initialize no first byte, no required byte. REQ_UNSET means "no char
2204 matching encountered yet". It gets changed to REQ_NONE if we hit something that
2205 matches a non-fixed char first char; reqbyte just remains unset if we never
2206 find one.
2207
2208 When we hit a repeat whose minimum is zero, we may have to adjust these values
2209 to take the zero repeat into account. This is implemented by setting them to
2210 zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2211 item types that can be repeated set these backoff variables appropriately. */
2212
2213 firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2214
2215 /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2216 according to the current setting of the caseless flag. REQ_CASELESS is a bit
2217 value > 255. It is added into the firstbyte or reqbyte variables to record the
2218 case status of the value. This is used only for ASCII characters. */
2219
2220 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2221
2222 /* Switch on next character until the end of the branch */
2223
2224 for (;; ptr++)
2225 {
2226 BOOL negate_class;
2227 BOOL possessive_quantifier;
2228 BOOL is_quantifier;
2229 BOOL is_recurse;
2230 BOOL reset_bracount;
2231 int class_charcount;
2232 int class_lastchar;
2233 int newoptions;
2234 int recno;
2235 int refsign;
2236 int skipbytes;
2237 int subreqbyte;
2238 int subfirstbyte;
2239 int terminator;
2240 int mclength;
2241 uschar mcbuffer[8];
2242
2243 /* Get next byte in the pattern */
2244
2245 c = *ptr;
2246
2247 /* If we are in the pre-compile phase, accumulate the length used for the
2248 previous cycle of this loop. */
2249
2250 if (lengthptr != NULL)
2251 {
2252 #ifdef DEBUG
2253 if (code > cd->hwm) cd->hwm = code; /* High water info */
2254 #endif
2255 if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
2256 {
2257 *errorcodeptr = ERR52;
2258 goto FAILED;
2259 }
2260
2261 /* There is at least one situation where code goes backwards: this is the
2262 case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2263 the class is simply eliminated. However, it is created first, so we have to
2264 allow memory for it. Therefore, don't ever reduce the length at this point.
2265 */
2266
2267 if (code < last_code) code = last_code;
2268
2269 /* Paranoid check for integer overflow */
2270
2271 if (OFLOW_MAX - *lengthptr < code - last_code)
2272 {
2273 *errorcodeptr = ERR20;
2274 goto FAILED;
2275 }
2276
2277 *lengthptr += code - last_code;
2278 DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2279
2280 /* If "previous" is set and it is not at the start of the work space, move
2281 it back to there, in order to avoid filling up the work space. Otherwise,
2282 if "previous" is NULL, reset the current code pointer to the start. */
2283
2284 if (previous != NULL)
2285 {
2286 if (previous > orig_code)
2287 {
2288 memmove(orig_code, previous, code - previous);
2289 code -= previous - orig_code;
2290 previous = orig_code;
2291 }
2292 }
2293 else code = orig_code;
2294
2295 /* Remember where this code item starts so we can pick up the length
2296 next time round. */
2297
2298 last_code = code;
2299 }
2300
2301 /* In the real compile phase, just check the workspace used by the forward
2302 reference list. */
2303
2304 else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
2305 {
2306 *errorcodeptr = ERR52;
2307 goto FAILED;
2308 }
2309
2310 /* If in \Q...\E, check for the end; if not, we have a literal */
2311
2312 if (inescq && c != 0)
2313 {
2314 if (c == '\\' && ptr[1] == 'E')
2315 {
2316 inescq = FALSE;
2317 ptr++;
2318 continue;
2319 }
2320 else
2321 {
2322 if (previous_callout != NULL)
2323 {
2324 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2325 complete_callout(previous_callout, ptr, cd);
2326 previous_callout = NULL;
2327 }
2328 if ((options & PCRE_AUTO_CALLOUT) != 0)
2329 {
2330 previous_callout = code;
2331 code = auto_callout(code, ptr, cd);
2332 }
2333 goto NORMAL_CHAR;
2334 }
2335 }
2336
2337 /* Fill in length of a previous callout, except when the next thing is
2338 a quantifier. */
2339
2340 is_quantifier = c == '*' || c == '+' || c == '?' ||
2341 (c == '{' && is_counted_repeat(ptr+1));
2342
2343 if (!is_quantifier && previous_callout != NULL &&
2344 after_manual_callout-- <= 0)
2345 {
2346 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2347 complete_callout(previous_callout, ptr, cd);
2348 previous_callout = NULL;
2349 }
2350
2351 /* In extended mode, skip white space and comments */
2352
2353 if ((options & PCRE_EXTENDED) != 0)
2354 {
2355 if ((cd->ctypes[c] & ctype_space) != 0) continue;
2356 if (c == '#')
2357 {
2358 while (*(++ptr) != 0)
2359 {
2360 if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
2361 }
2362 if (*ptr != 0) continue;
2363
2364 /* Else fall through to handle end of string */
2365 c = 0;
2366 }
2367 }
2368
2369 /* No auto callout for quantifiers. */
2370
2371 if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
2372 {
2373 previous_callout = code;
2374 code = auto_callout(code, ptr, cd);
2375 }
2376
2377 switch(c)
2378 {
2379 /* ===================================================================*/
2380 case 0: /* The branch terminates at string end */
2381 case '|': /* or | or ) */
2382 case ')':
2383 *firstbyteptr = firstbyte;
2384 *reqbyteptr = reqbyte;
2385 *codeptr = code;
2386 *ptrptr = ptr;
2387 if (lengthptr != NULL)
2388 {
2389 if (OFLOW_MAX - *lengthptr < code - last_code)
2390 {
2391 *errorcodeptr = ERR20;
2392 goto FAILED;
2393 }
2394 *lengthptr += code - last_code; /* To include callout length */
2395 DPRINTF((">> end branch\n"));
2396 }
2397 return TRUE;
2398
2399
2400 /* ===================================================================*/
2401 /* Handle single-character metacharacters. In multiline mode, ^ disables
2402 the setting of any following char as a first character. */
2403
2404 case '^':
2405 if ((options & PCRE_MULTILINE) != 0)
2406 {
2407 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2408 }
2409 previous = NULL;
2410 *code++ = OP_CIRC;
2411 break;
2412
2413 case '$':
2414 previous = NULL;
2415 *code++ = OP_DOLL;
2416 break;
2417
2418 /* There can never be a first char if '.' is first, whatever happens about
2419 repeats. The value of reqbyte doesn't change either. */
2420
2421 case '.':
2422 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2423 zerofirstbyte = firstbyte;
2424 zeroreqbyte = reqbyte;
2425 previous = code;
2426 *code++ = OP_ANY;
2427 break;
2428
2429
2430 /* ===================================================================*/
2431 /* Character classes. If the included characters are all < 256, we build a
2432 32-byte bitmap of the permitted characters, except in the special case
2433 where there is only one such character. For negated classes, we build the
2434 map as usual, then invert it at the end. However, we use a different opcode
2435 so that data characters > 255 can be handled correctly.
2436
2437 If the class contains characters outside the 0-255 range, a different
2438 opcode is compiled. It may optionally have a bit map for characters < 256,
2439 but those above are are explicitly listed afterwards. A flag byte tells
2440 whether the bitmap is present, and whether this is a negated class or not.
2441 */
2442
2443 case '[':
2444 previous = code;
2445
2446 /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
2447 they are encountered at the top level, so we'll do that too. */
2448
2449 if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2450 check_posix_syntax(ptr, &tempptr, cd))
2451 {
2452 *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;
2453 goto FAILED;
2454 }
2455
2456 /* If the first character is '^', set the negation flag and skip it. Also,
2457 if the first few characters (either before or after ^) are \Q\E or \E we
2458 skip them too. This makes for compatibility with Perl. */
2459
2460 negate_class = FALSE;
2461 for (;;)
2462 {
2463 c = *(++ptr);
2464 if (c == '\\')
2465 {
2466 if (ptr[1] == 'E') ptr++;
2467 else if (strncmp((const char *)ptr+1, "Q\\E", 3) == 0) ptr += 3;
2468 else break;
2469 }
2470 else if (!negate_class && c == '^')
2471 negate_class = TRUE;
2472 else break;
2473 }
2474
2475 /* Keep a count of chars with values < 256 so that we can optimize the case
2476 of just a single character (as long as it's < 256). However, For higher
2477 valued UTF-8 characters, we don't yet do any optimization. */
2478
2479 class_charcount = 0;
2480 class_lastchar = -1;
2481
2482 /* Initialize the 32-char bit map to all zeros. We build the map in a
2483 temporary bit of memory, in case the class contains only 1 character (less
2484 than 256), because in that case the compiled code doesn't use the bit map.
2485 */
2486
2487 memset(classbits, 0, 32 * sizeof(uschar));
2488
2489 #ifdef SUPPORT_UTF8
2490 class_utf8 = FALSE; /* No chars >= 256 */
2491 class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */
2492 #endif
2493
2494 /* Process characters until ] is reached. By writing this as a "do" it
2495 means that an initial ] is taken as a data character. At the start of the
2496 loop, c contains the first byte of the character. */
2497
2498 if (c != 0) do
2499 {
2500 const uschar *oldptr;
2501
2502 #ifdef SUPPORT_UTF8
2503 if (utf8 && c > 127)
2504 { /* Braces are required because the */
2505 GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
2506 }
2507 #endif
2508
2509 /* Inside \Q...\E everything is literal except \E */
2510
2511 if (inescq)
2512 {
2513 if (c == '\\' && ptr[1] == 'E') /* If we are at \E */
2514 {
2515 inescq = FALSE; /* Reset literal state */
2516 ptr++; /* Skip the 'E' */
2517 continue; /* Carry on with next */
2518 }
2519 goto CHECK_RANGE; /* Could be range if \E follows */
2520 }
2521
2522 /* Handle POSIX class names. Perl allows a negation extension of the
2523 form [:^name:]. A square bracket that doesn't match the syntax is
2524 treated as a literal. We also recognize the POSIX constructions
2525 [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
2526 5.6 and 5.8 do. */
2527
2528 if (c == '[' &&
2529 (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2530 check_posix_syntax(ptr, &tempptr, cd))
2531 {
2532 BOOL local_negate = FALSE;
2533 int posix_class, taboffset, tabopt;
2534 register const uschar *cbits = cd->cbits;
2535 uschar pbits[32];
2536
2537 if (ptr[1] != ':')
2538 {
2539 *errorcodeptr = ERR31;
2540 goto FAILED;
2541 }
2542
2543 ptr += 2;
2544 if (*ptr == '^')
2545 {
2546 local_negate = TRUE;
2547 ptr++;
2548 }
2549
2550 posix_class = check_posix_name(ptr, tempptr - ptr);
2551 if (posix_class < 0)
2552 {
2553 *errorcodeptr = ERR30;
2554 goto FAILED;
2555 }
2556
2557 /* If matching is caseless, upper and lower are converted to
2558 alpha. This relies on the fact that the class table starts with
2559 alpha, lower, upper as the first 3 entries. */
2560
2561 if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
2562 posix_class = 0;
2563
2564 /* We build the bit map for the POSIX class in a chunk of local store
2565 because we may be adding and subtracting from it, and we don't want to
2566 subtract bits that may be in the main map already. At the end we or the
2567 result into the bit map that is being built. */
2568
2569 posix_class *= 3;
2570
2571 /* Copy in the first table (always present) */
2572
2573 memcpy(pbits, cbits + posix_class_maps[posix_class],
2574 32 * sizeof(uschar));
2575
2576 /* If there is a second table, add or remove it as required. */
2577
2578 taboffset = posix_class_maps[posix_class + 1];
2579 tabopt = posix_class_maps[posix_class + 2];
2580
2581 if (taboffset >= 0)
2582 {
2583 if (tabopt >= 0)
2584 for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
2585 else
2586 for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
2587 }
2588
2589 /* Not see if we need to remove any special characters. An option
2590 value of 1 removes vertical space and 2 removes underscore. */
2591
2592 if (tabopt < 0) tabopt = -tabopt;
2593 if (tabopt == 1) pbits[1] &= ~0x3c;
2594 else if (tabopt == 2) pbits[11] &= 0x7f;
2595
2596 /* Add the POSIX table or its complement into the main table that is
2597 being built and we are done. */
2598
2599 if (local_negate)
2600 for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
2601 else
2602 for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
2603
2604 ptr = tempptr + 1;
2605 class_charcount = 10; /* Set > 1; assumes more than 1 per class */
2606 continue; /* End of POSIX syntax handling */
2607 }
2608
2609 /* Backslash may introduce a single character, or it may introduce one
2610 of the specials, which just set a flag. The sequence \b is a special
2611 case. Inside a class (and only there) it is treated as backspace.
2612 Elsewhere it marks a word boundary. Other escapes have preset maps ready
2613 to 'or' into the one we are building. We assume they have more than one
2614 character in them, so set class_charcount bigger than one. */
2615
2616 if (c == '\\')
2617 {
2618 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2619 if (*errorcodeptr != 0) goto FAILED;
2620
2621 if (-c == ESC_b) c = '\b'; /* \b is backslash in a class */
2622 else if (-c == ESC_X) c = 'X'; /* \X is literal X in a class */
2623 else if (-c == ESC_R) c = 'R'; /* \R is literal R in a class */
2624 else if (-c == ESC_Q) /* Handle start of quoted string */
2625 {
2626 if (ptr[1] == '\\' && ptr[2] == 'E')
2627 {
2628 ptr += 2; /* avoid empty string */
2629 }
2630 else inescq = TRUE;
2631 continue;
2632 }
2633
2634 if (c < 0)
2635 {
2636 register const uschar *cbits = cd->cbits;
2637 class_charcount += 2; /* Greater than 1 is what matters */
2638
2639 /* Save time by not doing this in the pre-compile phase. */
2640
2641 if (lengthptr == NULL) switch (-c)
2642 {
2643 case ESC_d:
2644 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
2645 continue;
2646
2647 case ESC_D:
2648 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
2649 continue;
2650
2651 case ESC_w:
2652 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
2653 continue;
2654
2655 case ESC_W:
2656 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
2657 continue;
2658
2659 case ESC_s:
2660 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
2661 classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
2662 continue;
2663
2664 case ESC_S:
2665 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
2666 classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
2667 continue;
2668
2669 case ESC_E: /* Perl ignores an orphan \E */
2670 continue;
2671
2672 default: /* Not recognized; fall through */
2673 break; /* Need "default" setting to stop compiler warning. */
2674 }
2675
2676 /* In the pre-compile phase, just do the recognition. */
2677
2678 else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
2679 c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
2680
2681 /* We need to deal with \H, \h, \V, and \v in both phases because
2682 they use extra memory. */
2683
2684 if (-c == ESC_h)
2685 {
2686 SETBIT(classbits, 0x09); /* VT */
2687 SETBIT(classbits, 0x20); /* SPACE */
2688 SETBIT(classbits, 0xa0); /* NSBP */
2689 #ifdef SUPPORT_UTF8
2690 if (utf8)
2691 {
2692 class_utf8 = TRUE;
2693 *class_utf8data++ = XCL_SINGLE;
2694 class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
2695 *class_utf8data++ = XCL_SINGLE;
2696 class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
2697 *class_utf8data++ = XCL_RANGE;
2698 class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
2699 class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
2700 *class_utf8data++ = XCL_SINGLE;
2701 class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
2702 *class_utf8data++ = XCL_SINGLE;
2703 class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
2704 *class_utf8data++ = XCL_SINGLE;
2705 class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
2706 }
2707 #endif
2708 continue;
2709 }
2710
2711 if (-c == ESC_H)
2712 {
2713 for (c = 0; c < 32; c++)
2714 {
2715 int x = 0xff;
2716 switch (c)
2717 {
2718 case 0x09/8: x ^= 1 << (0x09%8); break;
2719 case 0x20/8: x ^= 1 << (0x20%8); break;
2720 case 0xa0/8: x ^= 1 << (0xa0%8); break;
2721 default: break;
2722 }
2723 classbits[c] |= x;
2724 }
2725
2726 #ifdef SUPPORT_UTF8
2727 if (utf8)
2728 {
2729 class_utf8 = TRUE;
2730 *class_utf8data++ = XCL_RANGE;
2731 class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2732 class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
2733 *class_utf8data++ = XCL_RANGE;
2734 class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
2735 class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
2736 *class_utf8data++ = XCL_RANGE;
2737 class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
2738 class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
2739 *class_utf8data++ = XCL_RANGE;
2740 class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
2741 class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
2742 *class_utf8data++ = XCL_RANGE;
2743 class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
2744 class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
2745 *class_utf8data++ = XCL_RANGE;
2746 class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
2747 class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
2748 *class_utf8data++ = XCL_RANGE;
2749 class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
2750 class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2751 }
2752 #endif
2753 continue;
2754 }
2755
2756 if (-c == ESC_v)
2757 {
2758 SETBIT(classbits, 0x0a); /* LF */
2759 SETBIT(classbits, 0x0b); /* VT */
2760 SETBIT(classbits, 0x0c); /* FF */
2761 SETBIT(classbits, 0x0d); /* CR */
2762 SETBIT(classbits, 0x85); /* NEL */
2763 #ifdef SUPPORT_UTF8
2764 if (utf8)
2765 {
2766 class_utf8 = TRUE;
2767 *class_utf8data++ = XCL_RANGE;
2768 class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
2769 class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
2770 }
2771 #endif
2772 continue;
2773 }
2774
2775 if (-c == ESC_V)
2776 {
2777 for (c = 0; c < 32; c++)
2778 {
2779 int x = 0xff;
2780 switch (c)
2781 {
2782 case 0x0a/8: x ^= 1 << (0x0a%8);
2783 x ^= 1 << (0x0b%8);
2784 x ^= 1 << (0x0c%8);
2785 x ^= 1 << (0x0d%8);
2786 break;
2787 case 0x85/8: x ^= 1 << (0x85%8); break;
2788 default: break;
2789 }
2790 classbits[c] |= x;
2791 }
2792
2793 #ifdef SUPPORT_UTF8
2794 if (utf8)
2795 {
2796 class_utf8 = TRUE;
2797 *class_utf8data++ = XCL_RANGE;
2798 class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2799 class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
2800 *class_utf8data++ = XCL_RANGE;
2801 class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
2802 class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2803 }
2804 #endif
2805 continue;
2806 }
2807
2808 /* We need to deal with \P and \p in both phases. */
2809
2810 #ifdef SUPPORT_UCP
2811 if (-c == ESC_p || -c == ESC_P)
2812 {
2813 BOOL negated;
2814 int pdata;
2815 int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
2816 if (ptype < 0) goto FAILED;
2817 class_utf8 = TRUE;
2818 *class_utf8data++ = ((-c == ESC_p) != negated)?
2819 XCL_PROP : XCL_NOTPROP;
2820 *class_utf8data++ = ptype;
2821 *class_utf8data++ = pdata;
2822 class_charcount -= 2; /* Not a < 256 character */
2823 continue;
2824 }
2825 #endif
2826 /* Unrecognized escapes are faulted if PCRE is running in its
2827 strict mode. By default, for compatibility with Perl, they are
2828 treated as literals. */
2829
2830 if ((options & PCRE_EXTRA) != 0)
2831 {
2832 *errorcodeptr = ERR7;
2833 goto FAILED;
2834 }
2835
2836 class_charcount -= 2; /* Undo the default count from above */
2837 c = *ptr; /* Get the final character and fall through */
2838 }
2839
2840 /* Fall through if we have a single character (c >= 0). This may be
2841 greater than 256 in UTF-8 mode. */
2842
2843 } /* End of backslash handling */
2844
2845 /* A single character may be followed by '-' to form a range. However,
2846 Perl does not permit ']' to be the end of the range. A '-' character
2847 at the end is treated as a literal. Perl ignores orphaned \E sequences
2848 entirely. The code for handling \Q and \E is messy. */
2849
2850 CHECK_RANGE:
2851 while (ptr[1] == '\\' && ptr[2] == 'E')
2852 {
2853 inescq = FALSE;
2854 ptr += 2;
2855 }
2856
2857 oldptr = ptr;
2858
2859 if (!inescq && ptr[1] == '-')
2860 {
2861 int d;
2862 ptr += 2;
2863 while (*ptr == '\\' && ptr[1] == 'E') ptr += 2;
2864
2865 /* If we hit \Q (not followed by \E) at this point, go into escaped
2866 mode. */
2867
2868 while (*ptr == '\\' && ptr[1] == 'Q')
2869 {
2870 ptr += 2;
2871 if (*ptr == '\\' && ptr[1] == 'E') { ptr += 2; continue; }
2872 inescq = TRUE;
2873 break;
2874 }
2875
2876 if (*ptr == 0 || (!inescq && *ptr == ']'))
2877 {
2878 ptr = oldptr;
2879 goto LONE_SINGLE_CHARACTER;
2880 }
2881
2882 #ifdef SUPPORT_UTF8
2883 if (utf8)
2884 { /* Braces are required because the */
2885 GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
2886 }
2887 else
2888 #endif
2889 d = *ptr; /* Not UTF-8 mode */
2890
2891 /* The second part of a range can be a single-character escape, but
2892 not any of the other escapes. Perl 5.6 treats a hyphen as a literal
2893 in such circumstances. */
2894
2895 if (!inescq && d == '\\')
2896 {
2897 d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2898 if (*errorcodeptr != 0) goto FAILED;
2899
2900 /* \b is backslash; \X is literal X; \R is literal R; any other
2901 special means the '-' was literal */
2902
2903 if (d < 0)
2904 {
2905 if (d == -ESC_b) d = '\b';
2906 else if (d == -ESC_X) d = 'X';
2907 else if (d == -ESC_R) d = 'R'; else
2908 {
2909 ptr = oldptr;
2910 goto LONE_SINGLE_CHARACTER; /* A few lines below */
2911 }
2912 }
2913 }
2914
2915 /* Check that the two values are in the correct order. Optimize
2916 one-character ranges */
2917
2918 if (d < c)
2919 {
2920 *errorcodeptr = ERR8;
2921 goto FAILED;
2922 }
2923
2924 if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
2925
2926 /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
2927 matching, we have to use an XCLASS with extra data items. Caseless
2928 matching for characters > 127 is available only if UCP support is
2929 available. */
2930
2931 #ifdef SUPPORT_UTF8
2932 if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
2933 {
2934 class_utf8 = TRUE;
2935
2936 /* With UCP support, we can find the other case equivalents of
2937 the relevant characters. There may be several ranges. Optimize how
2938 they fit with the basic range. */
2939
2940 #ifdef SUPPORT_UCP
2941 if ((options & PCRE_CASELESS) != 0)
2942 {
2943 unsigned int occ, ocd;
2944 unsigned int cc = c;
2945 unsigned int origd = d;
2946 while (get_othercase_range(&cc, origd, &occ, &ocd))
2947 {
2948 if (occ >= (unsigned int)c &&
2949 ocd <= (unsigned int)d)
2950 continue; /* Skip embedded ranges */
2951
2952 if (occ < (unsigned int)c &&
2953 ocd >= (unsigned int)c - 1) /* Extend the basic range */
2954 { /* if there is overlap, */
2955 c = occ; /* noting that if occ < c */
2956 continue; /* we can't have ocd > d */
2957 } /* because a subrange is */
2958 if (ocd > (unsigned int)d &&
2959 occ <= (unsigned int)d + 1) /* always shorter than */
2960 { /* the basic range. */
2961 d = ocd;
2962 continue;
2963 }
2964
2965 if (occ == ocd)
2966 {
2967 *class_utf8data++ = XCL_SINGLE;
2968 }
2969 else
2970 {
2971 *class_utf8data++ = XCL_RANGE;
2972 class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
2973 }
2974 class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
2975 }
2976 }
2977 #endif /* SUPPORT_UCP */
2978
2979 /* Now record the original range, possibly modified for UCP caseless
2980 overlapping ranges. */
2981
2982 *class_utf8data++ = XCL_RANGE;
2983 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
2984 class_utf8data += _pcre_ord2utf8(d, class_utf8data);
2985
2986 /* With UCP support, we are done. Without UCP support, there is no
2987 caseless matching for UTF-8 characters > 127; we can use the bit map
2988 for the smaller ones. */
2989
2990 #ifdef SUPPORT_UCP
2991 continue; /* With next character in the class */
2992 #else
2993 if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
2994
2995 /* Adjust upper limit and fall through to set up the map */
2996
2997 d = 127;
2998
2999 #endif /* SUPPORT_UCP */
3000 }
3001 #endif /* SUPPORT_UTF8 */
3002
3003 /* We use the bit map for all cases when not in UTF-8 mode; else
3004 ranges that lie entirely within 0-127 when there is UCP support; else
3005 for partial ranges without UCP support. */
3006
3007 class_charcount += d - c + 1;
3008 class_lastchar = d;
3009
3010 /* We can save a bit of time by skipping this in the pre-compile. */
3011
3012 if (lengthptr == NULL) for (; c <= d; c++)
3013 {
3014 classbits[c/8] |= (1 << (c&7));
3015 if ((options & PCRE_CASELESS) != 0)
3016 {
3017 int uc = cd->fcc[c]; /* flip case */
3018 classbits[uc/8] |= (1 << (uc&7));
3019 }
3020 }
3021
3022 continue; /* Go get the next char in the class */
3023 }
3024
3025 /* Handle a lone single character - we can get here for a normal
3026 non-escape char, or after \ that introduces a single character or for an
3027 apparent range that isn't. */
3028
3029 LONE_SINGLE_CHARACTER:
3030
3031 /* Handle a character that cannot go in the bit map */
3032
3033 #ifdef SUPPORT_UTF8
3034 if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
3035 {
3036 class_utf8 = TRUE;
3037 *class_utf8data++ = XCL_SINGLE;
3038 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3039
3040 #ifdef SUPPORT_UCP
3041 if ((options & PCRE_CASELESS) != 0)
3042 {
3043 unsigned int othercase;
3044 if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR)
3045 {
3046 *class_utf8data++ = XCL_SINGLE;
3047 class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
3048 }
3049 }
3050 #endif /* SUPPORT_UCP */
3051
3052 }
3053 else
3054 #endif /* SUPPORT_UTF8 */
3055
3056 /* Handle a single-byte character */
3057 {
3058 classbits[c/8] |= (1 << (c&7));
3059 if ((options & PCRE_CASELESS) != 0)
3060 {
3061 c = cd->fcc[c]; /* flip case */
3062 classbits[c/8] |= (1 << (c&7));
3063 }
3064 class_charcount++;
3065 class_lastchar = c;
3066 }
3067 }
3068
3069 /* Loop until ']' reached. This "while" is the end of the "do" above. */
3070
3071 while ((c = *(++ptr)) != 0 && (c != ']' || inescq));
3072
3073 if (c == 0) /* Missing terminating ']' */
3074 {
3075 *errorcodeptr = ERR6;
3076 goto FAILED;
3077 }
3078
3079 /* If class_charcount is 1, we saw precisely one character whose value is
3080 less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we
3081 can optimize the negative case only if there were no characters >= 128
3082 because OP_NOT and the related opcodes like OP_NOTSTAR operate on
3083 single-bytes only. This is an historical hangover. Maybe one day we can
3084 tidy these opcodes to handle multi-byte characters.
3085
3086 The optimization throws away the bit map. We turn the item into a
3087 1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
3088 that OP_NOT does not support multibyte characters. In the positive case, it
3089 can cause firstbyte to be set. Otherwise, there can be no first char if
3090 this item is first, whatever repeat count may follow. In the case of
3091 reqbyte, save the previous value for reinstating. */
3092
3093 #ifdef SUPPORT_UTF8
3094 if (class_charcount == 1 &&
3095 (!utf8 ||
3096 (!class_utf8 && (!negate_class || class_lastchar < 128))))
3097
3098 #else
3099 if (class_charcount == 1)
3100 #endif
3101 {
3102 zeroreqbyte = reqbyte;
3103
3104 /* The OP_NOT opcode works on one-byte characters only. */
3105
3106 if (negate_class)
3107 {
3108 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3109 zerofirstbyte = firstbyte;
3110 *code++ = OP_NOT;
3111 *code++ = class_lastchar;
3112 break;
3113 }
3114
3115 /* For a single, positive character, get the value into mcbuffer, and
3116 then we can handle this with the normal one-character code. */
3117
3118 #ifdef SUPPORT_UTF8
3119 if (utf8 && class_lastchar > 127)
3120 mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
3121 else
3122 #endif
3123 {
3124 mcbuffer[0] = class_lastchar;
3125 mclength = 1;
3126 }
3127 goto ONE_CHAR;
3128 } /* End of 1-char optimization */
3129
3130 /* The general case - not the one-char optimization. If this is the first
3131 thing in the branch, there can be no first char setting, whatever the
3132 repeat count. Any reqbyte setting must remain unchanged after any kind of
3133 repeat. */
3134
3135 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3136 zerofirstbyte = firstbyte;
3137 zeroreqbyte = reqbyte;
3138
3139 /* If there are characters with values > 255, we have to compile an
3140 extended class, with its own opcode. If there are no characters < 256,
3141 we can omit the bitmap in the actual compiled code. */
3142
3143 #ifdef SUPPORT_UTF8
3144 if (class_utf8)
3145 {
3146 *class_utf8data++ = XCL_END; /* Marks the end of extra data */
3147 *code++ = OP_XCLASS;
3148 code += LINK_SIZE;
3149 *code = negate_class? XCL_NOT : 0;
3150
3151 /* If the map is required, move up the extra data to make room for it;
3152 otherwise just move the code pointer to the end of the extra data. */
3153
3154 if (class_charcount > 0)
3155 {
3156 *code++ |= XCL_MAP;
3157 memmove(code + 32, code, class_utf8data - code);
3158 memcpy(code, classbits, 32);
3159 code = class_utf8data + 32;
3160 }
3161 else code = class_utf8data;
3162
3163 /* Now fill in the complete length of the item */
3164
3165 PUT(previous, 1, code - previous);
3166 break; /* End of class handling */
3167 }
3168 #endif
3169
3170 /* If there are no characters > 255, negate the 32-byte map if necessary,
3171 and copy it into the code vector. If this is the first thing in the branch,
3172 there can be no first char setting, whatever the repeat count. Any reqbyte
3173 setting must remain unchanged after any kind of repeat. */
3174
3175 if (negate_class)
3176 {
3177 *code++ = OP_NCLASS;
3178 if (lengthptr == NULL) /* Save time in the pre-compile phase */
3179 for (c = 0; c < 32; c++) code[c] = ~classbits[c];
3180 }
3181 else
3182 {
3183 *code++ = OP_CLASS;
3184 memcpy(code, classbits, 32);
3185 }
3186 code += 32;
3187 break;
3188
3189
3190 /* ===================================================================*/
3191 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
3192 has been tested above. */
3193
3194 case '{':
3195 if (!is_quantifier) goto NORMAL_CHAR;
3196 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
3197 if (*errorcodeptr != 0) goto FAILED;
3198 goto REPEAT;
3199
3200 case '*':
3201 repeat_min = 0;
3202 repeat_max = -1;
3203 goto REPEAT;
3204
3205 case '+':
3206 repeat_min = 1;
3207 repeat_max = -1;
3208 goto REPEAT;
3209
3210 case '?':
3211 repeat_min = 0;
3212 repeat_max = 1;
3213
3214 REPEAT:
3215 if (previous == NULL)
3216 {
3217 *errorcodeptr = ERR9;
3218 goto FAILED;
3219 }
3220
3221 if (repeat_min == 0)
3222 {
3223 firstbyte = zerofirstbyte; /* Adjust for zero repeat */
3224 reqbyte = zeroreqbyte; /* Ditto */
3225 }
3226
3227 /* Remember whether this is a variable length repeat */
3228
3229 reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
3230
3231 op_type = 0; /* Default single-char op codes */
3232 possessive_quantifier = FALSE; /* Default not possessive quantifier */
3233
3234 /* Save start of previous item, in case we have to move it up to make space
3235 for an inserted OP_ONCE for the additional '+' extension. */
3236
3237 tempcode = previous;
3238
3239 /* If the next character is '+', we have a possessive quantifier. This
3240 implies greediness, whatever the setting of the PCRE_UNGREEDY option.
3241 If the next character is '?' this is a minimizing repeat, by default,
3242 but if PCRE_UNGREEDY is set, it works the other way round. We change the
3243 repeat type to the non-default. */
3244
3245 if (ptr[1] == '+')
3246 {
3247 repeat_type = 0; /* Force greedy */
3248 possessive_quantifier = TRUE;
3249 ptr++;
3250 }
3251 else if (ptr[1] == '?')
3252 {
3253 repeat_type = greedy_non_default;
3254 ptr++;
3255 }
3256 else repeat_type = greedy_default;
3257
3258 /* If previous was a character match, abolish the item and generate a
3259 repeat item instead. If a char item has a minumum of more than one, ensure
3260 that it is set in reqbyte - it might not be if a sequence such as x{3} is
3261 the first thing in a branch because the x will have gone into firstbyte
3262 instead. */
3263
3264 if (*previous == OP_CHAR || *previous == OP_CHARNC)
3265 {
3266 /* Deal with UTF-8 characters that take up more than one byte. It's
3267 easier to write this out separately than try to macrify it. Use c to
3268 hold the length of the character in bytes, plus 0x80 to flag that it's a
3269 length rather than a small character. */
3270
3271 #ifdef SUPPORT_UTF8
3272 if (utf8 && (code[-1] & 0x80) != 0)
3273 {
3274 uschar *lastchar = code - 1;
3275 while((*lastchar & 0xc0) == 0x80) lastchar--;
3276 c = code - lastchar; /* Length of UTF-8 character */
3277 memcpy(utf8_char, lastchar, c); /* Save the char */
3278 c |= 0x80; /* Flag c as a length */
3279 }
3280 else
3281 #endif
3282
3283 /* Handle the case of a single byte - either with no UTF8 support, or
3284 with UTF-8 disabled, or for a UTF-8 character < 128. */
3285
3286 {
3287 c = code[-1];
3288 if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
3289 }
3290
3291 /* If the repetition is unlimited, it pays to see if the next thing on
3292 the line is something that cannot possibly match this character. If so,
3293 automatically possessifying this item gains some performance in the case
3294 where the match fails. */
3295
3296 if (!possessive_quantifier &&
3297 repeat_max < 0 &&
3298 check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3299 options, cd))
3300 {
3301 repeat_type = 0; /* Force greedy */
3302 possessive_quantifier = TRUE;
3303 }
3304
3305 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
3306 }
3307
3308 /* If previous was a single negated character ([^a] or similar), we use
3309 one of the special opcodes, replacing it. The code is shared with single-
3310 character repeats by setting opt_type to add a suitable offset into
3311 repeat_type. We can also test for auto-possessification. OP_NOT is
3312 currently used only for single-byte chars. */
3313
3314 else if (*previous == OP_NOT)
3315 {
3316 op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
3317 c = previous[1];
3318 if (!possessive_quantifier &&
3319 repeat_max < 0 &&
3320 check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3321 {
3322 repeat_type = 0; /* Force greedy */
3323 possessive_quantifier = TRUE;
3324 }
3325 goto OUTPUT_SINGLE_REPEAT;
3326 }
3327
3328 /* If previous was a character type match (\d or similar), abolish it and
3329 create a suitable repeat item. The code is shared with single-character
3330 repeats by setting op_type to add a suitable offset into repeat_type. Note
3331 the the Unicode property types will be present only when SUPPORT_UCP is
3332 defined, but we don't wrap the little bits of code here because it just
3333 makes it horribly messy. */
3334
3335 else if (*previous < OP_EODN)
3336 {
3337 uschar *oldcode;
3338 int prop_type, prop_value;
3339 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
3340 c = *previous;
3341
3342 if (!possessive_quantifier &&
3343 repeat_max < 0 &&
3344 check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3345 {
3346 repeat_type = 0; /* Force greedy */
3347 possessive_quantifier = TRUE;
3348 }
3349
3350 OUTPUT_SINGLE_REPEAT:
3351 if (*previous == OP_PROP || *previous == OP_NOTPROP)
3352 {
3353 prop_type = previous[1];
3354 prop_value = previous[2];
3355 }
3356 else prop_type = prop_value = -1;
3357
3358 oldcode = code;
3359 code = previous; /* Usually overwrite previous item */
3360
3361 /* If the maximum is zero then the minimum must also be zero; Perl allows
3362 this case, so we do too - by simply omitting the item altogether. */
3363
3364 if (repeat_max == 0) goto END_REPEAT;
3365
3366 /* All real repeats make it impossible to handle partial matching (maybe
3367 one day we will be able to remove this restriction). */
3368
3369 if (repeat_max != 1) cd->nopartial = TRUE;
3370
3371 /* Combine the op_type with the repeat_type */
3372
3373 repeat_type += op_type;
3374
3375 /* A minimum of zero is handled either as the special case * or ?, or as
3376 an UPTO, with the maximum given. */
3377
3378 if (repeat_min == 0)
3379 {
3380 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
3381 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
3382 else
3383 {
3384 *code++ = OP_UPTO + repeat_type;
3385 PUT2INC(code, 0, repeat_max);
3386 }
3387 }
3388
3389 /* A repeat minimum of 1 is optimized into some special cases. If the
3390 maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
3391 left in place and, if the maximum is greater than 1, we use OP_UPTO with
3392 one less than the maximum. */
3393
3394 else if (repeat_min == 1)
3395 {
3396 if (repeat_max == -1)
3397 *code++ = OP_PLUS + repeat_type;
3398 else
3399 {
3400 code = oldcode; /* leave previous item in place */
3401 if (repeat_max == 1) goto END_REPEAT;
3402 *code++ = OP_UPTO + repeat_type;
3403 PUT2INC(code, 0, repeat_max - 1);
3404 }
3405 }
3406
3407 /* The case {n,n} is just an EXACT, while the general case {n,m} is
3408 handled as an EXACT followed by an UPTO. */
3409
3410 else
3411 {
3412 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
3413 PUT2INC(code, 0, repeat_min);
3414
3415 /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
3416 we have to insert the character for the previous code. For a repeated
3417 Unicode property match, there are two extra bytes that define the
3418 required property. In UTF-8 mode, long characters have their length in
3419 c, with the 0x80 bit as a flag. */
3420
3421 if (repeat_max < 0)
3422 {
3423 #ifdef SUPPORT_UTF8
3424 if (utf8 && c >= 128)
3425 {
3426 memcpy(code, utf8_char, c & 7);
3427 code += c & 7;
3428 }
3429 else
3430 #endif
3431 {
3432 *code++ = c;
3433 if (prop_type >= 0)
3434 {
3435 *code++ = prop_type;
3436 *code++ = prop_value;
3437 }
3438 }
3439 *code++ = OP_STAR + repeat_type;
3440 }
3441
3442 /* Else insert an UPTO if the max is greater than the min, again
3443 preceded by the character, for the previously inserted code. If the
3444 UPTO is just for 1 instance, we can use QUERY instead. */
3445
3446 else if (repeat_max != repeat_min)
3447 {
3448 #ifdef SUPPORT_UTF8
3449 if (utf8 && c >= 128)
3450 {
3451 memcpy(code, utf8_char, c & 7);
3452 code += c & 7;
3453 }
3454 else
3455 #endif
3456 *code++ = c;
3457 if (prop_type >= 0)
3458 {
3459 *code++ = prop_type;
3460 *code++ = prop_value;
3461 }
3462 repeat_max -= repeat_min;
3463
3464 if (repeat_max == 1)
3465 {
3466 *code++ = OP_QUERY + repeat_type;
3467 }
3468 else
3469 {
3470 *code++ = OP_UPTO + repeat_type;
3471 PUT2INC(code, 0, repeat_max);
3472 }
3473 }
3474 }
3475
3476 /* The character or character type itself comes last in all cases. */
3477
3478 #ifdef SUPPORT_UTF8
3479 if (utf8 && c >= 128)
3480 {
3481 memcpy(code, utf8_char, c & 7);
3482 code += c & 7;
3483 }
3484 else
3485 #endif
3486 *code++ = c;
3487
3488 /* For a repeated Unicode property match, there are two extra bytes that
3489 define the required property. */
3490
3491 #ifdef SUPPORT_UCP
3492 if (prop_type >= 0)
3493 {
3494 *code++ = prop_type;
3495 *code++ = prop_value;
3496 }
3497 #endif
3498 }
3499
3500 /* If previous was a character class or a back reference, we put the repeat
3501 stuff after it, but just skip the item if the repeat was {0,0}. */
3502
3503 else if (*previous == OP_CLASS ||
3504 *previous == OP_NCLASS ||
3505 #ifdef SUPPORT_UTF8
3506 *previous == OP_XCLASS ||
3507 #endif
3508 *previous == OP_REF)
3509 {
3510 if (repeat_max == 0)
3511 {
3512 code = previous;
3513 goto END_REPEAT;
3514 }
3515
3516 /* All real repeats make it impossible to handle partial matching (maybe
3517 one day we will be able to remove this restriction). */
3518
3519 if (repeat_max != 1) cd->nopartial = TRUE;
3520
3521 if (repeat_min == 0 && repeat_max == -1)
3522 *code++ = OP_CRSTAR + repeat_type;
3523 else if (repeat_min == 1 && repeat_max == -1)
3524 *code++ = OP_CRPLUS + repeat_type;
3525 else if (repeat_min == 0 && repeat_max == 1)
3526 *code++ = OP_CRQUERY + repeat_type;
3527 else
3528 {
3529 *code++ = OP_CRRANGE + repeat_type;
3530 PUT2INC(code, 0, repeat_min);
3531 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
3532 PUT2INC(code, 0, repeat_max);
3533 }
3534 }
3535
3536 /* If previous was a bracket group, we may have to replicate it in certain
3537 cases. */
3538
3539 else if (*previous == OP_BRA || *previous == OP_CBRA ||
3540 *previous == OP_ONCE || *previous == OP_COND)
3541 {
3542 register int i;
3543 int ketoffset = 0;
3544 int len = code - previous;
3545 uschar *bralink = NULL;
3546
3547 /* Repeating a DEFINE group is pointless */
3548
3549 if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
3550 {
3551 *errorcodeptr = ERR55;
3552 goto FAILED;
3553 }
3554
3555 /* If the maximum repeat count is unlimited, find the end of the bracket
3556 by scanning through from the start, and compute the offset back to it
3557 from the current code pointer. There may be an OP_OPT setting following
3558 the final KET, so we can't find the end just by going back from the code
3559 pointer. */
3560
3561 if (repeat_max == -1)
3562 {
3563 register uschar *ket = previous;
3564 do ket += GET(ket, 1); while (*ket != OP_KET);
3565 ketoffset = code - ket;
3566 }
3567
3568 /* The case of a zero minimum is special because of the need to stick
3569 OP_BRAZERO in front of it, and because the group appears once in the
3570 data, whereas in other cases it appears the minimum number of times. For
3571 this reason, it is simplest to treat this case separately, as otherwise
3572 the code gets far too messy. There are several special subcases when the
3573 minimum is zero. */
3574
3575 if (repeat_min == 0)
3576 {
3577 /* If the maximum is also zero, we just omit the group from the output
3578 altogether. */
3579
3580 if (repeat_max == 0)
3581 {
3582 code = previous;
3583 goto END_REPEAT;
3584 }
3585
3586 /* If the maximum is 1 or unlimited, we just have to stick in the
3587 BRAZERO and do no more at this point. However, we do need to adjust
3588 any OP_RECURSE calls inside the group that refer to the group itself or
3589 any internal or forward referenced group, because the offset is from
3590 the start of the whole regex. Temporarily terminate the pattern while
3591 doing this. */
3592
3593 if (repeat_max <= 1)
3594 {
3595 *code = OP_END;
3596 adjust_recurse(previous, 1, utf8, cd, save_hwm);
3597 memmove(previous+1, previous, len);
3598 code++;
3599 *previous++ = OP_BRAZERO + repeat_type;
3600 }
3601
3602 /* If the maximum is greater than 1 and limited, we have to replicate
3603 in a nested fashion, sticking OP_BRAZERO before each set of brackets.
3604 The first one has to be handled carefully because it's the original
3605 copy, which has to be moved up. The remainder can be handled by code
3606 that is common with the non-zero minimum case below. We have to
3607 adjust the value or repeat_max, since one less copy is required. Once
3608 again, we may have to adjust any OP_RECURSE calls inside the group. */
3609
3610 else
3611 {
3612 int offset;
3613 *code = OP_END;
3614 adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
3615 memmove(previous + 2 + LINK_SIZE, previous, len);
3616 code += 2 + LINK_SIZE;
3617 *previous++ = OP_BRAZERO + repeat_type;
3618 *previous++ = OP_BRA;
3619
3620 /* We chain together the bracket offset fields that have to be
3621 filled in later when the ends of the brackets are reached. */
3622
3623 offset = (bralink == NULL)? 0 : previous - bralink;
3624 bralink = previous;
3625 PUTINC(previous, 0, offset);
3626 }
3627
3628 repeat_max--;
3629 }
3630
3631 /* If the minimum is greater than zero, replicate the group as many
3632 times as necessary, and adjust the maximum to the number of subsequent
3633 copies that we need. If we set a first char from the group, and didn't
3634 set a required char, copy the latter from the former. If there are any
3635 forward reference subroutine calls in the group, there will be entries on
3636 the workspace list; replicate these with an appropriate increment. */
3637
3638 else
3639 {
3640 if (repeat_min > 1)
3641 {
3642 /* In the pre-compile phase, we don't actually do the replication. We
3643 just adjust the length as if we had. Do some paranoid checks for
3644 potential integer overflow. */
3645
3646 if (lengthptr != NULL)
3647 {
3648 int delta = (repeat_min - 1)*length_prevgroup;
3649 if ((double)(repeat_min - 1)*(double)length_prevgroup >
3650 (double)INT_MAX ||
3651 OFLOW_MAX - *lengthptr < delta)
3652 {
3653 *errorcodeptr = ERR20;
3654 goto FAILED;
3655 }
3656 *lengthptr += delta;
3657 }
3658
3659 /* This is compiling for real */
3660
3661 else
3662 {
3663 if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
3664 for (i = 1; i < repeat_min; i++)
3665 {
3666 uschar *hc;
3667 uschar *this_hwm = cd->hwm;
3668 memcpy(code, previous, len);
3669 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3670 {
3671 PUT(cd->hwm, 0, GET(hc, 0) + len);
3672 cd->hwm += LINK_SIZE;
3673 }
3674 save_hwm = this_hwm;
3675 code += len;
3676 }
3677 }
3678 }
3679
3680 if (repeat_max > 0) repeat_max -= repeat_min;
3681 }
3682
3683 /* This code is common to both the zero and non-zero minimum cases. If
3684 the maximum is limited, it replicates the group in a nested fashion,
3685 remembering the bracket starts on a stack. In the case of a zero minimum,
3686 the first one was set up above. In all cases the repeat_max now specifies
3687 the number of additional copies needed. Again, we must remember to
3688 replicate entries on the forward reference list. */
3689
3690 if (repeat_max >= 0)
3691 {
3692 /* In the pre-compile phase, we don't actually do the replication. We
3693 just adjust the length as if we had. For each repetition we must add 1
3694 to the length for BRAZERO and for all but the last repetition we must
3695 add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
3696 paranoid checks to avoid integer overflow. */
3697
3698 if (lengthptr != NULL && repeat_max > 0)
3699 {
3700 int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
3701 2 - 2*LINK_SIZE; /* Last one doesn't nest */
3702 if ((double)repeat_max *
3703 (double)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
3704 > (double)INT_MAX ||
3705 OFLOW_MAX - *lengthptr < delta)
3706 {
3707 *errorcodeptr = ERR20;
3708 goto FAILED;
3709 }
3710 *lengthptr += delta;
3711 }
3712
3713 /* This is compiling for real */
3714
3715 else for (i = repeat_max - 1; i >= 0; i--)
3716 {
3717 uschar *hc;
3718 uschar *this_hwm = cd->hwm;
3719
3720 *code++ = OP_BRAZERO + repeat_type;
3721
3722 /* All but the final copy start a new nesting, maintaining the
3723 chain of brackets outstanding. */
3724
3725 if (i != 0)
3726 {
3727 int offset;
3728 *code++ = OP_BRA;
3729 offset = (bralink == NULL)? 0 : code - bralink;
3730 bralink = code;
3731 PUTINC(code, 0, offset);
3732 }
3733
3734 memcpy(code, previous, len);
3735 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3736 {
3737 PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
3738 cd->hwm += LINK_SIZE;
3739 }
3740 save_hwm = this_hwm;
3741 code += len;
3742 }
3743
3744 /* Now chain through the pending brackets, and fill in their length
3745 fields (which are holding the chain links pro tem). */
3746
3747 while (bralink != NULL)
3748 {
3749 int oldlinkoffset;
3750 int offset = code - bralink + 1;
3751 uschar *bra = code - offset;
3752 oldlinkoffset = GET(bra, 1);
3753 bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
3754 *code++ = OP_KET;
3755 PUTINC(code, 0, offset);
3756 PUT(bra, 1, offset);
3757 }
3758 }
3759
3760 /* If the maximum is unlimited, set a repeater in the final copy. We
3761 can't just offset backwards from the current code point, because we
3762 don't know if there's been an options resetting after the ket. The
3763 correct offset was computed above.
3764
3765 Then, when we are doing the actual compile phase, check to see whether
3766 this group is a non-atomic one that could match an empty string. If so,
3767 convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
3768 that runtime checking can be done. [This check is also applied to
3769 atomic groups at runtime, but in a different way.] */
3770
3771 else
3772 {
3773 uschar *ketcode = code - ketoffset;
3774 uschar *bracode = ketcode - GET(ketcode, 1);
3775 *ketcode = OP_KETRMAX + repeat_type;
3776 if (lengthptr == NULL && *bracode != OP_ONCE)
3777 {
3778 uschar *scode = bracode;
3779 do
3780 {
3781 if (could_be_empty_branch(scode, ketcode, utf8))
3782 {
3783 *bracode += OP_SBRA - OP_BRA;
3784 break;
3785 }
3786 scode += GET(scode, 1);
3787 }
3788 while (*scode == OP_ALT);
3789 }
3790 }
3791 }
3792
3793 /* Else there's some kind of shambles */
3794
3795 else
3796 {
3797 *errorcodeptr = ERR11;
3798 goto FAILED;
3799 }
3800
3801 /* If the character following a repeat is '+', or if certain optimization
3802 tests above succeeded, possessive_quantifier is TRUE. For some of the
3803 simpler opcodes, there is an special alternative opcode for this. For
3804 anything else, we wrap the entire repeated item inside OP_ONCE brackets.
3805 The '+' notation is just syntactic sugar, taken from Sun's Java package,
3806 but the special opcodes can optimize it a bit. The repeated item starts at
3807 tempcode, not at previous, which might be the first part of a string whose
3808 (former) last char we repeated.
3809
3810 Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
3811 an 'upto' may follow. We skip over an 'exact' item, and then test the
3812 length of what remains before proceeding. */
3813
3814 if (possessive_quantifier)
3815 {
3816 int len;
3817 if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||
3818 *tempcode == OP_NOTEXACT)
3819 tempcode += _pcre_OP_lengths[*tempcode];
3820 len = code - tempcode;
3821 if (len > 0) switch (*tempcode)
3822 {
3823 case OP_STAR: *tempcode = OP_POSSTAR; break;
3824 case OP_PLUS: *tempcode = OP_POSPLUS; break;
3825 case OP_QUERY: *tempcode = OP_POSQUERY; break;
3826 case OP_UPTO: *tempcode = OP_POSUPTO; break;
3827
3828 case OP_TYPESTAR: *tempcode = OP_TYPEPOSSTAR; break;
3829 case OP_TYPEPLUS: *tempcode = OP_TYPEPOSPLUS; break;
3830 case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
3831 case OP_TYPEUPTO: *tempcode = OP_TYPEPOSUPTO; break;
3832
3833 case OP_NOTSTAR: *tempcode = OP_NOTPOSSTAR; break;
3834 case OP_NOTPLUS: *tempcode = OP_NOTPOSPLUS; break;
3835 case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
3836 case OP_NOTUPTO: *tempcode = OP_NOTPOSUPTO; break;
3837
3838 default:
3839 memmove(tempcode + 1+LINK_SIZE, tempcode, len);
3840 code += 1 + LINK_SIZE;
3841 len += 1 + LINK_SIZE;
3842 tempcode[0] = OP_ONCE;
3843 *code++ = OP_KET;
3844 PUTINC(code, 0, len);
3845 PUT(tempcode, 1, len);
3846 break;
3847 }
3848 }
3849
3850 /* In all case we no longer have a previous item. We also set the
3851 "follows varying string" flag for subsequently encountered reqbytes if
3852 it isn't already set and we have just passed a varying length item. */
3853
3854 END_REPEAT:
3855 previous = NULL;
3856 cd->req_varyopt |= reqvary;
3857 break;
3858
3859
3860 /* ===================================================================*/
3861 /* Start of nested parenthesized sub-expression, or comment or lookahead or
3862 lookbehind or option setting or condition or all the other extended
3863 parenthesis forms. First deal with the specials; all are introduced by ?,
3864 and the appearance of any of them means that this is not a capturing
3865 group. */
3866
3867 case '(':
3868 newoptions = options;
3869 skipbytes = 0;
3870 bravalue = OP_CBRA;
3871 save_hwm = cd->hwm;
3872 reset_bracount = FALSE;
3873
3874 if (*(++ptr) == '?')
3875 {
3876 int i, set, unset, namelen;
3877 int *optset;
3878 const uschar *name;
3879 uschar *slot;
3880
3881 switch (*(++ptr))
3882 {
3883 case '#': /* Comment; skip to ket */
3884 ptr++;
3885 while (*ptr != 0 && *ptr != ')') ptr++;
3886 if (*ptr == 0)
3887 {
3888 *errorcodeptr = ERR18;
3889 goto FAILED;
3890 }
3891 continue;
3892
3893
3894 /* ------------------------------------------------------------ */
3895 case '|': /* Reset capture count for each branch */
3896 reset_bracount = TRUE;
3897 /* Fall through */
3898
3899 /* ------------------------------------------------------------ */
3900 case ':': /* Non-capturing bracket */
3901 bravalue = OP_BRA;
3902 ptr++;
3903 break;
3904
3905
3906 /* ------------------------------------------------------------ */
3907 case '(':
3908 bravalue = OP_COND; /* Conditional group */
3909
3910 /* A condition can be an assertion, a number (referring to a numbered
3911 group), a name (referring to a named group), or 'R', referring to
3912 recursion. R<digits> and R&name are also permitted for recursion tests.
3913
3914 There are several syntaxes for testing a named group: (?(name)) is used
3915 by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
3916
3917 There are two unfortunate ambiguities, caused by history. (a) 'R' can
3918 be the recursive thing or the name 'R' (and similarly for 'R' followed
3919 by digits), and (b) a number could be a name that consists of digits.
3920 In both cases, we look for a name first; if not found, we try the other
3921 cases. */
3922
3923 /* For conditions that are assertions, check the syntax, and then exit
3924 the switch. This will take control down to where bracketed groups,
3925 including assertions, are processed. */
3926
3927 if (ptr[1] == '?' && (ptr[2] == '=' || ptr[2] == '!' || ptr[2] == '<'))
3928 break;
3929
3930 /* Most other conditions use OP_CREF (a couple change to OP_RREF
3931 below), and all need to skip 3 bytes at the start of the group. */
3932
3933 code[1+LINK_SIZE] = OP_CREF;
3934 skipbytes = 3;
3935 refsign = -1;
3936
3937 /* Check for a test for recursion in a named group. */
3938
3939 if (ptr[1] == 'R' && ptr[2] == '&')
3940 {
3941 terminator = -1;
3942 ptr += 2;
3943 code[1+LINK_SIZE] = OP_RREF; /* Change the type of test */
3944 }
3945
3946 /* Check for a test for a named group's having been set, using the Perl
3947 syntax (?(<name>) or (?('name') */
3948
3949 else if (ptr[1] == '<')
3950 {
3951 terminator = '>';
3952 ptr++;
3953 }
3954 else if (ptr[1] == '\'')
3955 {
3956 terminator = '\'';
3957 ptr++;
3958 }
3959 else
3960 {
3961 terminator = 0;
3962 if (ptr[1] == '-' || ptr[1] == '+') refsign = *(++ptr);
3963 }
3964
3965 /* We now expect to read a name; any thing else is an error */
3966
3967 if ((cd->ctypes[ptr[1]] & ctype_word) == 0)
3968 {
3969 ptr += 1; /* To get the right offset */
3970 *errorcodeptr = ERR28;
3971 goto FAILED;
3972 }
3973
3974 /* Read the name, but also get it as a number if it's all digits */
3975
3976 recno = 0;
3977 name = ++ptr;
3978 while ((cd->ctypes[*ptr] & ctype_word) != 0)
3979 {
3980 if (recno >= 0)
3981 recno = ((digitab[*ptr] & ctype_digit) != 0)?
3982 recno * 10 + *ptr - '0' : -1;
3983 ptr++;
3984 }
3985 namelen = ptr - name;
3986
3987 if ((terminator > 0 && *ptr++ != terminator) || *ptr++ != ')')
3988 {
3989 ptr--; /* Error offset */
3990 *errorcodeptr = ERR26;
3991 goto FAILED;
3992 }
3993
3994 /* Do no further checking in the pre-compile phase. */
3995
3996 if (lengthptr != NULL) break;
3997
3998 /* In the real compile we do the work of looking for the actual
3999 reference. If the string started with "+" or "-" we require the rest to
4000 be digits, in which case recno will be set. */
4001
4002 if (refsign > 0)
4003 {
4004 if (recno <= 0)
4005 {
4006 *errorcodeptr = ERR58;
4007 goto FAILED;
4008 }
4009 if (refsign == '-')
4010 {
4011 recno = cd->bracount - recno + 1;
4012 if (recno <= 0)
4013 {
4014 *errorcodeptr = ERR15;
4015 goto FAILED;
4016 }
4017 }
4018 else recno += cd->bracount;
4019 PUT2(code, 2+LINK_SIZE, recno);
4020 break;
4021 }
4022
4023 /* Otherwise (did not start with "+" or "-"), start by looking for the
4024 name. */
4025
4026 slot = cd->name_table;
4027 for (i = 0; i < cd->names_found; i++)
4028 {
4029 if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
4030 slot += cd->name_entry_size;
4031 }
4032
4033 /* Found a previous named subpattern */
4034
4035 if (i < cd->names_found)
4036 {
4037 recno = GET2(slot, 0);
4038 PUT2(code, 2+LINK_SIZE, recno);
4039 }
4040
4041 /* Search the pattern for a forward reference */
4042
4043 else if ((i = find_parens(ptr, cd->bracount, name, namelen,
4044 (options & PCRE_EXTENDED) != 0)) > 0)
4045 {
4046 PUT2(code, 2+LINK_SIZE, i);
4047 }
4048
4049 /* If terminator == 0 it means that the name followed directly after
4050 the opening parenthesis [e.g. (?(abc)...] and in this case there are
4051 some further alternatives to try. For the cases where terminator != 0
4052 [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
4053 now checked all the possibilities, so give an error. */
4054
4055 else if (terminator != 0)
4056 {
4057 *errorcodeptr = ERR15;
4058 goto FAILED;
4059 }
4060
4061 /* Check for (?(R) for recursion. Allow digits after R to specify a
4062 specific group number. */
4063
4064 else if (*name == 'R')
4065 {
4066 recno = 0;
4067 for (i = 1; i < namelen; i++)
4068 {
4069 if ((digitab[name[i]] & ctype_digit) == 0)
4070 {
4071 *errorcodeptr = ERR15;
4072 goto FAILED;
4073 }
4074 recno = recno * 10 + name[i] - '0';
4075 }
4076 if (recno == 0) recno = RREF_ANY;
4077 code[1+LINK_SIZE] = OP_RREF; /* Change test type */
4078 PUT2(code, 2+LINK_SIZE, recno);
4079 }
4080
4081 /* Similarly, check for the (?(DEFINE) "condition", which is always
4082 false. */
4083
4084 else if (namelen == 6 && strncmp((char *)name, "DEFINE", 6) == 0)
4085 {
4086 code[1+LINK_SIZE] = OP_DEF;
4087 skipbytes = 1;
4088 }
4089
4090 /* Check for the "name" actually being a subpattern number. */
4091
4092 else if (recno > 0)
4093 {
4094 PUT2(code, 2+LINK_SIZE, recno);
4095 }
4096
4097 /* Either an unidentified subpattern, or a reference to (?(0) */
4098
4099 else
4100 {
4101 *errorcodeptr = (recno == 0)? ERR35: ERR15;
4102 goto FAILED;
4103 }
4104 break;
4105
4106
4107 /* ------------------------------------------------------------ */
4108 case '=': /* Positive lookahead */
4109 bravalue = OP_ASSERT;
4110 ptr++;
4111 break;
4112
4113
4114 /* ------------------------------------------------------------ */
4115 case '!': /* Negative lookahead */
4116 bravalue = OP_ASSERT_NOT;
4117 ptr++;
4118 break;
4119
4120
4121 /* ------------------------------------------------------------ */
4122 case '<': /* Lookbehind or named define */
4123 switch (ptr[1])
4124 {
4125 case '=': /* Positive lookbehind */
4126 bravalue = OP_ASSERTBACK;
4127 ptr += 2;
4128 break;
4129
4130 case '!': /* Negative lookbehind */
4131 bravalue = OP_ASSERTBACK_NOT;
4132 ptr += 2;
4133 break;
4134
4135 default: /* Could be name define, else bad */
4136 if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME;
4137 ptr++; /* Correct offset for error */
4138 *errorcodeptr = ERR24;
4139 goto FAILED;
4140 }
4141 break;
4142
4143
4144 /* ------------------------------------------------------------ */
4145 case '>': /* One-time brackets */
4146 bravalue = OP_ONCE;
4147 ptr++;
4148 break;
4149
4150
4151 /* ------------------------------------------------------------ */
4152 case 'C': /* Callout - may be followed by digits; */
4153 previous_callout = code; /* Save for later completion */
4154 after_manual_callout = 1; /* Skip one item before completing */
4155 *code++ = OP_CALLOUT;
4156 {
4157 int n = 0;
4158 while ((digitab[*(++ptr)] & ctype_digit) != 0)
4159 n = n * 10 + *ptr - '0';
4160 if (*ptr != ')')
4161 {
4162 *errorcodeptr = ERR39;
4163 goto FAILED;
4164 }
4165 if (n > 255)
4166 {
4167 *errorcodeptr = ERR38;
4168 goto FAILED;
4169 }
4170 *code++ = n;
4171 PUT(code, 0, ptr - cd->start_pattern + 1); /* Pattern offset */
4172 PUT(code, LINK_SIZE, 0); /* Default length */
4173 code += 2 * LINK_SIZE;
4174 }
4175 previous = NULL;
4176 continue;
4177
4178
4179 /* ------------------------------------------------------------ */
4180 case 'P': /* Python-style named subpattern handling */
4181 if (*(++ptr) == '=' || *ptr == '>') /* Reference or recursion */
4182 {
4183 is_recurse = *ptr == '>';
4184 terminator = ')';
4185 goto NAMED_REF_OR_RECURSE;
4186 }
4187 else if (*ptr != '<') /* Test for Python-style definition */
4188 {
4189 *errorcodeptr = ERR41;
4190 goto FAILED;
4191 }
4192 /* Fall through to handle (?P< as (?< is handled */
4193
4194
4195 /* ------------------------------------------------------------ */
4196 DEFINE_NAME: /* Come here from (?< handling */
4197 case '\'':
4198 {
4199 terminator = (*ptr == '<')? '>' : '\'';
4200 name = ++ptr;
4201
4202 while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4203 namelen = ptr - name;
4204
4205 /* In the pre-compile phase, just do a syntax check. */
4206
4207 if (lengthptr != NULL)
4208 {
4209 if (*ptr != terminator)
4210 {
4211 *errorcodeptr = ERR42;
4212 goto FAILED;
4213 }
4214 if (cd->names_found >= MAX_NAME_COUNT)
4215 {
4216 *errorcodeptr = ERR49;
4217 goto FAILED;
4218 }
4219 if (namelen + 3 > cd->name_entry_size)
4220 {
4221 cd->name_entry_size = namelen + 3;
4222 if (namelen > MAX_NAME_SIZE)
4223 {
4224 *errorcodeptr = ERR48;
4225 goto FAILED;
4226 }
4227 }
4228 }
4229
4230 /* In the real compile, create the entry in the table */
4231
4232 else
4233 {
4234 slot = cd->name_table;
4235 for (i = 0; i < cd->names_found; i++)
4236 {
4237 int crc = memcmp(name, slot+2, namelen);
4238 if (crc == 0)
4239 {
4240 if (slot[2+namelen] == 0)
4241 {
4242 if ((options & PCRE_DUPNAMES) == 0)
4243 {
4244 *errorcodeptr = ERR43;
4245 goto FAILED;
4246 }
4247 }
4248 else crc = -1; /* Current name is substring */
4249 }
4250 if (crc < 0)
4251 {
4252 memmove(slot + cd->name_entry_size, slot,
4253 (cd->names_found - i) * cd->name_entry_size);
4254 break;
4255 }
4256 slot += cd->name_entry_size;
4257 }
4258
4259 PUT2(slot, 0, cd->bracount + 1);
4260 memcpy(slot + 2, name, namelen);
4261 slot[2+namelen] = 0;
4262 }
4263 }
4264
4265 /* In both cases, count the number of names we've encountered. */
4266
4267 ptr++; /* Move past > or ' */
4268 cd->names_found++;
4269 goto NUMBERED_GROUP;
4270
4271
4272 /* ------------------------------------------------------------ */
4273 case '&': /* Perl recursion/subroutine syntax */
4274 terminator = ')';
4275 is_recurse = TRUE;
4276 /* Fall through */
4277
4278 /* We come here from the Python syntax above that handles both
4279 references (?P=name) and recursion (?P>name), as well as falling
4280 through from the Perl recursion syntax (?&name). */
4281
4282 NAMED_REF_OR_RECURSE:
4283 name = ++ptr;
4284 while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4285 namelen = ptr - name;
4286
4287 /* In the pre-compile phase, do a syntax check and set a dummy
4288 reference number. */
4289
4290 if (lengthptr != NULL)
4291 {
4292 if (*ptr != terminator)
4293 {
4294 *errorcodeptr = ERR42;
4295 goto FAILED;
4296 }
4297 if (namelen > MAX_NAME_SIZE)
4298 {
4299 *errorcodeptr = ERR48;
4300 goto FAILED;
4301 }
4302 recno = 0;
4303 }
4304
4305 /* In the real compile, seek the name in the table */
4306
4307 else
4308 {
4309 slot = cd->name_table;
4310 for (i = 0; i < cd->names_found; i++)
4311 {
4312 if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
4313 slot += cd->name_entry_size;
4314 }
4315
4316 if (i < cd->names_found) /* Back reference */
4317 {
4318 recno = GET2(slot, 0);
4319 }
4320 else if ((recno = /* Forward back reference */
4321 find_parens(ptr, cd->bracount, name, namelen,
4322 (options & PCRE_EXTENDED) != 0)) <= 0)
4323 {
4324 *errorcodeptr = ERR15;
4325 goto FAILED;
4326 }
4327 }
4328
4329 /* In both phases, we can now go to the code than handles numerical
4330 recursion or backreferences. */
4331
4332 if (is_recurse) goto HANDLE_RECURSION;
4333 else goto HANDLE_REFERENCE;
4334
4335
4336 /* ------------------------------------------------------------ */
4337 case 'R': /* Recursion */
4338 ptr++; /* Same as (?0) */
4339 /* Fall through */
4340
4341
4342 /* ------------------------------------------------------------ */
4343 case '-': case '+':
4344 case '0': case '1': case '2': case '3': case '4': /* Recursion or */
4345 case '5': case '6': case '7': case '8': case '9': /* subroutine */
4346 {
4347 const uschar *called;
4348
4349 if ((refsign = *ptr) == '+') ptr++;
4350 else if (refsign == '-')
4351 {
4352 if ((digitab[ptr[1]] & ctype_digit) == 0)
4353 goto OTHER_CHAR_AFTER_QUERY;
4354 ptr++;
4355 }
4356
4357 recno = 0;
4358 while((digitab[*ptr] & ctype_digit) != 0)
4359 recno = recno * 10 + *ptr++ - '0';
4360
4361 if (*ptr != ')')
4362 {
4363 *errorcodeptr = ERR29;
4364 goto FAILED;
4365 }
4366
4367 if (refsign == '-')
4368 {
4369 if (recno == 0)
4370 {
4371 *errorcodeptr = ERR58;
4372 goto FAILED;
4373 }
4374 recno = cd->bracount - recno + 1;
4375 if (recno <= 0)
4376 {
4377 *errorcodeptr = ERR15;
4378 goto FAILED;
4379 }
4380 }
4381 else if (refsign == '+')
4382 {
4383 if (recno == 0)
4384 {
4385 *errorcodeptr = ERR58;
4386 goto FAILED;
4387 }
4388 recno += cd->bracount;
4389 }
4390
4391 /* Come here from code above that handles a named recursion */
4392
4393 HANDLE_RECURSION:
4394
4395 previous = code;
4396 called = cd->start_code;
4397
4398 /* When we are actually compiling, find the bracket that is being
4399 referenced. Temporarily end the regex in case it doesn't exist before
4400 this point. If we end up with a forward reference, first check that
4401 the bracket does occur later so we can give the error (and position)
4402 now. Then remember this forward reference in the workspace so it can
4403 be filled in at the end. */
4404
4405 if (lengthptr == NULL)
4406 {
4407 *code = OP_END;
4408 if (recno != 0) called = find_bracket(cd->start_code, utf8, recno);
4409
4410 /* Forward reference */
4411
4412 if (called == NULL)
4413 {
4414 if (find_parens(ptr, cd->bracount, NULL, recno,
4415 (options & PCRE_EXTENDED) != 0) < 0)
4416 {
4417 *errorcodeptr = ERR15;
4418 goto FAILED;
4419 }
4420 called = cd->start_code + recno;
4421 PUTINC(cd->hwm, 0, code + 2 + LINK_SIZE - cd->start_code);
4422 }
4423
4424 /* If not a forward reference, and the subpattern is still open,
4425 this is a recursive call. We check to see if this is a left
4426 recursion that could loop for ever, and diagnose that case. */
4427
4428 else if (GET(called, 1) == 0 &&
4429 could_be_empty(called, code, bcptr, utf8))
4430 {
4431 *errorcodeptr = ERR40;
4432 goto FAILED;
4433 }
4434 }
4435
4436 /* Insert the recursion/subroutine item, automatically wrapped inside
4437 "once" brackets. Set up a "previous group" length so that a
4438 subsequent quantifier will work. */
4439
4440 *code = OP_ONCE;
4441 PUT(code, 1, 2 + 2*LINK_SIZE);
4442 code += 1 + LINK_SIZE;
4443
4444 *code = OP_RECURSE;
4445 PUT(code, 1, called - cd->start_code);
4446 code += 1 + LINK_SIZE;
4447
4448 *code = OP_KET;
4449 PUT(code, 1, 2 + 2*LINK_SIZE);
4450 code += 1 + LINK_SIZE;
4451
4452 length_prevgroup = 3 + 3*LINK_SIZE;
4453 }
4454
4455 /* Can't determine a first byte now */
4456
4457 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
4458 continue;
4459
4460
4461 /* ------------------------------------------------------------ */
4462 default: /* Other characters: check option setting */
4463 OTHER_CHAR_AFTER_QUERY:
4464 set = unset = 0;
4465 optset = &set;
4466
4467 while (*ptr != ')' && *ptr != ':')
4468 {
4469 switch (*ptr++)
4470 {
4471 case '-': optset = &unset; break;
4472
4473 case 'J': /* Record that it changed in the external options */
4474 *optset |= PCRE_DUPNAMES;
4475 cd->external_options |= PCRE_JCHANGED;
4476 break;
4477
4478 case 'i': *optset |= PCRE_CASELESS; break;
4479 case 'm': *optset |= PCRE_MULTILINE; break;
4480 case 's': *optset |= PCRE_DOTALL; break;
4481 case 'x': *optset |= PCRE_EXTENDED; break;
4482 case 'U': *optset |= PCRE_UNGREEDY; break;
4483 case 'X': *optset |= PCRE_EXTRA; break;
4484
4485 default: *errorcodeptr = ERR12;
4486 ptr--; /* Correct the offset */
4487 goto FAILED;
4488 }
4489 }
4490
4491 /* Set up the changed option bits, but don't change anything yet. */
4492
4493 newoptions = (options | set) & (~unset);
4494
4495 /* If the options ended with ')' this is not the start of a nested
4496 group with option changes, so the options change at this level. If this
4497 item is right at the start of the pattern, the options can be
4498 abstracted and made external in the pre-compile phase, and ignored in
4499 the compile phase. This can be helpful when matching -- for instance in
4500 caseless checking of required bytes.
4501
4502 If the code pointer is not (cd->start_code + 1 + LINK_SIZE), we are
4503 definitely *not* at the start of the pattern because something has been
4504 compiled. In the pre-compile phase, however, the code pointer can have
4505 that value after the start, because it gets reset as code is discarded
4506 during the pre-compile. However, this can happen only at top level - if
4507 we are within parentheses, the starting BRA will still be present. At
4508 any parenthesis level, the length value can be used to test if anything
4509 has been compiled at that level. Thus, a test for both these conditions
4510 is necessary to ensure we correctly detect the start of the pattern in
4511 both phases.
4512
4513 If we are not at the pattern start, compile code to change the ims
4514 options if this setting actually changes any of them. We also pass the
4515 new setting back so that it can be put at the start of any following
4516 branches, and when this group ends (if we are in a group), a resetting
4517 item can be compiled. */
4518
4519 if (*ptr == ')')
4520 {
4521 if (code == cd->start_code + 1 + LINK_SIZE &&
4522 (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE))
4523 {
4524 cd->external_options = newoptions;
4525 options = newoptions;
4526 }
4527 else
4528 {
4529 if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
4530 {
4531 *code++ = OP_OPT;
4532 *code++ = newoptions & PCRE_IMS;
4533 }
4534
4535 /* Change options at this level, and pass them back for use
4536 in subsequent branches. Reset the greedy defaults and the case
4537 value for firstbyte and reqbyte. */
4538
4539 *optionsptr = options = newoptions;
4540 greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
4541 greedy_non_default = greedy_default ^ 1;
4542 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
4543 }
4544
4545 previous = NULL; /* This item can't be repeated */
4546 continue; /* It is complete */
4547 }
4548
4549 /* If the options ended with ':' we are heading into a nested group
4550 with possible change of options. Such groups are non-capturing and are
4551 not assertions of any kind. All we need to do is skip over the ':';
4552 the newoptions value is handled below. */
4553
4554 bravalue = OP_BRA;
4555 ptr++;
4556 } /* End of switch for character following (? */
4557 } /* End of (? handling */
4558
4559 /* Opening parenthesis not followed by '?'. If PCRE_NO_AUTO_CAPTURE is set,
4560 all unadorned brackets become non-capturing and behave like (?:...)
4561 brackets. */
4562
4563 else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
4564 {
4565 bravalue = OP_BRA;
4566 }
4567
4568 /* Else we have a capturing group. */
4569
4570 else
4571 {
4572 NUMBERED_GROUP:
4573 cd->bracount += 1;
4574 PUT2(code, 1+LINK_SIZE, cd->bracount);
4575 skipbytes = 2;
4576 }
4577
4578 /* Process nested bracketed regex. Assertions may not be repeated, but
4579 other kinds can be. All their opcodes are >= OP_ONCE. We copy code into a
4580 non-register variable in order to be able to pass its address because some
4581 compilers complain otherwise. Pass in a new setting for the ims options if
4582 they have changed. */
4583
4584 previous = (bravalue >= OP_ONCE)? code : NULL;
4585 *code = bravalue;
4586 tempcode = code;
4587 tempreqvary = cd->req_varyopt; /* Save value before bracket */
4588 length_prevgroup = 0; /* Initialize for pre-compile phase */
4589
4590 if (!compile_regex(
4591 newoptions, /* The complete new option state */
4592 options & PCRE_IMS, /* The previous ims option state */
4593 &tempcode, /* Where to put code (updated) */
4594 &ptr, /* Input pointer (updated) */
4595 errorcodeptr, /* Where to put an error message */
4596 (bravalue == OP_ASSERTBACK ||
4597 bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
4598 reset_bracount, /* True if (?| group */
4599 skipbytes, /* Skip over bracket number */
4600 &subfirstbyte, /* For possible first char */
4601 &subreqbyte, /* For possible last char */
4602 bcptr, /* Current branch chain */
4603 cd, /* Tables block */
4604 (lengthptr == NULL)? NULL : /* Actual compile phase */
4605 &length_prevgroup /* Pre-compile phase */
4606 ))
4607 goto FAILED;
4608
4609 /* At the end of compiling, code is still pointing to the start of the
4610 group, while tempcode has been updated to point past the end of the group
4611 and any option resetting that may follow it. The pattern pointer (ptr)
4612 is on the bracket. */
4613
4614 /* If this is a conditional bracket, check that there are no more than
4615 two branches in the group, or just one if it's a DEFINE group. We do this
4616 in the real compile phase, not in the pre-pass, where the whole group may
4617 not be available. */
4618
4619 if (bravalue == OP_COND && lengthptr == NULL)
4620 {
4621 uschar *tc = code;
4622 int condcount = 0;
4623
4624 do {
4625 condcount++;
4626 tc += GET(tc,1);
4627 }
4628 while (*tc != OP_KET);
4629
4630 /* A DEFINE group is never obeyed inline (the "condition" is always
4631 false). It must have only one branch. */
4632
4633 if (code[LINK_SIZE+1] == OP_DEF)
4634 {
4635 if (condcount > 1)
4636 {
4637 *errorcodeptr = ERR54;
4638 goto FAILED;
4639 }
4640 bravalue = OP_DEF; /* Just a flag to suppress char handling below */
4641 }
4642
4643 /* A "normal" conditional group. If there is just one branch, we must not
4644 make use of its firstbyte or reqbyte, because this is equivalent to an
4645 empty second branch. */
4646
4647 else
4648 {
4649 if (condcount > 2)
4650 {
4651 *errorcodeptr = ERR27;
4652 goto FAILED;
4653 }
4654 if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
4655 }
4656 }
4657
4658 /* Error if hit end of pattern */
4659
4660 if (*ptr != ')')
4661 {
4662 *errorcodeptr = ERR14;
4663 goto FAILED;
4664 }
4665
4666 /* In the pre-compile phase, update the length by the length of the nested
4667 group, less the brackets at either end. Then reduce the compiled code to
4668 just the brackets so that it doesn't use much memory if it is duplicated by
4669 a quantifier. */
4670
4671 if (lengthptr != NULL)
4672 {
4673 if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE)
4674 {
4675 *errorcodeptr = ERR20;
4676 goto FAILED;
4677 }
4678 *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
4679 code++;
4680 PUTINC(code, 0, 1 + LINK_SIZE);
4681 *code++ = OP_KET;
4682 PUTINC(code, 0, 1 + LINK_SIZE);
4683 }
4684
4685 /* Otherwise update the main code pointer to the end of the group. */
4686
4687 else code = tempcode;
4688
4689 /* For a DEFINE group, required and first character settings are not
4690 relevant. */
4691
4692 if (bravalue == OP_DEF) break;
4693
4694 /* Handle updating of the required and first characters for other types of
4695 group. Update for normal brackets of all kinds, and conditions with two
4696 branches (see code above). If the bracket is followed by a quantifier with
4697 zero repeat, we have to back off. Hence the definition of zeroreqbyte and
4698 zerofirstbyte outside the main loop so that they can be accessed for the
4699 back off. */
4700
4701 zeroreqbyte = reqbyte;
4702 zerofirstbyte = firstbyte;
4703 groupsetfirstbyte = FALSE;
4704
4705 if (bravalue >= OP_ONCE)
4706 {
4707 /* If we have not yet set a firstbyte in this branch, take it from the
4708 subpattern, remembering that it was set here so that a repeat of more
4709 than one can replicate it as reqbyte if necessary. If the subpattern has
4710 no firstbyte, set "none" for the whole branch. In both cases, a zero
4711 repeat forces firstbyte to "none". */
4712
4713 if (firstbyte == REQ_UNSET)
4714 {
4715 if (subfirstbyte >= 0)
4716 {
4717 firstbyte = subfirstbyte;
4718 groupsetfirstbyte = TRUE;
4719 }
4720 else firstbyte = REQ_NONE;
4721 zerofirstbyte = REQ_NONE;
4722 }
4723
4724 /* If firstbyte was previously set, convert the subpattern's firstbyte
4725 into reqbyte if there wasn't one, using the vary flag that was in
4726 existence beforehand. */
4727
4728 else if (subfirstbyte >= 0 && subreqbyte < 0)
4729 subreqbyte = subfirstbyte | tempreqvary;
4730
4731 /* If the subpattern set a required byte (or set a first byte that isn't
4732 really the first byte - see above), set it. */
4733
4734 if (subreqbyte >= 0) reqbyte = subreqbyte;
4735 }
4736
4737 /* For a forward assertion, we take the reqbyte, if set. This can be
4738 helpful if the pattern that follows the assertion doesn't set a different
4739 char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
4740 for an assertion, however because it leads to incorrect effect for patterns
4741 such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
4742 of a firstbyte. This is overcome by a scan at the end if there's no
4743 firstbyte, looking for an asserted first char. */
4744
4745 else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
4746 break; /* End of processing '(' */
4747
4748
4749 /* ===================================================================*/
4750 /* Handle metasequences introduced by \. For ones like \d, the ESC_ values
4751 are arranged to be the negation of the corresponding OP_values. For the
4752 back references, the values are ESC_REF plus the reference number. Only
4753 back references and those types that consume a character may be repeated.
4754 We can test for values between ESC_b and ESC_Z for the latter; this may
4755 have to change if any new ones are ever created. */
4756
4757 case '\\':
4758 tempptr = ptr;
4759 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, FALSE);
4760 if (*errorcodeptr != 0) goto FAILED;
4761
4762 if (c < 0)
4763 {
4764 if (-c == ESC_Q) /* Handle start of quoted string */
4765 {
4766 if (ptr[1] == '\\' && ptr[2] == 'E') ptr += 2; /* avoid empty string */
4767 else inescq = TRUE;
4768 continue;
4769 }
4770
4771 if (-c == ESC_E) continue; /* Perl ignores an orphan \E */
4772
4773 /* For metasequences that actually match a character, we disable the
4774 setting of a first character if it hasn't already been set. */
4775
4776 if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
4777 firstbyte = REQ_NONE;
4778
4779 /* Set values to reset to if this is followed by a zero repeat. */
4780
4781 zerofirstbyte = firstbyte;
4782 zeroreqbyte = reqbyte;
4783
4784 /* \k<name> or \k'name' is a back reference by name (Perl syntax).
4785 We also support \k{name} (.NET syntax) */
4786
4787 if (-c == ESC_k && (ptr[1] == '<' || ptr[1] == '\'' || ptr[1] == '{'))
4788 {
4789 is_recurse = FALSE;
4790 terminator = (*(++ptr) == '<')? '>' : (*ptr == '\'')? '\'' : '}';
4791 goto NAMED_REF_OR_RECURSE;
4792 }
4793
4794 /* Back references are handled specially; must disable firstbyte if
4795 not set to cope with cases like (?=(\w+))\1: which would otherwise set
4796 ':' later. */
4797
4798 if (-c >= ESC_REF)
4799 {
4800 recno = -c - ESC_REF;
4801
4802 HANDLE_REFERENCE: /* Come here from named backref handling */
4803 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
4804 previous = code;
4805 *code++ = OP_REF;
4806 PUT2INC(code, 0, recno);
4807 cd->backref_map |= (recno < 32)? (1 << recno) : 1;
4808 if (recno > cd->top_backref) cd->top_backref = recno;
4809 }
4810
4811 /* So are Unicode property matches, if supported. */
4812
4813 #ifdef SUPPORT_UCP
4814 else if (-c == ESC_P || -c == ESC_p)
4815 {
4816 BOOL negated;
4817 int pdata;
4818 int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
4819 if (ptype < 0) goto FAILED;
4820 previous = code;
4821 *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
4822 *code++ = ptype;
4823 *code++ = pdata;
4824 }
4825 #else
4826
4827 /* If Unicode properties are not supported, \X, \P, and \p are not
4828 allowed. */
4829
4830 else if (-c == ESC_X || -c == ESC_P || -c == ESC_p)
4831 {
4832 *errorcodeptr = ERR45;
4833 goto FAILED;
4834 }
4835 #endif
4836
4837 /* For the rest (including \X when Unicode properties are supported), we
4838 can obtain the OP value by negating the escape value. */
4839
4840 else
4841 {
4842 previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
4843 *code++ = -c;
4844 }
4845 continue;
4846 }
4847
4848 /* We have a data character whose value is in c. In UTF-8 mode it may have
4849 a value > 127. We set its representation in the length/buffer, and then
4850 handle it as a data character. */
4851
4852 #ifdef SUPPORT_UTF8
4853 if (utf8 && c > 127)
4854 mclength = _pcre_ord2utf8(c, mcbuffer);
4855 else
4856 #endif
4857
4858 {
4859 mcbuffer[0] = c;
4860 mclength = 1;
4861 }
4862 goto ONE_CHAR;
4863
4864
4865 /* ===================================================================*/
4866 /* Handle a literal character. It is guaranteed not to be whitespace or #
4867 when the extended flag is set. If we are in UTF-8 mode, it may be a
4868 multi-byte literal character. */
4869
4870 default:
4871 NORMAL_CHAR:
4872 mclength = 1;
4873 mcbuffer[0] = c;
4874
4875 #ifdef SUPPORT_UTF8
4876 if (utf8 && c >= 0xc0)
4877 {
4878 while ((ptr[1] & 0xc0) == 0x80)
4879 mcbuffer[mclength++] = *(++ptr);
4880 }
4881 #endif
4882
4883 /* At this point we have the character's bytes in mcbuffer, and the length
4884 in mclength. When not in UTF-8 mode, the length is always 1. */
4885
4886 ONE_CHAR:
4887 previous = code;
4888 *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARNC : OP_CHAR;
4889 for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
4890
4891 /* Set the first and required bytes appropriately. If no previous first
4892 byte, set it from this character, but revert to none on a zero repeat.
4893 Otherwise, leave the firstbyte value alone, and don't change it on a zero
4894 repeat. */
4895
4896 if (firstbyte == REQ_UNSET)
4897 {
4898 zerofirstbyte = REQ_NONE;
4899 zeroreqbyte = reqbyte;
4900
4901 /* If the character is more than one byte long, we can set firstbyte
4902 only if it is not to be matched caselessly. */
4903
4904 if (mclength == 1 || req_caseopt == 0)
4905 {
4906 firstbyte = mcbuffer[0] | req_caseopt;
4907 if (mclength != 1) reqbyte = code[-1] | cd->req_varyopt;
4908 }
4909 else firstbyte = reqbyte = REQ_NONE;
4910 }
4911
4912 /* firstbyte was previously set; we can set reqbyte only the length is
4913 1 or the matching is caseful. */
4914
4915 else
4916 {
4917 zerofirstbyte = firstbyte;
4918 zeroreqbyte = reqbyte;
4919 if (mclength == 1 || req_caseopt == 0)
4920 reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
4921 }
4922
4923 break; /* End of literal character handling */
4924 }
4925 } /* end of big loop */
4926
4927
4928 /* Control never reaches here by falling through, only by a goto for all the
4929 error states. Pass back the position in the pattern so that it can be displayed
4930 to the user for diagnosing the error. */
4931
4932 FAILED:
4933 *ptrptr = ptr;
4934 return FALSE;
4935 }
4936
4937
4938
4939
4940 /*************************************************
4941 * Compile sequence of alternatives *
4942 *************************************************/
4943
4944 /* On entry, ptr is pointing past the bracket character, but on return it
4945 points to the closing bracket, or vertical bar, or end of string. The code
4946 variable is pointing at the byte into which the BRA operator has been stored.
4947 If the ims options are changed at the start (for a (?ims: group) or during any
4948 branch, we need to insert an OP_OPT item at the start of every following branch
4949 to ensure they get set correctly at run time, and also pass the new options
4950 into every subsequent branch compile.
4951
4952 This function is used during the pre-compile phase when we are trying to find
4953 out the amount of memory needed, as well as during the real compile phase. The
4954 value of lengthptr distinguishes the two phases.
4955
4956 Arguments:
4957 options option bits, including any changes for this subpattern
4958 oldims previous settings of ims option bits
4959 codeptr -> the address of the current code pointer
4960 ptrptr -> the address of the current pattern pointer
4961 errorcodeptr -> pointer to error code variable
4962 lookbehind TRUE if this is a lookbehind assertion
4963 reset_bracount TRUE to reset the count for each branch
4964 skipbytes skip this many bytes at start (for brackets and OP_COND)
4965 firstbyteptr place to put the first required character, or a negative number
4966 reqbyteptr place to put the last required character, or a negative number
4967 bcptr pointer to the chain of currently open branches
4968 cd points to the data block with tables pointers etc.
4969 lengthptr NULL during the real compile phase
4970 points to length accumulator during pre-compile phase
4971
4972 Returns: TRUE on success
4973 */
4974
4975 static BOOL
4976 compile_regex(int options, int oldims, uschar **codeptr, const uschar **ptrptr,
4977 int *errorcodeptr, BOOL lookbehind, BOOL reset_bracount, int skipbytes,
4978 int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd,
4979 int *lengthptr)
4980 {
4981 const uschar *ptr = *ptrptr;
4982 uschar *code = *codeptr;
4983 uschar *last_branch = code;
4984 uschar *start_bracket = code;
4985 uschar *reverse_count = NULL;
4986 int firstbyte, reqbyte;
4987 int branchfirstbyte, branchreqbyte;
4988 int length;
4989 int orig_bracount;
4990 int max_bracount;
4991 branch_chain bc;
4992
4993 bc.outer = bcptr;
4994 bc.current = code;
4995
4996 firstbyte = reqbyte = REQ_UNSET;
4997
4998 /* Accumulate the length for use in the pre-compile phase. Start with the
4999 length of the BRA and KET and any extra bytes that are required at the
5000 beginning. We accumulate in a local variable to save frequent testing of
5001 lenthptr for NULL. We cannot do this by looking at the value of code at the
5002 start and end of each alternative, because compiled items are discarded during
5003 the pre-compile phase so that the work space is not exceeded. */
5004
5005 length = 2 + 2*LINK_SIZE + skipbytes;
5006
5007 /* WARNING: If the above line is changed for any reason, you must also change
5008 the code that abstracts option settings at the start of the pattern and makes
5009 them global. It tests the value of length for (2 + 2*LINK_SIZE) in the
5010 pre-compile phase to find out whether anything has yet been compiled or not. */
5011
5012 /* Offset is set zero to mark that this bracket is still open */
5013
5014 PUT(code, 1, 0);
5015 code += 1 + LINK_SIZE + skipbytes;
5016
5017 /* Loop for each alternative branch */
5018
5019 orig_bracount = max_bracount = cd->bracount;
5020 for (;;)
5021 {
5022 /* For a (?| group, reset the capturing bracket count so that each branch
5023 uses the same numbers. */
5024
5025 if (reset_bracount) cd->bracount = orig_bracount;
5026
5027 /* Handle a change of ims options at the start of the branch */
5028
5029 if ((options & PCRE_IMS) != oldims)
5030 {
5031 *code++ = OP_OPT;
5032 *code++ = options & PCRE_IMS;
5033 length += 2;
5034 }
5035
5036 /* Set up dummy OP_REVERSE if lookbehind assertion */
5037
5038 if (lookbehind)
5039 {
5040 *code++ = OP_REVERSE;
5041 reverse_count = code;
5042 PUTINC(code, 0, 0);
5043 length += 1 + LINK_SIZE;
5044 }
5045
5046 /* Now compile the branch; in the pre-compile phase its length gets added
5047 into the length. */
5048
5049 if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstbyte,
5050 &branchreqbyte, &bc, cd, (lengthptr == NULL)? NULL : &length))
5051 {
5052 *ptrptr = ptr;
5053 return FALSE;
5054 }
5055
5056 /* Keep the highest bracket count in case (?| was used and some branch
5057 has fewer than the rest. */
5058
5059 if (cd->bracount > max_bracount) max_bracount = cd->bracount;
5060
5061 /* In the real compile phase, there is some post-processing to be done. */
5062
5063 if (lengthptr == NULL)
5064 {
5065 /* If this is the first branch, the firstbyte and reqbyte values for the
5066 branch become the values for the regex. */
5067
5068 if (*last_branch != OP_ALT)
5069 {
5070 firstbyte = branchfirstbyte;
5071 reqbyte = branchreqbyte;
5072 }
5073
5074 /* If this is not the first branch, the first char and reqbyte have to
5075 match the values from all the previous branches, except that if the
5076 previous value for reqbyte didn't have REQ_VARY set, it can still match,
5077 and we set REQ_VARY for the regex. */
5078
5079 else
5080 {
5081 /* If we previously had a firstbyte, but it doesn't match the new branch,
5082 we have to abandon the firstbyte for the regex, but if there was
5083 previously no reqbyte, it takes on the value of the old firstbyte. */
5084
5085 if (firstbyte >= 0 && firstbyte != branchfirstbyte)
5086 {
5087 if (reqbyte < 0) reqbyte = firstbyte;
5088 firstbyte = REQ_NONE;
5089 }
5090
5091 /* If we (now or from before) have no firstbyte, a firstbyte from the
5092 branch becomes a reqbyte if there isn't a branch reqbyte. */
5093
5094 if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
5095 branchreqbyte = branchfirstbyte;
5096
5097 /* Now ensure that the reqbytes match */
5098
5099 if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
5100 reqbyte = REQ_NONE;
5101 else reqbyte |= branchreqbyte; /* To "or" REQ_VARY */
5102 }
5103
5104 /* If lookbehind, check that this branch matches a fixed-length string, and
5105 put the length into the OP_REVERSE item. Temporarily mark the end of the
5106 branch with OP_END. */
5107
5108 if (lookbehind)
5109 {
5110 int fixed_length;
5111 *code = OP_END;
5112 fixed_length = find_fixedlength(last_branch, options);
5113 DPRINTF(("fixed length = %d\n", fixed_length));
5114 if (fixed_length < 0)
5115 {
5116 *errorcodeptr = (fixed_length == -2)? ERR36 : ERR25;
5117 *ptrptr = ptr;
5118 return FALSE;
5119 }
5120 PUT(reverse_count, 0, fixed_length);
5121 }
5122 }
5123
5124 /* Reached end of expression, either ')' or end of pattern. In the real
5125 compile phase, go back through the alternative branches and reverse the chain
5126 of offsets, with the field in the BRA item now becoming an offset to the
5127 first alternative. If there are no alternatives, it points to the end of the
5128 group. The length in the terminating ket is always the length of the whole
5129 bracketed item. If any of the ims options were changed inside the group,
5130 compile a resetting op-code following, except at the very end of the pattern.
5131 Return leaving the pointer at the terminating char. */
5132
5133 if (*ptr != '|')
5134 {
5135 if (lengthptr == NULL)
5136 {
5137 int branch_length = code - last_branch;
5138 do
5139 {
5140 int prev_length = GET(last_branch, 1);
5141 PUT(last_branch, 1, branch_length);
5142 branch_length = prev_length;
5143 last_branch -= branch_length;
5144 }
5145 while (branch_length > 0);
5146 }
5147
5148 /* Fill in the ket */
5149
5150 *code = OP_KET;
5151 PUT(code, 1, code - start_bracket);
5152 code += 1 + LINK_SIZE;
5153
5154 /* Resetting option if needed */
5155
5156 if ((options & PCRE_IMS) != oldims && *ptr == ')')
5157 {
5158 *code++ = OP_OPT;
5159 *code++ = oldims;
5160 length += 2;
5161 }
5162
5163 /* Retain the highest bracket number, in case resetting was used. */
5164
5165 cd->bracount = max_bracount;
5166
5167 /* Set values to pass back */
5168
5169 *codeptr = code;
5170 *ptrptr = ptr;
5171 *firstbyteptr = firstbyte;
5172 *reqbyteptr = reqbyte;
5173 if (lengthptr != NULL)
5174 {
5175 if (OFLOW_MAX - *lengthptr < length)
5176 {
5177 *errorcodeptr = ERR20;
5178 return FALSE;
5179 }
5180 *lengthptr += length;
5181 }
5182 return TRUE;
5183 }
5184
5185 /* Another branch follows. In the pre-compile phase, we can move the code
5186 pointer back to where it was for the start of the first branch. (That is,
5187 pretend that each branch is the only one.)
5188
5189 In the real compile phase, insert an ALT node. Its length field points back
5190 to the previous branch while the bracket remains open. At the end the chain
5191 is reversed. It's done like this so that the start of the bracket has a
5192 zero offset until it is closed, making it possible to detect recursion. */
5193
5194 if (lengthptr != NULL)
5195 {
5196 code = *codeptr + 1 + LINK_SIZE + skipbytes;
5197 length += 1 + LINK_SIZE;
5198 }
5199 else
5200 {
5201 *code = OP_ALT;
5202 PUT(code, 1, code - last_branch);
5203 bc.current = last_branch = code;
5204 code += 1 + LINK_SIZE;
5205 }
5206
5207 ptr++;
5208 }
5209 /* Control never reaches here */
5210 }
5211
5212
5213
5214
5215 /*************************************************
5216 * Check for anchored expression *
5217 *************************************************/
5218
5219 /* Try to find out if this is an anchored regular expression. Consider each
5220 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
5221 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
5222 it's anchored. However, if this is a multiline pattern, then only OP_SOD
5223 counts, since OP_CIRC can match in the middle.
5224
5225 We can also consider a regex to be anchored if OP_SOM starts all its branches.
5226 This is the code for \G, which means "match at start of match position, taking
5227 into account the match offset".
5228
5229 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
5230 because that will try the rest of the pattern at all possible matching points,
5231 so there is no point trying again.... er ....
5232
5233 .... except when the .* appears inside capturing parentheses, and there is a
5234 subsequent back reference to those parentheses. We haven't enough information
5235 to catch that case precisely.
5236
5237 At first, the best we could do was to detect when .* was in capturing brackets
5238 and the highest back reference was greater than or equal to that level.
5239 However, by keeping a bitmap of the first 31 back references, we can catch some
5240 of the more common cases more precisely.
5241
5242 Arguments:
5243 code points to start of expression (the bracket)
5244 options points to the options setting
5245 bracket_map a bitmap of which brackets we are inside while testing; this
5246 handles up to substring 31; after that we just have to take
5247 the less precise approach
5248 backref_map the back reference bitmap
5249
5250 Returns: TRUE or FALSE
5251 */
5252
5253 static BOOL
5254 is_anchored(register const uschar *code, int *options, unsigned int bracket_map,
5255 unsigned int backref_map)
5256 {
5257 do {
5258 const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
5259 options, PCRE_MULTILINE, FALSE);
5260 register int op = *scode;
5261
5262 /* Non-capturing brackets */
5263
5264 if (op == OP_BRA)
5265 {
5266 if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
5267 }
5268
5269 /* Capturing brackets */
5270
5271 else if (op == OP_CBRA)
5272 {
5273 int n = GET2(scode, 1+LINK_SIZE);
5274 int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
5275 if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;
5276 }
5277
5278 /* Other brackets */
5279
5280 else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
5281 {
5282 if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
5283 }
5284
5285 /* .* is not anchored unless DOTALL is set and it isn't in brackets that
5286 are or may be referenced. */
5287
5288 else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
5289 op == OP_TYPEPOSSTAR) &&
5290 (*options & PCRE_DOTALL) != 0)
5291 {
5292 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
5293 }
5294
5295 /* Check for explicit anchoring */
5296
5297 else if (op != OP_SOD && op != OP_SOM &&
5298 ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
5299 return FALSE;
5300 code += GET(code, 1);
5301 }
5302 while (*code == OP_ALT); /* Loop for each alternative */
5303 return TRUE;
5304 }
5305
5306
5307
5308 /*************************************************
5309 * Check for starting with ^ or .* *
5310 *************************************************/
5311
5312 /* This is called to find out if every branch starts with ^ or .* so that
5313 "first char" processing can be done to speed things up in multiline
5314 matching and for non-DOTALL patterns that start with .* (which must start at
5315 the beginning or after \n). As in the case of is_anchored() (see above), we
5316 have to take account of back references to capturing brackets that contain .*
5317 because in that case we can't make the assumption.
5318
5319 Arguments:
5320 code points to start of expression (the bracket)
5321 bracket_map a bitmap of which brackets we are inside while testing; this
5322 handles up to substring 31; after that we just have to take
5323 the less precise approach
5324 backref_map the back reference bitmap
5325
5326 Returns: TRUE or FALSE
5327 */
5328
5329 static BOOL
5330 is_startline(const uschar *code, unsigned int bracket_map,
5331 unsigned int backref_map)
5332 {
5333 do {
5334 const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
5335 NULL, 0, FALSE);
5336 register int op = *scode;
5337
5338 /* Non-capturing brackets */
5339
5340 if (op == OP_BRA)
5341 {
5342 if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
5343 }
5344
5345 /* Capturing brackets */
5346
5347 else if (op == OP_CBRA)
5348 {
5349 int n = GET2(scode, 1+LINK_SIZE);
5350 int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
5351 if (!is_startline(scode, new_map, backref_map)) return FALSE;
5352 }
5353
5354 /* Other brackets */
5355
5356 else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
5357 { if (!is_startline(scode, bracket_map, backref_map)) return FALSE; }
5358
5359 /* .* means "start at start or after \n" if it isn't in brackets that
5360 may be referenced. */
5361
5362 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
5363 {
5364 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
5365 }
5366
5367 /* Check for explicit circumflex */
5368
5369 else if (op != OP_CIRC) return FALSE;
5370
5371 /* Move on to the next alternative */
5372
5373 code += GET(code, 1);
5374 }
5375 while (*code == OP_ALT); /* Loop for each alternative */
5376 return TRUE;
5377 }
5378
5379
5380
5381 /*************************************************
5382 * Check for asserted fixed first char *
5383 *************************************************/
5384
5385 /* During compilation, the "first char" settings from forward assertions are
5386 discarded, because they can cause conflicts with actual literals that follow.
5387 However, if we end up without a first char setting for an unanchored pattern,
5388 it is worth scanning the regex to see if there is an initial asserted first
5389 char. If all branches start with the same asserted char, or with a bracket all
5390 of whose alternatives start with the same asserted char (recurse ad lib), then
5391 we return that char, otherwise -1.
5392
5393 Arguments:
5394 code points to start of expression (the bracket)
5395 options pointer to the options (used to check casing changes)
5396 inassert TRUE if in an assertion
5397
5398 Returns: -1 or the fixed first char
5399 */
5400
5401 static int
5402 find_firstassertedchar(const uschar *code, int *options, BOOL inassert)
5403 {
5404 register int c = -1;
5405 do {
5406 int d;
5407 const uschar *scode =
5408 first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE);
5409 register int op = *scode;
5410
5411 switch(op)
5412 {
5413 default:
5414 return -1;
5415
5416 case OP_BRA:
5417 case OP_CBRA:
5418 case OP_ASSERT:
5419 case OP_ONCE:
5420 case OP_COND:
5421 if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0)
5422 return -1;
5423 if (c < 0) c = d; else if (c != d) return -1;
5424 break;
5425
5426 case OP_EXACT: /* Fall through */
5427 scode += 2;
5428
5429 case OP_CHAR:
5430 case OP_CHARNC:
5431 case OP_PLUS:
5432 case OP_MINPLUS:
5433 case OP_POSPLUS:
5434 if (!inassert) return -1;
5435 if (c < 0)
5436 {
5437 c = scode[1];
5438 if ((*options & PCRE_CASELESS) != 0) c |= REQ_CASELESS;
5439 }
5440 else if (c != scode[1]) return -1;
5441 break;
5442 }
5443
5444 code += GET(code, 1);
5445 }
5446 while (*code == OP_ALT);
5447 return c;
5448 }
5449
5450
5451
5452 /*************************************************
5453 * Compile a Regular Expression *
5454 *************************************************/
5455
5456 /* This function takes a string and returns a pointer to a block of store
5457 holding a compiled version of the expression. The original API for this
5458 function had no error code return variable; it is retained for backwards
5459 compatibility. The new function is given a new name.
5460
5461 Arguments:
5462 pattern the regular expression
5463 options various option bits
5464 errorcodeptr pointer to error code variable (pcre_compile2() only)
5465 can be NULL if you don't want a code value
5466 errorptr pointer to pointer to error text
5467 erroroffset ptr offset in pattern where error was detected
5468 tables pointer to character tables or NULL
5469
5470 Returns: pointer to compiled data block, or NULL on error,
5471 with errorptr and erroroffset set
5472 */
5473
5474 PCRE_EXP_DEFN pcre *
5475 pcre_compile(const char *pattern, int options, const char **errorptr,
5476 int *erroroffset, const unsigned char *tables)
5477 {
5478 return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
5479 }
5480
5481
5482 PCRE_EXP_DEFN pcre *
5483 pcre_compile2(const char *pattern, int options, int *errorcodeptr,
5484 const char **errorptr, int *erroroffset, const unsigned char *tables)
5485 {
5486 real_pcre *re;
5487 int length = 1; /* For final END opcode */
5488 int firstbyte, reqbyte, newline;
5489 int errorcode = 0;
5490 #ifdef SUPPORT_UTF8
5491 BOOL utf8;
5492 #endif
5493 size_t size;
5494 uschar *code;
5495 const uschar *codestart;
5496 const uschar *ptr;
5497 compile_data compile_block;
5498 compile_data *cd = &compile_block;
5499
5500 /* This space is used for "compiling" into during the first phase, when we are
5501 computing the amount of memory that is needed. Compiled items are thrown away
5502 as soon as possible, so that a fairly large buffer should be sufficient for
5503 this purpose. The same space is used in the second phase for remembering where
5504 to fill in forward references to subpatterns. */
5505
5506 uschar cworkspace[COMPILE_WORK_SIZE];
5507
5508
5509 /* Set this early so that early errors get offset 0. */
5510
5511 ptr = (const uschar *)pattern;
5512
5513 /* We can't pass back an error message if errorptr is NULL; I guess the best we
5514 can do is just return NULL, but we can set a code value if there is a code
5515 pointer. */
5516
5517 if (errorptr == NULL)
5518 {
5519 if (errorcodeptr != NULL) *errorcodeptr = 99;
5520 return NULL;
5521 }
5522
5523 *errorptr = NULL;
5524 if (errorcodeptr != NULL) *errorcodeptr = ERR0;
5525
5526 /* However, we can give a message for this error */
5527
5528 if (erroroffset == NULL)
5529 {
5530 errorcode = ERR16;
5531 goto PCRE_EARLY_ERROR_RETURN2;
5532 }
5533
5534 *erroroffset = 0;
5535
5536 /* Can't support UTF8 unless PCRE has been compiled to include the code. */
5537
5538 #ifdef SUPPORT_UTF8
5539 utf8 = (options & PCRE_UTF8) != 0;
5540 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
5541 (*erroroffset = _pcre_valid_utf8((uschar *)pattern, -1)) >= 0)
5542 {
5543 errorcode = ERR44;
5544 goto PCRE_EARLY_ERROR_RETURN2;
5545 }
5546 #else
5547 if ((options & PCRE_UTF8) != 0)
5548 {
5549 errorcode = ERR32;
5550 goto PCRE_EARLY_ERROR_RETURN;
5551 }
5552 #endif
5553
5554 if ((options & ~PUBLIC_OPTIONS) != 0)
5555 {
5556 errorcode = ERR17;
5557 goto PCRE_EARLY_ERROR_RETURN;
5558 }
5559
5560 /* Set up pointers to the individual character tables */
5561
5562 if (tables == NULL) tables = _pcre_default_tables;
5563 cd->lcc = tables + lcc_offset;
5564 cd->fcc = tables + fcc_offset;
5565 cd->cbits = tables + cbits_offset;
5566 cd->ctypes = tables + ctypes_offset;
5567
5568 /* Handle different types of newline. The three bits give seven cases. The
5569 current code allows for fixed one- or two-byte sequences, plus "any" and
5570 "anycrlf". */
5571
5572 switch (options & (PCRE_NEWLINE_CRLF | PCRE_NEWLINE_ANY))
5573 {
5574 case 0: newline = NEWLINE; break; /* Compile-time default */
5575 case PCRE_NEWLINE_CR: newline = '\r'; break;
5576 case PCRE_NEWLINE_LF: newline = '\n'; break;
5577 case PCRE_NEWLINE_CR+
5578 PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
5579 case PCRE_NEWLINE_ANY: newline = -1; break;
5580 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
5581 default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
5582 }
5583
5584 if (newline == -2)
5585 {
5586 cd->nltype = NLTYPE_ANYCRLF;
5587 }
5588 else if (newline < 0)
5589 {
5590 cd->nltype = NLTYPE_ANY;
5591 }
5592 else
5593 {
5594 cd->nltype = NLTYPE_FIXED;
5595 if (newline > 255)
5596 {
5597 cd->nllen = 2;
5598 cd->nl[0] = (newline >> 8) & 255;
5599 cd->nl[1] = newline & 255;
5600 }
5601 else
5602 {
5603 cd->nllen = 1;
5604 cd->nl[0] = newline;
5605 }
5606 }
5607
5608 /* Maximum back reference and backref bitmap. The bitmap records up to 31 back
5609 references to help in deciding whether (.*) can be treated as anchored or not.
5610 */
5611
5612 cd->top_backref = 0;
5613 cd->backref_map = 0;
5614
5615 /* Reflect pattern for debugging output */
5616
5617 DPRINTF(("------------------------------------------------------------------\n"));
5618 DPRINTF(("%s\n", pattern));
5619
5620 /* Pretend to compile the pattern while actually just accumulating the length
5621 of memory required. This behaviour is triggered by passing a non-NULL final
5622 argument to compile_regex(). We pass a block of workspace (cworkspace) for it
5623 to compile parts of the pattern into; the compiled code is discarded when it is
5624 no longer needed, so hopefully this workspace will never overflow, though there
5625 is a test for its doing so. */
5626
5627 cd->bracount = 0;
5628 cd->names_found = 0;
5629 cd->name_entry_size = 0;
5630 cd->name_table = NULL;
5631 cd->start_workspace = cworkspace;
5632 cd->start_code = cworkspace;
5633 cd->hwm = cworkspace;
5634 cd->start_pattern = (const uschar *)pattern;
5635 cd->end_pattern = (const uschar *)(pattern + strlen(pattern));
5636 cd->req_varyopt = 0;
5637 cd->nopartial = FALSE;
5638 cd->external_options = options;
5639
5640 /* Now do the pre-compile. On error, errorcode will be set non-zero, so we
5641 don't need to look at the result of the function here. The initial options have
5642 been put into the cd block so that they can be changed if an option setting is
5643 found within the regex right at the beginning. Bringing initial option settings
5644 outside can help speed up starting point checks. */
5645
5646 code = cworkspace;
5647 *code = OP_BRA;
5648 (void)compile_regex(cd->external_options, cd->external_options & PCRE_IMS,
5649 &code, &ptr, &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd,
5650 &length);
5651 if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN;
5652
5653 DPRINTF(("end pre-compile: length=%d workspace=%d\n", length,
5654 cd->hwm - cworkspace));
5655
5656 if (length > MAX_PATTERN_SIZE)
5657 {
5658 errorcode = ERR20;
5659 goto PCRE_EARLY_ERROR_RETURN;
5660 }
5661
5662 /* Compute the size of data block needed and get it, either from malloc or
5663 externally provided function. Integer overflow should no longer be possible
5664 because nowadays we limit the maximum value of cd->names_found and
5665 cd->name_entry_size. */
5666
5667 size = length + sizeof(real_pcre) + cd->names_found * (cd->name_entry_size + 3);
5668 re = (real_pcre *)(pcre_malloc)(size);
5669
5670 if (re == NULL)
5671 {
5672 errorcode = ERR21;
5673 goto PCRE_EARLY_ERROR_RETURN;
5674 }
5675
5676 /* Put in the magic number, and save the sizes, initial options, and character
5677 table pointer. NULL is used for the default character tables. The nullpad field
5678 is at the end; it's there to help in the case when a regex compiled on a system
5679 with 4-byte pointers is run on another with 8-byte pointers. */
5680
5681 re->magic_number = MAGIC_NUMBER;
5682 re->size = size;
5683 re->options = cd->external_options;
5684 re->dummy1 = 0;
5685 re->first_byte = 0;
5686 re->req_byte = 0;
5687 re->name_table_offset = sizeof(real_pcre);
5688 re->name_entry_size = cd->name_entry_size;
5689 re->name_count = cd->names_found;
5690 re->ref_count = 0;
5691 re->tables = (tables == _pcre_default_tables)? NULL : tables;
5692 re->nullpad = NULL;
5693
5694 /* The starting points of the name/number translation table and of the code are
5695 passed around in the compile data block. The start/end pattern and initial
5696 options are already set from the pre-compile phase, as is the name_entry_size
5697 field. Reset the bracket count and the names_found field. Also reset the hwm
5698 field; this time it's used for remembering forward references to subpatterns.
5699 */
5700
5701 cd->bracount = 0;
5702 cd->names_found = 0;
5703 cd->name_table = (uschar *)re + re->name_table_offset;
5704 codestart = cd->name_table + re->name_entry_size * re->name_count;
5705 cd->start_code = codestart;
5706 cd->hwm = cworkspace;
5707 cd->req_varyopt = 0;
5708 cd->nopartial = FALSE;
5709
5710 /* Set up a starting, non-extracting bracket, then compile the expression. On
5711 error, errorcode will be set non-zero, so we don't need to look at the result
5712 of the function here. */
5713
5714 ptr = (const uschar *)pattern;
5715 code = (uschar *)codestart;
5716 *code = OP_BRA;
5717 (void)compile_regex(re->options, re->options & PCRE_IMS, &code, &ptr,
5718 &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd, NULL);
5719 re->top_bracket = cd->bracount;
5720 re->top_backref = cd->top_backref;
5721
5722 if (cd->nopartial) re->options |= PCRE_NOPARTIAL;
5723
5724 /* If not reached end of pattern on success, there's an excess bracket. */
5725
5726 if (errorcode == 0 && *ptr != 0) errorcode = ERR22;
5727
5728 /* Fill in the terminating state and check for disastrous overflow, but
5729 if debugging, leave the test till after things are printed out. */
5730
5731 *code++ = OP_END;
5732
5733 #ifndef DEBUG
5734 if (code - codestart > length) errorcode = ERR23;
5735 #endif
5736
5737 /* Fill in any forward references that are required. */
5738
5739 while (errorcode == 0 && cd->hwm > cworkspace)
5740 {
5741 int offset, recno;
5742 const uschar *groupptr;
5743 cd->hwm -= LINK_SIZE;
5744 offset = GET(cd->hwm, 0);
5745 recno = GET(codestart, offset);
5746 groupptr = find_bracket(codestart, (re->options & PCRE_UTF8) != 0, recno);
5747 if (groupptr == NULL) errorcode = ERR53;
5748 else PUT(((uschar *)codestart), offset, groupptr - codestart);
5749 }
5750
5751 /* Give an error if there's back reference to a non-existent capturing
5752 subpattern. */
5753
5754 if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15;
5755
5756 /* Failed to compile, or error while post-processing */
5757
5758 if (errorcode != 0)
5759 {
5760 (pcre_free)(re);
5761 PCRE_EARLY_ERROR_RETURN:
5762 *erroroffset = ptr - (const uschar *)pattern;
5763 PCRE_EARLY_ERROR_RETURN2:
5764 *errorptr = error_texts[errorcode];
5765 if (errorcodeptr != NULL) *errorcodeptr = errorcode;
5766 return NULL;
5767 }
5768
5769 /* If the anchored option was not passed, set the flag if we can determine that
5770 the pattern is anchored by virtue of ^ characters or \A or anything else (such
5771 as starting with .* when DOTALL is set).
5772
5773 Otherwise, if we know what the first byte has to be, save it, because that
5774 speeds up unanchored matches no end. If not, see if we can set the
5775 PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
5776 start with ^. and also when all branches start with .* for non-DOTALL matches.
5777 */
5778
5779 if ((re->options & PCRE_ANCHORED) == 0)
5780 {
5781 int temp_options = re->options; /* May get changed during these scans */
5782 if (is_anchored(codestart, &temp_options, 0, cd->backref_map))
5783 re->options |= PCRE_ANCHORED;
5784 else
5785 {
5786 if (firstbyte < 0)
5787 firstbyte = find_firstassertedchar(codestart, &temp_options, FALSE);
5788 if (firstbyte >= 0) /* Remove caseless flag for non-caseable chars */
5789 {
5790 int ch = firstbyte & 255;
5791 re->first_byte = ((firstbyte & REQ_CASELESS) != 0 &&
5792 cd->fcc[ch] == ch)? ch : firstbyte;
5793 re->options |= PCRE_FIRSTSET;
5794 }
5795 else if (is_startline(codestart, 0, cd->backref_map))
5796 re->options |= PCRE_STARTLINE;
5797 }
5798 }
5799
5800 /* For an anchored pattern, we use the "required byte" only if it follows a
5801 variable length item in the regex. Remove the caseless flag for non-caseable
5802 bytes. */
5803
5804 if (reqbyte >= 0 &&
5805 ((re->options & PCRE_ANCHORED) == 0 || (reqbyte & REQ_VARY) != 0))
5806 {
5807 int ch = reqbyte & 255;
5808 re->req_byte = ((reqbyte & REQ_CASELESS) != 0 &&
5809 cd->fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte;
5810 re->options |= PCRE_REQCHSET;
5811 }
5812
5813 /* Print out the compiled data if debugging is enabled. This is never the
5814 case when building a production library. */
5815
5816 #ifdef DEBUG
5817
5818 printf("Length = %d top_bracket = %d top_backref = %d\n",
5819 length, re->top_bracket, re->top_backref);
5820
5821 if (re->options != 0)
5822 {
5823 printf("%s%s%s%s%s%s%s%s%s\n",
5824 ((re->options & PCRE_NOPARTIAL) != 0)? "nopartial " : "",
5825 ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
5826 ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
5827 ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
5828 ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
5829 ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
5830 ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
5831 ((re->options & PCRE_EXTRA) != 0)? "extra " : "",
5832 ((re->options & PCRE_UNGREEDY) != 0)? "ungreedy " : "");
5833 }
5834
5835 if ((re->options & PCRE_FIRSTSET) != 0)
5836 {
5837 int ch = re->first_byte & 255;
5838 const char *caseless = ((re->first_byte & REQ_CASELESS) == 0)?
5839 "" : " (caseless)";
5840 if (isprint(ch)) printf("First char = %c%s\n", ch, caseless);
5841 else printf("First char = \\x%02x%s\n", ch, caseless);
5842 }
5843
5844 if ((re->options & PCRE_REQCHSET) != 0)
5845 {
5846 int ch = re->req_byte & 255;
5847 const char *caseless = ((re->req_byte & REQ_CASELESS) == 0)?
5848 "" : " (caseless)";
5849 if (isprint(ch)) printf("Req char = %c%s\n", ch, caseless);
5850 else printf("Req char = \\x%02x%s\n", ch, caseless);
5851 }
5852
5853 pcre_printint(re, stdout, TRUE);
5854
5855 /* This check is done here in the debugging case so that the code that
5856 was compiled can be seen. */
5857
5858 if (code - codestart > length)
5859 {
5860 (pcre_free)(re);
5861 *errorptr = error_texts[ERR23];
5862 *erroroffset = ptr - (uschar *)pattern;
5863 if (errorcodeptr != NULL) *errorcodeptr = ERR23;
5864 return NULL;
5865 }
5866 #endif /* DEBUG */
5867
5868 return (pcre *)re;
5869 }
5870
5871 /* End of pcre_compile.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12