/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Contents of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 202 - (show annotations) (download)
Fri Aug 3 09:44:26 2007 UTC (6 years, 8 months ago) by ph10
File MIME type: text/plain
File size: 185235 byte(s)
Refactor the integer overflow testing so as to avoid imposing an artificial 
limit on the size of subpatterns.

1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2007 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains the external function pcre_compile(), along with
42 supporting internal functions that are not used by other modules. */
43
44
45 #ifdef HAVE_CONFIG_H
46 #include <config.h>
47 #endif
48
49 #define NLBLOCK cd /* Block containing newline information */
50 #define PSSTART start_pattern /* Field containing processed string start */
51 #define PSEND end_pattern /* Field containing processed string end */
52
53 #include "pcre_internal.h"
54
55
56 /* When DEBUG is defined, we need the pcre_printint() function, which is also
57 used by pcretest. DEBUG is not defined when building a production library. */
58
59 #ifdef DEBUG
60 #include "pcre_printint.src"
61 #endif
62
63
64 /* Macro for setting individual bits in class bitmaps. */
65
66 #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
67
68 /* Maximum length value to check against when making sure that the integer that
69 holds the compiled pattern length does not overflow. We make it a bit less than
70 INT_MAX to allow for adding in group terminating bytes, so that we don't have
71 to check them every time. */
72
73 #define OFLOW_MAX (INT_MAX - 20)
74
75
76 /*************************************************
77 * Code parameters and static tables *
78 *************************************************/
79
80 /* This value specifies the size of stack workspace that is used during the
81 first pre-compile phase that determines how much memory is required. The regex
82 is partly compiled into this space, but the compiled parts are discarded as
83 soon as they can be, so that hopefully there will never be an overrun. The code
84 does, however, check for an overrun. The largest amount I've seen used is 218,
85 so this number is very generous.
86
87 The same workspace is used during the second, actual compile phase for
88 remembering forward references to groups so that they can be filled in at the
89 end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
90 is 4 there is plenty of room. */
91
92 #define COMPILE_WORK_SIZE (4096)
93
94
95 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
96 are simple data values; negative values are for special things like \d and so
97 on. Zero means further processing is needed (for things like \x), or the escape
98 is invalid. */
99
100 #ifndef EBCDIC /* This is the "normal" table for ASCII systems */
101 static const short int escapes[] = {
102 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
103 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
104 '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */
105 -ESC_H, 0, 0, -ESC_K, 0, 0, 0, 0, /* H - O */
106 -ESC_P, -ESC_Q, -ESC_R, -ESC_S, 0, 0, -ESC_V, -ESC_W, /* P - W */
107 -ESC_X, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
108 '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */
109 -ESC_h, 0, 0, -ESC_k, 0, 0, ESC_n, 0, /* h - o */
110 -ESC_p, 0, ESC_r, -ESC_s, ESC_tee, 0, -ESC_v, -ESC_w, /* p - w */
111 0, 0, -ESC_z /* x - z */
112 };
113
114 #else /* This is the "abnormal" table for EBCDIC systems */
115 static const short int escapes[] = {
116 /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
117 /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
118 /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
119 /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
120 /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
121 /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
122 /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
123 /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
124 /* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0,
125 /* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
126 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
127 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0,
128 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
129 /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
130 /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
131 /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
132 /* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0,
133 /* D0 */ '}', 0, -ESC_K, 0, 0, 0, 0, -ESC_P,
134 /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
135 /* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X,
136 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
137 /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
138 /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
139 };
140 #endif
141
142
143 /* Tables of names of POSIX character classes and their lengths. The list is
144 terminated by a zero length entry. The first three must be alpha, lower, upper,
145 as this is assumed for handling case independence. */
146
147 static const char *const posix_names[] = {
148 "alpha", "lower", "upper",
149 "alnum", "ascii", "blank", "cntrl", "digit", "graph",
150 "print", "punct", "space", "word", "xdigit" };
151
152 static const uschar posix_name_lengths[] = {
153 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
154
155 /* Table of class bit maps for each POSIX class. Each class is formed from a
156 base map, with an optional addition or removal of another map. Then, for some
157 classes, there is some additional tweaking: for [:blank:] the vertical space
158 characters are removed, and for [:alpha:] and [:alnum:] the underscore
159 character is removed. The triples in the table consist of the base map offset,
160 second map offset or -1 if no second map, and a non-negative value for map
161 addition or a negative value for map subtraction (if there are two maps). The
162 absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
163 remove vertical space characters, 2 => remove underscore. */
164
165 static const int posix_class_maps[] = {
166 cbit_word, cbit_digit, -2, /* alpha */
167 cbit_lower, -1, 0, /* lower */
168 cbit_upper, -1, 0, /* upper */
169 cbit_word, -1, 2, /* alnum - word without underscore */
170 cbit_print, cbit_cntrl, 0, /* ascii */
171 cbit_space, -1, 1, /* blank - a GNU extension */
172 cbit_cntrl, -1, 0, /* cntrl */
173 cbit_digit, -1, 0, /* digit */
174 cbit_graph, -1, 0, /* graph */
175 cbit_print, -1, 0, /* print */
176 cbit_punct, -1, 0, /* punct */
177 cbit_space, -1, 0, /* space */
178 cbit_word, -1, 0, /* word - a Perl extension */
179 cbit_xdigit,-1, 0 /* xdigit */
180 };
181
182
183 #define STRING(a) # a
184 #define XSTRING(s) STRING(s)
185
186 /* The texts of compile-time error messages. These are "char *" because they
187 are passed to the outside world. Do not ever re-use any error number, because
188 they are documented. Always add a new error instead. Messages marked DEAD below
189 are no longer used. */
190
191 static const char *error_texts[] = {
192 "no error",
193 "\\ at end of pattern",
194 "\\c at end of pattern",
195 "unrecognized character follows \\",
196 "numbers out of order in {} quantifier",
197 /* 5 */
198 "number too big in {} quantifier",
199 "missing terminating ] for character class",
200 "invalid escape sequence in character class",
201 "range out of order in character class",
202 "nothing to repeat",
203 /* 10 */
204 "operand of unlimited repeat could match the empty string", /** DEAD **/
205 "internal error: unexpected repeat",
206 "unrecognized character after (?",
207 "POSIX named classes are supported only within a class",
208 "missing )",
209 /* 15 */
210 "reference to non-existent subpattern",
211 "erroffset passed as NULL",
212 "unknown option bit(s) set",
213 "missing ) after comment",
214 "parentheses nested too deeply", /** DEAD **/
215 /* 20 */
216 "regular expression is too large",
217 "failed to get memory",
218 "unmatched parentheses",
219 "internal error: code overflow",
220 "unrecognized character after (?<",
221 /* 25 */
222 "lookbehind assertion is not fixed length",
223 "malformed number or name after (?(",
224 "conditional group contains more than two branches",
225 "assertion expected after (?(",
226 "(?R or (?[+-]digits must be followed by )",
227 /* 30 */
228 "unknown POSIX class name",
229 "POSIX collating elements are not supported",
230 "this version of PCRE is not compiled with PCRE_UTF8 support",
231 "spare error", /** DEAD **/
232 "character value in \\x{...} sequence is too large",
233 /* 35 */
234 "invalid condition (?(0)",
235 "\\C not allowed in lookbehind assertion",
236 "PCRE does not support \\L, \\l, \\N, \\U, or \\u",
237 "number after (?C is > 255",
238 "closing ) for (?C expected",
239 /* 40 */
240 "recursive call could loop indefinitely",
241 "unrecognized character after (?P",
242 "syntax error in subpattern name (missing terminator)",
243 "two named subpatterns have the same name",
244 "invalid UTF-8 string",
245 /* 45 */
246 "support for \\P, \\p, and \\X has not been compiled",
247 "malformed \\P or \\p sequence",
248 "unknown property name after \\P or \\p",
249 "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)",
250 "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")",
251 /* 50 */
252 "repeated subpattern is too long", /** DEAD **/
253 "octal value is greater than \\377 (not in UTF-8 mode)",
254 "internal error: overran compiling workspace",
255 "internal error: previously-checked referenced subpattern not found",
256 "DEFINE group contains more than one branch",
257 /* 55 */
258 "repeating a DEFINE group is not allowed",
259 "inconsistent NEWLINE options",
260 "\\g is not followed by a braced name or an optionally braced non-zero number",
261 "(?+ or (?- or (?(+ or (?(- must be followed by a non-zero number"
262 };
263
264
265 /* Table to identify digits and hex digits. This is used when compiling
266 patterns. Note that the tables in chartables are dependent on the locale, and
267 may mark arbitrary characters as digits - but the PCRE compiling code expects
268 to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
269 a private table here. It costs 256 bytes, but it is a lot faster than doing
270 character value tests (at least in some simple cases I timed), and in some
271 applications one wants PCRE to compile efficiently as well as match
272 efficiently.
273
274 For convenience, we use the same bit definitions as in chartables:
275
276 0x04 decimal digit
277 0x08 hexadecimal digit
278
279 Then we can use ctype_digit and ctype_xdigit in the code. */
280
281 #ifndef EBCDIC /* This is the "normal" case, for ASCII systems */
282 static const unsigned char digitab[] =
283 {
284 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
285 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
286 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
287 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
288 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
289 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
290 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
291 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
292 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
293 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
294 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
295 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
296 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
297 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
298 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
299 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
300 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
301 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
302 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
303 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
304 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
305 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
306 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
307 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
308 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
309 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
310 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
311 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
312 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
313 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
314 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
315 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
316
317 #else /* This is the "abnormal" case, for EBCDIC systems */
318 static const unsigned char digitab[] =
319 {
320 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
321 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
322 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
323 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
324 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
325 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
326 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
327 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
328 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
329 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
330 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
331 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */
332 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
333 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
334 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
335 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
336 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
337 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
338 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
339 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
340 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
341 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
342 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
343 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
344 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
345 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
346 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
347 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
348 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
349 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
350 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
351 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
352
353 static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
354 0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
355 0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
356 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
357 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
358 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
359 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
360 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
361 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
362 0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
363 0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
364 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
365 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */
366 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
367 0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
368 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
369 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
370 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
371 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
372 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
373 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
374 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
375 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
376 0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
377 0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
378 0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
379 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
380 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
381 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
382 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
383 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
384 0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
385 0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
386 #endif
387
388
389 /* Definition to allow mutual recursion */
390
391 static BOOL
392 compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
393 int *, int *, branch_chain *, compile_data *, int *);
394
395
396
397 /*************************************************
398 * Handle escapes *
399 *************************************************/
400
401 /* This function is called when a \ has been encountered. It either returns a
402 positive value for a simple escape such as \n, or a negative value which
403 encodes one of the more complicated things such as \d. A backreference to group
404 n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
405 UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
406 ptr is pointing at the \. On exit, it is on the final character of the escape
407 sequence.
408
409 Arguments:
410 ptrptr points to the pattern position pointer
411 errorcodeptr points to the errorcode variable
412 bracount number of previous extracting brackets
413 options the options bits
414 isclass TRUE if inside a character class
415
416 Returns: zero or positive => a data character
417 negative => a special escape sequence
418 on error, errorptr is set
419 */
420
421 static int
422 check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
423 int options, BOOL isclass)
424 {
425 BOOL utf8 = (options & PCRE_UTF8) != 0;
426 const uschar *ptr = *ptrptr + 1;
427 int c, i;
428
429 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
430 ptr--; /* Set pointer back to the last byte */
431
432 /* If backslash is at the end of the pattern, it's an error. */
433
434 if (c == 0) *errorcodeptr = ERR1;
435
436 /* Non-alphamerics are literals. For digits or letters, do an initial lookup in
437 a table. A non-zero result is something that can be returned immediately.
438 Otherwise further processing may be required. */
439
440 #ifndef EBCDIC /* ASCII coding */
441 else if (c < '0' || c > 'z') {} /* Not alphameric */
442 else if ((i = escapes[c - '0']) != 0) c = i;
443
444 #else /* EBCDIC coding */
445 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphameric */
446 else if ((i = escapes[c - 0x48]) != 0) c = i;
447 #endif
448
449 /* Escapes that need further processing, or are illegal. */
450
451 else
452 {
453 const uschar *oldptr;
454 BOOL braced, negated;
455
456 switch (c)
457 {
458 /* A number of Perl escapes are not handled by PCRE. We give an explicit
459 error. */
460
461 case 'l':
462 case 'L':
463 case 'N':
464 case 'u':
465 case 'U':
466 *errorcodeptr = ERR37;
467 break;
468
469 /* \g must be followed by a number, either plain or braced. If positive, it
470 is an absolute backreference. If negative, it is a relative backreference.
471 This is a Perl 5.10 feature. Perl 5.10 also supports \g{name} as a
472 reference to a named group. This is part of Perl's movement towards a
473 unified syntax for back references. As this is synonymous with \k{name}, we
474 fudge it up by pretending it really was \k. */
475
476 case 'g':
477 if (ptr[1] == '{')
478 {
479 const uschar *p;
480 for (p = ptr+2; *p != 0 && *p != '}'; p++)
481 if (*p != '-' && (digitab[*p] & ctype_digit) == 0) break;
482 if (*p != 0 && *p != '}')
483 {
484 c = -ESC_k;
485 break;
486 }
487 braced = TRUE;
488 ptr++;
489 }
490 else braced = FALSE;
491
492 if (ptr[1] == '-')
493 {
494 negated = TRUE;
495 ptr++;
496 }
497 else negated = FALSE;
498
499 c = 0;
500 while ((digitab[ptr[1]] & ctype_digit) != 0)
501 c = c * 10 + *(++ptr) - '0';
502
503 if (c == 0 || (braced && *(++ptr) != '}'))
504 {
505 *errorcodeptr = ERR57;
506 return 0;
507 }
508
509 if (negated)
510 {
511 if (c > bracount)
512 {
513 *errorcodeptr = ERR15;
514 return 0;
515 }
516 c = bracount - (c - 1);
517 }
518
519 c = -(ESC_REF + c);
520 break;
521
522 /* The handling of escape sequences consisting of a string of digits
523 starting with one that is not zero is not straightforward. By experiment,
524 the way Perl works seems to be as follows:
525
526 Outside a character class, the digits are read as a decimal number. If the
527 number is less than 10, or if there are that many previous extracting
528 left brackets, then it is a back reference. Otherwise, up to three octal
529 digits are read to form an escaped byte. Thus \123 is likely to be octal
530 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
531 value is greater than 377, the least significant 8 bits are taken. Inside a
532 character class, \ followed by a digit is always an octal number. */
533
534 case '1': case '2': case '3': case '4': case '5':
535 case '6': case '7': case '8': case '9':
536
537 if (!isclass)
538 {
539 oldptr = ptr;
540 c -= '0';
541 while ((digitab[ptr[1]] & ctype_digit) != 0)
542 c = c * 10 + *(++ptr) - '0';
543 if (c < 10 || c <= bracount)
544 {
545 c = -(ESC_REF + c);
546 break;
547 }
548 ptr = oldptr; /* Put the pointer back and fall through */
549 }
550
551 /* Handle an octal number following \. If the first digit is 8 or 9, Perl
552 generates a binary zero byte and treats the digit as a following literal.
553 Thus we have to pull back the pointer by one. */
554
555 if ((c = *ptr) >= '8')
556 {
557 ptr--;
558 c = 0;
559 break;
560 }
561
562 /* \0 always starts an octal number, but we may drop through to here with a
563 larger first octal digit. The original code used just to take the least
564 significant 8 bits of octal numbers (I think this is what early Perls used
565 to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
566 than 3 octal digits. */
567
568 case '0':
569 c -= '0';
570 while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
571 c = c * 8 + *(++ptr) - '0';
572 if (!utf8 && c > 255) *errorcodeptr = ERR51;
573 break;
574
575 /* \x is complicated. \x{ddd} is a character number which can be greater
576 than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
577 treated as a data character. */
578
579 case 'x':
580 if (ptr[1] == '{')
581 {
582 const uschar *pt = ptr + 2;
583 int count = 0;
584
585 c = 0;
586 while ((digitab[*pt] & ctype_xdigit) != 0)
587 {
588 register int cc = *pt++;
589 if (c == 0 && cc == '0') continue; /* Leading zeroes */
590 count++;
591
592 #ifndef EBCDIC /* ASCII coding */
593 if (cc >= 'a') cc -= 32; /* Convert to upper case */
594 c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));
595 #else /* EBCDIC coding */
596 if (cc >= 'a' && cc <= 'z') cc += 64; /* Convert to upper case */
597 c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));
598 #endif
599 }
600
601 if (*pt == '}')
602 {
603 if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
604 ptr = pt;
605 break;
606 }
607
608 /* If the sequence of hex digits does not end with '}', then we don't
609 recognize this construct; fall through to the normal \x handling. */
610 }
611
612 /* Read just a single-byte hex-defined char */
613
614 c = 0;
615 while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
616 {
617 int cc; /* Some compilers don't like ++ */
618 cc = *(++ptr); /* in initializers */
619 #ifndef EBCDIC /* ASCII coding */
620 if (cc >= 'a') cc -= 32; /* Convert to upper case */
621 c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
622 #else /* EBCDIC coding */
623 if (cc <= 'z') cc += 64; /* Convert to upper case */
624 c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
625 #endif
626 }
627 break;
628
629 /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
630 This coding is ASCII-specific, but then the whole concept of \cx is
631 ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
632
633 case 'c':
634 c = *(++ptr);
635 if (c == 0)
636 {
637 *errorcodeptr = ERR2;
638 return 0;
639 }
640
641 #ifndef EBCDIC /* ASCII coding */
642 if (c >= 'a' && c <= 'z') c -= 32;
643 c ^= 0x40;
644 #else /* EBCDIC coding */
645 if (c >= 'a' && c <= 'z') c += 64;
646 c ^= 0xC0;
647 #endif
648 break;
649
650 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
651 other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
652 for Perl compatibility, it is a literal. This code looks a bit odd, but
653 there used to be some cases other than the default, and there may be again
654 in future, so I haven't "optimized" it. */
655
656 default:
657 if ((options & PCRE_EXTRA) != 0) switch(c)
658 {
659 default:
660 *errorcodeptr = ERR3;
661 break;
662 }
663 break;
664 }
665 }
666
667 *ptrptr = ptr;
668 return c;
669 }
670
671
672
673 #ifdef SUPPORT_UCP
674 /*************************************************
675 * Handle \P and \p *
676 *************************************************/
677
678 /* This function is called after \P or \p has been encountered, provided that
679 PCRE is compiled with support for Unicode properties. On entry, ptrptr is
680 pointing at the P or p. On exit, it is pointing at the final character of the
681 escape sequence.
682
683 Argument:
684 ptrptr points to the pattern position pointer
685 negptr points to a boolean that is set TRUE for negation else FALSE
686 dptr points to an int that is set to the detailed property value
687 errorcodeptr points to the error code variable
688
689 Returns: type value from ucp_type_table, or -1 for an invalid type
690 */
691
692 static int
693 get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
694 {
695 int c, i, bot, top;
696 const uschar *ptr = *ptrptr;
697 char name[32];
698
699 c = *(++ptr);
700 if (c == 0) goto ERROR_RETURN;
701
702 *negptr = FALSE;
703
704 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
705 negation. */
706
707 if (c == '{')
708 {
709 if (ptr[1] == '^')
710 {
711 *negptr = TRUE;
712 ptr++;
713 }
714 for (i = 0; i < (int)sizeof(name) - 1; i++)
715 {
716 c = *(++ptr);
717 if (c == 0) goto ERROR_RETURN;
718 if (c == '}') break;
719 name[i] = c;
720 }
721 if (c !='}') goto ERROR_RETURN;
722 name[i] = 0;
723 }
724
725 /* Otherwise there is just one following character */
726
727 else
728 {
729 name[0] = c;
730 name[1] = 0;
731 }
732
733 *ptrptr = ptr;
734
735 /* Search for a recognized property name using binary chop */
736
737 bot = 0;
738 top = _pcre_utt_size;
739
740 while (bot < top)
741 {
742 i = (bot + top) >> 1;
743 c = strcmp(name, _pcre_utt[i].name);
744 if (c == 0)
745 {
746 *dptr = _pcre_utt[i].value;
747 return _pcre_utt[i].type;
748 }
749 if (c > 0) bot = i + 1; else top = i;
750 }
751
752 *errorcodeptr = ERR47;
753 *ptrptr = ptr;
754 return -1;
755
756 ERROR_RETURN:
757 *errorcodeptr = ERR46;
758 *ptrptr = ptr;
759 return -1;
760 }
761 #endif
762
763
764
765
766 /*************************************************
767 * Check for counted repeat *
768 *************************************************/
769
770 /* This function is called when a '{' is encountered in a place where it might
771 start a quantifier. It looks ahead to see if it really is a quantifier or not.
772 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
773 where the ddds are digits.
774
775 Arguments:
776 p pointer to the first char after '{'
777
778 Returns: TRUE or FALSE
779 */
780
781 static BOOL
782 is_counted_repeat(const uschar *p)
783 {
784 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
785 while ((digitab[*p] & ctype_digit) != 0) p++;
786 if (*p == '}') return TRUE;
787
788 if (*p++ != ',') return FALSE;
789 if (*p == '}') return TRUE;
790
791 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
792 while ((digitab[*p] & ctype_digit) != 0) p++;
793
794 return (*p == '}');
795 }
796
797
798
799 /*************************************************
800 * Read repeat counts *
801 *************************************************/
802
803 /* Read an item of the form {n,m} and return the values. This is called only
804 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
805 so the syntax is guaranteed to be correct, but we need to check the values.
806
807 Arguments:
808 p pointer to first char after '{'
809 minp pointer to int for min
810 maxp pointer to int for max
811 returned as -1 if no max
812 errorcodeptr points to error code variable
813
814 Returns: pointer to '}' on success;
815 current ptr on error, with errorcodeptr set non-zero
816 */
817
818 static const uschar *
819 read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
820 {
821 int min = 0;
822 int max = -1;
823
824 /* Read the minimum value and do a paranoid check: a negative value indicates
825 an integer overflow. */
826
827 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
828 if (min < 0 || min > 65535)
829 {
830 *errorcodeptr = ERR5;
831 return p;
832 }
833
834 /* Read the maximum value if there is one, and again do a paranoid on its size.
835 Also, max must not be less than min. */
836
837 if (*p == '}') max = min; else
838 {
839 if (*(++p) != '}')
840 {
841 max = 0;
842 while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
843 if (max < 0 || max > 65535)
844 {
845 *errorcodeptr = ERR5;
846 return p;
847 }
848 if (max < min)
849 {
850 *errorcodeptr = ERR4;
851 return p;
852 }
853 }
854 }
855
856 /* Fill in the required variables, and pass back the pointer to the terminating
857 '}'. */
858
859 *minp = min;
860 *maxp = max;
861 return p;
862 }
863
864
865
866 /*************************************************
867 * Find forward referenced subpattern *
868 *************************************************/
869
870 /* This function scans along a pattern's text looking for capturing
871 subpatterns, and counting them. If it finds a named pattern that matches the
872 name it is given, it returns its number. Alternatively, if the name is NULL, it
873 returns when it reaches a given numbered subpattern. This is used for forward
874 references to subpatterns. We know that if (?P< is encountered, the name will
875 be terminated by '>' because that is checked in the first pass.
876
877 Arguments:
878 ptr current position in the pattern
879 count current count of capturing parens so far encountered
880 name name to seek, or NULL if seeking a numbered subpattern
881 lorn name length, or subpattern number if name is NULL
882 xmode TRUE if we are in /x mode
883
884 Returns: the number of the named subpattern, or -1 if not found
885 */
886
887 static int
888 find_parens(const uschar *ptr, int count, const uschar *name, int lorn,
889 BOOL xmode)
890 {
891 const uschar *thisname;
892
893 for (; *ptr != 0; ptr++)
894 {
895 int term;
896
897 /* Skip over backslashed characters and also entire \Q...\E */
898
899 if (*ptr == '\\')
900 {
901 if (*(++ptr) == 0) return -1;
902 if (*ptr == 'Q') for (;;)
903 {
904 while (*(++ptr) != 0 && *ptr != '\\');
905 if (*ptr == 0) return -1;
906 if (*(++ptr) == 'E') break;
907 }
908 continue;
909 }
910
911 /* Skip over character classes */
912
913 if (*ptr == '[')
914 {
915 while (*(++ptr) != ']')
916 {
917 if (*ptr == '\\')
918 {
919 if (*(++ptr) == 0) return -1;
920 if (*ptr == 'Q') for (;;)
921 {
922 while (*(++ptr) != 0 && *ptr != '\\');
923 if (*ptr == 0) return -1;
924 if (*(++ptr) == 'E') break;
925 }
926 continue;
927 }
928 }
929 continue;
930 }
931
932 /* Skip comments in /x mode */
933
934 if (xmode && *ptr == '#')
935 {
936 while (*(++ptr) != 0 && *ptr != '\n');
937 if (*ptr == 0) return -1;
938 continue;
939 }
940
941 /* An opening parens must now be a real metacharacter */
942
943 if (*ptr != '(') continue;
944 if (ptr[1] != '?')
945 {
946 count++;
947 if (name == NULL && count == lorn) return count;
948 continue;
949 }
950
951 ptr += 2;
952 if (*ptr == 'P') ptr++; /* Allow optional P */
953
954 /* We have to disambiguate (?<! and (?<= from (?<name> */
955
956 if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') &&
957 *ptr != '\'')
958 continue;
959
960 count++;
961
962 if (name == NULL && count == lorn) return count;
963 term = *ptr++;
964 if (term == '<') term = '>';
965 thisname = ptr;
966 while (*ptr != term) ptr++;
967 if (name != NULL && lorn == ptr - thisname &&
968 strncmp((const char *)name, (const char *)thisname, lorn) == 0)
969 return count;
970 }
971
972 return -1;
973 }
974
975
976
977 /*************************************************
978 * Find first significant op code *
979 *************************************************/
980
981 /* This is called by several functions that scan a compiled expression looking
982 for a fixed first character, or an anchoring op code etc. It skips over things
983 that do not influence this. For some calls, a change of option is important.
984 For some calls, it makes sense to skip negative forward and all backward
985 assertions, and also the \b assertion; for others it does not.
986
987 Arguments:
988 code pointer to the start of the group
989 options pointer to external options
990 optbit the option bit whose changing is significant, or
991 zero if none are
992 skipassert TRUE if certain assertions are to be skipped
993
994 Returns: pointer to the first significant opcode
995 */
996
997 static const uschar*
998 first_significant_code(const uschar *code, int *options, int optbit,
999 BOOL skipassert)
1000 {
1001 for (;;)
1002 {
1003 switch ((int)*code)
1004 {
1005 case OP_OPT:
1006 if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
1007 *options = (int)code[1];
1008 code += 2;
1009 break;
1010
1011 case OP_ASSERT_NOT:
1012 case OP_ASSERTBACK:
1013 case OP_ASSERTBACK_NOT:
1014 if (!skipassert) return code;
1015 do code += GET(code, 1); while (*code == OP_ALT);
1016 code += _pcre_OP_lengths[*code];
1017 break;
1018
1019 case OP_WORD_BOUNDARY:
1020 case OP_NOT_WORD_BOUNDARY:
1021 if (!skipassert) return code;
1022 /* Fall through */
1023
1024 case OP_CALLOUT:
1025 case OP_CREF:
1026 case OP_RREF:
1027 case OP_DEF:
1028 code += _pcre_OP_lengths[*code];
1029 break;
1030
1031 default:
1032 return code;
1033 }
1034 }
1035 /* Control never reaches here */
1036 }
1037
1038
1039
1040
1041 /*************************************************
1042 * Find the fixed length of a pattern *
1043 *************************************************/
1044
1045 /* Scan a pattern and compute the fixed length of subject that will match it,
1046 if the length is fixed. This is needed for dealing with backward assertions.
1047 In UTF8 mode, the result is in characters rather than bytes.
1048
1049 Arguments:
1050 code points to the start of the pattern (the bracket)
1051 options the compiling options
1052
1053 Returns: the fixed length, or -1 if there is no fixed length,
1054 or -2 if \C was encountered
1055 */
1056
1057 static int
1058 find_fixedlength(uschar *code, int options)
1059 {
1060 int length = -1;
1061
1062 register int branchlength = 0;
1063 register uschar *cc = code + 1 + LINK_SIZE;
1064
1065 /* Scan along the opcodes for this branch. If we get to the end of the
1066 branch, check the length against that of the other branches. */
1067
1068 for (;;)
1069 {
1070 int d;
1071 register int op = *cc;
1072
1073 switch (op)
1074 {
1075 case OP_CBRA:
1076 case OP_BRA:
1077 case OP_ONCE:
1078 case OP_COND:
1079 d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);
1080 if (d < 0) return d;
1081 branchlength += d;
1082 do cc += GET(cc, 1); while (*cc == OP_ALT);
1083 cc += 1 + LINK_SIZE;
1084 break;
1085
1086 /* Reached end of a branch; if it's a ket it is the end of a nested
1087 call. If it's ALT it is an alternation in a nested call. If it is
1088 END it's the end of the outer call. All can be handled by the same code. */
1089
1090 case OP_ALT:
1091 case OP_KET:
1092 case OP_KETRMAX:
1093 case OP_KETRMIN:
1094 case OP_END:
1095 if (length < 0) length = branchlength;
1096 else if (length != branchlength) return -1;
1097 if (*cc != OP_ALT) return length;
1098 cc += 1 + LINK_SIZE;
1099 branchlength = 0;
1100 break;
1101
1102 /* Skip over assertive subpatterns */
1103
1104 case OP_ASSERT:
1105 case OP_ASSERT_NOT:
1106 case OP_ASSERTBACK:
1107 case OP_ASSERTBACK_NOT:
1108 do cc += GET(cc, 1); while (*cc == OP_ALT);
1109 /* Fall through */
1110
1111 /* Skip over things that don't match chars */
1112
1113 case OP_REVERSE:
1114 case OP_CREF:
1115 case OP_RREF:
1116 case OP_DEF:
1117 case OP_OPT:
1118 case OP_CALLOUT:
1119 case OP_SOD:
1120 case OP_SOM:
1121 case OP_EOD:
1122 case OP_EODN:
1123 case OP_CIRC:
1124 case OP_DOLL:
1125 case OP_NOT_WORD_BOUNDARY:
1126 case OP_WORD_BOUNDARY:
1127 cc += _pcre_OP_lengths[*cc];
1128 break;
1129
1130 /* Handle literal characters */
1131
1132 case OP_CHAR:
1133 case OP_CHARNC:
1134 case OP_NOT:
1135 branchlength++;
1136 cc += 2;
1137 #ifdef SUPPORT_UTF8
1138 if ((options & PCRE_UTF8) != 0)
1139 {
1140 while ((*cc & 0xc0) == 0x80) cc++;
1141 }
1142 #endif
1143 break;
1144
1145 /* Handle exact repetitions. The count is already in characters, but we
1146 need to skip over a multibyte character in UTF8 mode. */
1147
1148 case OP_EXACT:
1149 branchlength += GET2(cc,1);
1150 cc += 4;
1151 #ifdef SUPPORT_UTF8
1152 if ((options & PCRE_UTF8) != 0)
1153 {
1154 while((*cc & 0x80) == 0x80) cc++;
1155 }
1156 #endif
1157 break;
1158
1159 case OP_TYPEEXACT:
1160 branchlength += GET2(cc,1);
1161 cc += 4;
1162 break;
1163
1164 /* Handle single-char matchers */
1165
1166 case OP_PROP:
1167 case OP_NOTPROP:
1168 cc += 2;
1169 /* Fall through */
1170
1171 case OP_NOT_DIGIT:
1172 case OP_DIGIT:
1173 case OP_NOT_WHITESPACE:
1174 case OP_WHITESPACE:
1175 case OP_NOT_WORDCHAR:
1176 case OP_WORDCHAR:
1177 case OP_ANY:
1178 branchlength++;
1179 cc++;
1180 break;
1181
1182 /* The single-byte matcher isn't allowed */
1183
1184 case OP_ANYBYTE:
1185 return -2;
1186
1187 /* Check a class for variable quantification */
1188
1189 #ifdef SUPPORT_UTF8
1190 case OP_XCLASS:
1191 cc += GET(cc, 1) - 33;
1192 /* Fall through */
1193 #endif
1194
1195 case OP_CLASS:
1196 case OP_NCLASS:
1197 cc += 33;
1198
1199 switch (*cc)
1200 {
1201 case OP_CRSTAR:
1202 case OP_CRMINSTAR:
1203 case OP_CRQUERY:
1204 case OP_CRMINQUERY:
1205 return -1;
1206
1207 case OP_CRRANGE:
1208 case OP_CRMINRANGE:
1209 if (GET2(cc,1) != GET2(cc,3)) return -1;
1210 branchlength += GET2(cc,1);
1211 cc += 5;
1212 break;
1213
1214 default:
1215 branchlength++;
1216 }
1217 break;
1218
1219 /* Anything else is variable length */
1220
1221 default:
1222 return -1;
1223 }
1224 }
1225 /* Control never gets here */
1226 }
1227
1228
1229
1230
1231 /*************************************************
1232 * Scan compiled regex for numbered bracket *
1233 *************************************************/
1234
1235 /* This little function scans through a compiled pattern until it finds a
1236 capturing bracket with the given number.
1237
1238 Arguments:
1239 code points to start of expression
1240 utf8 TRUE in UTF-8 mode
1241 number the required bracket number
1242
1243 Returns: pointer to the opcode for the bracket, or NULL if not found
1244 */
1245
1246 static const uschar *
1247 find_bracket(const uschar *code, BOOL utf8, int number)
1248 {
1249 for (;;)
1250 {
1251 register int c = *code;
1252 if (c == OP_END) return NULL;
1253
1254 /* XCLASS is used for classes that cannot be represented just by a bit
1255 map. This includes negated single high-valued characters. The length in
1256 the table is zero; the actual length is stored in the compiled code. */
1257
1258 if (c == OP_XCLASS) code += GET(code, 1);
1259
1260 /* Handle capturing bracket */
1261
1262 else if (c == OP_CBRA)
1263 {
1264 int n = GET2(code, 1+LINK_SIZE);
1265 if (n == number) return (uschar *)code;
1266 code += _pcre_OP_lengths[c];
1267 }
1268
1269 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1270 a multi-byte character. The length in the table is a minimum, so we have to
1271 arrange to skip the extra bytes. */
1272
1273 else
1274 {
1275 code += _pcre_OP_lengths[c];
1276 #ifdef SUPPORT_UTF8
1277 if (utf8) switch(c)
1278 {
1279 case OP_CHAR:
1280 case OP_CHARNC:
1281 case OP_EXACT:
1282 case OP_UPTO:
1283 case OP_MINUPTO:
1284 case OP_POSUPTO:
1285 case OP_STAR:
1286 case OP_MINSTAR:
1287 case OP_POSSTAR:
1288 case OP_PLUS:
1289 case OP_MINPLUS:
1290 case OP_POSPLUS:
1291 case OP_QUERY:
1292 case OP_MINQUERY:
1293 case OP_POSQUERY:
1294 if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1295 break;
1296 }
1297 #endif
1298 }
1299 }
1300 }
1301
1302
1303
1304 /*************************************************
1305 * Scan compiled regex for recursion reference *
1306 *************************************************/
1307
1308 /* This little function scans through a compiled pattern until it finds an
1309 instance of OP_RECURSE.
1310
1311 Arguments:
1312 code points to start of expression
1313 utf8 TRUE in UTF-8 mode
1314
1315 Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1316 */
1317
1318 static const uschar *
1319 find_recurse(const uschar *code, BOOL utf8)
1320 {
1321 for (;;)
1322 {
1323 register int c = *code;
1324 if (c == OP_END) return NULL;
1325 if (c == OP_RECURSE) return code;
1326
1327 /* XCLASS is used for classes that cannot be represented just by a bit
1328 map. This includes negated single high-valued characters. The length in
1329 the table is zero; the actual length is stored in the compiled code. */
1330
1331 if (c == OP_XCLASS) code += GET(code, 1);
1332
1333 /* Otherwise, we get the item's length from the table. In UTF-8 mode, opcodes
1334 that are followed by a character may be followed by a multi-byte character.
1335 The length in the table is a minimum, so we have to arrange to skip the extra
1336 bytes. */
1337
1338 else
1339 {
1340 code += _pcre_OP_lengths[c];
1341 #ifdef SUPPORT_UTF8
1342 if (utf8) switch(c)
1343 {
1344 case OP_CHAR:
1345 case OP_CHARNC:
1346 case OP_EXACT:
1347 case OP_UPTO:
1348 case OP_MINUPTO:
1349 case OP_POSUPTO:
1350 case OP_STAR:
1351 case OP_MINSTAR:
1352 case OP_POSSTAR:
1353 case OP_PLUS:
1354 case OP_MINPLUS:
1355 case OP_POSPLUS:
1356 case OP_QUERY:
1357 case OP_MINQUERY:
1358 case OP_POSQUERY:
1359 if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1360 break;
1361 }
1362 #endif
1363 }
1364 }
1365 }
1366
1367
1368
1369 /*************************************************
1370 * Scan compiled branch for non-emptiness *
1371 *************************************************/
1372
1373 /* This function scans through a branch of a compiled pattern to see whether it
1374 can match the empty string or not. It is called from could_be_empty()
1375 below and from compile_branch() when checking for an unlimited repeat of a
1376 group that can match nothing. Note that first_significant_code() skips over
1377 assertions. If we hit an unclosed bracket, we return "empty" - this means we've
1378 struck an inner bracket whose current branch will already have been scanned.
1379
1380 Arguments:
1381 code points to start of search
1382 endcode points to where to stop
1383 utf8 TRUE if in UTF8 mode
1384
1385 Returns: TRUE if what is matched could be empty
1386 */
1387
1388 static BOOL
1389 could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1390 {
1391 register int c;
1392 for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1393 code < endcode;
1394 code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1395 {
1396 const uschar *ccode;
1397
1398 c = *code;
1399
1400 /* Groups with zero repeats can of course be empty; skip them. */
1401
1402 if (c == OP_BRAZERO || c == OP_BRAMINZERO)
1403 {
1404 code += _pcre_OP_lengths[c];
1405 do code += GET(code, 1); while (*code == OP_ALT);
1406 c = *code;
1407 continue;
1408 }
1409
1410 /* For other groups, scan the branches. */
1411
1412 if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE)
1413 {
1414 BOOL empty_branch;
1415 if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1416
1417 /* Scan a closed bracket */
1418
1419 empty_branch = FALSE;
1420 do
1421 {
1422 if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1423 empty_branch = TRUE;
1424 code += GET(code, 1);
1425 }
1426 while (*code == OP_ALT);
1427 if (!empty_branch) return FALSE; /* All branches are non-empty */
1428 c = *code;
1429 continue;
1430 }
1431
1432 /* Handle the other opcodes */
1433
1434 switch (c)
1435 {
1436 /* Check for quantifiers after a class */
1437
1438 #ifdef SUPPORT_UTF8
1439 case OP_XCLASS:
1440 ccode = code + GET(code, 1);
1441 goto CHECK_CLASS_REPEAT;
1442 #endif
1443
1444 case OP_CLASS:
1445 case OP_NCLASS:
1446 ccode = code + 33;
1447
1448 #ifdef SUPPORT_UTF8
1449 CHECK_CLASS_REPEAT:
1450 #endif
1451
1452 switch (*ccode)
1453 {
1454 case OP_CRSTAR: /* These could be empty; continue */
1455 case OP_CRMINSTAR:
1456 case OP_CRQUERY:
1457 case OP_CRMINQUERY:
1458 break;
1459
1460 default: /* Non-repeat => class must match */
1461 case OP_CRPLUS: /* These repeats aren't empty */
1462 case OP_CRMINPLUS:
1463 return FALSE;
1464
1465 case OP_CRRANGE:
1466 case OP_CRMINRANGE:
1467 if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
1468 break;
1469 }
1470 break;
1471
1472 /* Opcodes that must match a character */
1473
1474 case OP_PROP:
1475 case OP_NOTPROP:
1476 case OP_EXTUNI:
1477 case OP_NOT_DIGIT:
1478 case OP_DIGIT:
1479 case OP_NOT_WHITESPACE:
1480 case OP_WHITESPACE:
1481 case OP_NOT_WORDCHAR:
1482 case OP_WORDCHAR:
1483 case OP_ANY:
1484 case OP_ANYBYTE:
1485 case OP_CHAR:
1486 case OP_CHARNC:
1487 case OP_NOT:
1488 case OP_PLUS:
1489 case OP_MINPLUS:
1490 case OP_POSPLUS:
1491 case OP_EXACT:
1492 case OP_NOTPLUS:
1493 case OP_NOTMINPLUS:
1494 case OP_NOTPOSPLUS:
1495 case OP_NOTEXACT:
1496 case OP_TYPEPLUS:
1497 case OP_TYPEMINPLUS:
1498 case OP_TYPEPOSPLUS:
1499 case OP_TYPEEXACT:
1500 return FALSE;
1501
1502 /* End of branch */
1503
1504 case OP_KET:
1505 case OP_KETRMAX:
1506 case OP_KETRMIN:
1507 case OP_ALT:
1508 return TRUE;
1509
1510 /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1511 MINUPTO, and POSUPTO may be followed by a multibyte character */
1512
1513 #ifdef SUPPORT_UTF8
1514 case OP_STAR:
1515 case OP_MINSTAR:
1516 case OP_POSSTAR:
1517 case OP_QUERY:
1518 case OP_MINQUERY:
1519 case OP_POSQUERY:
1520 case OP_UPTO:
1521 case OP_MINUPTO:
1522 case OP_POSUPTO:
1523 if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1524 break;
1525 #endif
1526 }
1527 }
1528
1529 return TRUE;
1530 }
1531
1532
1533
1534 /*************************************************
1535 * Scan compiled regex for non-emptiness *
1536 *************************************************/
1537
1538 /* This function is called to check for left recursive calls. We want to check
1539 the current branch of the current pattern to see if it could match the empty
1540 string. If it could, we must look outwards for branches at other levels,
1541 stopping when we pass beyond the bracket which is the subject of the recursion.
1542
1543 Arguments:
1544 code points to start of the recursion
1545 endcode points to where to stop (current RECURSE item)
1546 bcptr points to the chain of current (unclosed) branch starts
1547 utf8 TRUE if in UTF-8 mode
1548
1549 Returns: TRUE if what is matched could be empty
1550 */
1551
1552 static BOOL
1553 could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1554 BOOL utf8)
1555 {
1556 while (bcptr != NULL && bcptr->current >= code)
1557 {
1558 if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1559 bcptr = bcptr->outer;
1560 }
1561 return TRUE;
1562 }
1563
1564
1565
1566 /*************************************************
1567 * Check for POSIX class syntax *
1568 *************************************************/
1569
1570 /* This function is called when the sequence "[:" or "[." or "[=" is
1571 encountered in a character class. It checks whether this is followed by an
1572 optional ^ and then a sequence of letters, terminated by a matching ":]" or
1573 ".]" or "=]".
1574
1575 Argument:
1576 ptr pointer to the initial [
1577 endptr where to return the end pointer
1578 cd pointer to compile data
1579
1580 Returns: TRUE or FALSE
1581 */
1582
1583 static BOOL
1584 check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
1585 {
1586 int terminator; /* Don't combine these lines; the Solaris cc */
1587 terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
1588 if (*(++ptr) == '^') ptr++;
1589 while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
1590 if (*ptr == terminator && ptr[1] == ']')
1591 {
1592 *endptr = ptr;
1593 return TRUE;
1594 }
1595 return FALSE;
1596 }
1597
1598
1599
1600
1601 /*************************************************
1602 * Check POSIX class name *
1603 *************************************************/
1604
1605 /* This function is called to check the name given in a POSIX-style class entry
1606 such as [:alnum:].
1607
1608 Arguments:
1609 ptr points to the first letter
1610 len the length of the name
1611
1612 Returns: a value representing the name, or -1 if unknown
1613 */
1614
1615 static int
1616 check_posix_name(const uschar *ptr, int len)
1617 {
1618 register int yield = 0;
1619 while (posix_name_lengths[yield] != 0)
1620 {
1621 if (len == posix_name_lengths[yield] &&
1622 strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
1623 yield++;
1624 }
1625 return -1;
1626 }
1627
1628
1629 /*************************************************
1630 * Adjust OP_RECURSE items in repeated group *
1631 *************************************************/
1632
1633 /* OP_RECURSE items contain an offset from the start of the regex to the group
1634 that is referenced. This means that groups can be replicated for fixed
1635 repetition simply by copying (because the recursion is allowed to refer to
1636 earlier groups that are outside the current group). However, when a group is
1637 optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
1638 it, after it has been compiled. This means that any OP_RECURSE items within it
1639 that refer to the group itself or any contained groups have to have their
1640 offsets adjusted. That one of the jobs of this function. Before it is called,
1641 the partially compiled regex must be temporarily terminated with OP_END.
1642
1643 This function has been extended with the possibility of forward references for
1644 recursions and subroutine calls. It must also check the list of such references
1645 for the group we are dealing with. If it finds that one of the recursions in
1646 the current group is on this list, it adjusts the offset in the list, not the
1647 value in the reference (which is a group number).
1648
1649 Arguments:
1650 group points to the start of the group
1651 adjust the amount by which the group is to be moved
1652 utf8 TRUE in UTF-8 mode
1653 cd contains pointers to tables etc.
1654 save_hwm the hwm forward reference pointer at the start of the group
1655
1656 Returns: nothing
1657 */
1658
1659 static void
1660 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
1661 uschar *save_hwm)
1662 {
1663 uschar *ptr = group;
1664 while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1665 {
1666 int offset;
1667 uschar *hc;
1668
1669 /* See if this recursion is on the forward reference list. If so, adjust the
1670 reference. */
1671
1672 for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
1673 {
1674 offset = GET(hc, 0);
1675 if (cd->start_code + offset == ptr + 1)
1676 {
1677 PUT(hc, 0, offset + adjust);
1678 break;
1679 }
1680 }
1681
1682 /* Otherwise, adjust the recursion offset if it's after the start of this
1683 group. */
1684
1685 if (hc >= cd->hwm)
1686 {
1687 offset = GET(ptr, 1);
1688 if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1689 }
1690
1691 ptr += 1 + LINK_SIZE;
1692 }
1693 }
1694
1695
1696
1697 /*************************************************
1698 * Insert an automatic callout point *
1699 *************************************************/
1700
1701 /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
1702 callout points before each pattern item.
1703
1704 Arguments:
1705 code current code pointer
1706 ptr current pattern pointer
1707 cd pointers to tables etc
1708
1709 Returns: new code pointer
1710 */
1711
1712 static uschar *
1713 auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
1714 {
1715 *code++ = OP_CALLOUT;
1716 *code++ = 255;
1717 PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */
1718 PUT(code, LINK_SIZE, 0); /* Default length */
1719 return code + 2*LINK_SIZE;
1720 }
1721
1722
1723
1724 /*************************************************
1725 * Complete a callout item *
1726 *************************************************/
1727
1728 /* A callout item contains the length of the next item in the pattern, which
1729 we can't fill in till after we have reached the relevant point. This is used
1730 for both automatic and manual callouts.
1731
1732 Arguments:
1733 previous_callout points to previous callout item
1734 ptr current pattern pointer
1735 cd pointers to tables etc
1736
1737 Returns: nothing
1738 */
1739
1740 static void
1741 complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
1742 {
1743 int length = ptr - cd->start_pattern - GET(previous_callout, 2);
1744 PUT(previous_callout, 2 + LINK_SIZE, length);
1745 }
1746
1747
1748
1749 #ifdef SUPPORT_UCP
1750 /*************************************************
1751 * Get othercase range *
1752 *************************************************/
1753
1754 /* This function is passed the start and end of a class range, in UTF-8 mode
1755 with UCP support. It searches up the characters, looking for internal ranges of
1756 characters in the "other" case. Each call returns the next one, updating the
1757 start address.
1758
1759 Arguments:
1760 cptr points to starting character value; updated
1761 d end value
1762 ocptr where to put start of othercase range
1763 odptr where to put end of othercase range
1764
1765 Yield: TRUE when range returned; FALSE when no more
1766 */
1767
1768 static BOOL
1769 get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
1770 unsigned int *odptr)
1771 {
1772 unsigned int c, othercase, next;
1773
1774 for (c = *cptr; c <= d; c++)
1775 { if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR) break; }
1776
1777 if (c > d) return FALSE;
1778
1779 *ocptr = othercase;
1780 next = othercase + 1;
1781
1782 for (++c; c <= d; c++)
1783 {
1784 if (_pcre_ucp_othercase(c) != next) break;
1785 next++;
1786 }
1787
1788 *odptr = next - 1;
1789 *cptr = c;
1790
1791 return TRUE;
1792 }
1793 #endif /* SUPPORT_UCP */
1794
1795
1796
1797 /*************************************************
1798 * Check if auto-possessifying is possible *
1799 *************************************************/
1800
1801 /* This function is called for unlimited repeats of certain items, to see
1802 whether the next thing could possibly match the repeated item. If not, it makes
1803 sense to automatically possessify the repeated item.
1804
1805 Arguments:
1806 op_code the repeated op code
1807 this data for this item, depends on the opcode
1808 utf8 TRUE in UTF-8 mode
1809 utf8_char used for utf8 character bytes, NULL if not relevant
1810 ptr next character in pattern
1811 options options bits
1812 cd contains pointers to tables etc.
1813
1814 Returns: TRUE if possessifying is wanted
1815 */
1816
1817 static BOOL
1818 check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
1819 const uschar *ptr, int options, compile_data *cd)
1820 {
1821 int next;
1822
1823 /* Skip whitespace and comments in extended mode */
1824
1825 if ((options & PCRE_EXTENDED) != 0)
1826 {
1827 for (;;)
1828 {
1829 while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1830 if (*ptr == '#')
1831 {
1832 while (*(++ptr) != 0)
1833 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1834 }
1835 else break;
1836 }
1837 }
1838
1839 /* If the next item is one that we can handle, get its value. A non-negative
1840 value is a character, a negative value is an escape value. */
1841
1842 if (*ptr == '\\')
1843 {
1844 int temperrorcode = 0;
1845 next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
1846 if (temperrorcode != 0) return FALSE;
1847 ptr++; /* Point after the escape sequence */
1848 }
1849
1850 else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
1851 {
1852 #ifdef SUPPORT_UTF8
1853 if (utf8) { GETCHARINC(next, ptr); } else
1854 #endif
1855 next = *ptr++;
1856 }
1857
1858 else return FALSE;
1859
1860 /* Skip whitespace and comments in extended mode */
1861
1862 if ((options & PCRE_EXTENDED) != 0)
1863 {
1864 for (;;)
1865 {
1866 while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1867 if (*ptr == '#')
1868 {
1869 while (*(++ptr) != 0)
1870 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1871 }
1872 else break;
1873 }
1874 }
1875
1876 /* If the next thing is itself optional, we have to give up. */
1877
1878 if (*ptr == '*' || *ptr == '?' || strncmp((char *)ptr, "{0,", 3) == 0)
1879 return FALSE;
1880
1881 /* Now compare the next item with the previous opcode. If the previous is a
1882 positive single character match, "item" either contains the character or, if
1883 "item" is greater than 127 in utf8 mode, the character's bytes are in
1884 utf8_char. */
1885
1886
1887 /* Handle cases when the next item is a character. */
1888
1889 if (next >= 0) switch(op_code)
1890 {
1891 case OP_CHAR:
1892 #ifdef SUPPORT_UTF8
1893 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1894 #endif
1895 return item != next;
1896
1897 /* For CHARNC (caseless character) we must check the other case. If we have
1898 Unicode property support, we can use it to test the other case of
1899 high-valued characters. */
1900
1901 case OP_CHARNC:
1902 #ifdef SUPPORT_UTF8
1903 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1904 #endif
1905 if (item == next) return FALSE;
1906 #ifdef SUPPORT_UTF8
1907 if (utf8)
1908 {
1909 unsigned int othercase;
1910 if (next < 128) othercase = cd->fcc[next]; else
1911 #ifdef SUPPORT_UCP
1912 othercase = _pcre_ucp_othercase((unsigned int)next);
1913 #else
1914 othercase = NOTACHAR;
1915 #endif
1916 return (unsigned int)item != othercase;
1917 }
1918 else
1919 #endif /* SUPPORT_UTF8 */
1920 return (item != cd->fcc[next]); /* Non-UTF-8 mode */
1921
1922 /* For OP_NOT, "item" must be a single-byte character. */
1923
1924 case OP_NOT:
1925 if (next < 0) return FALSE; /* Not a character */
1926 if (item == next) return TRUE;
1927 if ((options & PCRE_CASELESS) == 0) return FALSE;
1928 #ifdef SUPPORT_UTF8
1929 if (utf8)
1930 {
1931 unsigned int othercase;
1932 if (next < 128) othercase = cd->fcc[next]; else
1933 #ifdef SUPPORT_UCP
1934 othercase = _pcre_ucp_othercase(next);
1935 #else
1936 othercase = NOTACHAR;
1937 #endif
1938 return (unsigned int)item == othercase;
1939 }
1940 else
1941 #endif /* SUPPORT_UTF8 */
1942 return (item == cd->fcc[next]); /* Non-UTF-8 mode */
1943
1944 case OP_DIGIT:
1945 return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
1946
1947 case OP_NOT_DIGIT:
1948 return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
1949
1950 case OP_WHITESPACE:
1951 return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
1952
1953 case OP_NOT_WHITESPACE:
1954 return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
1955
1956 case OP_WORDCHAR:
1957 return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
1958
1959 case OP_NOT_WORDCHAR:
1960 return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
1961
1962 case OP_HSPACE:
1963 case OP_NOT_HSPACE:
1964 switch(next)
1965 {
1966 case 0x09:
1967 case 0x20:
1968 case 0xa0:
1969 case 0x1680:
1970 case 0x180e:
1971 case 0x2000:
1972 case 0x2001:
1973 case 0x2002:
1974 case 0x2003:
1975 case 0x2004:
1976 case 0x2005:
1977 case 0x2006:
1978 case 0x2007:
1979 case 0x2008:
1980 case 0x2009:
1981 case 0x200A:
1982 case 0x202f:
1983 case 0x205f:
1984 case 0x3000:
1985 return op_code != OP_HSPACE;
1986 default:
1987 return op_code == OP_HSPACE;
1988 }
1989
1990 case OP_VSPACE:
1991 case OP_NOT_VSPACE:
1992 switch(next)
1993 {
1994 case 0x0a:
1995 case 0x0b:
1996 case 0x0c:
1997 case 0x0d:
1998 case 0x85:
1999 case 0x2028:
2000 case 0x2029:
2001 return op_code != OP_VSPACE;
2002 default:
2003 return op_code == OP_VSPACE;
2004 }
2005
2006 default:
2007 return FALSE;
2008 }
2009
2010
2011 /* Handle the case when the next item is \d, \s, etc. */
2012
2013 switch(op_code)
2014 {
2015 case OP_CHAR:
2016 case OP_CHARNC:
2017 #ifdef SUPPORT_UTF8
2018 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2019 #endif
2020 switch(-next)
2021 {
2022 case ESC_d:
2023 return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
2024
2025 case ESC_D:
2026 return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
2027
2028 case ESC_s:
2029 return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
2030
2031 case ESC_S:
2032 return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
2033
2034 case ESC_w:
2035 return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
2036
2037 case ESC_W:
2038 return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
2039
2040 case ESC_h:
2041 case ESC_H:
2042 switch(item)
2043 {
2044 case 0x09:
2045 case 0x20:
2046 case 0xa0:
2047 case 0x1680:
2048 case 0x180e:
2049 case 0x2000:
2050 case 0x2001:
2051 case 0x2002:
2052 case 0x2003:
2053 case 0x2004:
2054 case 0x2005:
2055 case 0x2006:
2056 case 0x2007:
2057 case 0x2008:
2058 case 0x2009:
2059 case 0x200A:
2060 case 0x202f:
2061 case 0x205f:
2062 case 0x3000:
2063 return -next != ESC_h;
2064 default:
2065 return -next == ESC_h;
2066 }
2067
2068 case ESC_v:
2069 case ESC_V:
2070 switch(item)
2071 {
2072 case 0x0a:
2073 case 0x0b:
2074 case 0x0c:
2075 case 0x0d:
2076 case 0x85:
2077 case 0x2028:
2078 case 0x2029:
2079 return -next != ESC_v;
2080 default:
2081 return -next == ESC_v;
2082 }
2083
2084 default:
2085 return FALSE;
2086 }
2087
2088 case OP_DIGIT:
2089 return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
2090 next == -ESC_h || next == -ESC_v;
2091
2092 case OP_NOT_DIGIT:
2093 return next == -ESC_d;
2094
2095 case OP_WHITESPACE:
2096 return next == -ESC_S || next == -ESC_d || next == -ESC_w;
2097
2098 case OP_NOT_WHITESPACE:
2099 return next == -ESC_s || next == -ESC_h || next == -ESC_v;
2100
2101 case OP_HSPACE:
2102 return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w;
2103
2104 case OP_NOT_HSPACE:
2105 return next == -ESC_h;
2106
2107 /* Can't have \S in here because VT matches \S (Perl anomaly) */
2108 case OP_VSPACE:
2109 return next == -ESC_V || next == -ESC_d || next == -ESC_w;
2110
2111 case OP_NOT_VSPACE:
2112 return next == -ESC_v;
2113
2114 case OP_WORDCHAR:
2115 return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v;
2116
2117 case OP_NOT_WORDCHAR:
2118 return next == -ESC_w || next == -ESC_d;
2119
2120 default:
2121 return FALSE;
2122 }
2123
2124 /* Control does not reach here */
2125 }
2126
2127
2128
2129 /*************************************************
2130 * Compile one branch *
2131 *************************************************/
2132
2133 /* Scan the pattern, compiling it into the a vector. If the options are
2134 changed during the branch, the pointer is used to change the external options
2135 bits. This function is used during the pre-compile phase when we are trying
2136 to find out the amount of memory needed, as well as during the real compile
2137 phase. The value of lengthptr distinguishes the two phases.
2138
2139 Arguments:
2140 optionsptr pointer to the option bits
2141 codeptr points to the pointer to the current code point
2142 ptrptr points to the current pattern pointer
2143 errorcodeptr points to error code variable
2144 firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2145 reqbyteptr set to the last literal character required, else < 0
2146 bcptr points to current branch chain
2147 cd contains pointers to tables etc.
2148 lengthptr NULL during the real compile phase
2149 points to length accumulator during pre-compile phase
2150
2151 Returns: TRUE on success
2152 FALSE, with *errorcodeptr set non-zero on error
2153 */
2154
2155 static BOOL
2156 compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2157 int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2158 compile_data *cd, int *lengthptr)
2159 {
2160 int repeat_type, op_type;
2161 int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
2162 int bravalue = 0;
2163 int greedy_default, greedy_non_default;
2164 int firstbyte, reqbyte;
2165 int zeroreqbyte, zerofirstbyte;
2166 int req_caseopt, reqvary, tempreqvary;
2167 int options = *optionsptr;
2168 int after_manual_callout = 0;
2169 int length_prevgroup = 0;
2170 register int c;
2171 register uschar *code = *codeptr;
2172 uschar *last_code = code;
2173 uschar *orig_code = code;
2174 uschar *tempcode;
2175 BOOL inescq = FALSE;
2176 BOOL groupsetfirstbyte = FALSE;
2177 const uschar *ptr = *ptrptr;
2178 const uschar *tempptr;
2179 uschar *previous = NULL;
2180 uschar *previous_callout = NULL;
2181 uschar *save_hwm = NULL;
2182 uschar classbits[32];
2183
2184 #ifdef SUPPORT_UTF8
2185 BOOL class_utf8;
2186 BOOL utf8 = (options & PCRE_UTF8) != 0;
2187 uschar *class_utf8data;
2188 uschar utf8_char[6];
2189 #else
2190 BOOL utf8 = FALSE;
2191 uschar *utf8_char = NULL;
2192 #endif
2193
2194 #ifdef DEBUG
2195 if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2196 #endif
2197
2198 /* Set up the default and non-default settings for greediness */
2199
2200 greedy_default = ((options & PCRE_UNGREEDY) != 0);
2201 greedy_non_default = greedy_default ^ 1;
2202
2203 /* Initialize no first byte, no required byte. REQ_UNSET means "no char
2204 matching encountered yet". It gets changed to REQ_NONE if we hit something that
2205 matches a non-fixed char first char; reqbyte just remains unset if we never
2206 find one.
2207
2208 When we hit a repeat whose minimum is zero, we may have to adjust these values
2209 to take the zero repeat into account. This is implemented by setting them to
2210 zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2211 item types that can be repeated set these backoff variables appropriately. */
2212
2213 firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2214
2215 /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2216 according to the current setting of the caseless flag. REQ_CASELESS is a bit
2217 value > 255. It is added into the firstbyte or reqbyte variables to record the
2218 case status of the value. This is used only for ASCII characters. */
2219
2220 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2221
2222 /* Switch on next character until the end of the branch */
2223
2224 for (;; ptr++)
2225 {
2226 BOOL negate_class;
2227 BOOL possessive_quantifier;
2228 BOOL is_quantifier;
2229 BOOL is_recurse;
2230 BOOL reset_bracount;
2231 int class_charcount;
2232 int class_lastchar;
2233 int newoptions;
2234 int recno;
2235 int refsign;
2236 int skipbytes;
2237 int subreqbyte;
2238 int subfirstbyte;
2239 int terminator;
2240 int mclength;
2241 uschar mcbuffer[8];
2242
2243 /* Get next byte in the pattern */
2244
2245 c = *ptr;
2246
2247 /* If we are in the pre-compile phase, accumulate the length used for the
2248 previous cycle of this loop. */
2249
2250 if (lengthptr != NULL)
2251 {
2252 #ifdef DEBUG
2253 if (code > cd->hwm) cd->hwm = code; /* High water info */
2254 #endif
2255 if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
2256 {
2257 *errorcodeptr = ERR52;
2258 goto FAILED;
2259 }
2260
2261 /* There is at least one situation where code goes backwards: this is the
2262 case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2263 the class is simply eliminated. However, it is created first, so we have to
2264 allow memory for it. Therefore, don't ever reduce the length at this point.
2265 */
2266
2267 if (code < last_code) code = last_code;
2268
2269 /* Paranoid check for integer overflow */
2270
2271 if (OFLOW_MAX - *lengthptr < code - last_code)
2272 {
2273 *errorcodeptr = ERR20;
2274 goto FAILED;
2275 }
2276
2277 *lengthptr += code - last_code;
2278 DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2279
2280 /* If "previous" is set and it is not at the start of the work space, move
2281 it back to there, in order to avoid filling up the work space. Otherwise,
2282 if "previous" is NULL, reset the current code pointer to the start. */
2283
2284 if (previous != NULL)
2285 {
2286 if (previous > orig_code)
2287 {
2288 memmove(orig_code, previous, code - previous);
2289 code -= previous - orig_code;
2290 previous = orig_code;
2291 }
2292 }
2293 else code = orig_code;
2294
2295 /* Remember where this code item starts so we can pick up the length
2296 next time round. */
2297
2298 last_code = code;
2299 }
2300
2301 /* In the real compile phase, just check the workspace used by the forward
2302 reference list. */
2303
2304 else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
2305 {
2306 *errorcodeptr = ERR52;
2307 goto FAILED;
2308 }
2309
2310 /* If in \Q...\E, check for the end; if not, we have a literal */
2311
2312 if (inescq && c != 0)
2313 {
2314 if (c == '\\' && ptr[1] == 'E')
2315 {
2316 inescq = FALSE;
2317 ptr++;
2318 continue;
2319 }
2320 else
2321 {
2322 if (previous_callout != NULL)
2323 {
2324 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2325 complete_callout(previous_callout, ptr, cd);
2326 previous_callout = NULL;
2327 }
2328 if ((options & PCRE_AUTO_CALLOUT) != 0)
2329 {
2330 previous_callout = code;
2331 code = auto_callout(code, ptr, cd);
2332 }
2333 goto NORMAL_CHAR;
2334 }
2335 }
2336
2337 /* Fill in length of a previous callout, except when the next thing is
2338 a quantifier. */
2339
2340 is_quantifier = c == '*' || c == '+' || c == '?' ||
2341 (c == '{' && is_counted_repeat(ptr+1));
2342
2343 if (!is_quantifier && previous_callout != NULL &&
2344 after_manual_callout-- <= 0)
2345 {
2346 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2347 complete_callout(previous_callout, ptr, cd);
2348 previous_callout = NULL;
2349 }
2350
2351 /* In extended mode, skip white space and comments */
2352
2353 if ((options & PCRE_EXTENDED) != 0)
2354 {
2355 if ((cd->ctypes[c] & ctype_space) != 0) continue;
2356 if (c == '#')
2357 {
2358 while (*(++ptr) != 0)
2359 {
2360 if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
2361 }
2362 if (*ptr != 0) continue;
2363
2364 /* Else fall through to handle end of string */
2365 c = 0;
2366 }
2367 }
2368
2369 /* No auto callout for quantifiers. */
2370
2371 if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
2372 {
2373 previous_callout = code;
2374 code = auto_callout(code, ptr, cd);
2375 }
2376
2377 switch(c)
2378 {
2379 /* ===================================================================*/
2380 case 0: /* The branch terminates at string end */
2381 case '|': /* or | or ) */
2382 case ')':
2383 *firstbyteptr = firstbyte;
2384 *reqbyteptr = reqbyte;
2385 *codeptr = code;
2386 *ptrptr = ptr;
2387 if (lengthptr != NULL)
2388 {
2389 if (OFLOW_MAX - *lengthptr < code - last_code)
2390 {
2391 *errorcodeptr = ERR20;
2392 goto FAILED;
2393 }
2394 *lengthptr += code - last_code; /* To include callout length */
2395 DPRINTF((">> end branch\n"));
2396 }
2397 return TRUE;
2398
2399
2400 /* ===================================================================*/
2401 /* Handle single-character metacharacters. In multiline mode, ^ disables
2402 the setting of any following char as a first character. */
2403
2404 case '^':
2405 if ((options & PCRE_MULTILINE) != 0)
2406 {
2407 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2408 }
2409 previous = NULL;
2410 *code++ = OP_CIRC;
2411 break;
2412
2413 case '$':
2414 previous = NULL;
2415 *code++ = OP_DOLL;
2416 break;
2417
2418 /* There can never be a first char if '.' is first, whatever happens about
2419 repeats. The value of reqbyte doesn't change either. */
2420
2421 case '.':
2422 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2423 zerofirstbyte = firstbyte;
2424 zeroreqbyte = reqbyte;
2425 previous = code;
2426 *code++ = OP_ANY;
2427 break;
2428
2429
2430 /* ===================================================================*/
2431 /* Character classes. If the included characters are all < 256, we build a
2432 32-byte bitmap of the permitted characters, except in the special case
2433 where there is only one such character. For negated classes, we build the
2434 map as usual, then invert it at the end. However, we use a different opcode
2435 so that data characters > 255 can be handled correctly.
2436
2437 If the class contains characters outside the 0-255 range, a different
2438 opcode is compiled. It may optionally have a bit map for characters < 256,
2439 but those above are are explicitly listed afterwards. A flag byte tells
2440 whether the bitmap is present, and whether this is a negated class or not.
2441 */
2442
2443 case '[':
2444 previous = code;
2445
2446 /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
2447 they are encountered at the top level, so we'll do that too. */
2448
2449 if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2450 check_posix_syntax(ptr, &tempptr, cd))
2451 {
2452 *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;
2453 goto FAILED;
2454 }
2455
2456 /* If the first character is '^', set the negation flag and skip it. */
2457
2458 if ((c = *(++ptr)) == '^')
2459 {
2460 negate_class = TRUE;
2461 c = *(++ptr);
2462 }
2463 else
2464 {
2465 negate_class = FALSE;
2466 }
2467
2468 /* Keep a count of chars with values < 256 so that we can optimize the case
2469 of just a single character (as long as it's < 256). However, For higher
2470 valued UTF-8 characters, we don't yet do any optimization. */
2471
2472 class_charcount = 0;
2473 class_lastchar = -1;
2474
2475 /* Initialize the 32-char bit map to all zeros. We build the map in a
2476 temporary bit of memory, in case the class contains only 1 character (less
2477 than 256), because in that case the compiled code doesn't use the bit map.
2478 */
2479
2480 memset(classbits, 0, 32 * sizeof(uschar));
2481
2482 #ifdef SUPPORT_UTF8
2483 class_utf8 = FALSE; /* No chars >= 256 */
2484 class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */
2485 #endif
2486
2487 /* Process characters until ] is reached. By writing this as a "do" it
2488 means that an initial ] is taken as a data character. At the start of the
2489 loop, c contains the first byte of the character. */
2490
2491 if (c != 0) do
2492 {
2493 const uschar *oldptr;
2494
2495 #ifdef SUPPORT_UTF8
2496 if (utf8 && c > 127)
2497 { /* Braces are required because the */
2498 GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
2499 }
2500 #endif
2501
2502 /* Inside \Q...\E everything is literal except \E */
2503
2504 if (inescq)
2505 {
2506 if (c == '\\' && ptr[1] == 'E') /* If we are at \E */
2507 {
2508 inescq = FALSE; /* Reset literal state */
2509 ptr++; /* Skip the 'E' */
2510 continue; /* Carry on with next */
2511 }
2512 goto CHECK_RANGE; /* Could be range if \E follows */
2513 }
2514
2515 /* Handle POSIX class names. Perl allows a negation extension of the
2516 form [:^name:]. A square bracket that doesn't match the syntax is
2517 treated as a literal. We also recognize the POSIX constructions
2518 [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
2519 5.6 and 5.8 do. */
2520
2521 if (c == '[' &&
2522 (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2523 check_posix_syntax(ptr, &tempptr, cd))
2524 {
2525 BOOL local_negate = FALSE;
2526 int posix_class, taboffset, tabopt;
2527 register const uschar *cbits = cd->cbits;
2528 uschar pbits[32];
2529
2530 if (ptr[1] != ':')
2531 {
2532 *errorcodeptr = ERR31;
2533 goto FAILED;
2534 }
2535
2536 ptr += 2;
2537 if (*ptr == '^')
2538 {
2539 local_negate = TRUE;
2540 ptr++;
2541 }
2542
2543 posix_class = check_posix_name(ptr, tempptr - ptr);
2544 if (posix_class < 0)
2545 {
2546 *errorcodeptr = ERR30;
2547 goto FAILED;
2548 }
2549
2550 /* If matching is caseless, upper and lower are converted to
2551 alpha. This relies on the fact that the class table starts with
2552 alpha, lower, upper as the first 3 entries. */
2553
2554 if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
2555 posix_class = 0;
2556
2557 /* We build the bit map for the POSIX class in a chunk of local store
2558 because we may be adding and subtracting from it, and we don't want to
2559 subtract bits that may be in the main map already. At the end we or the
2560 result into the bit map that is being built. */
2561
2562 posix_class *= 3;
2563
2564 /* Copy in the first table (always present) */
2565
2566 memcpy(pbits, cbits + posix_class_maps[posix_class],
2567 32 * sizeof(uschar));
2568
2569 /* If there is a second table, add or remove it as required. */
2570
2571 taboffset = posix_class_maps[posix_class + 1];
2572 tabopt = posix_class_maps[posix_class + 2];
2573
2574 if (taboffset >= 0)
2575 {
2576 if (tabopt >= 0)
2577 for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
2578 else
2579 for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
2580 }
2581
2582 /* Not see if we need to remove any special characters. An option
2583 value of 1 removes vertical space and 2 removes underscore. */
2584
2585 if (tabopt < 0) tabopt = -tabopt;
2586 if (tabopt == 1) pbits[1] &= ~0x3c;
2587 else if (tabopt == 2) pbits[11] &= 0x7f;
2588
2589 /* Add the POSIX table or its complement into the main table that is
2590 being built and we are done. */
2591
2592 if (local_negate)
2593 for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
2594 else
2595 for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
2596
2597 ptr = tempptr + 1;
2598 class_charcount = 10; /* Set > 1; assumes more than 1 per class */
2599 continue; /* End of POSIX syntax handling */
2600 }
2601
2602 /* Backslash may introduce a single character, or it may introduce one
2603 of the specials, which just set a flag. The sequence \b is a special
2604 case. Inside a class (and only there) it is treated as backspace.
2605 Elsewhere it marks a word boundary. Other escapes have preset maps ready
2606 to or into the one we are building. We assume they have more than one
2607 character in them, so set class_charcount bigger than one. */
2608
2609 if (c == '\\')
2610 {
2611 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2612 if (*errorcodeptr != 0) goto FAILED;
2613
2614 if (-c == ESC_b) c = '\b'; /* \b is backslash in a class */
2615 else if (-c == ESC_X) c = 'X'; /* \X is literal X in a class */
2616 else if (-c == ESC_R) c = 'R'; /* \R is literal R in a class */
2617 else if (-c == ESC_Q) /* Handle start of quoted string */
2618 {
2619 if (ptr[1] == '\\' && ptr[2] == 'E')
2620 {
2621 ptr += 2; /* avoid empty string */
2622 }
2623 else inescq = TRUE;
2624 continue;
2625 }
2626
2627 if (c < 0)
2628 {
2629 register const uschar *cbits = cd->cbits;
2630 class_charcount += 2; /* Greater than 1 is what matters */
2631
2632 /* Save time by not doing this in the pre-compile phase. */
2633
2634 if (lengthptr == NULL) switch (-c)
2635 {
2636 case ESC_d:
2637 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
2638 continue;
2639
2640 case ESC_D:
2641 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
2642 continue;
2643
2644 case ESC_w:
2645 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
2646 continue;
2647
2648 case ESC_W:
2649 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
2650 continue;
2651
2652 case ESC_s:
2653 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
2654 classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
2655 continue;
2656
2657 case ESC_S:
2658 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
2659 classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
2660 continue;
2661
2662 case ESC_E: /* Perl ignores an orphan \E */
2663 continue;
2664
2665 default: /* Not recognized; fall through */
2666 break; /* Need "default" setting to stop compiler warning. */
2667 }
2668
2669 /* In the pre-compile phase, just do the recognition. */
2670
2671 else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
2672 c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
2673
2674 /* We need to deal with \H, \h, \V, and \v in both phases because
2675 they use extra memory. */
2676
2677 if (-c == ESC_h)
2678 {
2679 SETBIT(classbits, 0x09); /* VT */
2680 SETBIT(classbits, 0x20); /* SPACE */
2681 SETBIT(classbits, 0xa0); /* NSBP */
2682 #ifdef SUPPORT_UTF8
2683 if (utf8)
2684 {
2685 class_utf8 = TRUE;
2686 *class_utf8data++ = XCL_SINGLE;
2687 class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
2688 *class_utf8data++ = XCL_SINGLE;
2689 class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
2690 *class_utf8data++ = XCL_RANGE;
2691 class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
2692 class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
2693 *class_utf8data++ = XCL_SINGLE;
2694 class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
2695 *class_utf8data++ = XCL_SINGLE;
2696 class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
2697 *class_utf8data++ = XCL_SINGLE;
2698 class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
2699 }
2700 #endif
2701 continue;
2702 }
2703
2704 if (-c == ESC_H)
2705 {
2706 for (c = 0; c < 32; c++)
2707 {
2708 int x = 0xff;
2709 switch (c)
2710 {
2711 case 0x09/8: x ^= 1 << (0x09%8); break;
2712 case 0x20/8: x ^= 1 << (0x20%8); break;
2713 case 0xa0/8: x ^= 1 << (0xa0%8); break;
2714 default: break;
2715 }
2716 classbits[c] |= x;
2717 }
2718
2719 #ifdef SUPPORT_UTF8
2720 if (utf8)
2721 {
2722 class_utf8 = TRUE;
2723 *class_utf8data++ = XCL_RANGE;
2724 class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2725 class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
2726 *class_utf8data++ = XCL_RANGE;
2727 class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
2728 class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
2729 *class_utf8data++ = XCL_RANGE;
2730 class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
2731 class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
2732 *class_utf8data++ = XCL_RANGE;
2733 class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
2734 class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
2735 *class_utf8data++ = XCL_RANGE;
2736 class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
2737 class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
2738 *class_utf8data++ = XCL_RANGE;
2739 class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
2740 class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
2741 *class_utf8data++ = XCL_RANGE;
2742 class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
2743 class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2744 }
2745 #endif
2746 continue;
2747 }
2748
2749 if (-c == ESC_v)
2750 {
2751 SETBIT(classbits, 0x0a); /* LF */
2752 SETBIT(classbits, 0x0b); /* VT */
2753 SETBIT(classbits, 0x0c); /* FF */
2754 SETBIT(classbits, 0x0d); /* CR */
2755 SETBIT(classbits, 0x85); /* NEL */
2756 #ifdef SUPPORT_UTF8
2757 if (utf8)
2758 {
2759 class_utf8 = TRUE;
2760 *class_utf8data++ = XCL_RANGE;
2761 class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
2762 class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
2763 }
2764 #endif
2765 continue;
2766 }
2767
2768 if (-c == ESC_V)
2769 {
2770 for (c = 0; c < 32; c++)
2771 {
2772 int x = 0xff;
2773 switch (c)
2774 {
2775 case 0x0a/8: x ^= 1 << (0x0a%8);
2776 x ^= 1 << (0x0b%8);
2777 x ^= 1 << (0x0c%8);
2778 x ^= 1 << (0x0d%8);
2779 break;
2780 case 0x85/8: x ^= 1 << (0x85%8); break;
2781 default: break;
2782 }
2783 classbits[c] |= x;
2784 }
2785
2786 #ifdef SUPPORT_UTF8
2787 if (utf8)
2788 {
2789 class_utf8 = TRUE;
2790 *class_utf8data++ = XCL_RANGE;
2791 class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2792 class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
2793 *class_utf8data++ = XCL_RANGE;
2794 class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
2795 class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2796 }
2797 #endif
2798 continue;
2799 }
2800
2801 /* We need to deal with \P and \p in both phases. */
2802
2803 #ifdef SUPPORT_UCP
2804 if (-c == ESC_p || -c == ESC_P)
2805 {
2806 BOOL negated;
2807 int pdata;
2808 int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
2809 if (ptype < 0) goto FAILED;
2810 class_utf8 = TRUE;
2811 *class_utf8data++ = ((-c == ESC_p) != negated)?
2812 XCL_PROP : XCL_NOTPROP;
2813 *class_utf8data++ = ptype;
2814 *class_utf8data++ = pdata;
2815 class_charcount -= 2; /* Not a < 256 character */
2816 continue;
2817 }
2818 #endif
2819 /* Unrecognized escapes are faulted if PCRE is running in its
2820 strict mode. By default, for compatibility with Perl, they are
2821 treated as literals. */
2822
2823 if ((options & PCRE_EXTRA) != 0)
2824 {
2825 *errorcodeptr = ERR7;
2826 goto FAILED;
2827 }
2828
2829 class_charcount -= 2; /* Undo the default count from above */
2830 c = *ptr; /* Get the final character and fall through */
2831 }
2832
2833 /* Fall through if we have a single character (c >= 0). This may be
2834 greater than 256 in UTF-8 mode. */
2835
2836 } /* End of backslash handling */
2837
2838 /* A single character may be followed by '-' to form a range. However,
2839 Perl does not permit ']' to be the end of the range. A '-' character
2840 at the end is treated as a literal. Perl ignores orphaned \E sequences
2841 entirely. The code for handling \Q and \E is messy. */
2842
2843 CHECK_RANGE:
2844 while (ptr[1] == '\\' && ptr[2] == 'E')
2845 {
2846 inescq = FALSE;
2847 ptr += 2;
2848 }
2849
2850 oldptr = ptr;
2851
2852 if (!inescq && ptr[1] == '-')
2853 {
2854 int d;
2855 ptr += 2;
2856 while (*ptr == '\\' && ptr[1] == 'E') ptr += 2;
2857
2858 /* If we hit \Q (not followed by \E) at this point, go into escaped
2859 mode. */
2860
2861 while (*ptr == '\\' && ptr[1] == 'Q')
2862 {
2863 ptr += 2;
2864 if (*ptr == '\\' && ptr[1] == 'E') { ptr += 2; continue; }
2865 inescq = TRUE;
2866 break;
2867 }
2868
2869 if (*ptr == 0 || (!inescq && *ptr == ']'))
2870 {
2871 ptr = oldptr;
2872 goto LONE_SINGLE_CHARACTER;
2873 }
2874
2875 #ifdef SUPPORT_UTF8
2876 if (utf8)
2877 { /* Braces are required because the */
2878 GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
2879 }
2880 else
2881 #endif
2882 d = *ptr; /* Not UTF-8 mode */
2883
2884 /* The second part of a range can be a single-character escape, but
2885 not any of the other escapes. Perl 5.6 treats a hyphen as a literal
2886 in such circumstances. */
2887
2888 if (!inescq && d == '\\')
2889 {
2890 d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2891 if (*errorcodeptr != 0) goto FAILED;
2892
2893 /* \b is backslash; \X is literal X; \R is literal R; any other
2894 special means the '-' was literal */
2895
2896 if (d < 0)
2897 {
2898 if (d == -ESC_b) d = '\b';
2899 else if (d == -ESC_X) d = 'X';
2900 else if (d == -ESC_R) d = 'R'; else
2901 {
2902 ptr = oldptr;
2903 goto LONE_SINGLE_CHARACTER; /* A few lines below */
2904 }
2905 }
2906 }
2907
2908 /* Check that the two values are in the correct order. Optimize
2909 one-character ranges */
2910
2911 if (d < c)
2912 {
2913 *errorcodeptr = ERR8;
2914 goto FAILED;
2915 }
2916
2917 if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
2918
2919 /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
2920 matching, we have to use an XCLASS with extra data items. Caseless
2921 matching for characters > 127 is available only if UCP support is
2922 available. */
2923
2924 #ifdef SUPPORT_UTF8
2925 if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
2926 {
2927 class_utf8 = TRUE;
2928
2929 /* With UCP support, we can find the other case equivalents of
2930 the relevant characters. There may be several ranges. Optimize how
2931 they fit with the basic range. */
2932
2933 #ifdef SUPPORT_UCP
2934 if ((options & PCRE_CASELESS) != 0)
2935 {
2936 unsigned int occ, ocd;
2937 unsigned int cc = c;
2938 unsigned int origd = d;
2939 while (get_othercase_range(&cc, origd, &occ, &ocd))
2940 {
2941 if (occ >= (unsigned int)c &&
2942 ocd <= (unsigned int)d)
2943 continue; /* Skip embedded ranges */
2944
2945 if (occ < (unsigned int)c &&
2946 ocd >= (unsigned int)c - 1) /* Extend the basic range */
2947 { /* if there is overlap, */
2948 c = occ; /* noting that if occ < c */
2949 continue; /* we can't have ocd > d */
2950 } /* because a subrange is */
2951 if (ocd > (unsigned int)d &&
2952 occ <= (unsigned int)d + 1) /* always shorter than */
2953 { /* the basic range. */
2954 d = ocd;
2955 continue;
2956 }
2957
2958 if (occ == ocd)
2959 {
2960 *class_utf8data++ = XCL_SINGLE;
2961 }
2962 else
2963 {
2964 *class_utf8data++ = XCL_RANGE;
2965 class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
2966 }
2967 class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
2968 }
2969 }
2970 #endif /* SUPPORT_UCP */
2971
2972 /* Now record the original range, possibly modified for UCP caseless
2973 overlapping ranges. */
2974
2975 *class_utf8data++ = XCL_RANGE;
2976 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
2977 class_utf8data += _pcre_ord2utf8(d, class_utf8data);
2978
2979 /* With UCP support, we are done. Without UCP support, there is no
2980 caseless matching for UTF-8 characters > 127; we can use the bit map
2981 for the smaller ones. */
2982
2983 #ifdef SUPPORT_UCP
2984 continue; /* With next character in the class */
2985 #else
2986 if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
2987
2988 /* Adjust upper limit and fall through to set up the map */
2989
2990 d = 127;
2991
2992 #endif /* SUPPORT_UCP */
2993 }
2994 #endif /* SUPPORT_UTF8 */
2995
2996 /* We use the bit map for all cases when not in UTF-8 mode; else
2997 ranges that lie entirely within 0-127 when there is UCP support; else
2998 for partial ranges without UCP support. */
2999
3000 class_charcount += d - c + 1;
3001 class_lastchar = d;
3002
3003 /* We can save a bit of time by skipping this in the pre-compile. */
3004
3005 if (lengthptr == NULL) for (; c <= d; c++)
3006 {
3007 classbits[c/8] |= (1 << (c&7));
3008 if ((options & PCRE_CASELESS) != 0)
3009 {
3010 int uc = cd->fcc[c]; /* flip case */
3011 classbits[uc/8] |= (1 << (uc&7));
3012 }
3013 }
3014
3015 continue; /* Go get the next char in the class */
3016 }
3017
3018 /* Handle a lone single character - we can get here for a normal
3019 non-escape char, or after \ that introduces a single character or for an
3020 apparent range that isn't. */
3021
3022 LONE_SINGLE_CHARACTER:
3023
3024 /* Handle a character that cannot go in the bit map */
3025
3026 #ifdef SUPPORT_UTF8
3027 if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
3028 {
3029 class_utf8 = TRUE;
3030 *class_utf8data++ = XCL_SINGLE;
3031 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3032
3033 #ifdef SUPPORT_UCP
3034 if ((options & PCRE_CASELESS) != 0)
3035 {
3036 unsigned int othercase;
3037 if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR)
3038 {
3039 *class_utf8data++ = XCL_SINGLE;
3040 class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
3041 }
3042 }
3043 #endif /* SUPPORT_UCP */
3044
3045 }
3046 else
3047 #endif /* SUPPORT_UTF8 */
3048
3049 /* Handle a single-byte character */
3050 {
3051 classbits[c/8] |= (1 << (c&7));
3052 if ((options & PCRE_CASELESS) != 0)
3053 {
3054 c = cd->fcc[c]; /* flip case */
3055 classbits[c/8] |= (1 << (c&7));
3056 }
3057 class_charcount++;
3058 class_lastchar = c;
3059 }
3060 }
3061
3062 /* Loop until ']' reached. This "while" is the end of the "do" above. */
3063
3064 while ((c = *(++ptr)) != 0 && (c != ']' || inescq));
3065
3066 if (c == 0) /* Missing terminating ']' */
3067 {
3068 *errorcodeptr = ERR6;
3069 goto FAILED;
3070 }
3071
3072 /* If class_charcount is 1, we saw precisely one character whose value is
3073 less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we
3074 can optimize the negative case only if there were no characters >= 128
3075 because OP_NOT and the related opcodes like OP_NOTSTAR operate on
3076 single-bytes only. This is an historical hangover. Maybe one day we can
3077 tidy these opcodes to handle multi-byte characters.
3078
3079 The optimization throws away the bit map. We turn the item into a
3080 1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
3081 that OP_NOT does not support multibyte characters. In the positive case, it
3082 can cause firstbyte to be set. Otherwise, there can be no first char if
3083 this item is first, whatever repeat count may follow. In the case of
3084 reqbyte, save the previous value for reinstating. */
3085
3086 #ifdef SUPPORT_UTF8
3087 if (class_charcount == 1 &&
3088 (!utf8 ||
3089 (!class_utf8 && (!negate_class || class_lastchar < 128))))
3090
3091 #else
3092 if (class_charcount == 1)
3093 #endif
3094 {
3095 zeroreqbyte = reqbyte;
3096
3097 /* The OP_NOT opcode works on one-byte characters only. */
3098
3099 if (negate_class)
3100 {
3101 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3102 zerofirstbyte = firstbyte;
3103 *code++ = OP_NOT;
3104 *code++ = class_lastchar;
3105 break;
3106 }
3107
3108 /* For a single, positive character, get the value into mcbuffer, and
3109 then we can handle this with the normal one-character code. */
3110
3111 #ifdef SUPPORT_UTF8
3112 if (utf8 && class_lastchar > 127)
3113 mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
3114 else
3115 #endif
3116 {
3117 mcbuffer[0] = class_lastchar;
3118 mclength = 1;
3119 }
3120 goto ONE_CHAR;
3121 } /* End of 1-char optimization */
3122
3123 /* The general case - not the one-char optimization. If this is the first
3124 thing in the branch, there can be no first char setting, whatever the
3125 repeat count. Any reqbyte setting must remain unchanged after any kind of
3126 repeat. */
3127
3128 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3129 zerofirstbyte = firstbyte;
3130 zeroreqbyte = reqbyte;
3131
3132 /* If there are characters with values > 255, we have to compile an
3133 extended class, with its own opcode. If there are no characters < 256,
3134 we can omit the bitmap in the actual compiled code. */
3135
3136 #ifdef SUPPORT_UTF8
3137 if (class_utf8)
3138 {
3139 *class_utf8data++ = XCL_END; /* Marks the end of extra data */
3140 *code++ = OP_XCLASS;
3141 code += LINK_SIZE;
3142 *code = negate_class? XCL_NOT : 0;
3143
3144 /* If the map is required, move up the extra data to make room for it;
3145 otherwise just move the code pointer to the end of the extra data. */
3146
3147 if (class_charcount > 0)
3148 {
3149 *code++ |= XCL_MAP;
3150 memmove(code + 32, code, class_utf8data - code);
3151 memcpy(code, classbits, 32);
3152 code = class_utf8data + 32;
3153 }
3154 else code = class_utf8data;
3155
3156 /* Now fill in the complete length of the item */
3157
3158 PUT(previous, 1, code - previous);
3159 break; /* End of class handling */
3160 }
3161 #endif
3162
3163 /* If there are no characters > 255, negate the 32-byte map if necessary,
3164 and copy it into the code vector. If this is the first thing in the branch,
3165 there can be no first char setting, whatever the repeat count. Any reqbyte
3166 setting must remain unchanged after any kind of repeat. */
3167
3168 if (negate_class)
3169 {
3170 *code++ = OP_NCLASS;
3171 if (lengthptr == NULL) /* Save time in the pre-compile phase */
3172 for (c = 0; c < 32; c++) code[c] = ~classbits[c];
3173 }
3174 else
3175 {
3176 *code++ = OP_CLASS;
3177 memcpy(code, classbits, 32);
3178 }
3179 code += 32;
3180 break;
3181
3182
3183 /* ===================================================================*/
3184 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
3185 has been tested above. */
3186
3187 case '{':
3188 if (!is_quantifier) goto NORMAL_CHAR;
3189 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
3190 if (*errorcodeptr != 0) goto FAILED;
3191 goto REPEAT;
3192
3193 case '*':
3194 repeat_min = 0;
3195 repeat_max = -1;
3196 goto REPEAT;
3197
3198 case '+':
3199 repeat_min = 1;
3200 repeat_max = -1;
3201 goto REPEAT;
3202
3203 case '?':
3204 repeat_min = 0;
3205 repeat_max = 1;
3206
3207 REPEAT:
3208 if (previous == NULL)
3209 {
3210 *errorcodeptr = ERR9;
3211 goto FAILED;
3212 }
3213
3214 if (repeat_min == 0)
3215 {
3216 firstbyte = zerofirstbyte; /* Adjust for zero repeat */
3217 reqbyte = zeroreqbyte; /* Ditto */
3218 }
3219
3220 /* Remember whether this is a variable length repeat */
3221
3222 reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
3223
3224 op_type = 0; /* Default single-char op codes */
3225 possessive_quantifier = FALSE; /* Default not possessive quantifier */
3226
3227 /* Save start of previous item, in case we have to move it up to make space
3228 for an inserted OP_ONCE for the additional '+' extension. */
3229
3230 tempcode = previous;
3231
3232 /* If the next character is '+', we have a possessive quantifier. This
3233 implies greediness, whatever the setting of the PCRE_UNGREEDY option.
3234 If the next character is '?' this is a minimizing repeat, by default,
3235 but if PCRE_UNGREEDY is set, it works the other way round. We change the
3236 repeat type to the non-default. */
3237
3238 if (ptr[1] == '+')
3239 {
3240 repeat_type = 0; /* Force greedy */
3241 possessive_quantifier = TRUE;
3242 ptr++;
3243 }
3244 else if (ptr[1] == '?')
3245 {
3246 repeat_type = greedy_non_default;
3247 ptr++;
3248 }
3249 else repeat_type = greedy_default;
3250
3251 /* If previous was a character match, abolish the item and generate a
3252 repeat item instead. If a char item has a minumum of more than one, ensure
3253 that it is set in reqbyte - it might not be if a sequence such as x{3} is
3254 the first thing in a branch because the x will have gone into firstbyte
3255 instead. */
3256
3257 if (*previous == OP_CHAR || *previous == OP_CHARNC)
3258 {
3259 /* Deal with UTF-8 characters that take up more than one byte. It's
3260 easier to write this out separately than try to macrify it. Use c to
3261 hold the length of the character in bytes, plus 0x80 to flag that it's a
3262 length rather than a small character. */
3263
3264 #ifdef SUPPORT_UTF8
3265 if (utf8 && (code[-1] & 0x80) != 0)
3266 {
3267 uschar *lastchar = code - 1;
3268 while((*lastchar & 0xc0) == 0x80) lastchar--;
3269 c = code - lastchar; /* Length of UTF-8 character */
3270 memcpy(utf8_char, lastchar, c); /* Save the char */
3271 c |= 0x80; /* Flag c as a length */
3272 }
3273 else
3274 #endif
3275
3276 /* Handle the case of a single byte - either with no UTF8 support, or
3277 with UTF-8 disabled, or for a UTF-8 character < 128. */
3278
3279 {
3280 c = code[-1];
3281 if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
3282 }
3283
3284 /* If the repetition is unlimited, it pays to see if the next thing on
3285 the line is something that cannot possibly match this character. If so,
3286 automatically possessifying this item gains some performance in the case
3287 where the match fails. */
3288
3289 if (!possessive_quantifier &&
3290 repeat_max < 0 &&
3291 check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3292 options, cd))
3293 {
3294 repeat_type = 0; /* Force greedy */
3295 possessive_quantifier = TRUE;
3296 }
3297
3298 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
3299 }
3300
3301 /* If previous was a single negated character ([^a] or similar), we use
3302 one of the special opcodes, replacing it. The code is shared with single-
3303 character repeats by setting opt_type to add a suitable offset into
3304 repeat_type. We can also test for auto-possessification. OP_NOT is
3305 currently used only for single-byte chars. */
3306
3307 else if (*previous == OP_NOT)
3308 {
3309 op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
3310 c = previous[1];
3311 if (!possessive_quantifier &&
3312 repeat_max < 0 &&
3313 check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3314 {
3315 repeat_type = 0; /* Force greedy */
3316 possessive_quantifier = TRUE;
3317 }
3318 goto OUTPUT_SINGLE_REPEAT;
3319 }
3320
3321 /* If previous was a character type match (\d or similar), abolish it and
3322 create a suitable repeat item. The code is shared with single-character
3323 repeats by setting op_type to add a suitable offset into repeat_type. Note
3324 the the Unicode property types will be present only when SUPPORT_UCP is
3325 defined, but we don't wrap the little bits of code here because it just
3326 makes it horribly messy. */
3327
3328 else if (*previous < OP_EODN)
3329 {
3330 uschar *oldcode;
3331 int prop_type, prop_value;
3332 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
3333 c = *previous;
3334
3335 if (!possessive_quantifier &&
3336 repeat_max < 0 &&
3337 check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3338 {
3339 repeat_type = 0; /* Force greedy */
3340 possessive_quantifier = TRUE;
3341 }
3342
3343 OUTPUT_SINGLE_REPEAT:
3344 if (*previous == OP_PROP || *previous == OP_NOTPROP)
3345 {
3346 prop_type = previous[1];
3347 prop_value = previous[2];
3348 }
3349 else prop_type = prop_value = -1;
3350
3351 oldcode = code;
3352 code = previous; /* Usually overwrite previous item */
3353
3354 /* If the maximum is zero then the minimum must also be zero; Perl allows
3355 this case, so we do too - by simply omitting the item altogether. */
3356
3357 if (repeat_max == 0) goto END_REPEAT;
3358
3359 /* All real repeats make it impossible to handle partial matching (maybe
3360 one day we will be able to remove this restriction). */
3361
3362 if (repeat_max != 1) cd->nopartial = TRUE;
3363
3364 /* Combine the op_type with the repeat_type */
3365
3366 repeat_type += op_type;
3367
3368 /* A minimum of zero is handled either as the special case * or ?, or as
3369 an UPTO, with the maximum given. */
3370
3371 if (repeat_min == 0)
3372 {
3373 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
3374 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
3375 else
3376 {
3377 *code++ = OP_UPTO + repeat_type;
3378 PUT2INC(code, 0, repeat_max);
3379 }
3380 }
3381
3382 /* A repeat minimum of 1 is optimized into some special cases. If the
3383 maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
3384 left in place and, if the maximum is greater than 1, we use OP_UPTO with
3385 one less than the maximum. */
3386
3387 else if (repeat_min == 1)
3388 {
3389 if (repeat_max == -1)
3390 *code++ = OP_PLUS + repeat_type;
3391 else
3392 {
3393 code = oldcode; /* leave previous item in place */
3394 if (repeat_max == 1) goto END_REPEAT;
3395 *code++ = OP_UPTO + repeat_type;
3396 PUT2INC(code, 0, repeat_max - 1);
3397 }
3398 }
3399
3400 /* The case {n,n} is just an EXACT, while the general case {n,m} is
3401 handled as an EXACT followed by an UPTO. */
3402
3403 else
3404 {
3405 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
3406 PUT2INC(code, 0, repeat_min);
3407
3408 /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
3409 we have to insert the character for the previous code. For a repeated
3410 Unicode property match, there are two extra bytes that define the
3411 required property. In UTF-8 mode, long characters have their length in
3412 c, with the 0x80 bit as a flag. */
3413
3414 if (repeat_max < 0)
3415 {
3416 #ifdef SUPPORT_UTF8
3417 if (utf8 && c >= 128)
3418 {
3419 memcpy(code, utf8_char, c & 7);
3420 code += c & 7;
3421 }
3422 else
3423 #endif
3424 {
3425 *code++ = c;
3426 if (prop_type >= 0)
3427 {
3428 *code++ = prop_type;
3429 *code++ = prop_value;
3430 }
3431 }
3432 *code++ = OP_STAR + repeat_type;
3433 }
3434
3435 /* Else insert an UPTO if the max is greater than the min, again
3436 preceded by the character, for the previously inserted code. If the
3437 UPTO is just for 1 instance, we can use QUERY instead. */
3438
3439 else if (repeat_max != repeat_min)
3440 {
3441 #ifdef SUPPORT_UTF8
3442 if (utf8 && c >= 128)
3443 {
3444 memcpy(code, utf8_char, c & 7);
3445 code += c & 7;
3446 }
3447 else
3448 #endif
3449 *code++ = c;
3450 if (prop_type >= 0)
3451 {
3452 *code++ = prop_type;
3453 *code++ = prop_value;
3454 }
3455 repeat_max -= repeat_min;
3456
3457 if (repeat_max == 1)
3458 {
3459 *code++ = OP_QUERY + repeat_type;
3460 }
3461 else
3462 {
3463 *code++ = OP_UPTO + repeat_type;
3464 PUT2INC(code, 0, repeat_max);
3465 }
3466 }
3467 }
3468
3469 /* The character or character type itself comes last in all cases. */
3470
3471 #ifdef SUPPORT_UTF8
3472 if (utf8 && c >= 128)
3473 {
3474 memcpy(code, utf8_char, c & 7);
3475 code += c & 7;
3476 }
3477 else
3478 #endif
3479 *code++ = c;
3480
3481 /* For a repeated Unicode property match, there are two extra bytes that
3482 define the required property. */
3483
3484 #ifdef SUPPORT_UCP
3485 if (prop_type >= 0)
3486 {
3487 *code++ = prop_type;
3488 *code++ = prop_value;
3489 }
3490 #endif
3491 }
3492
3493 /* If previous was a character class or a back reference, we put the repeat
3494 stuff after it, but just skip the item if the repeat was {0,0}. */
3495
3496 else if (*previous == OP_CLASS ||
3497 *previous == OP_NCLASS ||
3498 #ifdef SUPPORT_UTF8
3499 *previous == OP_XCLASS ||
3500 #endif
3501 *previous == OP_REF)
3502 {
3503 if (repeat_max == 0)
3504 {
3505 code = previous;
3506 goto END_REPEAT;
3507 }
3508
3509 /* All real repeats make it impossible to handle partial matching (maybe
3510 one day we will be able to remove this restriction). */
3511
3512 if (repeat_max != 1) cd->nopartial = TRUE;
3513
3514 if (repeat_min == 0 && repeat_max == -1)
3515 *code++ = OP_CRSTAR + repeat_type;
3516 else if (repeat_min == 1 && repeat_max == -1)
3517 *code++ = OP_CRPLUS + repeat_type;
3518 else if (repeat_min == 0 && repeat_max == 1)
3519 *code++ = OP_CRQUERY + repeat_type;
3520 else
3521 {
3522 *code++ = OP_CRRANGE + repeat_type;
3523 PUT2INC(code, 0, repeat_min);
3524 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
3525 PUT2INC(code, 0, repeat_max);
3526 }
3527 }
3528
3529 /* If previous was a bracket group, we may have to replicate it in certain
3530 cases. */
3531
3532 else if (*previous == OP_BRA || *previous == OP_CBRA ||
3533 *previous == OP_ONCE || *previous == OP_COND)
3534 {
3535 register int i;
3536 int ketoffset = 0;
3537 int len = code - previous;
3538 uschar *bralink = NULL;
3539
3540 /* Repeating a DEFINE group is pointless */
3541
3542 if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
3543 {
3544 *errorcodeptr = ERR55;
3545 goto FAILED;
3546 }
3547
3548 /* If the maximum repeat count is unlimited, find the end of the bracket
3549 by scanning through from the start, and compute the offset back to it
3550 from the current code pointer. There may be an OP_OPT setting following
3551 the final KET, so we can't find the end just by going back from the code
3552 pointer. */
3553
3554 if (repeat_max == -1)
3555 {
3556 register uschar *ket = previous;
3557 do ket += GET(ket, 1); while (*ket != OP_KET);
3558 ketoffset = code - ket;
3559 }
3560
3561 /* The case of a zero minimum is special because of the need to stick
3562 OP_BRAZERO in front of it, and because the group appears once in the
3563 data, whereas in other cases it appears the minimum number of times. For
3564 this reason, it is simplest to treat this case separately, as otherwise
3565 the code gets far too messy. There are several special subcases when the
3566 minimum is zero. */
3567
3568 if (repeat_min == 0)
3569 {
3570 /* If the maximum is also zero, we just omit the group from the output
3571 altogether. */
3572
3573 if (repeat_max == 0)
3574 {
3575 code = previous;
3576 goto END_REPEAT;
3577 }
3578
3579 /* If the maximum is 1 or unlimited, we just have to stick in the
3580 BRAZERO and do no more at this point. However, we do need to adjust
3581 any OP_RECURSE calls inside the group that refer to the group itself or
3582 any internal or forward referenced group, because the offset is from
3583 the start of the whole regex. Temporarily terminate the pattern while
3584 doing this. */
3585
3586 if (repeat_max <= 1)
3587 {
3588 *code = OP_END;
3589 adjust_recurse(previous, 1, utf8, cd, save_hwm);
3590 memmove(previous+1, previous, len);
3591 code++;
3592 *previous++ = OP_BRAZERO + repeat_type;
3593 }
3594
3595 /* If the maximum is greater than 1 and limited, we have to replicate
3596 in a nested fashion, sticking OP_BRAZERO before each set of brackets.
3597 The first one has to be handled carefully because it's the original
3598 copy, which has to be moved up. The remainder can be handled by code
3599 that is common with the non-zero minimum case below. We have to
3600 adjust the value or repeat_max, since one less copy is required. Once
3601 again, we may have to adjust any OP_RECURSE calls inside the group. */
3602
3603 else
3604 {
3605 int offset;
3606 *code = OP_END;
3607 adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
3608 memmove(previous + 2 + LINK_SIZE, previous, len);
3609 code += 2 + LINK_SIZE;
3610 *previous++ = OP_BRAZERO + repeat_type;
3611 *previous++ = OP_BRA;
3612
3613 /* We chain together the bracket offset fields that have to be
3614 filled in later when the ends of the brackets are reached. */
3615
3616 offset = (bralink == NULL)? 0 : previous - bralink;
3617 bralink = previous;
3618 PUTINC(previous, 0, offset);
3619 }
3620
3621 repeat_max--;
3622 }
3623
3624 /* If the minimum is greater than zero, replicate the group as many
3625 times as necessary, and adjust the maximum to the number of subsequent
3626 copies that we need. If we set a first char from the group, and didn't
3627 set a required char, copy the latter from the former. If there are any
3628 forward reference subroutine calls in the group, there will be entries on
3629 the workspace list; replicate these with an appropriate increment. */
3630
3631 else
3632 {
3633 if (repeat_min > 1)
3634 {
3635 /* In the pre-compile phase, we don't actually do the replication. We
3636 just adjust the length as if we had. Do some paranoid checks for
3637 potential integer overflow. */
3638
3639 if (lengthptr != NULL)
3640 {
3641 int delta = (repeat_min - 1)*length_prevgroup;
3642 if ((double)(repeat_min - 1)*(double)length_prevgroup >
3643 (double)INT_MAX ||
3644 OFLOW_MAX - *lengthptr < delta)
3645 {
3646 *errorcodeptr = ERR20;
3647 goto FAILED;
3648 }
3649 *lengthptr += delta;
3650 }
3651
3652 /* This is compiling for real */
3653
3654 else
3655 {
3656 if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
3657 for (i = 1; i < repeat_min; i++)
3658 {
3659 uschar *hc;
3660 uschar *this_hwm = cd->hwm;
3661 memcpy(code, previous, len);
3662 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3663 {
3664 PUT(cd->hwm, 0, GET(hc, 0) + len);
3665 cd->hwm += LINK_SIZE;
3666 }
3667 save_hwm = this_hwm;
3668 code += len;
3669 }
3670 }
3671 }
3672
3673 if (repeat_max > 0) repeat_max -= repeat_min;
3674 }
3675
3676 /* This code is common to both the zero and non-zero minimum cases. If
3677 the maximum is limited, it replicates the group in a nested fashion,
3678 remembering the bracket starts on a stack. In the case of a zero minimum,
3679 the first one was set up above. In all cases the repeat_max now specifies
3680 the number of additional copies needed. Again, we must remember to
3681 replicate entries on the forward reference list. */
3682
3683 if (repeat_max >= 0)
3684 {
3685 /* In the pre-compile phase, we don't actually do the replication. We
3686 just adjust the length as if we had. For each repetition we must add 1
3687 to the length for BRAZERO and for all but the last repetition we must
3688 add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
3689 paranoid checks to avoid integer overflow. */
3690
3691 if (lengthptr != NULL && repeat_max > 0)
3692 {
3693 int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
3694 2 - 2*LINK_SIZE; /* Last one doesn't nest */
3695 if ((double)repeat_max *
3696 (double)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
3697 > (double)INT_MAX ||
3698 OFLOW_MAX - *lengthptr < delta)
3699 {
3700 *errorcodeptr = ERR20;
3701 goto FAILED;
3702 }
3703 *lengthptr += delta;
3704 }
3705
3706 /* This is compiling for real */
3707
3708 else for (i = repeat_max - 1; i >= 0; i--)
3709 {
3710 uschar *hc;
3711 uschar *this_hwm = cd->hwm;
3712
3713 *code++ = OP_BRAZERO + repeat_type;
3714
3715 /* All but the final copy start a new nesting, maintaining the
3716 chain of brackets outstanding. */
3717
3718 if (i != 0)
3719 {
3720 int offset;
3721 *code++ = OP_BRA;
3722 offset = (bralink == NULL)? 0 : code - bralink;
3723 bralink = code;
3724 PUTINC(code, 0, offset);
3725 }
3726
3727 memcpy(code, previous, len);
3728 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3729 {
3730 PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
3731 cd->hwm += LINK_SIZE;
3732 }
3733 save_hwm = this_hwm;
3734 code += len;
3735 }
3736
3737 /* Now chain through the pending brackets, and fill in their length
3738 fields (which are holding the chain links pro tem). */
3739
3740 while (bralink != NULL)
3741 {
3742 int oldlinkoffset;
3743 int offset = code - bralink + 1;
3744 uschar *bra = code - offset;
3745 oldlinkoffset = GET(bra, 1);
3746 bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
3747 *code++ = OP_KET;
3748 PUTINC(code, 0, offset);
3749 PUT(bra, 1, offset);
3750 }
3751 }
3752
3753 /* If the maximum is unlimited, set a repeater in the final copy. We
3754 can't just offset backwards from the current code point, because we
3755 don't know if there's been an options resetting after the ket. The
3756 correct offset was computed above.
3757
3758 Then, when we are doing the actual compile phase, check to see whether
3759 this group is a non-atomic one that could match an empty string. If so,
3760 convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
3761 that runtime checking can be done. [This check is also applied to
3762 atomic groups at runtime, but in a different way.] */
3763
3764 else
3765 {
3766 uschar *ketcode = code - ketoffset;
3767 uschar *bracode = ketcode - GET(ketcode, 1);
3768 *ketcode = OP_KETRMAX + repeat_type;
3769 if (lengthptr == NULL && *bracode != OP_ONCE)
3770 {
3771 uschar *scode = bracode;
3772 do
3773 {
3774 if (could_be_empty_branch(scode, ketcode, utf8))
3775 {
3776 *bracode += OP_SBRA - OP_BRA;
3777 break;
3778 }
3779 scode += GET(scode, 1);
3780 }
3781 while (*scode == OP_ALT);
3782 }
3783 }
3784 }
3785
3786 /* Else there's some kind of shambles */
3787
3788 else
3789 {
3790 *errorcodeptr = ERR11;
3791 goto FAILED;
3792 }
3793
3794 /* If the character following a repeat is '+', or if certain optimization
3795 tests above succeeded, possessive_quantifier is TRUE. For some of the
3796 simpler opcodes, there is an special alternative opcode for this. For
3797 anything else, we wrap the entire repeated item inside OP_ONCE brackets.
3798 The '+' notation is just syntactic sugar, taken from Sun's Java package,
3799 but the special opcodes can optimize it a bit. The repeated item starts at
3800 tempcode, not at previous, which might be the first part of a string whose
3801 (former) last char we repeated.
3802
3803 Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
3804 an 'upto' may follow. We skip over an 'exact' item, and then test the
3805 length of what remains before proceeding. */
3806
3807 if (possessive_quantifier)
3808 {
3809 int len;
3810 if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||
3811 *tempcode == OP_NOTEXACT)
3812 tempcode += _pcre_OP_lengths[*tempcode];
3813 len = code - tempcode;
3814 if (len > 0) switch (*tempcode)
3815 {
3816 case OP_STAR: *tempcode = OP_POSSTAR; break;
3817 case OP_PLUS: *tempcode = OP_POSPLUS; break;
3818 case OP_QUERY: *tempcode = OP_POSQUERY; break;
3819 case OP_UPTO: *tempcode = OP_POSUPTO; break;
3820
3821 case OP_TYPESTAR: *tempcode = OP_TYPEPOSSTAR; break;
3822 case OP_TYPEPLUS: *tempcode = OP_TYPEPOSPLUS; break;
3823 case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
3824 case OP_TYPEUPTO: *tempcode = OP_TYPEPOSUPTO; break;
3825
3826 case OP_NOTSTAR: *tempcode = OP_NOTPOSSTAR; break;
3827 case OP_NOTPLUS: *tempcode = OP_NOTPOSPLUS; break;
3828 case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
3829 case OP_NOTUPTO: *tempcode = OP_NOTPOSUPTO; break;
3830
3831 default:
3832 memmove(tempcode + 1+LINK_SIZE, tempcode, len);
3833 code += 1 + LINK_SIZE;
3834 len += 1 + LINK_SIZE;
3835 tempcode[0] = OP_ONCE;
3836 *code++ = OP_KET;
3837 PUTINC(code, 0, len);
3838 PUT(tempcode, 1, len);
3839 break;
3840 }
3841 }
3842
3843 /* In all case we no longer have a previous item. We also set the
3844 "follows varying string" flag for subsequently encountered reqbytes if
3845 it isn't already set and we have just passed a varying length item. */
3846
3847 END_REPEAT:
3848 previous = NULL;
3849 cd->req_varyopt |= reqvary;
3850 break;
3851
3852
3853 /* ===================================================================*/
3854 /* Start of nested parenthesized sub-expression, or comment or lookahead or
3855 lookbehind or option setting or condition or all the other extended
3856 parenthesis forms. First deal with the specials; all are introduced by ?,
3857 and the appearance of any of them means that this is not a capturing
3858 group. */
3859
3860 case '(':
3861 newoptions = options;
3862 skipbytes = 0;
3863 bravalue = OP_CBRA;
3864 save_hwm = cd->hwm;
3865 reset_bracount = FALSE;
3866
3867 if (*(++ptr) == '?')
3868 {
3869 int i, set, unset, namelen;
3870 int *optset;
3871 const uschar *name;
3872 uschar *slot;
3873
3874 switch (*(++ptr))
3875 {
3876 case '#': /* Comment; skip to ket */
3877 ptr++;
3878 while (*ptr != 0 && *ptr != ')') ptr++;
3879 if (*ptr == 0)
3880 {
3881 *errorcodeptr = ERR18;
3882 goto FAILED;
3883 }
3884 continue;
3885
3886
3887 /* ------------------------------------------------------------ */
3888 case '|': /* Reset capture count for each branch */
3889 reset_bracount = TRUE;
3890 /* Fall through */
3891
3892 /* ------------------------------------------------------------ */
3893 case ':': /* Non-capturing bracket */
3894 bravalue = OP_BRA;
3895 ptr++;
3896 break;
3897
3898
3899 /* ------------------------------------------------------------ */
3900 case '(':
3901 bravalue = OP_COND; /* Conditional group */
3902
3903 /* A condition can be an assertion, a number (referring to a numbered
3904 group), a name (referring to a named group), or 'R', referring to
3905 recursion. R<digits> and R&name are also permitted for recursion tests.
3906
3907 There are several syntaxes for testing a named group: (?(name)) is used
3908 by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
3909
3910 There are two unfortunate ambiguities, caused by history. (a) 'R' can
3911 be the recursive thing or the name 'R' (and similarly for 'R' followed
3912 by digits), and (b) a number could be a name that consists of digits.
3913 In both cases, we look for a name first; if not found, we try the other
3914 cases. */
3915
3916 /* For conditions that are assertions, check the syntax, and then exit
3917 the switch. This will take control down to where bracketed groups,
3918 including assertions, are processed. */
3919
3920 if (ptr[1] == '?' && (ptr[2] == '=' || ptr[2] == '!' || ptr[2] == '<'))
3921 break;
3922
3923 /* Most other conditions use OP_CREF (a couple change to OP_RREF
3924 below), and all need to skip 3 bytes at the start of the group. */
3925
3926 code[1+LINK_SIZE] = OP_CREF;
3927 skipbytes = 3;
3928 refsign = -1;
3929
3930 /* Check for a test for recursion in a named group. */
3931
3932 if (ptr[1] == 'R' && ptr[2] == '&')
3933 {
3934 terminator = -1;
3935 ptr += 2;
3936 code[1+LINK_SIZE] = OP_RREF; /* Change the type of test */
3937 }
3938
3939 /* Check for a test for a named group's having been set, using the Perl
3940 syntax (?(<name>) or (?('name') */
3941
3942 else if (ptr[1] == '<')
3943 {
3944 terminator = '>';
3945 ptr++;
3946 }
3947 else if (ptr[1] == '\'')
3948 {
3949 terminator = '\'';
3950 ptr++;
3951 }
3952 else
3953 {
3954 terminator = 0;
3955 if (ptr[1] == '-' || ptr[1] == '+') refsign = *(++ptr);
3956 }
3957
3958 /* We now expect to read a name; any thing else is an error */
3959
3960 if ((cd->ctypes[ptr[1]] & ctype_word) == 0)
3961 {
3962 ptr += 1; /* To get the right offset */
3963 *errorcodeptr = ERR28;
3964 goto FAILED;
3965 }
3966
3967 /* Read the name, but also get it as a number if it's all digits */
3968
3969 recno = 0;
3970 name = ++ptr;
3971 while ((cd->ctypes[*ptr] & ctype_word) != 0)
3972 {
3973 if (recno >= 0)
3974 recno = ((digitab[*ptr] & ctype_digit) != 0)?
3975 recno * 10 + *ptr - '0' : -1;
3976 ptr++;
3977 }
3978 namelen = ptr - name;
3979
3980 if ((terminator > 0 && *ptr++ != terminator) || *ptr++ != ')')
3981 {
3982 ptr--; /* Error offset */
3983 *errorcodeptr = ERR26;
3984 goto FAILED;
3985 }
3986
3987 /* Do no further checking in the pre-compile phase. */
3988
3989 if (lengthptr != NULL) break;
3990
3991 /* In the real compile we do the work of looking for the actual
3992 reference. If the string started with "+" or "-" we require the rest to
3993 be digits, in which case recno will be set. */
3994
3995 if (refsign > 0)
3996 {
3997 if (recno <= 0)
3998 {
3999 *errorcodeptr = ERR58;
4000 goto FAILED;
4001 }
4002 if (refsign == '-')
4003 {
4004 recno = cd->bracount - recno + 1;
4005 if (recno <= 0)
4006 {
4007 *errorcodeptr = ERR15;
4008 goto FAILED;
4009 }
4010 }
4011 else recno += cd->bracount;
4012 PUT2(code, 2+LINK_SIZE, recno);
4013 break;
4014 }
4015
4016 /* Otherwise (did not start with "+" or "-"), start by looking for the
4017 name. */
4018
4019 slot = cd->name_table;
4020 for (i = 0; i < cd->names_found; i++)
4021 {
4022 if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
4023 slot += cd->name_entry_size;
4024 }
4025
4026 /* Found a previous named subpattern */
4027
4028 if (i < cd->names_found)
4029 {
4030 recno = GET2(slot, 0);
4031 PUT2(code, 2+LINK_SIZE, recno);
4032 }
4033
4034 /* Search the pattern for a forward reference */
4035
4036 else if ((i = find_parens(ptr, cd->bracount, name, namelen,
4037 (options & PCRE_EXTENDED) != 0)) > 0)
4038 {
4039 PUT2(code, 2+LINK_SIZE, i);
4040 }
4041
4042 /* If terminator == 0 it means that the name followed directly after
4043 the opening parenthesis [e.g. (?(abc)...] and in this case there are
4044 some further alternatives to try. For the cases where terminator != 0
4045 [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
4046 now checked all the possibilities, so give an error. */
4047
4048 else if (terminator != 0)
4049 {
4050 *errorcodeptr = ERR15;
4051 goto FAILED;
4052 }
4053
4054 /* Check for (?(R) for recursion. Allow digits after R to specify a
4055 specific group number. */
4056
4057 else if (*name == 'R')
4058 {
4059 recno = 0;
4060 for (i = 1; i < namelen; i++)
4061 {
4062 if ((digitab[name[i]] & ctype_digit) == 0)
4063 {
4064 *errorcodeptr = ERR15;
4065 goto FAILED;
4066 }
4067 recno = recno * 10 + name[i] - '0';
4068 }
4069 if (recno == 0) recno = RREF_ANY;
4070 code[1+LINK_SIZE] = OP_RREF; /* Change test type */
4071 PUT2(code, 2+LINK_SIZE, recno);
4072 }
4073
4074 /* Similarly, check for the (?(DEFINE) "condition", which is always
4075 false. */
4076
4077 else if (namelen == 6 && strncmp((char *)name, "DEFINE", 6) == 0)
4078 {
4079 code[1+LINK_SIZE] = OP_DEF;
4080 skipbytes = 1;
4081 }
4082
4083 /* Check for the "name" actually being a subpattern number. */
4084
4085 else if (recno > 0)
4086 {
4087 PUT2(code, 2+LINK_SIZE, recno);
4088 }
4089
4090 /* Either an unidentified subpattern, or a reference to (?(0) */
4091
4092 else
4093 {
4094 *errorcodeptr = (recno == 0)? ERR35: ERR15;
4095 goto FAILED;
4096 }
4097 break;
4098
4099
4100 /* ------------------------------------------------------------ */
4101 case '=': /* Positive lookahead */
4102 bravalue = OP_ASSERT;
4103 ptr++;
4104 break;
4105
4106
4107 /* ------------------------------------------------------------ */
4108 case '!': /* Negative lookahead */
4109 bravalue = OP_ASSERT_NOT;
4110 ptr++;
4111 break;
4112
4113
4114 /* ------------------------------------------------------------ */
4115 case '<': /* Lookbehind or named define */
4116 switch (ptr[1])
4117 {
4118 case '=': /* Positive lookbehind */
4119 bravalue = OP_ASSERTBACK;
4120 ptr += 2;
4121 break;
4122
4123 case '!': /* Negative lookbehind */
4124 bravalue = OP_ASSERTBACK_NOT;
4125 ptr += 2;
4126 break;
4127
4128 default: /* Could be name define, else bad */
4129 if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME;
4130 ptr++; /* Correct offset for error */
4131 *errorcodeptr = ERR24;
4132 goto FAILED;
4133 }
4134 break;
4135
4136
4137 /* ------------------------------------------------------------ */
4138 case '>': /* One-time brackets */
4139 bravalue = OP_ONCE;
4140 ptr++;
4141 break;
4142
4143
4144 /* ------------------------------------------------------------ */
4145 case 'C': /* Callout - may be followed by digits; */
4146 previous_callout = code; /* Save for later completion */
4147 after_manual_callout = 1; /* Skip one item before completing */
4148 *code++ = OP_CALLOUT;
4149 {
4150 int n = 0;
4151 while ((digitab[*(++ptr)] & ctype_digit) != 0)
4152 n = n * 10 + *ptr - '0';
4153 if (*ptr != ')')
4154 {
4155 *errorcodeptr = ERR39;
4156 goto FAILED;
4157 }
4158 if (n > 255)
4159 {
4160 *errorcodeptr = ERR38;
4161 goto FAILED;
4162 }
4163 *code++ = n;
4164 PUT(code, 0, ptr - cd->start_pattern + 1); /* Pattern offset */
4165 PUT(code, LINK_SIZE, 0); /* Default length */
4166 code += 2 * LINK_SIZE;
4167 }
4168 previous = NULL;
4169 continue;
4170
4171
4172 /* ------------------------------------------------------------ */
4173 case 'P': /* Python-style named subpattern handling */
4174 if (*(++ptr) == '=' || *ptr == '>') /* Reference or recursion */
4175 {
4176 is_recurse = *ptr == '>';
4177 terminator = ')';
4178 goto NAMED_REF_OR_RECURSE;
4179 }
4180 else if (*ptr != '<') /* Test for Python-style definition */
4181 {
4182 *errorcodeptr = ERR41;
4183 goto FAILED;
4184 }
4185 /* Fall through to handle (?P< as (?< is handled */
4186
4187
4188 /* ------------------------------------------------------------ */
4189 DEFINE_NAME: /* Come here from (?< handling */
4190 case '\'':
4191 {
4192 terminator = (*ptr == '<')? '>' : '\'';
4193 name = ++ptr;
4194
4195 while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4196 namelen = ptr - name;
4197
4198 /* In the pre-compile phase, just do a syntax check. */
4199
4200 if (lengthptr != NULL)
4201 {
4202 if (*ptr != terminator)
4203 {
4204 *errorcodeptr = ERR42;
4205 goto FAILED;
4206 }
4207 if (cd->names_found >= MAX_NAME_COUNT)
4208 {
4209 *errorcodeptr = ERR49;
4210 goto FAILED;
4211 }
4212 if (namelen + 3 > cd->name_entry_size)
4213 {
4214 cd->name_entry_size = namelen + 3;
4215 if (namelen > MAX_NAME_SIZE)
4216 {
4217 *errorcodeptr = ERR48;
4218 goto FAILED;
4219 }
4220 }
4221 }
4222
4223 /* In the real compile, create the entry in the table */
4224
4225 else
4226 {
4227 slot = cd->name_table;
4228 for (i = 0; i < cd->names_found; i++)
4229 {
4230 int crc = memcmp(name, slot+2, namelen);
4231 if (crc == 0)
4232 {
4233 if (slot[2+namelen] == 0)
4234 {
4235 if ((options & PCRE_DUPNAMES) == 0)
4236 {
4237 *errorcodeptr = ERR43;
4238 goto FAILED;
4239 }
4240 }
4241 else crc = -1; /* Current name is substring */
4242 }
4243 if (crc < 0)
4244 {
4245 memmove(slot + cd->name_entry_size, slot,
4246 (cd->names_found - i) * cd->name_entry_size);
4247 break;
4248 }
4249 slot += cd->name_entry_size;
4250 }
4251
4252 PUT2(slot, 0, cd->bracount + 1);
4253 memcpy(slot + 2, name, namelen);
4254 slot[2+namelen] = 0;
4255 }
4256 }
4257
4258 /* In both cases, count the number of names we've encountered. */
4259
4260 ptr++; /* Move past > or ' */
4261 cd->names_found++;
4262 goto NUMBERED_GROUP;
4263
4264
4265 /* ------------------------------------------------------------ */
4266 case '&': /* Perl recursion/subroutine syntax */
4267 terminator = ')';
4268 is_recurse = TRUE;
4269 /* Fall through */
4270
4271 /* We come here from the Python syntax above that handles both
4272 references (?P=name) and recursion (?P>name), as well as falling
4273 through from the Perl recursion syntax (?&name). */
4274
4275 NAMED_REF_OR_RECURSE:
4276 name = ++ptr;
4277 while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4278 namelen = ptr - name;
4279
4280 /* In the pre-compile phase, do a syntax check and set a dummy
4281 reference number. */
4282
4283 if (lengthptr != NULL)
4284 {
4285 if (*ptr != terminator)
4286 {
4287 *errorcodeptr = ERR42;
4288 goto FAILED;
4289 }
4290 if (namelen > MAX_NAME_SIZE)
4291 {
4292 *errorcodeptr = ERR48;
4293 goto FAILED;
4294 }
4295 recno = 0;
4296 }
4297
4298 /* In the real compile, seek the name in the table */
4299
4300 else
4301 {
4302 slot = cd->name_table;
4303 for (i = 0; i < cd->names_found; i++)
4304 {
4305 if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
4306 slot += cd->name_entry_size;
4307 }
4308
4309 if (i < cd->names_found) /* Back reference */
4310 {
4311 recno = GET2(slot, 0);
4312 }
4313 else if ((recno = /* Forward back reference */
4314 find_parens(ptr, cd->bracount, name, namelen,
4315 (options & PCRE_EXTENDED) != 0)) <= 0)
4316 {
4317 *errorcodeptr = ERR15;
4318 goto FAILED;
4319 }
4320 }
4321
4322 /* In both phases, we can now go to the code than handles numerical
4323 recursion or backreferences. */
4324
4325 if (is_recurse) goto HANDLE_RECURSION;
4326 else goto HANDLE_REFERENCE;
4327
4328
4329 /* ------------------------------------------------------------ */
4330 case 'R': /* Recursion */
4331 ptr++; /* Same as (?0) */
4332 /* Fall through */
4333
4334
4335 /* ------------------------------------------------------------ */
4336 case '-': case '+':
4337 case '0': case '1': case '2': case '3': case '4': /* Recursion or */
4338 case '5': case '6': case '7': case '8': case '9': /* subroutine */
4339 {
4340 const uschar *called;
4341
4342 if ((refsign = *ptr) == '+') ptr++;
4343 else if (refsign == '-')
4344 {
4345 if ((digitab[ptr[1]] & ctype_digit) == 0)
4346 goto OTHER_CHAR_AFTER_QUERY;
4347 ptr++;
4348 }
4349
4350 recno = 0;
4351 while((digitab[*ptr] & ctype_digit) != 0)
4352 recno = recno * 10 + *ptr++ - '0';
4353
4354 if (*ptr != ')')
4355 {
4356 *errorcodeptr = ERR29;
4357 goto FAILED;
4358 }
4359
4360 if (refsign == '-')
4361 {
4362 if (recno == 0)
4363 {
4364 *errorcodeptr = ERR58;
4365 goto FAILED;
4366 }
4367 recno = cd->bracount - recno + 1;
4368 if (recno <= 0)
4369 {
4370 *errorcodeptr = ERR15;
4371 goto FAILED;
4372 }
4373 }
4374 else if (refsign == '+')
4375 {
4376 if (recno == 0)
4377 {
4378 *errorcodeptr = ERR58;
4379 goto FAILED;
4380 }
4381 recno += cd->bracount;
4382 }
4383
4384 /* Come here from code above that handles a named recursion */
4385
4386 HANDLE_RECURSION:
4387
4388 previous = code;
4389 called = cd->start_code;
4390
4391 /* When we are actually compiling, find the bracket that is being
4392 referenced. Temporarily end the regex in case it doesn't exist before
4393 this point. If we end up with a forward reference, first check that
4394 the bracket does occur later so we can give the error (and position)
4395 now. Then remember this forward reference in the workspace so it can
4396 be filled in at the end. */
4397
4398 if (lengthptr == NULL)
4399 {
4400 *code = OP_END;
4401 if (recno != 0) called = find_bracket(cd->start_code, utf8, recno);
4402
4403 /* Forward reference */
4404
4405 if (called == NULL)
4406 {
4407 if (find_parens(ptr, cd->bracount, NULL, recno,
4408 (options & PCRE_EXTENDED) != 0) < 0)
4409 {
4410 *errorcodeptr = ERR15;
4411 goto FAILED;
4412 }
4413 called = cd->start_code + recno;
4414 PUTINC(cd->hwm, 0, code + 2 + LINK_SIZE - cd->start_code);
4415 }
4416
4417 /* If not a forward reference, and the subpattern is still open,
4418 this is a recursive call. We check to see if this is a left
4419 recursion that could loop for ever, and diagnose that case. */
4420
4421 else if (GET(called, 1) == 0 &&
4422 could_be_empty(called, code, bcptr, utf8))
4423 {
4424 *errorcodeptr = ERR40;
4425 goto FAILED;
4426 }
4427 }
4428
4429 /* Insert the recursion/subroutine item, automatically wrapped inside
4430 "once" brackets. Set up a "previous group" length so that a
4431 subsequent quantifier will work. */
4432
4433 *code = OP_ONCE;
4434 PUT(code, 1, 2 + 2*LINK_SIZE);
4435 code += 1 + LINK_SIZE;
4436
4437 *code = OP_RECURSE;
4438 PUT(code, 1, called - cd->start_code);
4439 code += 1 + LINK_SIZE;
4440
4441 *code = OP_KET;
4442 PUT(code, 1, 2 + 2*LINK_SIZE);
4443 code += 1 + LINK_SIZE;
4444
4445 length_prevgroup = 3 + 3*LINK_SIZE;
4446 }
4447
4448 /* Can't determine a first byte now */
4449
4450 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
4451 continue;
4452
4453
4454 /* ------------------------------------------------------------ */
4455 default: /* Other characters: check option setting */
4456 OTHER_CHAR_AFTER_QUERY:
4457 set = unset = 0;
4458 optset = &set;
4459
4460 while (*ptr != ')' && *ptr != ':')
4461 {
4462 switch (*ptr++)
4463 {
4464 case '-': optset = &unset; break;
4465
4466 case 'J': /* Record that it changed in the external options */
4467 *optset |= PCRE_DUPNAMES;
4468 cd->external_options |= PCRE_JCHANGED;
4469 break;
4470
4471 case 'i': *optset |= PCRE_CASELESS; break;
4472 case 'm': *optset |= PCRE_MULTILINE; break;
4473 case 's': *optset |= PCRE_DOTALL; break;
4474 case 'x': *optset |= PCRE_EXTENDED; break;
4475 case 'U': *optset |= PCRE_UNGREEDY; break;
4476 case 'X': *optset |= PCRE_EXTRA; break;
4477
4478 default: *errorcodeptr = ERR12;
4479 ptr--; /* Correct the offset */
4480 goto FAILED;
4481 }
4482 }
4483
4484 /* Set up the changed option bits, but don't change anything yet. */
4485
4486 newoptions = (options | set) & (~unset);
4487
4488 /* If the options ended with ')' this is not the start of a nested
4489 group with option changes, so the options change at this level. If this
4490 item is right at the start of the pattern, the options can be
4491 abstracted and made external in the pre-compile phase, and ignored in
4492 the compile phase. This can be helpful when matching -- for instance in
4493 caseless checking of required bytes.
4494
4495 If the code pointer is not (cd->start_code + 1 + LINK_SIZE), we are
4496 definitely *not* at the start of the pattern because something has been
4497 compiled. In the pre-compile phase, however, the code pointer can have
4498 that value after the start, because it gets reset as code is discarded
4499 during the pre-compile. However, this can happen only at top level - if
4500 we are within parentheses, the starting BRA will still be present. At
4501 any parenthesis level, the length value can be used to test if anything
4502 has been compiled at that level. Thus, a test for both these conditions
4503 is necessary to ensure we correctly detect the start of the pattern in
4504 both phases.
4505
4506 If we are not at the pattern start, compile code to change the ims
4507 options if this setting actually changes any of them. We also pass the
4508 new setting back so that it can be put at the start of any following
4509 branches, and when this group ends (if we are in a group), a resetting
4510 item can be compiled. */
4511
4512 if (*ptr == ')')
4513 {
4514 if (code == cd->start_code + 1 + LINK_SIZE &&
4515 (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE))
4516 {
4517 cd->external_options = newoptions;
4518 options = newoptions;
4519 }
4520 else
4521 {
4522 if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
4523 {
4524 *code++ = OP_OPT;
4525 *code++ = newoptions & PCRE_IMS;
4526 }
4527
4528 /* Change options at this level, and pass them back for use
4529 in subsequent branches. Reset the greedy defaults and the case
4530 value for firstbyte and reqbyte. */
4531
4532 *optionsptr = options = newoptions;
4533 greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
4534 greedy_non_default = greedy_default ^ 1;
4535 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
4536 }
4537
4538 previous = NULL; /* This item can't be repeated */
4539 continue; /* It is complete */
4540 }
4541
4542 /* If the options ended with ':' we are heading into a nested group
4543 with possible change of options. Such groups are non-capturing and are
4544 not assertions of any kind. All we need to do is skip over the ':';
4545 the newoptions value is handled below. */
4546
4547 bravalue = OP_BRA;
4548 ptr++;
4549 } /* End of switch for character following (? */
4550 } /* End of (? handling */
4551
4552 /* Opening parenthesis not followed by '?'. If PCRE_NO_AUTO_CAPTURE is set,
4553 all unadorned brackets become non-capturing and behave like (?:...)
4554 brackets. */
4555
4556 else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
4557 {
4558 bravalue = OP_BRA;
4559 }
4560
4561 /* Else we have a capturing group. */
4562
4563 else
4564 {
4565 NUMBERED_GROUP:
4566 cd->bracount += 1;
4567 PUT2(code, 1+LINK_SIZE, cd->bracount);
4568 skipbytes = 2;
4569 }
4570
4571 /* Process nested bracketed regex. Assertions may not be repeated, but
4572 other kinds can be. All their opcodes are >= OP_ONCE. We copy code into a
4573 non-register variable in order to be able to pass its address because some
4574 compilers complain otherwise. Pass in a new setting for the ims options if
4575 they have changed. */
4576
4577 previous = (bravalue >= OP_ONCE)? code : NULL;
4578 *code = bravalue;
4579 tempcode = code;
4580 tempreqvary = cd->req_varyopt; /* Save value before bracket */
4581 length_prevgroup = 0; /* Initialize for pre-compile phase */
4582
4583 if (!compile_regex(
4584 newoptions, /* The complete new option state */
4585 options & PCRE_IMS, /* The previous ims option state */
4586 &tempcode, /* Where to put code (updated) */
4587 &ptr, /* Input pointer (updated) */
4588 errorcodeptr, /* Where to put an error message */
4589 (bravalue == OP_ASSERTBACK ||
4590 bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
4591 reset_bracount, /* True if (?| group */
4592 skipbytes, /* Skip over bracket number */
4593 &subfirstbyte, /* For possible first char */
4594 &subreqbyte, /* For possible last char */
4595 bcptr, /* Current branch chain */
4596 cd, /* Tables block */
4597 (lengthptr == NULL)? NULL : /* Actual compile phase */
4598 &length_prevgroup /* Pre-compile phase */
4599 ))
4600 goto FAILED;
4601
4602 /* At the end of compiling, code is still pointing to the start of the
4603 group, while tempcode has been updated to point past the end of the group
4604 and any option resetting that may follow it. The pattern pointer (ptr)
4605 is on the bracket. */
4606
4607 /* If this is a conditional bracket, check that there are no more than
4608 two branches in the group, or just one if it's a DEFINE group. We do this
4609 in the real compile phase, not in the pre-pass, where the whole group may
4610 not be available. */
4611
4612 if (bravalue == OP_COND && lengthptr == NULL)
4613 {
4614 uschar *tc = code;
4615 int condcount = 0;
4616
4617 do {
4618 condcount++;
4619 tc += GET(tc,1);
4620 }
4621 while (*tc != OP_KET);
4622
4623 /* A DEFINE group is never obeyed inline (the "condition" is always
4624 false). It must have only one branch. */
4625
4626 if (code[LINK_SIZE+1] == OP_DEF)
4627 {
4628 if (condcount > 1)
4629 {
4630 *errorcodeptr = ERR54;
4631 goto FAILED;
4632 }
4633 bravalue = OP_DEF; /* Just a flag to suppress char handling below */
4634 }
4635
4636 /* A "normal" conditional group. If there is just one branch, we must not
4637 make use of its firstbyte or reqbyte, because this is equivalent to an
4638 empty second branch. */
4639
4640 else
4641 {
4642 if (condcount > 2)
4643 {
4644 *errorcodeptr = ERR27;
4645 goto FAILED;
4646 }
4647 if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
4648 }
4649 }
4650
4651 /* Error if hit end of pattern */
4652
4653 if (*ptr != ')')
4654 {
4655 *errorcodeptr = ERR14;
4656 goto FAILED;
4657 }
4658
4659 /* In the pre-compile phase, update the length by the length of the nested
4660 group, less the brackets at either end. Then reduce the compiled code to
4661 just the brackets so that it doesn't use much memory if it is duplicated by
4662 a quantifier. */
4663
4664 if (lengthptr != NULL)
4665 {
4666 if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE)
4667 {
4668 *errorcodeptr = ERR20;
4669 goto FAILED;
4670 }
4671 *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
4672 code++;
4673 PUTINC(code, 0, 1 + LINK_SIZE);
4674 *code++ = OP_KET;
4675 PUTINC(code, 0, 1 + LINK_SIZE);
4676 }
4677
4678 /* Otherwise update the main code pointer to the end of the group. */
4679
4680 else code = tempcode;
4681
4682 /* For a DEFINE group, required and first character settings are not
4683 relevant. */
4684
4685 if (bravalue == OP_DEF) break;
4686
4687 /* Handle updating of the required and first characters for other types of
4688 group. Update for normal brackets of all kinds, and conditions with two
4689 branches (see code above). If the bracket is followed by a quantifier with
4690 zero repeat, we have to back off. Hence the definition of zeroreqbyte and
4691 zerofirstbyte outside the main loop so that they can be accessed for the
4692 back off. */
4693
4694 zeroreqbyte = reqbyte;
4695 zerofirstbyte = firstbyte;
4696 groupsetfirstbyte = FALSE;
4697
4698 if (bravalue >= OP_ONCE)
4699 {
4700 /* If we have not yet set a firstbyte in this branch, take it from the
4701 subpattern, remembering that it was set here so that a repeat of more
4702 than one can replicate it as reqbyte if necessary. If the subpattern has
4703 no firstbyte, set "none" for the whole branch. In both cases, a zero
4704 repeat forces firstbyte to "none". */
4705
4706 if (firstbyte == REQ_UNSET)
4707 {
4708 if (subfirstbyte >= 0)
4709 {
4710 firstbyte = subfirstbyte;
4711 groupsetfirstbyte = TRUE;
4712 }
4713 else firstbyte = REQ_NONE;
4714 zerofirstbyte = REQ_NONE;
4715 }
4716
4717 /* If firstbyte was previously set, convert the subpattern's firstbyte
4718 into reqbyte if there wasn't one, using the vary flag that was in
4719 existence beforehand. */
4720
4721 else if (subfirstbyte >= 0 && subreqbyte < 0)
4722 subreqbyte = subfirstbyte | tempreqvary;
4723
4724 /* If the subpattern set a required byte (or set a first byte that isn't
4725 really the first byte - see above), set it. */
4726
4727 if (subreqbyte >= 0) reqbyte = subreqbyte;
4728 }
4729
4730 /* For a forward assertion, we take the reqbyte, if set. This can be
4731 helpful if the pattern that follows the assertion doesn't set a different
4732 char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
4733 for an assertion, however because it leads to incorrect effect for patterns
4734 such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
4735 of a firstbyte. This is overcome by a scan at the end if there's no
4736 firstbyte, looking for an asserted first char. */
4737
4738 else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
4739 break; /* End of processing '(' */
4740
4741
4742 /* ===================================================================*/
4743 /* Handle metasequences introduced by \. For ones like \d, the ESC_ values
4744 are arranged to be the negation of the corresponding OP_values. For the
4745 back references, the values are ESC_REF plus the reference number. Only
4746 back references and those types that consume a character may be repeated.
4747 We can test for values between ESC_b and ESC_Z for the latter; this may
4748 have to change if any new ones are ever created. */
4749
4750 case '\\':
4751 tempptr = ptr;
4752 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, FALSE);
4753 if (*errorcodeptr != 0) goto FAILED;
4754
4755 if (c < 0)
4756 {
4757 if (-c == ESC_Q) /* Handle start of quoted string */
4758 {
4759 if (ptr[1] == '\\' && ptr[2] == 'E') ptr += 2; /* avoid empty string */
4760 else inescq = TRUE;
4761 continue;
4762 }
4763
4764 if (-c == ESC_E) continue; /* Perl ignores an orphan \E */
4765
4766 /* For metasequences that actually match a character, we disable the
4767 setting of a first character if it hasn't already been set. */
4768
4769 if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
4770 firstbyte = REQ_NONE;
4771
4772 /* Set values to reset to if this is followed by a zero repeat. */
4773
4774 zerofirstbyte = firstbyte;
4775 zeroreqbyte = reqbyte;
4776
4777 /* \k<name> or \k'name' is a back reference by name (Perl syntax).
4778 We also support \k{name} (.NET syntax) */
4779
4780 if (-c == ESC_k && (ptr[1] == '<' || ptr[1] == '\'' || ptr[1] == '{'))
4781 {
4782 is_recurse = FALSE;
4783 terminator = (*(++ptr) == '<')? '>' : (*ptr == '\'')? '\'' : '}';
4784 goto NAMED_REF_OR_RECURSE;
4785 }
4786
4787 /* Back references are handled specially; must disable firstbyte if
4788 not set to cope with cases like (?=(\w+))\1: which would otherwise set
4789 ':' later. */
4790
4791 if (-c >= ESC_REF)
4792 {
4793 recno = -c - ESC_REF;
4794
4795 HANDLE_REFERENCE: /* Come here from named backref handling */
4796 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
4797 previous = code;
4798 *code++ = OP_REF;
4799 PUT2INC(code, 0, recno);
4800 cd->backref_map |= (recno < 32)? (1 << recno) : 1;
4801 if (recno > cd->top_backref) cd->top_backref = recno;
4802 }
4803
4804 /* So are Unicode property matches, if supported. */
4805
4806 #ifdef SUPPORT_UCP
4807 else if (-c == ESC_P || -c == ESC_p)
4808 {
4809 BOOL negated;
4810 int pdata;
4811 int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
4812 if (ptype < 0) goto FAILED;
4813 previous = code;
4814 *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
4815 *code++ = ptype;
4816 *code++ = pdata;
4817 }
4818 #else
4819
4820 /* If Unicode properties are not supported, \X, \P, and \p are not
4821 allowed. */
4822
4823 else if (-c == ESC_X || -c == ESC_P || -c == ESC_p)
4824 {
4825 *errorcodeptr = ERR45;
4826 goto FAILED;
4827 }
4828 #endif
4829
4830 /* For the rest (including \X when Unicode properties are supported), we
4831 can obtain the OP value by negating the escape value. */
4832
4833 else
4834 {
4835 previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
4836 *code++ = -c;
4837 }
4838 continue;
4839 }
4840
4841 /* We have a data character whose value is in c. In UTF-8 mode it may have
4842 a value > 127. We set its representation in the length/buffer, and then
4843 handle it as a data character. */
4844
4845 #ifdef SUPPORT_UTF8
4846 if (utf8 && c > 127)
4847 mclength = _pcre_ord2utf8(c, mcbuffer);
4848 else
4849 #endif
4850
4851 {
4852 mcbuffer[0] = c;
4853 mclength = 1;
4854 }
4855 goto ONE_CHAR;
4856
4857
4858 /* ===================================================================*/
4859 /* Handle a literal character. It is guaranteed not to be whitespace or #
4860 when the extended flag is set. If we are in UTF-8 mode, it may be a
4861 multi-byte literal character. */
4862
4863 default:
4864 NORMAL_CHAR:
4865 mclength = 1;
4866 mcbuffer[0] = c;
4867
4868 #ifdef SUPPORT_UTF8
4869 if (utf8 && c >= 0xc0)
4870 {
4871 while ((ptr[1] & 0xc0) == 0x80)
4872 mcbuffer[mclength++] = *(++ptr);
4873 }
4874 #endif
4875
4876 /* At this point we have the character's bytes in mcbuffer, and the length
4877 in mclength. When not in UTF-8 mode, the length is always 1. */
4878
4879 ONE_CHAR:
4880 previous = code;
4881 *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARNC : OP_CHAR;
4882 for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
4883
4884 /* Set the first and required bytes appropriately. If no previous first
4885 byte, set it from this character, but revert to none on a zero repeat.
4886 Otherwise, leave the firstbyte value alone, and don't change it on a zero
4887 repeat. */
4888
4889 if (firstbyte == REQ_UNSET)
4890 {
4891 zerofirstbyte = REQ_NONE;
4892 zeroreqbyte = reqbyte;
4893
4894 /* If the character is more than one byte long, we can set firstbyte
4895 only if it is not to be matched caselessly. */
4896
4897 if (mclength == 1 || req_caseopt == 0)
4898 {
4899 firstbyte = mcbuffer[0] | req_caseopt;
4900 if (mclength != 1) reqbyte = code[-1] | cd->req_varyopt;
4901 }
4902 else firstbyte = reqbyte = REQ_NONE;
4903 }
4904
4905 /* firstbyte was previously set; we can set reqbyte only the length is
4906 1 or the matching is caseful. */
4907
4908 else
4909 {
4910 zerofirstbyte = firstbyte;
4911 zeroreqbyte = reqbyte;
4912 if (mclength == 1 || req_caseopt == 0)
4913 reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
4914 }
4915
4916 break; /* End of literal character handling */
4917 }
4918 } /* end of big loop */
4919
4920
4921 /* Control never reaches here by falling through, only by a goto for all the
4922 error states. Pass back the position in the pattern so that it can be displayed
4923 to the user for diagnosing the error. */
4924
4925 FAILED:
4926 *ptrptr = ptr;
4927 return FALSE;
4928 }
4929
4930
4931
4932
4933 /*************************************************
4934 * Compile sequence of alternatives *
4935 *************************************************/
4936
4937 /* On entry, ptr is pointing past the bracket character, but on return it
4938 points to the closing bracket, or vertical bar, or end of string. The code
4939 variable is pointing at the byte into which the BRA operator has been stored.
4940 If the ims options are changed at the start (for a (?ims: group) or during any
4941 branch, we need to insert an OP_OPT item at the start of every following branch
4942 to ensure they get set correctly at run time, and also pass the new options
4943 into every subsequent branch compile.
4944
4945 This function is used during the pre-compile phase when we are trying to find
4946 out the amount of memory needed, as well as during the real compile phase. The
4947 value of lengthptr distinguishes the two phases.
4948
4949 Arguments:
4950 options option bits, including any changes for this subpattern
4951 oldims previous settings of ims option bits
4952 codeptr -> the address of the current code pointer
4953 ptrptr -> the address of the current pattern pointer
4954 errorcodeptr -> pointer to error code variable
4955 lookbehind TRUE if this is a lookbehind assertion
4956 reset_bracount TRUE to reset the count for each branch
4957 skipbytes skip this many bytes at start (for brackets and OP_COND)
4958 firstbyteptr place to put the first required character, or a negative number
4959 reqbyteptr place to put the last required character, or a negative number
4960 bcptr pointer to the chain of currently open branches
4961 cd points to the data block with tables pointers etc.
4962 lengthptr NULL during the real compile phase
4963 points to length accumulator during pre-compile phase
4964
4965 Returns: TRUE on success
4966 */
4967
4968 static BOOL
4969 compile_regex(int options, int oldims, uschar **codeptr, const uschar **ptrptr,
4970 int *errorcodeptr, BOOL lookbehind, BOOL reset_bracount, int skipbytes,
4971 int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd,
4972 int *lengthptr)
4973 {
4974 const uschar *ptr = *ptrptr;
4975 uschar *code = *codeptr;
4976 uschar *last_branch = code;
4977 uschar *start_bracket = code;
4978 uschar *reverse_count = NULL;
4979 int firstbyte, reqbyte;
4980 int branchfirstbyte, branchreqbyte;
4981 int length;
4982 int orig_bracount;
4983 int max_bracount;
4984 branch_chain bc;
4985
4986 bc.outer = bcptr;
4987 bc.current = code;
4988
4989 firstbyte = reqbyte = REQ_UNSET;
4990
4991 /* Accumulate the length for use in the pre-compile phase. Start with the
4992 length of the BRA and KET and any extra bytes that are required at the
4993 beginning. We accumulate in a local variable to save frequent testing of
4994 lenthptr for NULL. We cannot do this by looking at the value of code at the
4995 start and end of each alternative, because compiled items are discarded during
4996 the pre-compile phase so that the work space is not exceeded. */
4997
4998 length = 2 + 2*LINK_SIZE + skipbytes;
4999
5000 /* WARNING: If the above line is changed for any reason, you must also change
5001 the code that abstracts option settings at the start of the pattern and makes
5002 them global. It tests the value of length for (2 + 2*LINK_SIZE) in the
5003 pre-compile phase to find out whether anything has yet been compiled or not. */
5004
5005 /* Offset is set zero to mark that this bracket is still open */
5006
5007 PUT(code, 1, 0);
5008 code += 1 + LINK_SIZE + skipbytes;
5009
5010 /* Loop for each alternative branch */
5011
5012 orig_bracount = max_bracount = cd->bracount;
5013 for (;;)
5014 {
5015 /* For a (?| group, reset the capturing bracket count so that each branch
5016 uses the same numbers. */
5017
5018 if (reset_bracount) cd->bracount = orig_bracount;
5019
5020 /* Handle a change of ims options at the start of the branch */
5021
5022 if ((options & PCRE_IMS) != oldims)
5023 {
5024 *code++ = OP_OPT;
5025 *code++ = options & PCRE_IMS;
5026 length += 2;
5027 }
5028
5029 /* Set up dummy OP_REVERSE if lookbehind assertion */
5030
5031 if (lookbehind)
5032 {
5033 *code++ = OP_REVERSE;
5034 reverse_count = code;
5035 PUTINC(code, 0, 0);
5036 length += 1 + LINK_SIZE;
5037 }
5038
5039 /* Now compile the branch; in the pre-compile phase its length gets added
5040 into the length. */
5041
5042 if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstbyte,
5043 &branchreqbyte, &bc, cd, (lengthptr == NULL)? NULL : &length))
5044 {
5045 *ptrptr = ptr;
5046 return FALSE;
5047 }
5048
5049 /* Keep the highest bracket count in case (?| was used and some branch
5050 has fewer than the rest. */
5051
5052 if (cd->bracount > max_bracount) max_bracount = cd->bracount;
5053
5054 /* In the real compile phase, there is some post-processing to be done. */
5055
5056 if (lengthptr == NULL)
5057 {
5058 /* If this is the first branch, the firstbyte and reqbyte values for the
5059 branch become the values for the regex. */
5060
5061 if (*last_branch != OP_ALT)
5062 {
5063 firstbyte = branchfirstbyte;
5064 reqbyte = branchreqbyte;
5065 }
5066
5067 /* If this is not the first branch, the first char and reqbyte have to
5068 match the values from all the previous branches, except that if the
5069 previous value for reqbyte didn't have REQ_VARY set, it can still match,
5070 and we set REQ_VARY for the regex. */
5071
5072 else
5073 {
5074 /* If we previously had a firstbyte, but it doesn't match the new branch,
5075 we have to abandon the firstbyte for the regex, but if there was
5076 previously no reqbyte, it takes on the value of the old firstbyte. */
5077
5078 if (firstbyte >= 0 && firstbyte != branchfirstbyte)
5079 {
5080 if (reqbyte < 0) reqbyte = firstbyte;
5081 firstbyte = REQ_NONE;
5082 }
5083
5084 /* If we (now or from before) have no firstbyte, a firstbyte from the
5085 branch becomes a reqbyte if there isn't a branch reqbyte. */
5086
5087 if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
5088 branchreqbyte = branchfirstbyte;
5089
5090 /* Now ensure that the reqbytes match */
5091
5092 if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
5093 reqbyte = REQ_NONE;
5094 else reqbyte |= branchreqbyte; /* To "or" REQ_VARY */
5095 }
5096
5097 /* If lookbehind, check that this branch matches a fixed-length string, and
5098 put the length into the OP_REVERSE item. Temporarily mark the end of the
5099 branch with OP_END. */
5100
5101 if (lookbehind)
5102 {
5103 int fixed_length;
5104 *code = OP_END;
5105 fixed_length = find_fixedlength(last_branch, options);
5106 DPRINTF(("fixed length = %d\n", fixed_length));
5107 if (fixed_length < 0)
5108 {
5109 *errorcodeptr = (fixed_length == -2)? ERR36 : ERR25;
5110 *ptrptr = ptr;
5111 return FALSE;
5112 }
5113 PUT(reverse_count, 0, fixed_length);
5114 }
5115 }
5116
5117 /* Reached end of expression, either ')' or end of pattern. In the real
5118 compile phase, go back through the alternative branches and reverse the chain
5119 of offsets, with the field in the BRA item now becoming an offset to the
5120 first alternative. If there are no alternatives, it points to the end of the
5121 group. The length in the terminating ket is always the length of the whole
5122 bracketed item. If any of the ims options were changed inside the group,
5123 compile a resetting op-code following, except at the very end of the pattern.
5124 Return leaving the pointer at the terminating char. */
5125
5126 if (*ptr != '|')
5127 {
5128 if (lengthptr == NULL)
5129 {
5130 int branch_length = code - last_branch;
5131 do
5132 {
5133 int prev_length = GET(last_branch, 1);
5134 PUT(last_branch, 1, branch_length);
5135 branch_length = prev_length;
5136 last_branch -= branch_length;
5137 }
5138 while (branch_length > 0);
5139 }
5140
5141 /* Fill in the ket */
5142
5143 *code = OP_KET;
5144 PUT(code, 1, code - start_bracket);
5145 code += 1 + LINK_SIZE;
5146
5147 /* Resetting option if needed */
5148
5149 if ((options & PCRE_IMS) != oldims && *ptr == ')')
5150 {
5151 *code++ = OP_OPT;
5152 *code++ = oldims;
5153 length += 2;
5154 }
5155
5156 /* Retain the highest bracket number, in case resetting was used. */
5157
5158 cd->bracount = max_bracount;
5159
5160 /* Set values to pass back */
5161
5162 *codeptr = code;
5163 *ptrptr = ptr;
5164 *firstbyteptr = firstbyte;
5165 *reqbyteptr = reqbyte;
5166 if (lengthptr != NULL)
5167 {
5168 if (OFLOW_MAX - *lengthptr < length)
5169 {
5170 *errorcodeptr = ERR20;
5171 return FALSE;
5172 }
5173 *lengthptr += length;
5174 }
5175 return TRUE;
5176 }
5177
5178 /* Another branch follows. In the pre-compile phase, we can move the code
5179 pointer back to where it was for the start of the first branch. (That is,
5180 pretend that each branch is the only one.)
5181
5182 In the real compile phase, insert an ALT node. Its length field points back
5183 to the previous branch while the bracket remains open. At the end the chain
5184 is reversed. It's done like this so that the start of the bracket has a
5185 zero offset until it is closed, making it possible to detect recursion. */
5186
5187 if (lengthptr != NULL)
5188 {
5189 code = *codeptr + 1 + LINK_SIZE + skipbytes;
5190 length += 1 + LINK_SIZE;
5191 }
5192 else
5193 {
5194 *code = OP_ALT;
5195 PUT(code, 1, code - last_branch);
5196 bc.current = last_branch = code;
5197 code += 1 + LINK_SIZE;
5198 }
5199
5200 ptr++;
5201 }
5202 /* Control never reaches here */
5203 }
5204
5205
5206
5207
5208 /*************************************************
5209 * Check for anchored expression *
5210 *************************************************/
5211
5212 /* Try to find out if this is an anchored regular expression. Consider each
5213 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
5214 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
5215 it's anchored. However, if this is a multiline pattern, then only OP_SOD
5216 counts, since OP_CIRC can match in the middle.
5217
5218 We can also consider a regex to be anchored if OP_SOM starts all its branches.
5219 This is the code for \G, which means "match at start of match position, taking
5220 into account the match offset".
5221
5222 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
5223 because that will try the rest of the pattern at all possible matching points,
5224 so there is no point trying again.... er ....
5225
5226 .... except when the .* appears inside capturing parentheses, and there is a
5227 subsequent back reference to those parentheses. We haven't enough information
5228 to catch that case precisely.
5229
5230 At first, the best we could do was to detect when .* was in capturing brackets
5231 and the highest back reference was greater than or equal to that level.
5232 However, by keeping a bitmap of the first 31 back references, we can catch some
5233 of the more common cases more precisely.
5234
5235 Arguments:
5236 code points to start of expression (the bracket)
5237 options points to the options setting
5238 bracket_map a bitmap of which brackets we are inside while testing; this
5239 handles up to substring 31; after that we just have to take
5240 the less precise approach
5241 backref_map the back reference bitmap
5242
5243 Returns: TRUE or FALSE
5244 */
5245
5246 static BOOL
5247 is_anchored(register const uschar *code, int *options, unsigned int bracket_map,
5248 unsigned int backref_map)
5249 {
5250 do {
5251 const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
5252 options, PCRE_MULTILINE, FALSE);
5253 register int op = *scode;
5254
5255 /* Non-capturing brackets */
5256
5257 if (op == OP_BRA)
5258 {
5259 if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
5260 }
5261
5262 /* Capturing brackets */
5263
5264 else if (op == OP_CBRA)
5265 {
5266 int n = GET2(scode, 1+LINK_SIZE);
5267 int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
5268 if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;
5269 }
5270
5271 /* Other brackets */
5272
5273 else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
5274 {
5275 if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
5276 }
5277
5278 /* .* is not anchored unless DOTALL is set and it isn't in brackets that
5279 are or may be referenced. */
5280
5281 else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
5282 op == OP_TYPEPOSSTAR) &&
5283 (*options & PCRE_DOTALL) != 0)
5284 {
5285 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
5286 }
5287
5288 /* Check for explicit anchoring */
5289
5290 else if (op != OP_SOD && op != OP_SOM &&
5291 ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
5292 return FALSE;
5293 code += GET(code, 1);
5294 }
5295 while (*code == OP_ALT); /* Loop for each alternative */
5296 return TRUE;
5297 }
5298
5299
5300
5301 /*************************************************
5302 * Check for starting with ^ or .* *
5303 *************************************************/
5304
5305 /* This is called to find out if every branch starts with ^ or .* so that
5306 "first char" processing can be done to speed things up in multiline
5307 matching and for non-DOTALL patterns that start with .* (which must start at
5308 the beginning or after \n). As in the case of is_anchored() (see above), we
5309 have to take account of back references to capturing brackets that contain .*
5310 because in that case we can't make the assumption.
5311
5312 Arguments:
5313 code points to start of expression (the bracket)
5314 bracket_map a bitmap of which brackets we are inside while testing; this
5315 handles up to substring 31; after that we just have to take
5316 the less precise approach
5317 backref_map the back reference bitmap
5318
5319 Returns: TRUE or FALSE
5320 */
5321
5322 static BOOL
5323 is_startline(const uschar *code, unsigned int bracket_map,
5324 unsigned int backref_map)
5325 {
5326 do {
5327 const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
5328 NULL, 0, FALSE);
5329 register int op = *scode;
5330
5331 /* Non-capturing brackets */
5332
5333 if (op == OP_BRA)
5334 {
5335 if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
5336 }
5337
5338 /* Capturing brackets */
5339
5340 else if (op == OP_CBRA)
5341 {
5342 int n = GET2(scode, 1+LINK_SIZE);
5343 int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
5344 if (!is_startline(scode, new_map, backref_map)) return FALSE;
5345 }
5346
5347 /* Other brackets */
5348
5349 else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
5350 { if (!is_startline(scode, bracket_map, backref_map)) return FALSE; }
5351
5352 /* .* means "start at start or after \n" if it isn't in brackets that
5353 may be referenced. */
5354
5355 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
5356 {
5357 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
5358 }
5359
5360 /* Check for explicit circumflex */
5361
5362 else if (op != OP_CIRC) return FALSE;
5363
5364 /* Move on to the next alternative */
5365
5366 code += GET(code, 1);
5367 }
5368 while (*code == OP_ALT); /* Loop for each alternative */
5369 return TRUE;
5370 }
5371
5372
5373
5374 /*************************************************
5375 * Check for asserted fixed first char *
5376 *************************************************/
5377
5378 /* During compilation, the "first char" settings from forward assertions are
5379 discarded, because they can cause conflicts with actual literals that follow.
5380 However, if we end up without a first char setting for an unanchored pattern,
5381 it is worth scanning the regex to see if there is an initial asserted first
5382 char. If all branches start with the same asserted char, or with a bracket all
5383 of whose alternatives start with the same asserted char (recurse ad lib), then
5384 we return that char, otherwise -1.
5385
5386 Arguments:
5387 code points to start of expression (the bracket)
5388 options pointer to the options (used to check casing changes)
5389 inassert TRUE if in an assertion
5390
5391 Returns: -1 or the fixed first char
5392 */
5393
5394 static int
5395 find_firstassertedchar(const uschar *code, int *options, BOOL inassert)
5396 {
5397 register int c = -1;
5398 do {
5399 int d;
5400 const uschar *scode =
5401 first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE);
5402 register int op = *scode;
5403
5404 switch(op)
5405 {
5406 default:
5407 return -1;
5408
5409 case OP_BRA:
5410 case OP_CBRA:
5411 case OP_ASSERT:
5412 case OP_ONCE:
5413 case OP_COND:
5414 if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0)
5415 return -1;
5416 if (c < 0) c = d; else if (c != d) return -1;
5417 break;
5418
5419 case OP_EXACT: /* Fall through */
5420 scode += 2;
5421
5422 case OP_CHAR:
5423 case OP_CHARNC:
5424 case OP_PLUS:
5425 case OP_MINPLUS:
5426 case OP_POSPLUS:
5427 if (!inassert) return -1;
5428 if (c < 0)
5429 {
5430 c = scode[1];
5431 if ((*options & PCRE_CASELESS) != 0) c |= REQ_CASELESS;
5432 }
5433 else if (c != scode[1]) return -1;
5434 break;
5435 }
5436
5437 code += GET(code, 1);
5438 }
5439 while (*code == OP_ALT);
5440 return c;
5441 }
5442
5443
5444
5445 /*************************************************
5446 * Compile a Regular Expression *
5447 *************************************************/
5448
5449 /* This function takes a string and returns a pointer to a block of store
5450 holding a compiled version of the expression. The original API for this
5451 function had no error code return variable; it is retained for backwards
5452 compatibility. The new function is given a new name.
5453
5454 Arguments:
5455 pattern the regular expression
5456 options various option bits
5457 errorcodeptr pointer to error code variable (pcre_compile2() only)
5458 can be NULL if you don't want a code value
5459 errorptr pointer to pointer to error text
5460 erroroffset ptr offset in pattern where error was detected
5461 tables pointer to character tables or NULL
5462
5463 Returns: pointer to compiled data block, or NULL on error,
5464 with errorptr and erroroffset set
5465 */
5466
5467 PCRE_EXP_DEFN pcre *
5468 pcre_compile(const char *pattern, int options, const char **errorptr,
5469 int *erroroffset, const unsigned char *tables)
5470 {
5471 return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
5472 }
5473
5474
5475 PCRE_EXP_DEFN pcre *
5476 pcre_compile2(const char *pattern, int options, int *errorcodeptr,
5477 const char **errorptr, int *erroroffset, const unsigned char *tables)
5478 {
5479 real_pcre *re;
5480 int length = 1; /* For final END opcode */
5481 int firstbyte, reqbyte, newline;
5482 int errorcode = 0;
5483 #ifdef SUPPORT_UTF8
5484 BOOL utf8;
5485 #endif
5486 size_t size;
5487 uschar *code;
5488 const uschar *codestart;
5489 const uschar *ptr;
5490 compile_data compile_block;
5491 compile_data *cd = &compile_block;
5492
5493 /* This space is used for "compiling" into during the first phase, when we are
5494 computing the amount of memory that is needed. Compiled items are thrown away
5495 as soon as possible, so that a fairly large buffer should be sufficient for
5496 this purpose. The same space is used in the second phase for remembering where
5497 to fill in forward references to subpatterns. */
5498
5499 uschar cworkspace[COMPILE_WORK_SIZE];
5500
5501
5502 /* Set this early so that early errors get offset 0. */
5503
5504 ptr = (const uschar *)pattern;
5505
5506 /* We can't pass back an error message if errorptr is NULL; I guess the best we
5507 can do is just return NULL, but we can set a code value if there is a code
5508 pointer. */
5509
5510 if (errorptr == NULL)
5511 {
5512 if (errorcodeptr != NULL) *errorcodeptr = 99;
5513 return NULL;
5514 }
5515
5516 *errorptr = NULL;
5517 if (errorcodeptr != NULL) *errorcodeptr = ERR0;
5518
5519 /* However, we can give a message for this error */
5520
5521 if (erroroffset == NULL)
5522 {
5523 errorcode = ERR16;
5524 goto PCRE_EARLY_ERROR_RETURN2;
5525 }
5526
5527 *erroroffset = 0;
5528
5529 /* Can't support UTF8 unless PCRE has been compiled to include the code. */
5530
5531 #ifdef SUPPORT_UTF8
5532 utf8 = (options & PCRE_UTF8) != 0;
5533 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
5534 (*erroroffset = _pcre_valid_utf8((uschar *)pattern, -1)) >= 0)
5535 {
5536 errorcode = ERR44;
5537 goto PCRE_EARLY_ERROR_RETURN2;
5538 }
5539 #else
5540 if ((options & PCRE_UTF8) != 0)
5541 {
5542 errorcode = ERR32;
5543 goto PCRE_EARLY_ERROR_RETURN;
5544 }
5545 #endif
5546
5547 if ((options & ~PUBLIC_OPTIONS) != 0)
5548 {
5549 errorcode = ERR17;
5550 goto PCRE_EARLY_ERROR_RETURN;
5551 }
5552
5553 /* Set up pointers to the individual character tables */
5554
5555 if (tables == NULL) tables = _pcre_default_tables;
5556 cd->lcc = tables + lcc_offset;
5557 cd->fcc = tables + fcc_offset;
5558 cd->cbits = tables + cbits_offset;
5559 cd->ctypes = tables + ctypes_offset;
5560
5561 /* Handle different types of newline. The three bits give seven cases. The
5562 current code allows for fixed one- or two-byte sequences, plus "any" and
5563 "anycrlf". */
5564
5565 switch (options & (PCRE_NEWLINE_CRLF | PCRE_NEWLINE_ANY))
5566 {
5567 case 0: newline = NEWLINE; break; /* Compile-time default */
5568 case PCRE_NEWLINE_CR: newline = '\r'; break;
5569 case PCRE_NEWLINE_LF: newline = '\n'; break;
5570 case PCRE_NEWLINE_CR+
5571 PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
5572 case PCRE_NEWLINE_ANY: newline = -1; break;
5573 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
5574 default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
5575 }
5576
5577 if (newline == -2)
5578 {
5579 cd->nltype = NLTYPE_ANYCRLF;
5580 }
5581 else if (newline < 0)
5582 {
5583 cd->nltype = NLTYPE_ANY;
5584 }
5585 else
5586 {
5587 cd->nltype = NLTYPE_FIXED;
5588 if (newline > 255)
5589 {
5590 cd->nllen = 2;
5591 cd->nl[0] = (newline >> 8) & 255;
5592 cd->nl[1] = newline & 255;
5593 }
5594 else
5595 {
5596 cd->nllen = 1;
5597 cd->nl[0] = newline;
5598 }
5599 }
5600
5601 /* Maximum back reference and backref bitmap. The bitmap records up to 31 back
5602 references to help in deciding whether (.*) can be treated as anchored or not.
5603 */
5604
5605 cd->top_backref = 0;
5606 cd->backref_map = 0;
5607
5608 /* Reflect pattern for debugging output */
5609
5610 DPRINTF(("------------------------------------------------------------------\n"));
5611 DPRINTF(("%s\n", pattern));
5612
5613 /* Pretend to compile the pattern while actually just accumulating the length
5614 of memory required. This behaviour is triggered by passing a non-NULL final
5615 argument to compile_regex(). We pass a block of workspace (cworkspace) for it
5616 to compile parts of the pattern into; the compiled code is discarded when it is
5617 no longer needed, so hopefully this workspace will never overflow, though there
5618 is a test for its doing so. */
5619
5620 cd->bracount = 0;
5621 cd->names_found = 0;
5622 cd->name_entry_size = 0;
5623 cd->name_table = NULL;
5624 cd->start_workspace = cworkspace;
5625 cd->start_code = cworkspace;
5626 cd->hwm = cworkspace;
5627 cd->start_pattern = (const uschar *)pattern;
5628 cd->end_pattern = (const uschar *)(pattern + strlen(pattern));
5629 cd->req_varyopt = 0;
5630 cd->nopartial = FALSE;
5631 cd->external_options = options;
5632
5633 /* Now do the pre-compile. On error, errorcode will be set non-zero, so we
5634 don't need to look at the result of the function here. The initial options have
5635 been put into the cd block so that they can be changed if an option setting is
5636 found within the regex right at the beginning. Bringing initial option settings
5637 outside can help speed up starting point checks. */
5638
5639 code = cworkspace;
5640 *code = OP_BRA;
5641 (void)compile_regex(cd->external_options, cd->external_options & PCRE_IMS,
5642 &code, &ptr, &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd,
5643 &length);
5644 if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN;
5645
5646 DPRINTF(("end pre-compile: length=%d workspace=%d\n", length,
5647 cd->hwm - cworkspace));
5648
5649 if (length > MAX_PATTERN_SIZE)
5650 {
5651 errorcode = ERR20;
5652 goto PCRE_EARLY_ERROR_RETURN;
5653 }
5654
5655 /* Compute the size of data block needed and get it, either from malloc or
5656 externally provided function. Integer overflow should no longer be possible
5657 because nowadays we limit the maximum value of cd->names_found and
5658 cd->name_entry_size. */
5659
5660 size = length + sizeof(real_pcre) + cd->names_found * (cd->name_entry_size + 3);
5661 re = (real_pcre *)(pcre_malloc)(size);
5662
5663 if (re == NULL)
5664 {
5665 errorcode = ERR21;
5666 goto PCRE_EARLY_ERROR_RETURN;
5667 }
5668
5669 /* Put in the magic number, and save the sizes, initial options, and character
5670 table pointer. NULL is used for the default character tables. The nullpad field
5671 is at the end; it's there to help in the case when a regex compiled on a system
5672 with 4-byte pointers is run on another with 8-byte pointers. */
5673
5674 re->magic_number = MAGIC_NUMBER;
5675 re->size = size;
5676 re->options = cd->external_options;
5677 re->dummy1 = 0;
5678 re->first_byte = 0;
5679 re->req_byte = 0;
5680 re->name_table_offset = sizeof(real_pcre);
5681 re->name_entry_size = cd->name_entry_size;
5682 re->name_count = cd->names_found;
5683 re->ref_count = 0;
5684 re->tables = (tables == _pcre_default_tables)? NULL : tables;
5685 re->nullpad = NULL;
5686
5687 /* The starting points of the name/number translation table and of the code are
5688 passed around in the compile data block. The start/end pattern and initial
5689 options are already set from the pre-compile phase, as is the name_entry_size
5690 field. Reset the bracket count and the names_found field. Also reset the hwm
5691 field; this time it's used for remembering forward references to subpatterns.
5692 */
5693
5694 cd->bracount = 0;
5695 cd->names_found = 0;
5696 cd->name_table = (uschar *)re + re->name_table_offset;
5697 codestart = cd->name_table + re->name_entry_size * re->name_count;
5698 cd->start_code = codestart;
5699 cd->hwm = cworkspace;
5700 cd->req_varyopt = 0;
5701 cd->nopartial = FALSE;
5702
5703 /* Set up a starting, non-extracting bracket, then compile the expression. On
5704 error, errorcode will be set non-zero, so we don't need to look at the result
5705 of the function here. */
5706
5707 ptr = (const uschar *)pattern;
5708 code = (uschar *)codestart;
5709 *code = OP_BRA;
5710 (void)compile_regex(re->options, re->options & PCRE_IMS, &code, &ptr,
5711 &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd, NULL);
5712 re->top_bracket = cd->bracount;
5713 re->top_backref = cd->top_backref;
5714
5715 if (cd->nopartial) re->options |= PCRE_NOPARTIAL;
5716
5717 /* If not reached end of pattern on success, there's an excess bracket. */
5718
5719 if (errorcode == 0 && *ptr != 0) errorcode = ERR22;
5720
5721 /* Fill in the terminating state and check for disastrous overflow, but
5722 if debugging, leave the test till after things are printed out. */
5723
5724 *code++ = OP_END;
5725
5726 #ifndef DEBUG
5727 if (code - codestart > length) errorcode = ERR23;
5728 #endif
5729
5730 /* Fill in any forward references that are required. */
5731
5732 while (errorcode == 0 && cd->hwm > cworkspace)
5733 {
5734 int offset, recno;
5735 const uschar *groupptr;
5736 cd->hwm -= LINK_SIZE;
5737 offset = GET(cd->hwm, 0);
5738 recno = GET(codestart, offset);
5739 groupptr = find_bracket(codestart, (re->options & PCRE_UTF8) != 0, recno);
5740 if (groupptr == NULL) errorcode = ERR53;
5741 else PUT(((uschar *)codestart), offset, groupptr - codestart);
5742 }
5743
5744 /* Give an error if there's back reference to a non-existent capturing
5745 subpattern. */
5746
5747 if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15;
5748
5749 /* Failed to compile, or error while post-processing */
5750
5751 if (errorcode != 0)
5752 {
5753 (pcre_free)(re);
5754 PCRE_EARLY_ERROR_RETURN:
5755 *erroroffset = ptr - (const uschar *)pattern;
5756 PCRE_EARLY_ERROR_RETURN2:
5757 *errorptr = error_texts[errorcode];
5758 if (errorcodeptr != NULL) *errorcodeptr = errorcode;
5759 return NULL;
5760 }
5761
5762 /* If the anchored option was not passed, set the flag if we can determine that
5763 the pattern is anchored by virtue of ^ characters or \A or anything else (such
5764 as starting with .* when DOTALL is set).
5765
5766 Otherwise, if we know what the first byte has to be, save it, because that
5767 speeds up unanchored matches no end. If not, see if we can set the
5768 PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
5769 start with ^. and also when all branches start with .* for non-DOTALL matches.
5770 */
5771
5772 if ((re->options & PCRE_ANCHORED) == 0)
5773 {
5774 int temp_options = re->options; /* May get changed during these scans */
5775 if (is_anchored(codestart, &temp_options, 0, cd->backref_map))
5776 re->options |= PCRE_ANCHORED;
5777 else
5778 {
5779 if (firstbyte < 0)
5780 firstbyte = find_firstassertedchar(codestart, &temp_options, FALSE);
5781 if (firstbyte >= 0) /* Remove caseless flag for non-caseable chars */
5782 {
5783 int ch = firstbyte & 255;
5784 re->first_byte = ((firstbyte & REQ_CASELESS) != 0 &&
5785 cd->fcc[ch] == ch)? ch : firstbyte;
5786 re->options |= PCRE_FIRSTSET;
5787 }
5788 else if (is_startline(codestart, 0, cd->backref_map))
5789 re->options |= PCRE_STARTLINE;
5790 }
5791 }
5792
5793 /* For an anchored pattern, we use the "required byte" only if it follows a
5794 variable length item in the regex. Remove the caseless flag for non-caseable
5795 bytes. */
5796
5797 if (reqbyte >= 0 &&
5798 ((re->options & PCRE_ANCHORED) == 0 || (reqbyte & REQ_VARY) != 0))
5799 {
5800 int ch = reqbyte & 255;
5801 re->req_byte = ((reqbyte & REQ_CASELESS) != 0 &&
5802 cd->fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte;
5803 re->options |= PCRE_REQCHSET;
5804 }
5805
5806 /* Print out the compiled data if debugging is enabled. This is never the
5807 case when building a production library. */
5808
5809 #ifdef DEBUG
5810
5811 printf("Length = %d top_bracket = %d top_backref = %d\n",
5812 length, re->top_bracket, re->top_backref);
5813
5814 if (re->options != 0)
5815 {
5816 printf("%s%s%s%s%s%s%s%s%s\n",
5817 ((re->options & PCRE_NOPARTIAL) != 0)? "nopartial " : "",
5818 ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
5819 ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
5820 ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
5821 ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
5822 ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
5823 ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
5824 ((re->options & PCRE_EXTRA) != 0)? "extra " : "",
5825 ((re->options & PCRE_UNGREEDY) != 0)? "ungreedy " : "");
5826 }
5827
5828 if ((re->options & PCRE_FIRSTSET) != 0)
5829 {
5830 int ch = re->first_byte & 255;
5831 const char *caseless = ((re->first_byte & REQ_CASELESS) == 0)?
5832 "" : " (caseless)";
5833 if (isprint(ch)) printf("First char = %c%s\n", ch, caseless);
5834 else printf("First char = \\x%02x%s\n", ch, caseless);
5835 }
5836
5837 if ((re->options & PCRE_REQCHSET) != 0)
5838 {
5839 int ch = re->req_byte & 255;
5840 const char *caseless = ((re->req_byte & REQ_CASELESS) == 0)?
5841 "" : " (caseless)";
5842 if (isprint(ch)) printf("Req char = %c%s\n", ch, caseless);
5843 else printf("Req char = \\x%02x%s\n", ch, caseless);
5844 }
5845
5846 pcre_printint(re, stdout, TRUE);
5847
5848 /* This check is done here in the debugging case so that the code that
5849 was compiled can be seen. */
5850
5851 if (code - codestart > length)
5852 {
5853 (pcre_free)(re);
5854 *errorptr = error_texts[ERR23];
5855 *erroroffset = ptr - (uschar *)pattern;
5856 if (errorcodeptr != NULL) *errorcodeptr = ERR23;
5857 return NULL;
5858 }
5859 #endif /* DEBUG */
5860
5861 return (pcre *)re;
5862 }
5863
5864 /* End of pcre_compile.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12