/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Contents of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 200 - (show annotations) (download)
Wed Aug 1 09:10:40 2007 UTC (7 years, 1 month ago) by ph10
File MIME type: text/plain
File size: 183708 byte(s)
Correct errors in previous patch; tidy for test release.

1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2007 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains the external function pcre_compile(), along with
42 supporting internal functions that are not used by other modules. */
43
44
45 #ifdef HAVE_CONFIG_H
46 #include <config.h>
47 #endif
48
49 #define NLBLOCK cd /* Block containing newline information */
50 #define PSSTART start_pattern /* Field containing processed string start */
51 #define PSEND end_pattern /* Field containing processed string end */
52
53 #include "pcre_internal.h"
54
55
56 /* When DEBUG is defined, we need the pcre_printint() function, which is also
57 used by pcretest. DEBUG is not defined when building a production library. */
58
59 #ifdef DEBUG
60 #include "pcre_printint.src"
61 #endif
62
63
64 /* Macro for setting individual bits in class bitmaps. */
65
66 #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
67
68
69 /*************************************************
70 * Code parameters and static tables *
71 *************************************************/
72
73 /* This value specifies the size of stack workspace that is used during the
74 first pre-compile phase that determines how much memory is required. The regex
75 is partly compiled into this space, but the compiled parts are discarded as
76 soon as they can be, so that hopefully there will never be an overrun. The code
77 does, however, check for an overrun. The largest amount I've seen used is 218,
78 so this number is very generous.
79
80 The same workspace is used during the second, actual compile phase for
81 remembering forward references to groups so that they can be filled in at the
82 end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
83 is 4 there is plenty of room. */
84
85 #define COMPILE_WORK_SIZE (4096)
86
87
88 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
89 are simple data values; negative values are for special things like \d and so
90 on. Zero means further processing is needed (for things like \x), or the escape
91 is invalid. */
92
93 #ifndef EBCDIC /* This is the "normal" table for ASCII systems */
94 static const short int escapes[] = {
95 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
96 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
97 '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */
98 -ESC_H, 0, 0, -ESC_K, 0, 0, 0, 0, /* H - O */
99 -ESC_P, -ESC_Q, -ESC_R, -ESC_S, 0, 0, -ESC_V, -ESC_W, /* P - W */
100 -ESC_X, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
101 '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */
102 -ESC_h, 0, 0, -ESC_k, 0, 0, ESC_n, 0, /* h - o */
103 -ESC_p, 0, ESC_r, -ESC_s, ESC_tee, 0, -ESC_v, -ESC_w, /* p - w */
104 0, 0, -ESC_z /* x - z */
105 };
106
107 #else /* This is the "abnormal" table for EBCDIC systems */
108 static const short int escapes[] = {
109 /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
110 /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
111 /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
112 /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
113 /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
114 /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
115 /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
116 /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
117 /* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0,
118 /* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
119 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
120 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0,
121 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
122 /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
123 /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
124 /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
125 /* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0,
126 /* D0 */ '}', 0, -ESC_K, 0, 0, 0, 0, -ESC_P,
127 /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
128 /* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X,
129 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
130 /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
131 /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
132 };
133 #endif
134
135
136 /* Tables of names of POSIX character classes and their lengths. The list is
137 terminated by a zero length entry. The first three must be alpha, lower, upper,
138 as this is assumed for handling case independence. */
139
140 static const char *const posix_names[] = {
141 "alpha", "lower", "upper",
142 "alnum", "ascii", "blank", "cntrl", "digit", "graph",
143 "print", "punct", "space", "word", "xdigit" };
144
145 static const uschar posix_name_lengths[] = {
146 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
147
148 /* Table of class bit maps for each POSIX class. Each class is formed from a
149 base map, with an optional addition or removal of another map. Then, for some
150 classes, there is some additional tweaking: for [:blank:] the vertical space
151 characters are removed, and for [:alpha:] and [:alnum:] the underscore
152 character is removed. The triples in the table consist of the base map offset,
153 second map offset or -1 if no second map, and a non-negative value for map
154 addition or a negative value for map subtraction (if there are two maps). The
155 absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
156 remove vertical space characters, 2 => remove underscore. */
157
158 static const int posix_class_maps[] = {
159 cbit_word, cbit_digit, -2, /* alpha */
160 cbit_lower, -1, 0, /* lower */
161 cbit_upper, -1, 0, /* upper */
162 cbit_word, -1, 2, /* alnum - word without underscore */
163 cbit_print, cbit_cntrl, 0, /* ascii */
164 cbit_space, -1, 1, /* blank - a GNU extension */
165 cbit_cntrl, -1, 0, /* cntrl */
166 cbit_digit, -1, 0, /* digit */
167 cbit_graph, -1, 0, /* graph */
168 cbit_print, -1, 0, /* print */
169 cbit_punct, -1, 0, /* punct */
170 cbit_space, -1, 0, /* space */
171 cbit_word, -1, 0, /* word - a Perl extension */
172 cbit_xdigit,-1, 0 /* xdigit */
173 };
174
175
176 #define STRING(a) # a
177 #define XSTRING(s) STRING(s)
178
179 /* The texts of compile-time error messages. These are "char *" because they
180 are passed to the outside world. Do not ever re-use any error number, because
181 they are documented. Always add a new error instead. Messages marked DEAD below
182 are no longer used. */
183
184 static const char *error_texts[] = {
185 "no error",
186 "\\ at end of pattern",
187 "\\c at end of pattern",
188 "unrecognized character follows \\",
189 "numbers out of order in {} quantifier",
190 /* 5 */
191 "number too big in {} quantifier",
192 "missing terminating ] for character class",
193 "invalid escape sequence in character class",
194 "range out of order in character class",
195 "nothing to repeat",
196 /* 10 */
197 "operand of unlimited repeat could match the empty string", /** DEAD **/
198 "internal error: unexpected repeat",
199 "unrecognized character after (?",
200 "POSIX named classes are supported only within a class",
201 "missing )",
202 /* 15 */
203 "reference to non-existent subpattern",
204 "erroffset passed as NULL",
205 "unknown option bit(s) set",
206 "missing ) after comment",
207 "parentheses nested too deeply", /** DEAD **/
208 /* 20 */
209 "regular expression too large",
210 "failed to get memory",
211 "unmatched parentheses",
212 "internal error: code overflow",
213 "unrecognized character after (?<",
214 /* 25 */
215 "lookbehind assertion is not fixed length",
216 "malformed number or name after (?(",
217 "conditional group contains more than two branches",
218 "assertion expected after (?(",
219 "(?R or (?[+-]digits must be followed by )",
220 /* 30 */
221 "unknown POSIX class name",
222 "POSIX collating elements are not supported",
223 "this version of PCRE is not compiled with PCRE_UTF8 support",
224 "spare error", /** DEAD **/
225 "character value in \\x{...} sequence is too large",
226 /* 35 */
227 "invalid condition (?(0)",
228 "\\C not allowed in lookbehind assertion",
229 "PCRE does not support \\L, \\l, \\N, \\U, or \\u",
230 "number after (?C is > 255",
231 "closing ) for (?C expected",
232 /* 40 */
233 "recursive call could loop indefinitely",
234 "unrecognized character after (?P",
235 "syntax error in subpattern name (missing terminator)",
236 "two named subpatterns have the same name",
237 "invalid UTF-8 string",
238 /* 45 */
239 "support for \\P, \\p, and \\X has not been compiled",
240 "malformed \\P or \\p sequence",
241 "unknown property name after \\P or \\p",
242 "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)",
243 "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")",
244 /* 50 */
245 "repeated subpattern is too long",
246 "octal value is greater than \\377 (not in UTF-8 mode)",
247 "internal error: overran compiling workspace",
248 "internal error: previously-checked referenced subpattern not found",
249 "DEFINE group contains more than one branch",
250 /* 55 */
251 "repeating a DEFINE group is not allowed",
252 "inconsistent NEWLINE options",
253 "\\g is not followed by a braced name or an optionally braced non-zero number",
254 "(?+ or (?- or (?(+ or (?(- must be followed by a non-zero number"
255 };
256
257
258 /* Table to identify digits and hex digits. This is used when compiling
259 patterns. Note that the tables in chartables are dependent on the locale, and
260 may mark arbitrary characters as digits - but the PCRE compiling code expects
261 to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
262 a private table here. It costs 256 bytes, but it is a lot faster than doing
263 character value tests (at least in some simple cases I timed), and in some
264 applications one wants PCRE to compile efficiently as well as match
265 efficiently.
266
267 For convenience, we use the same bit definitions as in chartables:
268
269 0x04 decimal digit
270 0x08 hexadecimal digit
271
272 Then we can use ctype_digit and ctype_xdigit in the code. */
273
274 #ifndef EBCDIC /* This is the "normal" case, for ASCII systems */
275 static const unsigned char digitab[] =
276 {
277 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
278 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
279 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
280 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
281 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
282 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
283 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
284 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
285 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
286 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
287 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
288 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
289 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
290 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
291 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
292 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
293 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
294 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
295 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
296 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
297 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
298 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
299 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
300 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
301 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
302 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
303 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
304 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
305 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
306 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
307 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
308 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
309
310 #else /* This is the "abnormal" case, for EBCDIC systems */
311 static const unsigned char digitab[] =
312 {
313 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
314 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
315 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
316 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
317 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
318 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
319 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
320 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
321 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
322 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
323 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
324 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */
325 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
326 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
327 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
328 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
329 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
330 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
331 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
332 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
333 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
334 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
335 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
336 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
337 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
338 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
339 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
340 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
341 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
342 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
343 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
344 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
345
346 static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
347 0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
348 0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
349 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
350 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
351 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
352 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
353 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
354 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
355 0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
356 0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
357 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
358 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */
359 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
360 0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
361 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
362 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
363 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
364 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
365 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
366 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
367 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
368 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
369 0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
370 0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
371 0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
372 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
373 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
374 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
375 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
376 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
377 0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
378 0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
379 #endif
380
381
382 /* Definition to allow mutual recursion */
383
384 static BOOL
385 compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
386 int *, int *, branch_chain *, compile_data *, int *);
387
388
389
390 /*************************************************
391 * Handle escapes *
392 *************************************************/
393
394 /* This function is called when a \ has been encountered. It either returns a
395 positive value for a simple escape such as \n, or a negative value which
396 encodes one of the more complicated things such as \d. A backreference to group
397 n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
398 UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
399 ptr is pointing at the \. On exit, it is on the final character of the escape
400 sequence.
401
402 Arguments:
403 ptrptr points to the pattern position pointer
404 errorcodeptr points to the errorcode variable
405 bracount number of previous extracting brackets
406 options the options bits
407 isclass TRUE if inside a character class
408
409 Returns: zero or positive => a data character
410 negative => a special escape sequence
411 on error, errorptr is set
412 */
413
414 static int
415 check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
416 int options, BOOL isclass)
417 {
418 BOOL utf8 = (options & PCRE_UTF8) != 0;
419 const uschar *ptr = *ptrptr + 1;
420 int c, i;
421
422 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
423 ptr--; /* Set pointer back to the last byte */
424
425 /* If backslash is at the end of the pattern, it's an error. */
426
427 if (c == 0) *errorcodeptr = ERR1;
428
429 /* Non-alphamerics are literals. For digits or letters, do an initial lookup in
430 a table. A non-zero result is something that can be returned immediately.
431 Otherwise further processing may be required. */
432
433 #ifndef EBCDIC /* ASCII coding */
434 else if (c < '0' || c > 'z') {} /* Not alphameric */
435 else if ((i = escapes[c - '0']) != 0) c = i;
436
437 #else /* EBCDIC coding */
438 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphameric */
439 else if ((i = escapes[c - 0x48]) != 0) c = i;
440 #endif
441
442 /* Escapes that need further processing, or are illegal. */
443
444 else
445 {
446 const uschar *oldptr;
447 BOOL braced, negated;
448
449 switch (c)
450 {
451 /* A number of Perl escapes are not handled by PCRE. We give an explicit
452 error. */
453
454 case 'l':
455 case 'L':
456 case 'N':
457 case 'u':
458 case 'U':
459 *errorcodeptr = ERR37;
460 break;
461
462 /* \g must be followed by a number, either plain or braced. If positive, it
463 is an absolute backreference. If negative, it is a relative backreference.
464 This is a Perl 5.10 feature. Perl 5.10 also supports \g{name} as a
465 reference to a named group. This is part of Perl's movement towards a
466 unified syntax for back references. As this is synonymous with \k{name}, we
467 fudge it up by pretending it really was \k. */
468
469 case 'g':
470 if (ptr[1] == '{')
471 {
472 const uschar *p;
473 for (p = ptr+2; *p != 0 && *p != '}'; p++)
474 if (*p != '-' && (digitab[*p] & ctype_digit) == 0) break;
475 if (*p != 0 && *p != '}')
476 {
477 c = -ESC_k;
478 break;
479 }
480 braced = TRUE;
481 ptr++;
482 }
483 else braced = FALSE;
484
485 if (ptr[1] == '-')
486 {
487 negated = TRUE;
488 ptr++;
489 }
490 else negated = FALSE;
491
492 c = 0;
493 while ((digitab[ptr[1]] & ctype_digit) != 0)
494 c = c * 10 + *(++ptr) - '0';
495
496 if (c == 0 || (braced && *(++ptr) != '}'))
497 {
498 *errorcodeptr = ERR57;
499 return 0;
500 }
501
502 if (negated)
503 {
504 if (c > bracount)
505 {
506 *errorcodeptr = ERR15;
507 return 0;
508 }
509 c = bracount - (c - 1);
510 }
511
512 c = -(ESC_REF + c);
513 break;
514
515 /* The handling of escape sequences consisting of a string of digits
516 starting with one that is not zero is not straightforward. By experiment,
517 the way Perl works seems to be as follows:
518
519 Outside a character class, the digits are read as a decimal number. If the
520 number is less than 10, or if there are that many previous extracting
521 left brackets, then it is a back reference. Otherwise, up to three octal
522 digits are read to form an escaped byte. Thus \123 is likely to be octal
523 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
524 value is greater than 377, the least significant 8 bits are taken. Inside a
525 character class, \ followed by a digit is always an octal number. */
526
527 case '1': case '2': case '3': case '4': case '5':
528 case '6': case '7': case '8': case '9':
529
530 if (!isclass)
531 {
532 oldptr = ptr;
533 c -= '0';
534 while ((digitab[ptr[1]] & ctype_digit) != 0)
535 c = c * 10 + *(++ptr) - '0';
536 if (c < 10 || c <= bracount)
537 {
538 c = -(ESC_REF + c);
539 break;
540 }
541 ptr = oldptr; /* Put the pointer back and fall through */
542 }
543
544 /* Handle an octal number following \. If the first digit is 8 or 9, Perl
545 generates a binary zero byte and treats the digit as a following literal.
546 Thus we have to pull back the pointer by one. */
547
548 if ((c = *ptr) >= '8')
549 {
550 ptr--;
551 c = 0;
552 break;
553 }
554
555 /* \0 always starts an octal number, but we may drop through to here with a
556 larger first octal digit. The original code used just to take the least
557 significant 8 bits of octal numbers (I think this is what early Perls used
558 to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
559 than 3 octal digits. */
560
561 case '0':
562 c -= '0';
563 while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
564 c = c * 8 + *(++ptr) - '0';
565 if (!utf8 && c > 255) *errorcodeptr = ERR51;
566 break;
567
568 /* \x is complicated. \x{ddd} is a character number which can be greater
569 than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
570 treated as a data character. */
571
572 case 'x':
573 if (ptr[1] == '{')
574 {
575 const uschar *pt = ptr + 2;
576 int count = 0;
577
578 c = 0;
579 while ((digitab[*pt] & ctype_xdigit) != 0)
580 {
581 register int cc = *pt++;
582 if (c == 0 && cc == '0') continue; /* Leading zeroes */
583 count++;
584
585 #ifndef EBCDIC /* ASCII coding */
586 if (cc >= 'a') cc -= 32; /* Convert to upper case */
587 c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));
588 #else /* EBCDIC coding */
589 if (cc >= 'a' && cc <= 'z') cc += 64; /* Convert to upper case */
590 c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));
591 #endif
592 }
593
594 if (*pt == '}')
595 {
596 if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
597 ptr = pt;
598 break;
599 }
600
601 /* If the sequence of hex digits does not end with '}', then we don't
602 recognize this construct; fall through to the normal \x handling. */
603 }
604
605 /* Read just a single-byte hex-defined char */
606
607 c = 0;
608 while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
609 {
610 int cc; /* Some compilers don't like ++ */
611 cc = *(++ptr); /* in initializers */
612 #ifndef EBCDIC /* ASCII coding */
613 if (cc >= 'a') cc -= 32; /* Convert to upper case */
614 c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
615 #else /* EBCDIC coding */
616 if (cc <= 'z') cc += 64; /* Convert to upper case */
617 c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
618 #endif
619 }
620 break;
621
622 /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
623 This coding is ASCII-specific, but then the whole concept of \cx is
624 ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
625
626 case 'c':
627 c = *(++ptr);
628 if (c == 0)
629 {
630 *errorcodeptr = ERR2;
631 return 0;
632 }
633
634 #ifndef EBCDIC /* ASCII coding */
635 if (c >= 'a' && c <= 'z') c -= 32;
636 c ^= 0x40;
637 #else /* EBCDIC coding */
638 if (c >= 'a' && c <= 'z') c += 64;
639 c ^= 0xC0;
640 #endif
641 break;
642
643 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
644 other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
645 for Perl compatibility, it is a literal. This code looks a bit odd, but
646 there used to be some cases other than the default, and there may be again
647 in future, so I haven't "optimized" it. */
648
649 default:
650 if ((options & PCRE_EXTRA) != 0) switch(c)
651 {
652 default:
653 *errorcodeptr = ERR3;
654 break;
655 }
656 break;
657 }
658 }
659
660 *ptrptr = ptr;
661 return c;
662 }
663
664
665
666 #ifdef SUPPORT_UCP
667 /*************************************************
668 * Handle \P and \p *
669 *************************************************/
670
671 /* This function is called after \P or \p has been encountered, provided that
672 PCRE is compiled with support for Unicode properties. On entry, ptrptr is
673 pointing at the P or p. On exit, it is pointing at the final character of the
674 escape sequence.
675
676 Argument:
677 ptrptr points to the pattern position pointer
678 negptr points to a boolean that is set TRUE for negation else FALSE
679 dptr points to an int that is set to the detailed property value
680 errorcodeptr points to the error code variable
681
682 Returns: type value from ucp_type_table, or -1 for an invalid type
683 */
684
685 static int
686 get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
687 {
688 int c, i, bot, top;
689 const uschar *ptr = *ptrptr;
690 char name[32];
691
692 c = *(++ptr);
693 if (c == 0) goto ERROR_RETURN;
694
695 *negptr = FALSE;
696
697 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
698 negation. */
699
700 if (c == '{')
701 {
702 if (ptr[1] == '^')
703 {
704 *negptr = TRUE;
705 ptr++;
706 }
707 for (i = 0; i < (int)sizeof(name) - 1; i++)
708 {
709 c = *(++ptr);
710 if (c == 0) goto ERROR_RETURN;
711 if (c == '}') break;
712 name[i] = c;
713 }
714 if (c !='}') goto ERROR_RETURN;
715 name[i] = 0;
716 }
717
718 /* Otherwise there is just one following character */
719
720 else
721 {
722 name[0] = c;
723 name[1] = 0;
724 }
725
726 *ptrptr = ptr;
727
728 /* Search for a recognized property name using binary chop */
729
730 bot = 0;
731 top = _pcre_utt_size;
732
733 while (bot < top)
734 {
735 i = (bot + top) >> 1;
736 c = strcmp(name, _pcre_utt[i].name);
737 if (c == 0)
738 {
739 *dptr = _pcre_utt[i].value;
740 return _pcre_utt[i].type;
741 }
742 if (c > 0) bot = i + 1; else top = i;
743 }
744
745 *errorcodeptr = ERR47;
746 *ptrptr = ptr;
747 return -1;
748
749 ERROR_RETURN:
750 *errorcodeptr = ERR46;
751 *ptrptr = ptr;
752 return -1;
753 }
754 #endif
755
756
757
758
759 /*************************************************
760 * Check for counted repeat *
761 *************************************************/
762
763 /* This function is called when a '{' is encountered in a place where it might
764 start a quantifier. It looks ahead to see if it really is a quantifier or not.
765 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
766 where the ddds are digits.
767
768 Arguments:
769 p pointer to the first char after '{'
770
771 Returns: TRUE or FALSE
772 */
773
774 static BOOL
775 is_counted_repeat(const uschar *p)
776 {
777 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
778 while ((digitab[*p] & ctype_digit) != 0) p++;
779 if (*p == '}') return TRUE;
780
781 if (*p++ != ',') return FALSE;
782 if (*p == '}') return TRUE;
783
784 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
785 while ((digitab[*p] & ctype_digit) != 0) p++;
786
787 return (*p == '}');
788 }
789
790
791
792 /*************************************************
793 * Read repeat counts *
794 *************************************************/
795
796 /* Read an item of the form {n,m} and return the values. This is called only
797 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
798 so the syntax is guaranteed to be correct, but we need to check the values.
799
800 Arguments:
801 p pointer to first char after '{'
802 minp pointer to int for min
803 maxp pointer to int for max
804 returned as -1 if no max
805 errorcodeptr points to error code variable
806
807 Returns: pointer to '}' on success;
808 current ptr on error, with errorcodeptr set non-zero
809 */
810
811 static const uschar *
812 read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
813 {
814 int min = 0;
815 int max = -1;
816
817 /* Read the minimum value and do a paranoid check: a negative value indicates
818 an integer overflow. */
819
820 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
821 if (min < 0 || min > 65535)
822 {
823 *errorcodeptr = ERR5;
824 return p;
825 }
826
827 /* Read the maximum value if there is one, and again do a paranoid on its size.
828 Also, max must not be less than min. */
829
830 if (*p == '}') max = min; else
831 {
832 if (*(++p) != '}')
833 {
834 max = 0;
835 while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
836 if (max < 0 || max > 65535)
837 {
838 *errorcodeptr = ERR5;
839 return p;
840 }
841 if (max < min)
842 {
843 *errorcodeptr = ERR4;
844 return p;
845 }
846 }
847 }
848
849 /* Fill in the required variables, and pass back the pointer to the terminating
850 '}'. */
851
852 *minp = min;
853 *maxp = max;
854 return p;
855 }
856
857
858
859 /*************************************************
860 * Find forward referenced subpattern *
861 *************************************************/
862
863 /* This function scans along a pattern's text looking for capturing
864 subpatterns, and counting them. If it finds a named pattern that matches the
865 name it is given, it returns its number. Alternatively, if the name is NULL, it
866 returns when it reaches a given numbered subpattern. This is used for forward
867 references to subpatterns. We know that if (?P< is encountered, the name will
868 be terminated by '>' because that is checked in the first pass.
869
870 Arguments:
871 ptr current position in the pattern
872 count current count of capturing parens so far encountered
873 name name to seek, or NULL if seeking a numbered subpattern
874 lorn name length, or subpattern number if name is NULL
875 xmode TRUE if we are in /x mode
876
877 Returns: the number of the named subpattern, or -1 if not found
878 */
879
880 static int
881 find_parens(const uschar *ptr, int count, const uschar *name, int lorn,
882 BOOL xmode)
883 {
884 const uschar *thisname;
885
886 for (; *ptr != 0; ptr++)
887 {
888 int term;
889
890 /* Skip over backslashed characters and also entire \Q...\E */
891
892 if (*ptr == '\\')
893 {
894 if (*(++ptr) == 0) return -1;
895 if (*ptr == 'Q') for (;;)
896 {
897 while (*(++ptr) != 0 && *ptr != '\\');
898 if (*ptr == 0) return -1;
899 if (*(++ptr) == 'E') break;
900 }
901 continue;
902 }
903
904 /* Skip over character classes */
905
906 if (*ptr == '[')
907 {
908 while (*(++ptr) != ']')
909 {
910 if (*ptr == '\\')
911 {
912 if (*(++ptr) == 0) return -1;
913 if (*ptr == 'Q') for (;;)
914 {
915 while (*(++ptr) != 0 && *ptr != '\\');
916 if (*ptr == 0) return -1;
917 if (*(++ptr) == 'E') break;
918 }
919 continue;
920 }
921 }
922 continue;
923 }
924
925 /* Skip comments in /x mode */
926
927 if (xmode && *ptr == '#')
928 {
929 while (*(++ptr) != 0 && *ptr != '\n');
930 if (*ptr == 0) return -1;
931 continue;
932 }
933
934 /* An opening parens must now be a real metacharacter */
935
936 if (*ptr != '(') continue;
937 if (ptr[1] != '?')
938 {
939 count++;
940 if (name == NULL && count == lorn) return count;
941 continue;
942 }
943
944 ptr += 2;
945 if (*ptr == 'P') ptr++; /* Allow optional P */
946
947 /* We have to disambiguate (?<! and (?<= from (?<name> */
948
949 if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') &&
950 *ptr != '\'')
951 continue;
952
953 count++;
954
955 if (name == NULL && count == lorn) return count;
956 term = *ptr++;
957 if (term == '<') term = '>';
958 thisname = ptr;
959 while (*ptr != term) ptr++;
960 if (name != NULL && lorn == ptr - thisname &&
961 strncmp((const char *)name, (const char *)thisname, lorn) == 0)
962 return count;
963 }
964
965 return -1;
966 }
967
968
969
970 /*************************************************
971 * Find first significant op code *
972 *************************************************/
973
974 /* This is called by several functions that scan a compiled expression looking
975 for a fixed first character, or an anchoring op code etc. It skips over things
976 that do not influence this. For some calls, a change of option is important.
977 For some calls, it makes sense to skip negative forward and all backward
978 assertions, and also the \b assertion; for others it does not.
979
980 Arguments:
981 code pointer to the start of the group
982 options pointer to external options
983 optbit the option bit whose changing is significant, or
984 zero if none are
985 skipassert TRUE if certain assertions are to be skipped
986
987 Returns: pointer to the first significant opcode
988 */
989
990 static const uschar*
991 first_significant_code(const uschar *code, int *options, int optbit,
992 BOOL skipassert)
993 {
994 for (;;)
995 {
996 switch ((int)*code)
997 {
998 case OP_OPT:
999 if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
1000 *options = (int)code[1];
1001 code += 2;
1002 break;
1003
1004 case OP_ASSERT_NOT:
1005 case OP_ASSERTBACK:
1006 case OP_ASSERTBACK_NOT:
1007 if (!skipassert) return code;
1008 do code += GET(code, 1); while (*code == OP_ALT);
1009 code += _pcre_OP_lengths[*code];
1010 break;
1011
1012 case OP_WORD_BOUNDARY:
1013 case OP_NOT_WORD_BOUNDARY:
1014 if (!skipassert) return code;
1015 /* Fall through */
1016
1017 case OP_CALLOUT:
1018 case OP_CREF:
1019 case OP_RREF:
1020 case OP_DEF:
1021 code += _pcre_OP_lengths[*code];
1022 break;
1023
1024 default:
1025 return code;
1026 }
1027 }
1028 /* Control never reaches here */
1029 }
1030
1031
1032
1033
1034 /*************************************************
1035 * Find the fixed length of a pattern *
1036 *************************************************/
1037
1038 /* Scan a pattern and compute the fixed length of subject that will match it,
1039 if the length is fixed. This is needed for dealing with backward assertions.
1040 In UTF8 mode, the result is in characters rather than bytes.
1041
1042 Arguments:
1043 code points to the start of the pattern (the bracket)
1044 options the compiling options
1045
1046 Returns: the fixed length, or -1 if there is no fixed length,
1047 or -2 if \C was encountered
1048 */
1049
1050 static int
1051 find_fixedlength(uschar *code, int options)
1052 {
1053 int length = -1;
1054
1055 register int branchlength = 0;
1056 register uschar *cc = code + 1 + LINK_SIZE;
1057
1058 /* Scan along the opcodes for this branch. If we get to the end of the
1059 branch, check the length against that of the other branches. */
1060
1061 for (;;)
1062 {
1063 int d;
1064 register int op = *cc;
1065
1066 switch (op)
1067 {
1068 case OP_CBRA:
1069 case OP_BRA:
1070 case OP_ONCE:
1071 case OP_COND:
1072 d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);
1073 if (d < 0) return d;
1074 branchlength += d;
1075 do cc += GET(cc, 1); while (*cc == OP_ALT);
1076 cc += 1 + LINK_SIZE;
1077 break;
1078
1079 /* Reached end of a branch; if it's a ket it is the end of a nested
1080 call. If it's ALT it is an alternation in a nested call. If it is
1081 END it's the end of the outer call. All can be handled by the same code. */
1082
1083 case OP_ALT:
1084 case OP_KET:
1085 case OP_KETRMAX:
1086 case OP_KETRMIN:
1087 case OP_END:
1088 if (length < 0) length = branchlength;
1089 else if (length != branchlength) return -1;
1090 if (*cc != OP_ALT) return length;
1091 cc += 1 + LINK_SIZE;
1092 branchlength = 0;
1093 break;
1094
1095 /* Skip over assertive subpatterns */
1096
1097 case OP_ASSERT:
1098 case OP_ASSERT_NOT:
1099 case OP_ASSERTBACK:
1100 case OP_ASSERTBACK_NOT:
1101 do cc += GET(cc, 1); while (*cc == OP_ALT);
1102 /* Fall through */
1103
1104 /* Skip over things that don't match chars */
1105
1106 case OP_REVERSE:
1107 case OP_CREF:
1108 case OP_RREF:
1109 case OP_DEF:
1110 case OP_OPT:
1111 case OP_CALLOUT:
1112 case OP_SOD:
1113 case OP_SOM:
1114 case OP_EOD:
1115 case OP_EODN:
1116 case OP_CIRC:
1117 case OP_DOLL:
1118 case OP_NOT_WORD_BOUNDARY:
1119 case OP_WORD_BOUNDARY:
1120 cc += _pcre_OP_lengths[*cc];
1121 break;
1122
1123 /* Handle literal characters */
1124
1125 case OP_CHAR:
1126 case OP_CHARNC:
1127 case OP_NOT:
1128 branchlength++;
1129 cc += 2;
1130 #ifdef SUPPORT_UTF8
1131 if ((options & PCRE_UTF8) != 0)
1132 {
1133 while ((*cc & 0xc0) == 0x80) cc++;
1134 }
1135 #endif
1136 break;
1137
1138 /* Handle exact repetitions. The count is already in characters, but we
1139 need to skip over a multibyte character in UTF8 mode. */
1140
1141 case OP_EXACT:
1142 branchlength += GET2(cc,1);
1143 cc += 4;
1144 #ifdef SUPPORT_UTF8
1145 if ((options & PCRE_UTF8) != 0)
1146 {
1147 while((*cc & 0x80) == 0x80) cc++;
1148 }
1149 #endif
1150 break;
1151
1152 case OP_TYPEEXACT:
1153 branchlength += GET2(cc,1);
1154 cc += 4;
1155 break;
1156
1157 /* Handle single-char matchers */
1158
1159 case OP_PROP:
1160 case OP_NOTPROP:
1161 cc += 2;
1162 /* Fall through */
1163
1164 case OP_NOT_DIGIT:
1165 case OP_DIGIT:
1166 case OP_NOT_WHITESPACE:
1167 case OP_WHITESPACE:
1168 case OP_NOT_WORDCHAR:
1169 case OP_WORDCHAR:
1170 case OP_ANY:
1171 branchlength++;
1172 cc++;
1173 break;
1174
1175 /* The single-byte matcher isn't allowed */
1176
1177 case OP_ANYBYTE:
1178 return -2;
1179
1180 /* Check a class for variable quantification */
1181
1182 #ifdef SUPPORT_UTF8
1183 case OP_XCLASS:
1184 cc += GET(cc, 1) - 33;
1185 /* Fall through */
1186 #endif
1187
1188 case OP_CLASS:
1189 case OP_NCLASS:
1190 cc += 33;
1191
1192 switch (*cc)
1193 {
1194 case OP_CRSTAR:
1195 case OP_CRMINSTAR:
1196 case OP_CRQUERY:
1197 case OP_CRMINQUERY:
1198 return -1;
1199
1200 case OP_CRRANGE:
1201 case OP_CRMINRANGE:
1202 if (GET2(cc,1) != GET2(cc,3)) return -1;
1203 branchlength += GET2(cc,1);
1204 cc += 5;
1205 break;
1206
1207 default:
1208 branchlength++;
1209 }
1210 break;
1211
1212 /* Anything else is variable length */
1213
1214 default:
1215 return -1;
1216 }
1217 }
1218 /* Control never gets here */
1219 }
1220
1221
1222
1223
1224 /*************************************************
1225 * Scan compiled regex for numbered bracket *
1226 *************************************************/
1227
1228 /* This little function scans through a compiled pattern until it finds a
1229 capturing bracket with the given number.
1230
1231 Arguments:
1232 code points to start of expression
1233 utf8 TRUE in UTF-8 mode
1234 number the required bracket number
1235
1236 Returns: pointer to the opcode for the bracket, or NULL if not found
1237 */
1238
1239 static const uschar *
1240 find_bracket(const uschar *code, BOOL utf8, int number)
1241 {
1242 for (;;)
1243 {
1244 register int c = *code;
1245 if (c == OP_END) return NULL;
1246
1247 /* XCLASS is used for classes that cannot be represented just by a bit
1248 map. This includes negated single high-valued characters. The length in
1249 the table is zero; the actual length is stored in the compiled code. */
1250
1251 if (c == OP_XCLASS) code += GET(code, 1);
1252
1253 /* Handle capturing bracket */
1254
1255 else if (c == OP_CBRA)
1256 {
1257 int n = GET2(code, 1+LINK_SIZE);
1258 if (n == number) return (uschar *)code;
1259 code += _pcre_OP_lengths[c];
1260 }
1261
1262 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1263 a multi-byte character. The length in the table is a minimum, so we have to
1264 arrange to skip the extra bytes. */
1265
1266 else
1267 {
1268 code += _pcre_OP_lengths[c];
1269 #ifdef SUPPORT_UTF8
1270 if (utf8) switch(c)
1271 {
1272 case OP_CHAR:
1273 case OP_CHARNC:
1274 case OP_EXACT:
1275 case OP_UPTO:
1276 case OP_MINUPTO:
1277 case OP_POSUPTO:
1278 case OP_STAR:
1279 case OP_MINSTAR:
1280 case OP_POSSTAR:
1281 case OP_PLUS:
1282 case OP_MINPLUS:
1283 case OP_POSPLUS:
1284 case OP_QUERY:
1285 case OP_MINQUERY:
1286 case OP_POSQUERY:
1287 if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1288 break;
1289 }
1290 #endif
1291 }
1292 }
1293 }
1294
1295
1296
1297 /*************************************************
1298 * Scan compiled regex for recursion reference *
1299 *************************************************/
1300
1301 /* This little function scans through a compiled pattern until it finds an
1302 instance of OP_RECURSE.
1303
1304 Arguments:
1305 code points to start of expression
1306 utf8 TRUE in UTF-8 mode
1307
1308 Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1309 */
1310
1311 static const uschar *
1312 find_recurse(const uschar *code, BOOL utf8)
1313 {
1314 for (;;)
1315 {
1316 register int c = *code;
1317 if (c == OP_END) return NULL;
1318 if (c == OP_RECURSE) return code;
1319
1320 /* XCLASS is used for classes that cannot be represented just by a bit
1321 map. This includes negated single high-valued characters. The length in
1322 the table is zero; the actual length is stored in the compiled code. */
1323
1324 if (c == OP_XCLASS) code += GET(code, 1);
1325
1326 /* Otherwise, we get the item's length from the table. In UTF-8 mode, opcodes
1327 that are followed by a character may be followed by a multi-byte character.
1328 The length in the table is a minimum, so we have to arrange to skip the extra
1329 bytes. */
1330
1331 else
1332 {
1333 code += _pcre_OP_lengths[c];
1334 #ifdef SUPPORT_UTF8
1335 if (utf8) switch(c)
1336 {
1337 case OP_CHAR:
1338 case OP_CHARNC:
1339 case OP_EXACT:
1340 case OP_UPTO:
1341 case OP_MINUPTO:
1342 case OP_POSUPTO:
1343 case OP_STAR:
1344 case OP_MINSTAR:
1345 case OP_POSSTAR:
1346 case OP_PLUS:
1347 case OP_MINPLUS:
1348 case OP_POSPLUS:
1349 case OP_QUERY:
1350 case OP_MINQUERY:
1351 case OP_POSQUERY:
1352 if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1353 break;
1354 }
1355 #endif
1356 }
1357 }
1358 }
1359
1360
1361
1362 /*************************************************
1363 * Scan compiled branch for non-emptiness *
1364 *************************************************/
1365
1366 /* This function scans through a branch of a compiled pattern to see whether it
1367 can match the empty string or not. It is called from could_be_empty()
1368 below and from compile_branch() when checking for an unlimited repeat of a
1369 group that can match nothing. Note that first_significant_code() skips over
1370 assertions. If we hit an unclosed bracket, we return "empty" - this means we've
1371 struck an inner bracket whose current branch will already have been scanned.
1372
1373 Arguments:
1374 code points to start of search
1375 endcode points to where to stop
1376 utf8 TRUE if in UTF8 mode
1377
1378 Returns: TRUE if what is matched could be empty
1379 */
1380
1381 static BOOL
1382 could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1383 {
1384 register int c;
1385 for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1386 code < endcode;
1387 code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1388 {
1389 const uschar *ccode;
1390
1391 c = *code;
1392
1393 /* Groups with zero repeats can of course be empty; skip them. */
1394
1395 if (c == OP_BRAZERO || c == OP_BRAMINZERO)
1396 {
1397 code += _pcre_OP_lengths[c];
1398 do code += GET(code, 1); while (*code == OP_ALT);
1399 c = *code;
1400 continue;
1401 }
1402
1403 /* For other groups, scan the branches. */
1404
1405 if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE)
1406 {
1407 BOOL empty_branch;
1408 if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1409
1410 /* Scan a closed bracket */
1411
1412 empty_branch = FALSE;
1413 do
1414 {
1415 if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1416 empty_branch = TRUE;
1417 code += GET(code, 1);
1418 }
1419 while (*code == OP_ALT);
1420 if (!empty_branch) return FALSE; /* All branches are non-empty */
1421 c = *code;
1422 continue;
1423 }
1424
1425 /* Handle the other opcodes */
1426
1427 switch (c)
1428 {
1429 /* Check for quantifiers after a class */
1430
1431 #ifdef SUPPORT_UTF8
1432 case OP_XCLASS:
1433 ccode = code + GET(code, 1);
1434 goto CHECK_CLASS_REPEAT;
1435 #endif
1436
1437 case OP_CLASS:
1438 case OP_NCLASS:
1439 ccode = code + 33;
1440
1441 #ifdef SUPPORT_UTF8
1442 CHECK_CLASS_REPEAT:
1443 #endif
1444
1445 switch (*ccode)
1446 {
1447 case OP_CRSTAR: /* These could be empty; continue */
1448 case OP_CRMINSTAR:
1449 case OP_CRQUERY:
1450 case OP_CRMINQUERY:
1451 break;
1452
1453 default: /* Non-repeat => class must match */
1454 case OP_CRPLUS: /* These repeats aren't empty */
1455 case OP_CRMINPLUS:
1456 return FALSE;
1457
1458 case OP_CRRANGE:
1459 case OP_CRMINRANGE:
1460 if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
1461 break;
1462 }
1463 break;
1464
1465 /* Opcodes that must match a character */
1466
1467 case OP_PROP:
1468 case OP_NOTPROP:
1469 case OP_EXTUNI:
1470 case OP_NOT_DIGIT:
1471 case OP_DIGIT:
1472 case OP_NOT_WHITESPACE:
1473 case OP_WHITESPACE:
1474 case OP_NOT_WORDCHAR:
1475 case OP_WORDCHAR:
1476 case OP_ANY:
1477 case OP_ANYBYTE:
1478 case OP_CHAR:
1479 case OP_CHARNC:
1480 case OP_NOT:
1481 case OP_PLUS:
1482 case OP_MINPLUS:
1483 case OP_POSPLUS:
1484 case OP_EXACT:
1485 case OP_NOTPLUS:
1486 case OP_NOTMINPLUS:
1487 case OP_NOTPOSPLUS:
1488 case OP_NOTEXACT:
1489 case OP_TYPEPLUS:
1490 case OP_TYPEMINPLUS:
1491 case OP_TYPEPOSPLUS:
1492 case OP_TYPEEXACT:
1493 return FALSE;
1494
1495 /* End of branch */
1496
1497 case OP_KET:
1498 case OP_KETRMAX:
1499 case OP_KETRMIN:
1500 case OP_ALT:
1501 return TRUE;
1502
1503 /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1504 MINUPTO, and POSUPTO may be followed by a multibyte character */
1505
1506 #ifdef SUPPORT_UTF8
1507 case OP_STAR:
1508 case OP_MINSTAR:
1509 case OP_POSSTAR:
1510 case OP_QUERY:
1511 case OP_MINQUERY:
1512 case OP_POSQUERY:
1513 case OP_UPTO:
1514 case OP_MINUPTO:
1515 case OP_POSUPTO:
1516 if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1517 break;
1518 #endif
1519 }
1520 }
1521
1522 return TRUE;
1523 }
1524
1525
1526
1527 /*************************************************
1528 * Scan compiled regex for non-emptiness *
1529 *************************************************/
1530
1531 /* This function is called to check for left recursive calls. We want to check
1532 the current branch of the current pattern to see if it could match the empty
1533 string. If it could, we must look outwards for branches at other levels,
1534 stopping when we pass beyond the bracket which is the subject of the recursion.
1535
1536 Arguments:
1537 code points to start of the recursion
1538 endcode points to where to stop (current RECURSE item)
1539 bcptr points to the chain of current (unclosed) branch starts
1540 utf8 TRUE if in UTF-8 mode
1541
1542 Returns: TRUE if what is matched could be empty
1543 */
1544
1545 static BOOL
1546 could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1547 BOOL utf8)
1548 {
1549 while (bcptr != NULL && bcptr->current >= code)
1550 {
1551 if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1552 bcptr = bcptr->outer;
1553 }
1554 return TRUE;
1555 }
1556
1557
1558
1559 /*************************************************
1560 * Check for POSIX class syntax *
1561 *************************************************/
1562
1563 /* This function is called when the sequence "[:" or "[." or "[=" is
1564 encountered in a character class. It checks whether this is followed by an
1565 optional ^ and then a sequence of letters, terminated by a matching ":]" or
1566 ".]" or "=]".
1567
1568 Argument:
1569 ptr pointer to the initial [
1570 endptr where to return the end pointer
1571 cd pointer to compile data
1572
1573 Returns: TRUE or FALSE
1574 */
1575
1576 static BOOL
1577 check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
1578 {
1579 int terminator; /* Don't combine these lines; the Solaris cc */
1580 terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
1581 if (*(++ptr) == '^') ptr++;
1582 while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
1583 if (*ptr == terminator && ptr[1] == ']')
1584 {
1585 *endptr = ptr;
1586 return TRUE;
1587 }
1588 return FALSE;
1589 }
1590
1591
1592
1593
1594 /*************************************************
1595 * Check POSIX class name *
1596 *************************************************/
1597
1598 /* This function is called to check the name given in a POSIX-style class entry
1599 such as [:alnum:].
1600
1601 Arguments:
1602 ptr points to the first letter
1603 len the length of the name
1604
1605 Returns: a value representing the name, or -1 if unknown
1606 */
1607
1608 static int
1609 check_posix_name(const uschar *ptr, int len)
1610 {
1611 register int yield = 0;
1612 while (posix_name_lengths[yield] != 0)
1613 {
1614 if (len == posix_name_lengths[yield] &&
1615 strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
1616 yield++;
1617 }
1618 return -1;
1619 }
1620
1621
1622 /*************************************************
1623 * Adjust OP_RECURSE items in repeated group *
1624 *************************************************/
1625
1626 /* OP_RECURSE items contain an offset from the start of the regex to the group
1627 that is referenced. This means that groups can be replicated for fixed
1628 repetition simply by copying (because the recursion is allowed to refer to
1629 earlier groups that are outside the current group). However, when a group is
1630 optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
1631 it, after it has been compiled. This means that any OP_RECURSE items within it
1632 that refer to the group itself or any contained groups have to have their
1633 offsets adjusted. That one of the jobs of this function. Before it is called,
1634 the partially compiled regex must be temporarily terminated with OP_END.
1635
1636 This function has been extended with the possibility of forward references for
1637 recursions and subroutine calls. It must also check the list of such references
1638 for the group we are dealing with. If it finds that one of the recursions in
1639 the current group is on this list, it adjusts the offset in the list, not the
1640 value in the reference (which is a group number).
1641
1642 Arguments:
1643 group points to the start of the group
1644 adjust the amount by which the group is to be moved
1645 utf8 TRUE in UTF-8 mode
1646 cd contains pointers to tables etc.
1647 save_hwm the hwm forward reference pointer at the start of the group
1648
1649 Returns: nothing
1650 */
1651
1652 static void
1653 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
1654 uschar *save_hwm)
1655 {
1656 uschar *ptr = group;
1657 while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1658 {
1659 int offset;
1660 uschar *hc;
1661
1662 /* See if this recursion is on the forward reference list. If so, adjust the
1663 reference. */
1664
1665 for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
1666 {
1667 offset = GET(hc, 0);
1668 if (cd->start_code + offset == ptr + 1)
1669 {
1670 PUT(hc, 0, offset + adjust);
1671 break;
1672 }
1673 }
1674
1675 /* Otherwise, adjust the recursion offset if it's after the start of this
1676 group. */
1677
1678 if (hc >= cd->hwm)
1679 {
1680 offset = GET(ptr, 1);
1681 if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1682 }
1683
1684 ptr += 1 + LINK_SIZE;
1685 }
1686 }
1687
1688
1689
1690 /*************************************************
1691 * Insert an automatic callout point *
1692 *************************************************/
1693
1694 /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
1695 callout points before each pattern item.
1696
1697 Arguments:
1698 code current code pointer
1699 ptr current pattern pointer
1700 cd pointers to tables etc
1701
1702 Returns: new code pointer
1703 */
1704
1705 static uschar *
1706 auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
1707 {
1708 *code++ = OP_CALLOUT;
1709 *code++ = 255;
1710 PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */
1711 PUT(code, LINK_SIZE, 0); /* Default length */
1712 return code + 2*LINK_SIZE;
1713 }
1714
1715
1716
1717 /*************************************************
1718 * Complete a callout item *
1719 *************************************************/
1720
1721 /* A callout item contains the length of the next item in the pattern, which
1722 we can't fill in till after we have reached the relevant point. This is used
1723 for both automatic and manual callouts.
1724
1725 Arguments:
1726 previous_callout points to previous callout item
1727 ptr current pattern pointer
1728 cd pointers to tables etc
1729
1730 Returns: nothing
1731 */
1732
1733 static void
1734 complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
1735 {
1736 int length = ptr - cd->start_pattern - GET(previous_callout, 2);
1737 PUT(previous_callout, 2 + LINK_SIZE, length);
1738 }
1739
1740
1741
1742 #ifdef SUPPORT_UCP
1743 /*************************************************
1744 * Get othercase range *
1745 *************************************************/
1746
1747 /* This function is passed the start and end of a class range, in UTF-8 mode
1748 with UCP support. It searches up the characters, looking for internal ranges of
1749 characters in the "other" case. Each call returns the next one, updating the
1750 start address.
1751
1752 Arguments:
1753 cptr points to starting character value; updated
1754 d end value
1755 ocptr where to put start of othercase range
1756 odptr where to put end of othercase range
1757
1758 Yield: TRUE when range returned; FALSE when no more
1759 */
1760
1761 static BOOL
1762 get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
1763 unsigned int *odptr)
1764 {
1765 unsigned int c, othercase, next;
1766
1767 for (c = *cptr; c <= d; c++)
1768 { if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR) break; }
1769
1770 if (c > d) return FALSE;
1771
1772 *ocptr = othercase;
1773 next = othercase + 1;
1774
1775 for (++c; c <= d; c++)
1776 {
1777 if (_pcre_ucp_othercase(c) != next) break;
1778 next++;
1779 }
1780
1781 *odptr = next - 1;
1782 *cptr = c;
1783
1784 return TRUE;
1785 }
1786 #endif /* SUPPORT_UCP */
1787
1788
1789
1790 /*************************************************
1791 * Check if auto-possessifying is possible *
1792 *************************************************/
1793
1794 /* This function is called for unlimited repeats of certain items, to see
1795 whether the next thing could possibly match the repeated item. If not, it makes
1796 sense to automatically possessify the repeated item.
1797
1798 Arguments:
1799 op_code the repeated op code
1800 this data for this item, depends on the opcode
1801 utf8 TRUE in UTF-8 mode
1802 utf8_char used for utf8 character bytes, NULL if not relevant
1803 ptr next character in pattern
1804 options options bits
1805 cd contains pointers to tables etc.
1806
1807 Returns: TRUE if possessifying is wanted
1808 */
1809
1810 static BOOL
1811 check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
1812 const uschar *ptr, int options, compile_data *cd)
1813 {
1814 int next;
1815
1816 /* Skip whitespace and comments in extended mode */
1817
1818 if ((options & PCRE_EXTENDED) != 0)
1819 {
1820 for (;;)
1821 {
1822 while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1823 if (*ptr == '#')
1824 {
1825 while (*(++ptr) != 0)
1826 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1827 }
1828 else break;
1829 }
1830 }
1831
1832 /* If the next item is one that we can handle, get its value. A non-negative
1833 value is a character, a negative value is an escape value. */
1834
1835 if (*ptr == '\\')
1836 {
1837 int temperrorcode = 0;
1838 next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
1839 if (temperrorcode != 0) return FALSE;
1840 ptr++; /* Point after the escape sequence */
1841 }
1842
1843 else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
1844 {
1845 #ifdef SUPPORT_UTF8
1846 if (utf8) { GETCHARINC(next, ptr); } else
1847 #endif
1848 next = *ptr++;
1849 }
1850
1851 else return FALSE;
1852
1853 /* Skip whitespace and comments in extended mode */
1854
1855 if ((options & PCRE_EXTENDED) != 0)
1856 {
1857 for (;;)
1858 {
1859 while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1860 if (*ptr == '#')
1861 {
1862 while (*(++ptr) != 0)
1863 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1864 }
1865 else break;
1866 }
1867 }
1868
1869 /* If the next thing is itself optional, we have to give up. */
1870
1871 if (*ptr == '*' || *ptr == '?' || strncmp((char *)ptr, "{0,", 3) == 0)
1872 return FALSE;
1873
1874 /* Now compare the next item with the previous opcode. If the previous is a
1875 positive single character match, "item" either contains the character or, if
1876 "item" is greater than 127 in utf8 mode, the character's bytes are in
1877 utf8_char. */
1878
1879
1880 /* Handle cases when the next item is a character. */
1881
1882 if (next >= 0) switch(op_code)
1883 {
1884 case OP_CHAR:
1885 #ifdef SUPPORT_UTF8
1886 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1887 #endif
1888 return item != next;
1889
1890 /* For CHARNC (caseless character) we must check the other case. If we have
1891 Unicode property support, we can use it to test the other case of
1892 high-valued characters. */
1893
1894 case OP_CHARNC:
1895 #ifdef SUPPORT_UTF8
1896 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1897 #endif
1898 if (item == next) return FALSE;
1899 #ifdef SUPPORT_UTF8
1900 if (utf8)
1901 {
1902 unsigned int othercase;
1903 if (next < 128) othercase = cd->fcc[next]; else
1904 #ifdef SUPPORT_UCP
1905 othercase = _pcre_ucp_othercase((unsigned int)next);
1906 #else
1907 othercase = NOTACHAR;
1908 #endif
1909 return (unsigned int)item != othercase;
1910 }
1911 else
1912 #endif /* SUPPORT_UTF8 */
1913 return (item != cd->fcc[next]); /* Non-UTF-8 mode */
1914
1915 /* For OP_NOT, "item" must be a single-byte character. */
1916
1917 case OP_NOT:
1918 if (next < 0) return FALSE; /* Not a character */
1919 if (item == next) return TRUE;
1920 if ((options & PCRE_CASELESS) == 0) return FALSE;
1921 #ifdef SUPPORT_UTF8
1922 if (utf8)
1923 {
1924 unsigned int othercase;
1925 if (next < 128) othercase = cd->fcc[next]; else
1926 #ifdef SUPPORT_UCP
1927 othercase = _pcre_ucp_othercase(next);
1928 #else
1929 othercase = NOTACHAR;
1930 #endif
1931 return (unsigned int)item == othercase;
1932 }
1933 else
1934 #endif /* SUPPORT_UTF8 */
1935 return (item == cd->fcc[next]); /* Non-UTF-8 mode */
1936
1937 case OP_DIGIT:
1938 return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
1939
1940 case OP_NOT_DIGIT:
1941 return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
1942
1943 case OP_WHITESPACE:
1944 return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
1945
1946 case OP_NOT_WHITESPACE:
1947 return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
1948
1949 case OP_WORDCHAR:
1950 return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
1951
1952 case OP_NOT_WORDCHAR:
1953 return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
1954
1955 case OP_HSPACE:
1956 case OP_NOT_HSPACE:
1957 switch(next)
1958 {
1959 case 0x09:
1960 case 0x20:
1961 case 0xa0:
1962 case 0x1680:
1963 case 0x180e:
1964 case 0x2000:
1965 case 0x2001:
1966 case 0x2002:
1967 case 0x2003:
1968 case 0x2004:
1969 case 0x2005:
1970 case 0x2006:
1971 case 0x2007:
1972 case 0x2008:
1973 case 0x2009:
1974 case 0x200A:
1975 case 0x202f:
1976 case 0x205f:
1977 case 0x3000:
1978 return op_code != OP_HSPACE;
1979 default:
1980 return op_code == OP_HSPACE;
1981 }
1982
1983 case OP_VSPACE:
1984 case OP_NOT_VSPACE:
1985 switch(next)
1986 {
1987 case 0x0a:
1988 case 0x0b:
1989 case 0x0c:
1990 case 0x0d:
1991 case 0x85:
1992 case 0x2028:
1993 case 0x2029:
1994 return op_code != OP_VSPACE;
1995 default:
1996 return op_code == OP_VSPACE;
1997 }
1998
1999 default:
2000 return FALSE;
2001 }
2002
2003
2004 /* Handle the case when the next item is \d, \s, etc. */
2005
2006 switch(op_code)
2007 {
2008 case OP_CHAR:
2009 case OP_CHARNC:
2010 #ifdef SUPPORT_UTF8
2011 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2012 #endif
2013 switch(-next)
2014 {
2015 case ESC_d:
2016 return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
2017
2018 case ESC_D:
2019 return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
2020
2021 case ESC_s:
2022 return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
2023
2024 case ESC_S:
2025 return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
2026
2027 case ESC_w:
2028 return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
2029
2030 case ESC_W:
2031 return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
2032
2033 case ESC_h:
2034 case ESC_H:
2035 switch(item)
2036 {
2037 case 0x09:
2038 case 0x20:
2039 case 0xa0:
2040 case 0x1680:
2041 case 0x180e:
2042 case 0x2000:
2043 case 0x2001:
2044 case 0x2002:
2045 case 0x2003:
2046 case 0x2004:
2047 case 0x2005:
2048 case 0x2006:
2049 case 0x2007:
2050 case 0x2008:
2051 case 0x2009:
2052 case 0x200A:
2053 case 0x202f:
2054 case 0x205f:
2055 case 0x3000:
2056 return -next != ESC_h;
2057 default:
2058 return -next == ESC_h;
2059 }
2060
2061 case ESC_v:
2062 case ESC_V:
2063 switch(item)
2064 {
2065 case 0x0a:
2066 case 0x0b:
2067 case 0x0c:
2068 case 0x0d:
2069 case 0x85:
2070 case 0x2028:
2071 case 0x2029:
2072 return -next != ESC_v;
2073 default:
2074 return -next == ESC_v;
2075 }
2076
2077 default:
2078 return FALSE;
2079 }
2080
2081 case OP_DIGIT:
2082 return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
2083 next == -ESC_h || next == -ESC_v;
2084
2085 case OP_NOT_DIGIT:
2086 return next == -ESC_d;
2087
2088 case OP_WHITESPACE:
2089 return next == -ESC_S || next == -ESC_d || next == -ESC_w;
2090
2091 case OP_NOT_WHITESPACE:
2092 return next == -ESC_s || next == -ESC_h || next == -ESC_v;
2093
2094 case OP_HSPACE:
2095 return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w;
2096
2097 case OP_NOT_HSPACE:
2098 return next == -ESC_h;
2099
2100 /* Can't have \S in here because VT matches \S (Perl anomaly) */
2101 case OP_VSPACE:
2102 return next == -ESC_V || next == -ESC_d || next == -ESC_w;
2103
2104 case OP_NOT_VSPACE:
2105 return next == -ESC_v;
2106
2107 case OP_WORDCHAR:
2108 return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v;
2109
2110 case OP_NOT_WORDCHAR:
2111 return next == -ESC_w || next == -ESC_d;
2112
2113 default:
2114 return FALSE;
2115 }
2116
2117 /* Control does not reach here */
2118 }
2119
2120
2121
2122 /*************************************************
2123 * Compile one branch *
2124 *************************************************/
2125
2126 /* Scan the pattern, compiling it into the a vector. If the options are
2127 changed during the branch, the pointer is used to change the external options
2128 bits. This function is used during the pre-compile phase when we are trying
2129 to find out the amount of memory needed, as well as during the real compile
2130 phase. The value of lengthptr distinguishes the two phases.
2131
2132 Arguments:
2133 optionsptr pointer to the option bits
2134 codeptr points to the pointer to the current code point
2135 ptrptr points to the current pattern pointer
2136 errorcodeptr points to error code variable
2137 firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2138 reqbyteptr set to the last literal character required, else < 0
2139 bcptr points to current branch chain
2140 cd contains pointers to tables etc.
2141 lengthptr NULL during the real compile phase
2142 points to length accumulator during pre-compile phase
2143
2144 Returns: TRUE on success
2145 FALSE, with *errorcodeptr set non-zero on error
2146 */
2147
2148 static BOOL
2149 compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2150 int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2151 compile_data *cd, int *lengthptr)
2152 {
2153 int repeat_type, op_type;
2154 int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
2155 int bravalue = 0;
2156 int greedy_default, greedy_non_default;
2157 int firstbyte, reqbyte;
2158 int zeroreqbyte, zerofirstbyte;
2159 int req_caseopt, reqvary, tempreqvary;
2160 int options = *optionsptr;
2161 int after_manual_callout = 0;
2162 int length_prevgroup = 0;
2163 register int c;
2164 register uschar *code = *codeptr;
2165 uschar *last_code = code;
2166 uschar *orig_code = code;
2167 uschar *tempcode;
2168 BOOL inescq = FALSE;
2169 BOOL groupsetfirstbyte = FALSE;
2170 const uschar *ptr = *ptrptr;
2171 const uschar *tempptr;
2172 uschar *previous = NULL;
2173 uschar *previous_callout = NULL;
2174 uschar *save_hwm = NULL;
2175 uschar classbits[32];
2176
2177 #ifdef SUPPORT_UTF8
2178 BOOL class_utf8;
2179 BOOL utf8 = (options & PCRE_UTF8) != 0;
2180 uschar *class_utf8data;
2181 uschar utf8_char[6];
2182 #else
2183 BOOL utf8 = FALSE;
2184 uschar *utf8_char = NULL;
2185 #endif
2186
2187 #ifdef DEBUG
2188 if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2189 #endif
2190
2191 /* Set up the default and non-default settings for greediness */
2192
2193 greedy_default = ((options & PCRE_UNGREEDY) != 0);
2194 greedy_non_default = greedy_default ^ 1;
2195
2196 /* Initialize no first byte, no required byte. REQ_UNSET means "no char
2197 matching encountered yet". It gets changed to REQ_NONE if we hit something that
2198 matches a non-fixed char first char; reqbyte just remains unset if we never
2199 find one.
2200
2201 When we hit a repeat whose minimum is zero, we may have to adjust these values
2202 to take the zero repeat into account. This is implemented by setting them to
2203 zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2204 item types that can be repeated set these backoff variables appropriately. */
2205
2206 firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2207
2208 /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2209 according to the current setting of the caseless flag. REQ_CASELESS is a bit
2210 value > 255. It is added into the firstbyte or reqbyte variables to record the
2211 case status of the value. This is used only for ASCII characters. */
2212
2213 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2214
2215 /* Switch on next character until the end of the branch */
2216
2217 for (;; ptr++)
2218 {
2219 BOOL negate_class;
2220 BOOL possessive_quantifier;
2221 BOOL is_quantifier;
2222 BOOL is_recurse;
2223 BOOL reset_bracount;
2224 int class_charcount;
2225 int class_lastchar;
2226 int newoptions;
2227 int recno;
2228 int refsign;
2229 int skipbytes;
2230 int subreqbyte;
2231 int subfirstbyte;
2232 int terminator;
2233 int mclength;
2234 uschar mcbuffer[8];
2235
2236 /* Get next byte in the pattern */
2237
2238 c = *ptr;
2239
2240 /* If we are in the pre-compile phase, accumulate the length used for the
2241 previous cycle of this loop. */
2242
2243 if (lengthptr != NULL)
2244 {
2245 #ifdef DEBUG
2246 if (code > cd->hwm) cd->hwm = code; /* High water info */
2247 #endif
2248 if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
2249 {
2250 *errorcodeptr = ERR52;
2251 goto FAILED;
2252 }
2253
2254 /* There is at least one situation where code goes backwards: this is the
2255 case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2256 the class is simply eliminated. However, it is created first, so we have to
2257 allow memory for it. Therefore, don't ever reduce the length at this point.
2258 */
2259
2260 if (code < last_code) code = last_code;
2261 *lengthptr += code - last_code;
2262 DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2263
2264 /* If "previous" is set and it is not at the start of the work space, move
2265 it back to there, in order to avoid filling up the work space. Otherwise,
2266 if "previous" is NULL, reset the current code pointer to the start. */
2267
2268 if (previous != NULL)
2269 {
2270 if (previous > orig_code)
2271 {
2272 memmove(orig_code, previous, code - previous);
2273 code -= previous - orig_code;
2274 previous = orig_code;
2275 }
2276 }
2277 else code = orig_code;
2278
2279 /* Remember where this code item starts so we can pick up the length
2280 next time round. */
2281
2282 last_code = code;
2283 }
2284
2285 /* In the real compile phase, just check the workspace used by the forward
2286 reference list. */
2287
2288 else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
2289 {
2290 *errorcodeptr = ERR52;
2291 goto FAILED;
2292 }
2293
2294 /* If in \Q...\E, check for the end; if not, we have a literal */
2295
2296 if (inescq && c != 0)
2297 {
2298 if (c == '\\' && ptr[1] == 'E')
2299 {
2300 inescq = FALSE;
2301 ptr++;
2302 continue;
2303 }
2304 else
2305 {
2306 if (previous_callout != NULL)
2307 {
2308 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2309 complete_callout(previous_callout, ptr, cd);
2310 previous_callout = NULL;
2311 }
2312 if ((options & PCRE_AUTO_CALLOUT) != 0)
2313 {
2314 previous_callout = code;
2315 code = auto_callout(code, ptr, cd);
2316 }
2317 goto NORMAL_CHAR;
2318 }
2319 }
2320
2321 /* Fill in length of a previous callout, except when the next thing is
2322 a quantifier. */
2323
2324 is_quantifier = c == '*' || c == '+' || c == '?' ||
2325 (c == '{' && is_counted_repeat(ptr+1));
2326
2327 if (!is_quantifier && previous_callout != NULL &&
2328 after_manual_callout-- <= 0)
2329 {
2330 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2331 complete_callout(previous_callout, ptr, cd);
2332 previous_callout = NULL;
2333 }
2334
2335 /* In extended mode, skip white space and comments */
2336
2337 if ((options & PCRE_EXTENDED) != 0)
2338 {
2339 if ((cd->ctypes[c] & ctype_space) != 0) continue;
2340 if (c == '#')
2341 {
2342 while (*(++ptr) != 0)
2343 {
2344 if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
2345 }
2346 if (*ptr != 0) continue;
2347
2348 /* Else fall through to handle end of string */
2349 c = 0;
2350 }
2351 }
2352
2353 /* No auto callout for quantifiers. */
2354
2355 if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
2356 {
2357 previous_callout = code;
2358 code = auto_callout(code, ptr, cd);
2359 }
2360
2361 switch(c)
2362 {
2363 /* ===================================================================*/
2364 case 0: /* The branch terminates at string end */
2365 case '|': /* or | or ) */
2366 case ')':
2367 *firstbyteptr = firstbyte;
2368 *reqbyteptr = reqbyte;
2369 *codeptr = code;
2370 *ptrptr = ptr;
2371 if (lengthptr != NULL)
2372 {
2373 *lengthptr += code - last_code; /* To include callout length */
2374 DPRINTF((">> end branch\n"));
2375 }
2376 return TRUE;
2377
2378
2379 /* ===================================================================*/
2380 /* Handle single-character metacharacters. In multiline mode, ^ disables
2381 the setting of any following char as a first character. */
2382
2383 case '^':
2384 if ((options & PCRE_MULTILINE) != 0)
2385 {
2386 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2387 }
2388 previous = NULL;
2389 *code++ = OP_CIRC;
2390 break;
2391
2392 case '$':
2393 previous = NULL;
2394 *code++ = OP_DOLL;
2395 break;
2396
2397 /* There can never be a first char if '.' is first, whatever happens about
2398 repeats. The value of reqbyte doesn't change either. */
2399
2400 case '.':
2401 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2402 zerofirstbyte = firstbyte;
2403 zeroreqbyte = reqbyte;
2404 previous = code;
2405 *code++ = OP_ANY;
2406 break;
2407
2408
2409 /* ===================================================================*/
2410 /* Character classes. If the included characters are all < 256, we build a
2411 32-byte bitmap of the permitted characters, except in the special case
2412 where there is only one such character. For negated classes, we build the
2413 map as usual, then invert it at the end. However, we use a different opcode
2414 so that data characters > 255 can be handled correctly.
2415
2416 If the class contains characters outside the 0-255 range, a different
2417 opcode is compiled. It may optionally have a bit map for characters < 256,
2418 but those above are are explicitly listed afterwards. A flag byte tells
2419 whether the bitmap is present, and whether this is a negated class or not.
2420 */
2421
2422 case '[':
2423 previous = code;
2424
2425 /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
2426 they are encountered at the top level, so we'll do that too. */
2427
2428 if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2429 check_posix_syntax(ptr, &tempptr, cd))
2430 {
2431 *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;
2432 goto FAILED;
2433 }
2434
2435 /* If the first character is '^', set the negation flag and skip it. */
2436
2437 if ((c = *(++ptr)) == '^')
2438 {
2439 negate_class = TRUE;
2440 c = *(++ptr);
2441 }
2442 else
2443 {
2444 negate_class = FALSE;
2445 }
2446
2447 /* Keep a count of chars with values < 256 so that we can optimize the case
2448 of just a single character (as long as it's < 256). However, For higher
2449 valued UTF-8 characters, we don't yet do any optimization. */
2450
2451 class_charcount = 0;
2452 class_lastchar = -1;
2453
2454 /* Initialize the 32-char bit map to all zeros. We build the map in a
2455 temporary bit of memory, in case the class contains only 1 character (less
2456 than 256), because in that case the compiled code doesn't use the bit map.
2457 */
2458
2459 memset(classbits, 0, 32 * sizeof(uschar));
2460
2461 #ifdef SUPPORT_UTF8
2462 class_utf8 = FALSE; /* No chars >= 256 */
2463 class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */
2464 #endif
2465
2466 /* Process characters until ] is reached. By writing this as a "do" it
2467 means that an initial ] is taken as a data character. At the start of the
2468 loop, c contains the first byte of the character. */
2469
2470 if (c != 0) do
2471 {
2472 const uschar *oldptr;
2473
2474 #ifdef SUPPORT_UTF8
2475 if (utf8 && c > 127)
2476 { /* Braces are required because the */
2477 GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
2478 }
2479 #endif
2480
2481 /* Inside \Q...\E everything is literal except \E */
2482
2483 if (inescq)
2484 {
2485 if (c == '\\' && ptr[1] == 'E') /* If we are at \E */
2486 {
2487 inescq = FALSE; /* Reset literal state */
2488 ptr++; /* Skip the 'E' */
2489 continue; /* Carry on with next */
2490 }
2491 goto CHECK_RANGE; /* Could be range if \E follows */
2492 }
2493
2494 /* Handle POSIX class names. Perl allows a negation extension of the
2495 form [:^name:]. A square bracket that doesn't match the syntax is
2496 treated as a literal. We also recognize the POSIX constructions
2497 [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
2498 5.6 and 5.8 do. */
2499
2500 if (c == '[' &&
2501 (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2502 check_posix_syntax(ptr, &tempptr, cd))
2503 {
2504 BOOL local_negate = FALSE;
2505 int posix_class, taboffset, tabopt;
2506 register const uschar *cbits = cd->cbits;
2507 uschar pbits[32];
2508
2509 if (ptr[1] != ':')
2510 {
2511 *errorcodeptr = ERR31;
2512 goto FAILED;
2513 }
2514
2515 ptr += 2;
2516 if (*ptr == '^')
2517 {
2518 local_negate = TRUE;
2519 ptr++;
2520 }
2521
2522 posix_class = check_posix_name(ptr, tempptr - ptr);
2523 if (posix_class < 0)
2524 {
2525 *errorcodeptr = ERR30;
2526 goto FAILED;
2527 }
2528
2529 /* If matching is caseless, upper and lower are converted to
2530 alpha. This relies on the fact that the class table starts with
2531 alpha, lower, upper as the first 3 entries. */
2532
2533 if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
2534 posix_class = 0;
2535
2536 /* We build the bit map for the POSIX class in a chunk of local store
2537 because we may be adding and subtracting from it, and we don't want to
2538 subtract bits that may be in the main map already. At the end we or the
2539 result into the bit map that is being built. */
2540
2541 posix_class *= 3;
2542
2543 /* Copy in the first table (always present) */
2544
2545 memcpy(pbits, cbits + posix_class_maps[posix_class],
2546 32 * sizeof(uschar));
2547
2548 /* If there is a second table, add or remove it as required. */
2549
2550 taboffset = posix_class_maps[posix_class + 1];
2551 tabopt = posix_class_maps[posix_class + 2];
2552
2553 if (taboffset >= 0)
2554 {
2555 if (tabopt >= 0)
2556 for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
2557 else
2558 for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
2559 }
2560
2561 /* Not see if we need to remove any special characters. An option
2562 value of 1 removes vertical space and 2 removes underscore. */
2563
2564 if (tabopt < 0) tabopt = -tabopt;
2565 if (tabopt == 1) pbits[1] &= ~0x3c;
2566 else if (tabopt == 2) pbits[11] &= 0x7f;
2567
2568 /* Add the POSIX table or its complement into the main table that is
2569 being built and we are done. */
2570
2571 if (local_negate)
2572 for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
2573 else
2574 for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
2575
2576 ptr = tempptr + 1;
2577 class_charcount = 10; /* Set > 1; assumes more than 1 per class */
2578 continue; /* End of POSIX syntax handling */
2579 }
2580
2581 /* Backslash may introduce a single character, or it may introduce one
2582 of the specials, which just set a flag. The sequence \b is a special
2583 case. Inside a class (and only there) it is treated as backspace.
2584 Elsewhere it marks a word boundary. Other escapes have preset maps ready
2585 to or into the one we are building. We assume they have more than one
2586 character in them, so set class_charcount bigger than one. */
2587
2588 if (c == '\\')
2589 {
2590 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2591 if (*errorcodeptr != 0) goto FAILED;
2592
2593 if (-c == ESC_b) c = '\b'; /* \b is backslash in a class */
2594 else if (-c == ESC_X) c = 'X'; /* \X is literal X in a class */
2595 else if (-c == ESC_R) c = 'R'; /* \R is literal R in a class */
2596 else if (-c == ESC_Q) /* Handle start of quoted string */
2597 {
2598 if (ptr[1] == '\\' && ptr[2] == 'E')
2599 {
2600 ptr += 2; /* avoid empty string */
2601 }
2602 else inescq = TRUE;
2603 continue;
2604 }
2605
2606 if (c < 0)
2607 {
2608 register const uschar *cbits = cd->cbits;
2609 class_charcount += 2; /* Greater than 1 is what matters */
2610
2611 /* Save time by not doing this in the pre-compile phase. */
2612
2613 if (lengthptr == NULL) switch (-c)
2614 {
2615 case ESC_d:
2616 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
2617 continue;
2618
2619 case ESC_D:
2620 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
2621 continue;
2622
2623 case ESC_w:
2624 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
2625 continue;
2626
2627 case ESC_W:
2628 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
2629 continue;
2630
2631 case ESC_s:
2632 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
2633 classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
2634 continue;
2635
2636 case ESC_S:
2637 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
2638 classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
2639 continue;
2640
2641 case ESC_E: /* Perl ignores an orphan \E */
2642 continue;
2643
2644 default: /* Not recognized; fall through */
2645 break; /* Need "default" setting to stop compiler warning. */
2646 }
2647
2648 /* In the pre-compile phase, just do the recognition. */
2649
2650 else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
2651 c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
2652
2653 /* We need to deal with \H, \h, \V, and \v in both phases because
2654 they use extra memory. */
2655
2656 if (-c == ESC_h)
2657 {
2658 SETBIT(classbits, 0x09); /* VT */
2659 SETBIT(classbits, 0x20); /* SPACE */
2660 SETBIT(classbits, 0xa0); /* NSBP */
2661 #ifdef SUPPORT_UTF8
2662 if (utf8)
2663 {
2664 class_utf8 = TRUE;
2665 *class_utf8data++ = XCL_SINGLE;
2666 class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
2667 *class_utf8data++ = XCL_SINGLE;
2668 class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
2669 *class_utf8data++ = XCL_RANGE;
2670 class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
2671 class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
2672 *class_utf8data++ = XCL_SINGLE;
2673 class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
2674 *class_utf8data++ = XCL_SINGLE;
2675 class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
2676 *class_utf8data++ = XCL_SINGLE;
2677 class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
2678 }
2679 #endif
2680 continue;
2681 }
2682
2683 if (-c == ESC_H)
2684 {
2685 for (c = 0; c < 32; c++)
2686 {
2687 int x = 0xff;
2688 switch (c)
2689 {
2690 case 0x09/8: x ^= 1 << (0x09%8); break;
2691 case 0x20/8: x ^= 1 << (0x20%8); break;
2692 case 0xa0/8: x ^= 1 << (0xa0%8); break;
2693 default: break;
2694 }
2695 classbits[c] |= x;
2696 }
2697
2698 #ifdef SUPPORT_UTF8
2699 if (utf8)
2700 {
2701 class_utf8 = TRUE;
2702 *class_utf8data++ = XCL_RANGE;
2703 class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2704 class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
2705 *class_utf8data++ = XCL_RANGE;
2706 class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
2707 class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
2708 *class_utf8data++ = XCL_RANGE;
2709 class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
2710 class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
2711 *class_utf8data++ = XCL_RANGE;
2712 class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
2713 class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
2714 *class_utf8data++ = XCL_RANGE;
2715 class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
2716 class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
2717 *class_utf8data++ = XCL_RANGE;
2718 class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
2719 class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
2720 *class_utf8data++ = XCL_RANGE;
2721 class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
2722 class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2723 }
2724 #endif
2725 continue;
2726 }
2727
2728 if (-c == ESC_v)
2729 {
2730 SETBIT(classbits, 0x0a); /* LF */
2731 SETBIT(classbits, 0x0b); /* VT */
2732 SETBIT(classbits, 0x0c); /* FF */
2733 SETBIT(classbits, 0x0d); /* CR */
2734 SETBIT(classbits, 0x85); /* NEL */
2735 #ifdef SUPPORT_UTF8
2736 if (utf8)
2737 {
2738 class_utf8 = TRUE;
2739 *class_utf8data++ = XCL_RANGE;
2740 class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
2741 class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
2742 }
2743 #endif
2744 continue;
2745 }
2746
2747 if (-c == ESC_V)
2748 {
2749 for (c = 0; c < 32; c++)
2750 {
2751 int x = 0xff;
2752 switch (c)
2753 {
2754 case 0x0a/8: x ^= 1 << (0x0a%8);
2755 x ^= 1 << (0x0b%8);
2756 x ^= 1 << (0x0c%8);
2757 x ^= 1 << (0x0d%8);
2758 break;
2759 case 0x85/8: x ^= 1 << (0x85%8); break;
2760 default: break;
2761 }
2762 classbits[c] |= x;
2763 }
2764
2765 #ifdef SUPPORT_UTF8
2766 if (utf8)
2767 {
2768 class_utf8 = TRUE;
2769 *class_utf8data++ = XCL_RANGE;
2770 class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2771 class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
2772 *class_utf8data++ = XCL_RANGE;
2773 class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
2774 class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2775 }
2776 #endif
2777 continue;
2778 }
2779
2780 /* We need to deal with \P and \p in both phases. */
2781
2782 #ifdef SUPPORT_UCP
2783 if (-c == ESC_p || -c == ESC_P)
2784 {
2785 BOOL negated;
2786 int pdata;
2787 int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
2788 if (ptype < 0) goto FAILED;
2789 class_utf8 = TRUE;
2790 *class_utf8data++ = ((-c == ESC_p) != negated)?
2791 XCL_PROP : XCL_NOTPROP;
2792 *class_utf8data++ = ptype;
2793 *class_utf8data++ = pdata;
2794 class_charcount -= 2; /* Not a < 256 character */
2795 continue;
2796 }
2797 #endif
2798 /* Unrecognized escapes are faulted if PCRE is running in its
2799 strict mode. By default, for compatibility with Perl, they are
2800 treated as literals. */
2801
2802 if ((options & PCRE_EXTRA) != 0)
2803 {
2804 *errorcodeptr = ERR7;
2805 goto FAILED;
2806 }
2807
2808 class_charcount -= 2; /* Undo the default count from above */
2809 c = *ptr; /* Get the final character and fall through */
2810 }
2811
2812 /* Fall through if we have a single character (c >= 0). This may be
2813 greater than 256 in UTF-8 mode. */
2814
2815 } /* End of backslash handling */
2816
2817 /* A single character may be followed by '-' to form a range. However,
2818 Perl does not permit ']' to be the end of the range. A '-' character
2819 at the end is treated as a literal. Perl ignores orphaned \E sequences
2820 entirely. The code for handling \Q and \E is messy. */
2821
2822 CHECK_RANGE:
2823 while (ptr[1] == '\\' && ptr[2] == 'E')
2824 {
2825 inescq = FALSE;
2826 ptr += 2;
2827 }
2828
2829 oldptr = ptr;
2830
2831 if (!inescq && ptr[1] == '-')
2832 {
2833 int d;
2834 ptr += 2;
2835 while (*ptr == '\\' && ptr[1] == 'E') ptr += 2;
2836
2837 /* If we hit \Q (not followed by \E) at this point, go into escaped
2838 mode. */
2839
2840 while (*ptr == '\\' && ptr[1] == 'Q')
2841 {
2842 ptr += 2;
2843 if (*ptr == '\\' && ptr[1] == 'E') { ptr += 2; continue; }
2844 inescq = TRUE;
2845 break;
2846 }
2847
2848 if (*ptr == 0 || (!inescq && *ptr == ']'))
2849 {
2850 ptr = oldptr;
2851 goto LONE_SINGLE_CHARACTER;
2852 }
2853
2854 #ifdef SUPPORT_UTF8
2855 if (utf8)
2856 { /* Braces are required because the */
2857 GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
2858 }
2859 else
2860 #endif
2861 d = *ptr; /* Not UTF-8 mode */
2862
2863 /* The second part of a range can be a single-character escape, but
2864 not any of the other escapes. Perl 5.6 treats a hyphen as a literal
2865 in such circumstances. */
2866
2867 if (!inescq && d == '\\')
2868 {
2869 d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2870 if (*errorcodeptr != 0) goto FAILED;
2871
2872 /* \b is backslash; \X is literal X; \R is literal R; any other
2873 special means the '-' was literal */
2874
2875 if (d < 0)
2876 {
2877 if (d == -ESC_b) d = '\b';
2878 else if (d == -ESC_X) d = 'X';
2879 else if (d == -ESC_R) d = 'R'; else
2880 {
2881 ptr = oldptr;
2882 goto LONE_SINGLE_CHARACTER; /* A few lines below */
2883 }
2884 }
2885 }
2886
2887 /* Check that the two values are in the correct order. Optimize
2888 one-character ranges */
2889
2890 if (d < c)
2891 {
2892 *errorcodeptr = ERR8;
2893 goto FAILED;
2894 }
2895
2896 if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
2897
2898 /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
2899 matching, we have to use an XCLASS with extra data items. Caseless
2900 matching for characters > 127 is available only if UCP support is
2901 available. */
2902
2903 #ifdef SUPPORT_UTF8
2904 if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
2905 {
2906 class_utf8 = TRUE;
2907
2908 /* With UCP support, we can find the other case equivalents of
2909 the relevant characters. There may be several ranges. Optimize how
2910 they fit with the basic range. */
2911
2912 #ifdef SUPPORT_UCP
2913 if ((options & PCRE_CASELESS) != 0)
2914 {
2915 unsigned int occ, ocd;
2916 unsigned int cc = c;
2917 unsigned int origd = d;
2918 while (get_othercase_range(&cc, origd, &occ, &ocd))
2919 {
2920 if (occ >= (unsigned int)c &&
2921 ocd <= (unsigned int)d)
2922 continue; /* Skip embedded ranges */
2923
2924 if (occ < (unsigned int)c &&
2925 ocd >= (unsigned int)c - 1) /* Extend the basic range */
2926 { /* if there is overlap, */
2927 c = occ; /* noting that if occ < c */
2928 continue; /* we can't have ocd > d */
2929 } /* because a subrange is */
2930 if (ocd > (unsigned int)d &&
2931 occ <= (unsigned int)d + 1) /* always shorter than */
2932 { /* the basic range. */
2933 d = ocd;
2934 continue;
2935 }
2936
2937 if (occ == ocd)
2938 {
2939 *class_utf8data++ = XCL_SINGLE;
2940 }
2941 else
2942 {
2943 *class_utf8data++ = XCL_RANGE;
2944 class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
2945 }
2946 class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
2947 }
2948 }
2949 #endif /* SUPPORT_UCP */
2950
2951 /* Now record the original range, possibly modified for UCP caseless
2952 overlapping ranges. */
2953
2954 *class_utf8data++ = XCL_RANGE;
2955 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
2956 class_utf8data += _pcre_ord2utf8(d, class_utf8data);
2957
2958 /* With UCP support, we are done. Without UCP support, there is no
2959 caseless matching for UTF-8 characters > 127; we can use the bit map
2960 for the smaller ones. */
2961
2962 #ifdef SUPPORT_UCP
2963 continue; /* With next character in the class */
2964 #else
2965 if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
2966
2967 /* Adjust upper limit and fall through to set up the map */
2968
2969 d = 127;
2970
2971 #endif /* SUPPORT_UCP */
2972 }
2973 #endif /* SUPPORT_UTF8 */
2974
2975 /* We use the bit map for all cases when not in UTF-8 mode; else
2976 ranges that lie entirely within 0-127 when there is UCP support; else
2977 for partial ranges without UCP support. */
2978
2979 class_charcount += d - c + 1;
2980 class_lastchar = d;
2981
2982 /* We can save a bit of time by skipping this in the pre-compile. */
2983
2984 if (lengthptr == NULL) for (; c <= d; c++)
2985 {
2986 classbits[c/8] |= (1 << (c&7));
2987 if ((options & PCRE_CASELESS) != 0)
2988 {
2989 int uc = cd->fcc[c]; /* flip case */
2990 classbits[uc/8] |= (1 << (uc&7));
2991 }
2992 }
2993
2994 continue; /* Go get the next char in the class */
2995 }
2996
2997 /* Handle a lone single character - we can get here for a normal
2998 non-escape char, or after \ that introduces a single character or for an
2999 apparent range that isn't. */
3000
3001 LONE_SINGLE_CHARACTER:
3002
3003 /* Handle a character that cannot go in the bit map */
3004
3005 #ifdef SUPPORT_UTF8
3006 if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
3007 {
3008 class_utf8 = TRUE;
3009 *class_utf8data++ = XCL_SINGLE;
3010 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3011
3012 #ifdef SUPPORT_UCP
3013 if ((options & PCRE_CASELESS) != 0)
3014 {
3015 unsigned int othercase;
3016 if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR)
3017 {
3018 *class_utf8data++ = XCL_SINGLE;
3019 class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
3020 }
3021 }
3022 #endif /* SUPPORT_UCP */
3023
3024 }
3025 else
3026 #endif /* SUPPORT_UTF8 */
3027
3028 /* Handle a single-byte character */
3029 {
3030 classbits[c/8] |= (1 << (c&7));
3031 if ((options & PCRE_CASELESS) != 0)
3032 {
3033 c = cd->fcc[c]; /* flip case */
3034 classbits[c/8] |= (1 << (c&7));
3035 }
3036 class_charcount++;
3037 class_lastchar = c;
3038 }
3039 }
3040
3041 /* Loop until ']' reached. This "while" is the end of the "do" above. */
3042
3043 while ((c = *(++ptr)) != 0 && (c != ']' || inescq));
3044
3045 if (c == 0) /* Missing terminating ']' */
3046 {
3047 *errorcodeptr = ERR6;
3048 goto FAILED;
3049 }
3050
3051 /* If class_charcount is 1, we saw precisely one character whose value is
3052 less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we
3053 can optimize the negative case only if there were no characters >= 128
3054 because OP_NOT and the related opcodes like OP_NOTSTAR operate on
3055 single-bytes only. This is an historical hangover. Maybe one day we can
3056 tidy these opcodes to handle multi-byte characters.
3057
3058 The optimization throws away the bit map. We turn the item into a
3059 1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
3060 that OP_NOT does not support multibyte characters. In the positive case, it
3061 can cause firstbyte to be set. Otherwise, there can be no first char if
3062 this item is first, whatever repeat count may follow. In the case of
3063 reqbyte, save the previous value for reinstating. */
3064
3065 #ifdef SUPPORT_UTF8
3066 if (class_charcount == 1 &&
3067 (!utf8 ||
3068 (!class_utf8 && (!negate_class || class_lastchar < 128))))
3069
3070 #else
3071 if (class_charcount == 1)
3072 #endif
3073 {
3074 zeroreqbyte = reqbyte;
3075
3076 /* The OP_NOT opcode works on one-byte characters only. */
3077
3078 if (negate_class)
3079 {
3080 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3081 zerofirstbyte = firstbyte;
3082 *code++ = OP_NOT;
3083 *code++ = class_lastchar;
3084 break;
3085 }
3086
3087 /* For a single, positive character, get the value into mcbuffer, and
3088 then we can handle this with the normal one-character code. */
3089
3090 #ifdef SUPPORT_UTF8
3091 if (utf8 && class_lastchar > 127)
3092 mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
3093 else
3094 #endif
3095 {
3096 mcbuffer[0] = class_lastchar;
3097 mclength = 1;
3098 }
3099 goto ONE_CHAR;
3100 } /* End of 1-char optimization */
3101
3102 /* The general case - not the one-char optimization. If this is the first
3103 thing in the branch, there can be no first char setting, whatever the
3104 repeat count. Any reqbyte setting must remain unchanged after any kind of
3105 repeat. */
3106
3107 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3108 zerofirstbyte = firstbyte;
3109 zeroreqbyte = reqbyte;
3110
3111 /* If there are characters with values > 255, we have to compile an
3112 extended class, with its own opcode. If there are no characters < 256,
3113 we can omit the bitmap in the actual compiled code. */
3114
3115 #ifdef SUPPORT_UTF8
3116 if (class_utf8)
3117 {
3118 *class_utf8data++ = XCL_END; /* Marks the end of extra data */
3119 *code++ = OP_XCLASS;
3120 code += LINK_SIZE;
3121 *code = negate_class? XCL_NOT : 0;
3122
3123 /* If the map is required, move up the extra data to make room for it;
3124 otherwise just move the code pointer to the end of the extra data. */
3125
3126 if (class_charcount > 0)
3127 {
3128 *code++ |= XCL_MAP;
3129 memmove(code + 32, code, class_utf8data - code);
3130 memcpy(code, classbits, 32);
3131 code = class_utf8data + 32;
3132 }
3133 else code = class_utf8data;
3134
3135 /* Now fill in the complete length of the item */
3136
3137 PUT(previous, 1, code - previous);
3138 break; /* End of class handling */
3139 }
3140 #endif
3141
3142 /* If there are no characters > 255, negate the 32-byte map if necessary,
3143 and copy it into the code vector. If this is the first thing in the branch,
3144 there can be no first char setting, whatever the repeat count. Any reqbyte
3145 setting must remain unchanged after any kind of repeat. */
3146
3147 if (negate_class)
3148 {
3149 *code++ = OP_NCLASS;
3150 if (lengthptr == NULL) /* Save time in the pre-compile phase */
3151 for (c = 0; c < 32; c++) code[c] = ~classbits[c];
3152 }
3153 else
3154 {
3155 *code++ = OP_CLASS;
3156 memcpy(code, classbits, 32);
3157 }
3158 code += 32;
3159 break;
3160
3161
3162 /* ===================================================================*/
3163 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
3164 has been tested above. */
3165
3166 case '{':
3167 if (!is_quantifier) goto NORMAL_CHAR;
3168 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
3169 if (*errorcodeptr != 0) goto FAILED;
3170 goto REPEAT;
3171
3172 case '*':
3173 repeat_min = 0;
3174 repeat_max = -1;
3175 goto REPEAT;
3176
3177 case '+':
3178 repeat_min = 1;
3179 repeat_max = -1;
3180 goto REPEAT;
3181
3182 case '?':
3183 repeat_min = 0;
3184 repeat_max = 1;
3185
3186 REPEAT:
3187 if (previous == NULL)
3188 {
3189 *errorcodeptr = ERR9;
3190 goto FAILED;
3191 }
3192
3193 if (repeat_min == 0)
3194 {
3195 firstbyte = zerofirstbyte; /* Adjust for zero repeat */
3196 reqbyte = zeroreqbyte; /* Ditto */
3197 }
3198
3199 /* Remember whether this is a variable length repeat */
3200
3201 reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
3202
3203 op_type = 0; /* Default single-char op codes */
3204 possessive_quantifier = FALSE; /* Default not possessive quantifier */
3205
3206 /* Save start of previous item, in case we have to move it up to make space
3207 for an inserted OP_ONCE for the additional '+' extension. */
3208
3209 tempcode = previous;
3210
3211 /* If the next character is '+', we have a possessive quantifier. This
3212 implies greediness, whatever the setting of the PCRE_UNGREEDY option.
3213 If the next character is '?' this is a minimizing repeat, by default,
3214 but if PCRE_UNGREEDY is set, it works the other way round. We change the
3215 repeat type to the non-default. */
3216
3217 if (ptr[1] == '+')
3218 {
3219 repeat_type = 0; /* Force greedy */
3220 possessive_quantifier = TRUE;
3221 ptr++;
3222 }
3223 else if (ptr[1] == '?')
3224 {
3225 repeat_type = greedy_non_default;
3226 ptr++;
3227 }
3228 else repeat_type = greedy_default;
3229
3230 /* If previous was a character match, abolish the item and generate a
3231 repeat item instead. If a char item has a minumum of more than one, ensure
3232 that it is set in reqbyte - it might not be if a sequence such as x{3} is
3233 the first thing in a branch because the x will have gone into firstbyte
3234 instead. */
3235
3236 if (*previous == OP_CHAR || *previous == OP_CHARNC)
3237 {
3238 /* Deal with UTF-8 characters that take up more than one byte. It's
3239 easier to write this out separately than try to macrify it. Use c to
3240 hold the length of the character in bytes, plus 0x80 to flag that it's a
3241 length rather than a small character. */
3242
3243 #ifdef SUPPORT_UTF8
3244 if (utf8 && (code[-1] & 0x80) != 0)
3245 {
3246 uschar *lastchar = code - 1;
3247 while((*lastchar & 0xc0) == 0x80) lastchar--;
3248 c = code - lastchar; /* Length of UTF-8 character */
3249 memcpy(utf8_char, lastchar, c); /* Save the char */
3250 c |= 0x80; /* Flag c as a length */
3251 }
3252 else
3253 #endif
3254
3255 /* Handle the case of a single byte - either with no UTF8 support, or
3256 with UTF-8 disabled, or for a UTF-8 character < 128. */
3257
3258 {
3259 c = code[-1];
3260 if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
3261 }
3262
3263 /* If the repetition is unlimited, it pays to see if the next thing on
3264 the line is something that cannot possibly match this character. If so,
3265 automatically possessifying this item gains some performance in the case
3266 where the match fails. */
3267
3268 if (!possessive_quantifier &&
3269 repeat_max < 0 &&
3270 check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3271 options, cd))
3272 {
3273 repeat_type = 0; /* Force greedy */
3274 possessive_quantifier = TRUE;
3275 }
3276
3277 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
3278 }
3279
3280 /* If previous was a single negated character ([^a] or similar), we use
3281 one of the special opcodes, replacing it. The code is shared with single-
3282 character repeats by setting opt_type to add a suitable offset into
3283 repeat_type. We can also test for auto-possessification. OP_NOT is
3284 currently used only for single-byte chars. */
3285
3286 else if (*previous == OP_NOT)
3287 {
3288 op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
3289 c = previous[1];
3290 if (!possessive_quantifier &&
3291 repeat_max < 0 &&
3292 check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3293 {
3294 repeat_type = 0; /* Force greedy */
3295 possessive_quantifier = TRUE;
3296 }
3297 goto OUTPUT_SINGLE_REPEAT;
3298 }
3299
3300 /* If previous was a character type match (\d or similar), abolish it and
3301 create a suitable repeat item. The code is shared with single-character
3302 repeats by setting op_type to add a suitable offset into repeat_type. Note
3303 the the Unicode property types will be present only when SUPPORT_UCP is
3304 defined, but we don't wrap the little bits of code here because it just
3305 makes it horribly messy. */
3306
3307 else if (*previous < OP_EODN)
3308 {
3309 uschar *oldcode;
3310 int prop_type, prop_value;
3311 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
3312 c = *previous;
3313
3314 if (!possessive_quantifier &&
3315 repeat_max < 0 &&
3316 check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3317 {
3318 repeat_type = 0; /* Force greedy */
3319 possessive_quantifier = TRUE;
3320 }
3321
3322 OUTPUT_SINGLE_REPEAT:
3323 if (*previous == OP_PROP || *previous == OP_NOTPROP)
3324 {
3325 prop_type = previous[1];
3326 prop_value = previous[2];
3327 }
3328 else prop_type = prop_value = -1;
3329
3330 oldcode = code;
3331 code = previous; /* Usually overwrite previous item */
3332
3333 /* If the maximum is zero then the minimum must also be zero; Perl allows
3334 this case, so we do too - by simply omitting the item altogether. */
3335
3336 if (repeat_max == 0) goto END_REPEAT;
3337
3338 /* All real repeats make it impossible to handle partial matching (maybe
3339 one day we will be able to remove this restriction). */
3340
3341 if (repeat_max != 1) cd->nopartial = TRUE;
3342
3343 /* Combine the op_type with the repeat_type */
3344
3345 repeat_type += op_type;
3346
3347 /* A minimum of zero is handled either as the special case * or ?, or as
3348 an UPTO, with the maximum given. */
3349
3350 if (repeat_min == 0)
3351 {
3352 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
3353 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
3354 else
3355 {
3356 *code++ = OP_UPTO + repeat_type;
3357 PUT2INC(code, 0, repeat_max);
3358 }
3359 }
3360
3361 /* A repeat minimum of 1 is optimized into some special cases. If the
3362 maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
3363 left in place and, if the maximum is greater than 1, we use OP_UPTO with
3364 one less than the maximum. */
3365
3366 else if (repeat_min == 1)
3367 {
3368 if (repeat_max == -1)
3369 *code++ = OP_PLUS + repeat_type;
3370 else
3371 {
3372 code = oldcode; /* leave previous item in place */
3373 if (repeat_max == 1) goto END_REPEAT;
3374 *code++ = OP_UPTO + repeat_type;
3375 PUT2INC(code, 0, repeat_max - 1);
3376 }
3377 }
3378
3379 /* The case {n,n} is just an EXACT, while the general case {n,m} is
3380 handled as an EXACT followed by an UPTO. */
3381
3382 else
3383 {
3384 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
3385 PUT2INC(code, 0, repeat_min);
3386
3387 /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
3388 we have to insert the character for the previous code. For a repeated
3389 Unicode property match, there are two extra bytes that define the
3390 required property. In UTF-8 mode, long characters have their length in
3391 c, with the 0x80 bit as a flag. */
3392
3393 if (repeat_max < 0)
3394 {
3395 #ifdef SUPPORT_UTF8
3396 if (utf8 && c >= 128)
3397 {
3398 memcpy(code, utf8_char, c & 7);
3399 code += c & 7;
3400 }
3401 else
3402 #endif
3403 {
3404 *code++ = c;
3405 if (prop_type >= 0)
3406 {
3407 *code++ = prop_type;
3408 *code++ = prop_value;
3409 }
3410 }
3411 *code++ = OP_STAR + repeat_type;
3412 }
3413
3414 /* Else insert an UPTO if the max is greater than the min, again
3415 preceded by the character, for the previously inserted code. If the
3416 UPTO is just for 1 instance, we can use QUERY instead. */
3417
3418 else if (repeat_max != repeat_min)
3419 {
3420 #ifdef SUPPORT_UTF8
3421 if (utf8 && c >= 128)
3422 {
3423 memcpy(code, utf8_char, c & 7);
3424 code += c & 7;
3425 }
3426 else
3427 #endif
3428 *code++ = c;
3429 if (prop_type >= 0)
3430 {
3431 *code++ = prop_type;
3432 *code++ = prop_value;
3433 }
3434 repeat_max -= repeat_min;
3435
3436 if (repeat_max == 1)
3437 {
3438 *code++ = OP_QUERY + repeat_type;
3439 }
3440 else
3441 {
3442 *code++ = OP_UPTO + repeat_type;
3443 PUT2INC(code, 0, repeat_max);
3444 }
3445 }
3446 }
3447
3448 /* The character or character type itself comes last in all cases. */
3449
3450 #ifdef SUPPORT_UTF8
3451 if (utf8 && c >= 128)
3452 {
3453 memcpy(code, utf8_char, c & 7);
3454 code += c & 7;
3455 }
3456 else
3457 #endif
3458 *code++ = c;
3459
3460 /* For a repeated Unicode property match, there are two extra bytes that
3461 define the required property. */
3462
3463 #ifdef SUPPORT_UCP
3464 if (prop_type >= 0)
3465 {
3466 *code++ = prop_type;
3467 *code++ = prop_value;
3468 }
3469 #endif
3470 }
3471
3472 /* If previous was a character class or a back reference, we put the repeat
3473 stuff after it, but just skip the item if the repeat was {0,0}. */
3474
3475 else if (*previous == OP_CLASS ||
3476 *previous == OP_NCLASS ||
3477 #ifdef SUPPORT_UTF8
3478 *previous == OP_XCLASS ||
3479 #endif
3480 *previous == OP_REF)
3481 {
3482 if (repeat_max == 0)
3483 {
3484 code = previous;
3485 goto END_REPEAT;
3486 }
3487
3488 /* All real repeats make it impossible to handle partial matching (maybe
3489 one day we will be able to remove this restriction). */
3490
3491 if (repeat_max != 1) cd->nopartial = TRUE;
3492
3493 if (repeat_min == 0 && repeat_max == -1)
3494 *code++ = OP_CRSTAR + repeat_type;
3495 else if (repeat_min == 1 && repeat_max == -1)
3496 *code++ = OP_CRPLUS + repeat_type;
3497 else if (repeat_min == 0 && repeat_max == 1)
3498 *code++ = OP_CRQUERY + repeat_type;
3499 else
3500 {
3501 *code++ = OP_CRRANGE + repeat_type;
3502 PUT2INC(code, 0, repeat_min);
3503 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
3504 PUT2INC(code, 0, repeat_max);
3505 }
3506 }
3507
3508 /* If previous was a bracket group, we may have to replicate it in certain
3509 cases. */
3510
3511 else if (*previous == OP_BRA || *previous == OP_CBRA ||
3512 *previous == OP_ONCE || *previous == OP_COND)
3513 {
3514 register int i;
3515 int ketoffset = 0;
3516 int len = code - previous;
3517 uschar *bralink = NULL;
3518
3519 /* Repeating a DEFINE group is pointless */
3520
3521 if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
3522 {
3523 *errorcodeptr = ERR55;
3524 goto FAILED;
3525 }
3526
3527 /* This is a paranoid check to stop integer overflow later on */
3528
3529 if (len > MAX_DUPLENGTH)
3530 {
3531 *errorcodeptr = ERR50;
3532 goto FAILED;
3533 }
3534
3535 /* If the maximum repeat count is unlimited, find the end of the bracket
3536 by scanning through from the start, and compute the offset back to it
3537 from the current code pointer. There may be an OP_OPT setting following
3538 the final KET, so we can't find the end just by going back from the code
3539 pointer. */
3540
3541 if (repeat_max == -1)
3542 {
3543 register uschar *ket = previous;
3544 do ket += GET(ket, 1); while (*ket != OP_KET);
3545 ketoffset = code - ket;
3546 }
3547
3548 /* The case of a zero minimum is special because of the need to stick
3549 OP_BRAZERO in front of it, and because the group appears once in the
3550 data, whereas in other cases it appears the minimum number of times. For
3551 this reason, it is simplest to treat this case separately, as otherwise
3552 the code gets far too messy. There are several special subcases when the
3553 minimum is zero. */
3554
3555 if (repeat_min == 0)
3556 {
3557 /* If the maximum is also zero, we just omit the group from the output
3558 altogether. */
3559
3560 if (repeat_max == 0)
3561 {
3562 code = previous;
3563 goto END_REPEAT;
3564 }
3565
3566 /* If the maximum is 1 or unlimited, we just have to stick in the
3567 BRAZERO and do no more at this point. However, we do need to adjust
3568 any OP_RECURSE calls inside the group that refer to the group itself or
3569 any internal or forward referenced group, because the offset is from
3570 the start of the whole regex. Temporarily terminate the pattern while
3571 doing this. */
3572
3573 if (repeat_max <= 1)
3574 {
3575 *code = OP_END;
3576 adjust_recurse(previous, 1, utf8, cd, save_hwm);
3577 memmove(previous+1, previous, len);
3578 code++;
3579 *previous++ = OP_BRAZERO + repeat_type;
3580 }
3581
3582 /* If the maximum is greater than 1 and limited, we have to replicate
3583 in a nested fashion, sticking OP_BRAZERO before each set of brackets.
3584 The first one has to be handled carefully because it's the original
3585 copy, which has to be moved up. The remainder can be handled by code
3586 that is common with the non-zero minimum case below. We have to
3587 adjust the value or repeat_max, since one less copy is required. Once
3588 again, we may have to adjust any OP_RECURSE calls inside the group. */
3589
3590 else
3591 {
3592 int offset;
3593 *code = OP_END;
3594 adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
3595 memmove(previous + 2 + LINK_SIZE, previous, len);
3596 code += 2 + LINK_SIZE;
3597 *previous++ = OP_BRAZERO + repeat_type;
3598 *previous++ = OP_BRA;
3599
3600 /* We chain together the bracket offset fields that have to be
3601 filled in later when the ends of the brackets are reached. */
3602
3603 offset = (bralink == NULL)? 0 : previous - bralink;
3604 bralink = previous;
3605 PUTINC(previous, 0, offset);
3606 }
3607
3608 repeat_max--;
3609 }
3610
3611 /* If the minimum is greater than zero, replicate the group as many
3612 times as necessary, and adjust the maximum to the number of subsequent
3613 copies that we need. If we set a first char from the group, and didn't
3614 set a required char, copy the latter from the former. If there are any
3615 forward reference subroutine calls in the group, there will be entries on
3616 the workspace list; replicate these with an appropriate increment. */
3617
3618 else
3619 {
3620 if (repeat_min > 1)
3621 {
3622 /* In the pre-compile phase, we don't actually do the replication. We
3623 just adjust the length as if we had. */
3624
3625 if (lengthptr != NULL)
3626 *lengthptr += (repeat_min - 1)*length_prevgroup;
3627
3628 /* This is compiling for real */
3629
3630 else
3631 {
3632 if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
3633 for (i = 1; i < repeat_min; i++)
3634 {
3635 uschar *hc;
3636 uschar *this_hwm = cd->hwm;
3637 memcpy(code, previous, len);
3638 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3639 {
3640 PUT(cd->hwm, 0, GET(hc, 0) + len);
3641 cd->hwm += LINK_SIZE;
3642 }
3643 save_hwm = this_hwm;
3644 code += len;
3645 }
3646 }
3647 }
3648
3649 if (repeat_max > 0) repeat_max -= repeat_min;
3650 }
3651
3652 /* This code is common to both the zero and non-zero minimum cases. If
3653 the maximum is limited, it replicates the group in a nested fashion,
3654 remembering the bracket starts on a stack. In the case of a zero minimum,
3655 the first one was set up above. In all cases the repeat_max now specifies
3656 the number of additional copies needed. Again, we must remember to
3657 replicate entries on the forward reference list. */
3658
3659 if (repeat_max >= 0)
3660 {
3661 /* In the pre-compile phase, we don't actually do the replication. We
3662 just adjust the length as if we had. For each repetition we must add 1
3663 to the length for BRAZERO and for all but the last repetition we must
3664 add 2 + 2*LINKSIZE to allow for the nesting that occurs. */
3665
3666 if (lengthptr != NULL && repeat_max > 0)
3667 *lengthptr += repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
3668 2 - 2*LINK_SIZE; /* Last one doesn't nest */
3669
3670 /* This is compiling for real */
3671
3672 else for (i = repeat_max - 1; i >= 0; i--)
3673 {
3674 uschar *hc;
3675 uschar *this_hwm = cd->hwm;
3676
3677 *code++ = OP_BRAZERO + repeat_type;
3678
3679 /* All but the final copy start a new nesting, maintaining the
3680 chain of brackets outstanding. */
3681
3682 if (i != 0)
3683 {
3684 int offset;
3685 *code++ = OP_BRA;
3686 offset = (bralink == NULL)? 0 : code - bralink;
3687 bralink = code;
3688 PUTINC(code, 0, offset);
3689 }
3690
3691 memcpy(code, previous, len);
3692 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3693 {
3694 PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
3695 cd->hwm += LINK_SIZE;
3696 }
3697 save_hwm = this_hwm;
3698 code += len;
3699 }
3700
3701 /* Now chain through the pending brackets, and fill in their length
3702 fields (which are holding the chain links pro tem). */
3703
3704 while (bralink != NULL)
3705 {
3706 int oldlinkoffset;
3707 int offset = code - bralink + 1;
3708 uschar *bra = code - offset;
3709 oldlinkoffset = GET(bra, 1);
3710 bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
3711 *code++ = OP_KET;
3712 PUTINC(code, 0, offset);
3713 PUT(bra, 1, offset);
3714 }
3715 }
3716
3717 /* If the maximum is unlimited, set a repeater in the final copy. We
3718 can't just offset backwards from the current code point, because we
3719 don't know if there's been an options resetting after the ket. The
3720 correct offset was computed above.
3721
3722 Then, when we are doing the actual compile phase, check to see whether
3723 this group is a non-atomic one that could match an empty string. If so,
3724 convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
3725 that runtime checking can be done. [This check is also applied to
3726 atomic groups at runtime, but in a different way.] */
3727
3728 else
3729 {
3730 uschar *ketcode = code - ketoffset;
3731 uschar *bracode = ketcode - GET(ketcode, 1);
3732 *ketcode = OP_KETRMAX + repeat_type;
3733 if (lengthptr == NULL && *bracode != OP_ONCE)
3734 {
3735 uschar *scode = bracode;
3736 do
3737 {
3738 if (could_be_empty_branch(scode, ketcode, utf8))
3739 {
3740 *bracode += OP_SBRA - OP_BRA;
3741 break;
3742 }
3743 scode += GET(scode, 1);
3744 }
3745 while (*scode == OP_ALT);
3746 }
3747 }
3748 }
3749
3750 /* Else there's some kind of shambles */
3751
3752 else
3753 {
3754 *errorcodeptr = ERR11;
3755 goto FAILED;
3756 }
3757
3758 /* If the character following a repeat is '+', or if certain optimization
3759 tests above succeeded, possessive_quantifier is TRUE. For some of the
3760 simpler opcodes, there is an special alternative opcode for this. For
3761 anything else, we wrap the entire repeated item inside OP_ONCE brackets.
3762 The '+' notation is just syntactic sugar, taken from Sun's Java package,
3763 but the special opcodes can optimize it a bit. The repeated item starts at
3764 tempcode, not at previous, which might be the first part of a string whose
3765 (former) last char we repeated.
3766
3767 Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
3768 an 'upto' may follow. We skip over an 'exact' item, and then test the
3769 length of what remains before proceeding. */
3770
3771 if (possessive_quantifier)
3772 {
3773 int len;
3774 if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||
3775 *tempcode == OP_NOTEXACT)
3776 tempcode += _pcre_OP_lengths[*tempcode];
3777 len = code - tempcode;
3778 if (len > 0) switch (*tempcode)
3779 {
3780 case OP_STAR: *tempcode = OP_POSSTAR; break;
3781 case OP_PLUS: *tempcode = OP_POSPLUS; break;
3782 case OP_QUERY: *tempcode = OP_POSQUERY; break;
3783 case OP_UPTO: *tempcode = OP_POSUPTO; break;
3784
3785 case OP_TYPESTAR: *tempcode = OP_TYPEPOSSTAR; break;
3786 case OP_TYPEPLUS: *tempcode = OP_TYPEPOSPLUS; break;
3787 case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
3788 case OP_TYPEUPTO: *tempcode = OP_TYPEPOSUPTO; break;
3789
3790 case OP_NOTSTAR: *tempcode = OP_NOTPOSSTAR; break;
3791 case OP_NOTPLUS: *tempcode = OP_NOTPOSPLUS; break;
3792 case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
3793 case OP_NOTUPTO: *tempcode = OP_NOTPOSUPTO; break;
3794
3795 default:
3796 memmove(tempcode + 1+LINK_SIZE, tempcode, len);
3797 code += 1 + LINK_SIZE;
3798 len += 1 + LINK_SIZE;
3799 tempcode[0] = OP_ONCE;
3800 *code++ = OP_KET;
3801 PUTINC(code, 0, len);
3802 PUT(tempcode, 1, len);
3803 break;
3804 }
3805 }
3806
3807 /* In all case we no longer have a previous item. We also set the
3808 "follows varying string" flag for subsequently encountered reqbytes if
3809 it isn't already set and we have just passed a varying length item. */
3810
3811 END_REPEAT:
3812 previous = NULL;
3813 cd->req_varyopt |= reqvary;
3814 break;
3815
3816
3817 /* ===================================================================*/
3818 /* Start of nested parenthesized sub-expression, or comment or lookahead or
3819 lookbehind or option setting or condition or all the other extended
3820 parenthesis forms. First deal with the specials; all are introduced by ?,
3821 and the appearance of any of them means that this is not a capturing
3822 group. */
3823
3824 case '(':
3825 newoptions = options;
3826 skipbytes = 0;
3827 bravalue = OP_CBRA;
3828 save_hwm = cd->hwm;
3829 reset_bracount = FALSE;
3830
3831 if (*(++ptr) == '?')
3832 {
3833 int i, set, unset, namelen;
3834 int *optset;
3835 const uschar *name;
3836 uschar *slot;
3837
3838 switch (*(++ptr))
3839 {
3840 case '#': /* Comment; skip to ket */
3841 ptr++;
3842 while (*ptr != 0 && *ptr != ')') ptr++;
3843 if (*ptr == 0)
3844 {
3845 *errorcodeptr = ERR18;
3846 goto FAILED;
3847 }
3848 continue;
3849
3850
3851 /* ------------------------------------------------------------ */
3852 case '|': /* Reset capture count for each branch */
3853 reset_bracount = TRUE;
3854 /* Fall through */
3855
3856 /* ------------------------------------------------------------ */
3857 case ':': /* Non-capturing bracket */
3858 bravalue = OP_BRA;
3859 ptr++;
3860 break;
3861
3862
3863 /* ------------------------------------------------------------ */
3864 case '(':
3865 bravalue = OP_COND; /* Conditional group */
3866
3867 /* A condition can be an assertion, a number (referring to a numbered
3868 group), a name (referring to a named group), or 'R', referring to
3869 recursion. R<digits> and R&name are also permitted for recursion tests.
3870
3871 There are several syntaxes for testing a named group: (?(name)) is used
3872 by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
3873
3874 There are two unfortunate ambiguities, caused by history. (a) 'R' can
3875 be the recursive thing or the name 'R' (and similarly for 'R' followed
3876 by digits), and (b) a number could be a name that consists of digits.
3877 In both cases, we look for a name first; if not found, we try the other
3878 cases. */
3879
3880 /* For conditions that are assertions, check the syntax, and then exit
3881 the switch. This will take control down to where bracketed groups,
3882 including assertions, are processed. */
3883
3884 if (ptr[1] == '?' && (ptr[2] == '=' || ptr[2] == '!' || ptr[2] == '<'))
3885 break;
3886
3887 /* Most other conditions use OP_CREF (a couple change to OP_RREF
3888 below), and all need to skip 3 bytes at the start of the group. */
3889
3890 code[1+LINK_SIZE] = OP_CREF;
3891 skipbytes = 3;
3892 refsign = -1;
3893
3894 /* Check for a test for recursion in a named group. */
3895
3896 if (ptr[1] == 'R' && ptr[2] == '&')
3897 {
3898 terminator = -1;
3899 ptr += 2;
3900 code[1+LINK_SIZE] = OP_RREF; /* Change the type of test */
3901 }
3902
3903 /* Check for a test for a named group's having been set, using the Perl
3904 syntax (?(<name>) or (?('name') */
3905
3906 else if (ptr[1] == '<')
3907 {
3908 terminator = '>';
3909 ptr++;
3910 }
3911 else if (ptr[1] == '\'')
3912 {
3913 terminator = '\'';
3914 ptr++;
3915 }
3916 else
3917 {
3918 terminator = 0;
3919 if (ptr[1] == '-' || ptr[1] == '+') refsign = *(++ptr);
3920 }
3921
3922 /* We now expect to read a name; any thing else is an error */
3923
3924 if ((cd->ctypes[ptr[1]] & ctype_word) == 0)
3925 {
3926 ptr += 1; /* To get the right offset */
3927 *errorcodeptr = ERR28;
3928 goto FAILED;
3929 }
3930
3931 /* Read the name, but also get it as a number if it's all digits */
3932
3933 recno = 0;
3934 name = ++ptr;
3935 while ((cd->ctypes[*ptr] & ctype_word) != 0)
3936 {
3937 if (recno >= 0)
3938 recno = ((digitab[*ptr] & ctype_digit) != 0)?
3939 recno * 10 + *ptr - '0' : -1;
3940 ptr++;
3941 }
3942 namelen = ptr - name;
3943
3944 if ((terminator > 0 && *ptr++ != terminator) || *ptr++ != ')')
3945 {
3946 ptr--; /* Error offset */
3947 *errorcodeptr = ERR26;
3948 goto FAILED;
3949 }
3950
3951 /* Do no further checking in the pre-compile phase. */
3952
3953 if (lengthptr != NULL) break;
3954
3955 /* In the real compile we do the work of looking for the actual
3956 reference. If the string started with "+" or "-" we require the rest to
3957 be digits, in which case recno will be set. */
3958
3959 if (refsign > 0)
3960 {
3961 if (recno <= 0)
3962 {
3963 *errorcodeptr = ERR58;
3964 goto FAILED;
3965 }
3966 if (refsign == '-')
3967 {
3968 recno = cd->bracount - recno + 1;
3969 if (recno <= 0)
3970 {
3971 *errorcodeptr = ERR15;
3972 goto FAILED;
3973 }
3974 }
3975 else recno += cd->bracount;
3976 PUT2(code, 2+LINK_SIZE, recno);
3977 break;
3978 }
3979
3980 /* Otherwise (did not start with "+" or "-"), start by looking for the
3981 name. */
3982
3983 slot = cd->name_table;
3984 for (i = 0; i < cd->names_found; i++)
3985 {
3986 if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
3987 slot += cd->name_entry_size;
3988 }
3989
3990 /* Found a previous named subpattern */
3991
3992 if (i < cd->names_found)
3993 {
3994 recno = GET2(slot, 0);
3995 PUT2(code, 2+LINK_SIZE, recno);
3996 }
3997
3998 /* Search the pattern for a forward reference */
3999
4000 else if ((i = find_parens(ptr, cd->bracount, name, namelen,
4001 (options & PCRE_EXTENDED) != 0)) > 0)
4002 {
4003 PUT2(code, 2+LINK_SIZE, i);
4004 }
4005
4006 /* If terminator == 0 it means that the name followed directly after
4007 the opening parenthesis [e.g. (?(abc)...] and in this case there are
4008 some further alternatives to try. For the cases where terminator != 0
4009 [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
4010 now checked all the possibilities, so give an error. */
4011
4012 else if (terminator != 0)
4013 {
4014 *errorcodeptr = ERR15;
4015 goto FAILED;
4016 }
4017
4018 /* Check for (?(R) for recursion. Allow digits after R to specify a
4019 specific group number. */
4020
4021 else if (*name == 'R')
4022 {
4023 recno = 0;
4024 for (i = 1; i < namelen; i++)
4025 {
4026 if ((digitab[name[i]] & ctype_digit) == 0)
4027 {
4028 *errorcodeptr = ERR15;
4029 goto FAILED;
4030 }
4031 recno = recno * 10 + name[i] - '0';
4032 }
4033 if (recno == 0) recno = RREF_ANY;
4034 code[1+LINK_SIZE] = OP_RREF; /* Change test type */
4035 PUT2(code, 2+LINK_SIZE, recno);
4036 }
4037
4038 /* Similarly, check for the (?(DEFINE) "condition", which is always
4039 false. */
4040
4041 else if (namelen == 6 && strncmp((char *)name, "DEFINE", 6) == 0)
4042 {
4043 code[1+LINK_SIZE] = OP_DEF;
4044 skipbytes = 1;
4045 }
4046
4047 /* Check for the "name" actually being a subpattern number. */
4048
4049 else if (recno > 0)
4050 {
4051 PUT2(code, 2+LINK_SIZE, recno);
4052 }
4053
4054 /* Either an unidentified subpattern, or a reference to (?(0) */
4055
4056 else
4057 {
4058 *errorcodeptr = (recno == 0)? ERR35: ERR15;
4059 goto FAILED;
4060 }
4061 break;
4062
4063
4064 /* ------------------------------------------------------------ */
4065 case '=': /* Positive lookahead */
4066 bravalue = OP_ASSERT;
4067 ptr++;
4068 break;
4069
4070
4071 /* ------------------------------------------------------------ */
4072 case '!': /* Negative lookahead */
4073 bravalue = OP_ASSERT_NOT;
4074 ptr++;
4075 break;
4076
4077
4078 /* ------------------------------------------------------------ */
4079 case '<': /* Lookbehind or named define */
4080 switch (ptr[1])
4081 {
4082 case '=': /* Positive lookbehind */
4083 bravalue = OP_ASSERTBACK;
4084 ptr += 2;
4085 break;
4086
4087 case '!': /* Negative lookbehind */
4088 bravalue = OP_ASSERTBACK_NOT;
4089 ptr += 2;
4090 break;
4091
4092 default: /* Could be name define, else bad */
4093 if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME;
4094 ptr++; /* Correct offset for error */
4095 *errorcodeptr = ERR24;
4096 goto FAILED;
4097 }
4098 break;
4099
4100
4101 /* ------------------------------------------------------------ */
4102 case '>': /* One-time brackets */
4103 bravalue = OP_ONCE;
4104 ptr++;
4105 break;
4106
4107
4108 /* ------------------------------------------------------------ */
4109 case 'C': /* Callout - may be followed by digits; */
4110 previous_callout = code; /* Save for later completion */
4111 after_manual_callout = 1; /* Skip one item before completing */
4112 *code++ = OP_CALLOUT;
4113 {
4114 int n = 0;
4115 while ((digitab[*(++ptr)] & ctype_digit) != 0)
4116 n = n * 10 + *ptr - '0';
4117 if (*ptr != ')')
4118 {
4119 *errorcodeptr = ERR39;
4120 goto FAILED;
4121 }
4122 if (n > 255)
4123 {
4124 *errorcodeptr = ERR38;
4125 goto FAILED;
4126 }
4127 *code++ = n;
4128 PUT(code, 0, ptr - cd->start_pattern + 1); /* Pattern offset */
4129 PUT(code, LINK_SIZE, 0); /* Default length */
4130 code += 2 * LINK_SIZE;
4131 }
4132 previous = NULL;
4133 continue;
4134
4135
4136 /* ------------------------------------------------------------ */
4137 case 'P': /* Python-style named subpattern handling */
4138 if (*(++ptr) == '=' || *ptr == '>') /* Reference or recursion */
4139 {
4140 is_recurse = *ptr == '>';
4141 terminator = ')';
4142 goto NAMED_REF_OR_RECURSE;
4143 }
4144 else if (*ptr != '<') /* Test for Python-style definition */
4145 {
4146 *errorcodeptr = ERR41;
4147 goto FAILED;
4148 }
4149 /* Fall through to handle (?P< as (?< is handled */
4150
4151
4152 /* ------------------------------------------------------------ */
4153 DEFINE_NAME: /* Come here from (?< handling */
4154 case '\'':
4155 {
4156 terminator = (*ptr == '<')? '>' : '\'';
4157 name = ++ptr;
4158
4159 while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4160 namelen = ptr - name;
4161
4162 /* In the pre-compile phase, just do a syntax check. */
4163
4164 if (lengthptr != NULL)
4165 {
4166 if (*ptr != terminator)
4167 {
4168 *errorcodeptr = ERR42;
4169 goto FAILED;
4170 }
4171 if (cd->names_found >= MAX_NAME_COUNT)
4172 {
4173 *errorcodeptr = ERR49;
4174 goto FAILED;
4175 }
4176 if (namelen + 3 > cd->name_entry_size)
4177 {
4178 cd->name_entry_size = namelen + 3;
4179 if (namelen > MAX_NAME_SIZE)
4180 {
4181 *errorcodeptr = ERR48;
4182 goto FAILED;
4183 }
4184 }
4185 }
4186
4187 /* In the real compile, create the entry in the table */
4188
4189 else
4190 {
4191 slot = cd->name_table;
4192 for (i = 0; i < cd->names_found; i++)
4193 {
4194 int crc = memcmp(name, slot+2, namelen);
4195 if (crc == 0)
4196 {
4197 if (slot[2+namelen] == 0)
4198 {
4199 if ((options & PCRE_DUPNAMES) == 0)
4200 {
4201 *errorcodeptr = ERR43;
4202 goto FAILED;
4203 }
4204 }
4205 else crc = -1; /* Current name is substring */
4206 }
4207 if (crc < 0)
4208 {
4209 memmove(slot + cd->name_entry_size, slot,
4210 (cd->names_found - i) * cd->name_entry_size);
4211 break;
4212 }
4213 slot += cd->name_entry_size;
4214 }
4215
4216 PUT2(slot, 0, cd->bracount + 1);
4217 memcpy(slot + 2, name, namelen);
4218 slot[2+namelen] = 0;
4219 }
4220 }
4221
4222 /* In both cases, count the number of names we've encountered. */
4223
4224 ptr++; /* Move past > or ' */
4225 cd->names_found++;
4226 goto NUMBERED_GROUP;
4227
4228
4229 /* ------------------------------------------------------------ */
4230 case '&': /* Perl recursion/subroutine syntax */
4231 terminator = ')';
4232 is_recurse = TRUE;
4233 /* Fall through */
4234
4235 /* We come here from the Python syntax above that handles both
4236 references (?P=name) and recursion (?P>name), as well as falling
4237 through from the Perl recursion syntax (?&name). */
4238
4239 NAMED_REF_OR_RECURSE:
4240 name = ++ptr;
4241 while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4242 namelen = ptr - name;
4243
4244 /* In the pre-compile phase, do a syntax check and set a dummy
4245 reference number. */
4246
4247 if (lengthptr != NULL)
4248 {
4249 if (*ptr != terminator)
4250 {
4251 *errorcodeptr = ERR42;
4252 goto FAILED;
4253 }
4254 if (namelen > MAX_NAME_SIZE)
4255 {
4256 *errorcodeptr = ERR48;
4257 goto FAILED;
4258 }
4259 recno = 0;
4260 }
4261
4262 /* In the real compile, seek the name in the table */
4263
4264 else
4265 {
4266 slot = cd->name_table;
4267 for (i = 0; i < cd->names_found; i++)
4268 {
4269 if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
4270 slot += cd->name_entry_size;
4271 }
4272
4273 if (i < cd->names_found) /* Back reference */
4274 {
4275 recno = GET2(slot, 0);
4276 }
4277 else if ((recno = /* Forward back reference */
4278 find_parens(ptr, cd->bracount, name, namelen,
4279 (options & PCRE_EXTENDED) != 0)) <= 0)
4280 {
4281 *errorcodeptr = ERR15;
4282 goto FAILED;
4283 }
4284 }
4285
4286 /* In both phases, we can now go to the code than handles numerical
4287 recursion or backreferences. */
4288
4289 if (is_recurse) goto HANDLE_RECURSION;
4290 else goto HANDLE_REFERENCE;
4291
4292
4293 /* ------------------------------------------------------------ */
4294 case 'R': /* Recursion */
4295 ptr++; /* Same as (?0) */
4296 /* Fall through */
4297
4298
4299 /* ------------------------------------------------------------ */
4300 case '-': case '+':
4301 case '0': case '1': case '2': case '3': case '4': /* Recursion or */
4302 case '5': case '6': case '7': case '8': case '9': /* subroutine */
4303 {
4304 const uschar *called;
4305
4306 if ((refsign = *ptr) == '+') ptr++;
4307 else if (refsign == '-')
4308 {
4309 if ((digitab[ptr[1]] & ctype_digit) == 0)
4310 goto OTHER_CHAR_AFTER_QUERY;
4311 ptr++;
4312 }
4313
4314 recno = 0;
4315 while((digitab[*ptr] & ctype_digit) != 0)
4316 recno = recno * 10 + *ptr++ - '0';
4317
4318 if (*ptr != ')')
4319 {
4320 *errorcodeptr = ERR29;
4321 goto FAILED;
4322 }
4323
4324 if (refsign == '-')
4325 {
4326 if (recno == 0)
4327 {
4328 *errorcodeptr = ERR58;
4329 goto FAILED;
4330 }
4331 recno = cd->bracount - recno + 1;
4332 if (recno <= 0)
4333 {
4334 *errorcodeptr = ERR15;
4335 goto FAILED;
4336 }
4337 }
4338 else if (refsign == '+')
4339 {
4340 if (recno == 0)
4341 {
4342 *errorcodeptr = ERR58;
4343 goto FAILED;
4344 }
4345 recno += cd->bracount;
4346 }
4347
4348 /* Come here from code above that handles a named recursion */
4349
4350 HANDLE_RECURSION:
4351
4352 previous = code;
4353 called = cd->start_code;
4354
4355 /* When we are actually compiling, find the bracket that is being
4356 referenced. Temporarily end the regex in case it doesn't exist before
4357 this point. If we end up with a forward reference, first check that
4358 the bracket does occur later so we can give the error (and position)
4359 now. Then remember this forward reference in the workspace so it can
4360 be filled in at the end. */
4361
4362 if (lengthptr == NULL)
4363 {
4364 *code = OP_END;
4365 if (recno != 0) called = find_bracket(cd->start_code, utf8, recno);
4366
4367 /* Forward reference */
4368
4369 if (called == NULL)
4370 {
4371 if (find_parens(ptr, cd->bracount, NULL, recno,
4372 (options & PCRE_EXTENDED) != 0) < 0)
4373 {
4374 *errorcodeptr = ERR15;
4375 goto FAILED;
4376 }
4377 called = cd->start_code + recno;
4378 PUTINC(cd->hwm, 0, code + 2 + LINK_SIZE - cd->start_code);
4379 }
4380
4381 /* If not a forward reference, and the subpattern is still open,
4382 this is a recursive call. We check to see if this is a left
4383 recursion that could loop for ever, and diagnose that case. */
4384
4385 else if (GET(called, 1) == 0 &&
4386 could_be_empty(called, code, bcptr, utf8))
4387 {
4388 *errorcodeptr = ERR40;
4389 goto FAILED;
4390 }
4391 }
4392
4393 /* Insert the recursion/subroutine item, automatically wrapped inside
4394 "once" brackets. Set up a "previous group" length so that a
4395 subsequent quantifier will work. */
4396
4397 *code = OP_ONCE;
4398 PUT(code, 1, 2 + 2*LINK_SIZE);
4399 code += 1 + LINK_SIZE;
4400
4401 *code = OP_RECURSE;
4402 PUT(code, 1, called - cd->start_code);
4403 code += 1 + LINK_SIZE;
4404
4405 *code = OP_KET;
4406 PUT(code, 1, 2 + 2*LINK_SIZE);
4407 code += 1 + LINK_SIZE;
4408
4409 length_prevgroup = 3 + 3*LINK_SIZE;
4410 }
4411
4412 /* Can't determine a first byte now */
4413
4414 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
4415 continue;
4416
4417
4418 /* ------------------------------------------------------------ */
4419 default: /* Other characters: check option setting */
4420 OTHER_CHAR_AFTER_QUERY:
4421 set = unset = 0;
4422 optset = &set;
4423
4424 while (*ptr != ')' && *ptr != ':')
4425 {
4426 switch (*ptr++)
4427 {
4428 case '-': optset = &unset; break;
4429
4430 case 'J': /* Record that it changed in the external options */
4431 *optset |= PCRE_DUPNAMES;
4432 cd->external_options |= PCRE_JCHANGED;
4433 break;
4434
4435 case 'i': *optset |= PCRE_CASELESS; break;
4436 case 'm': *optset |= PCRE_MULTILINE; break;
4437 case 's': *optset |= PCRE_DOTALL; break;
4438 case 'x': *optset |= PCRE_EXTENDED; break;
4439 case 'U': *optset |= PCRE_UNGREEDY; break;
4440 case 'X': *optset |= PCRE_EXTRA; break;
4441
4442 default: *errorcodeptr = ERR12;
4443 ptr--; /* Correct the offset */
4444 goto FAILED;
4445 }
4446 }
4447
4448 /* Set up the changed option bits, but don't change anything yet. */
4449
4450 newoptions = (options | set) & (~unset);
4451
4452 /* If the options ended with ')' this is not the start of a nested
4453 group with option changes, so the options change at this level. If this
4454 item is right at the start of the pattern, the options can be
4455 abstracted and made external in the pre-compile phase, and ignored in
4456 the compile phase. This can be helpful when matching -- for instance in
4457 caseless checking of required bytes.
4458
4459 If the code pointer is not (cd->start_code + 1 + LINK_SIZE), we are
4460 definitely *not* at the start of the pattern because something has been
4461 compiled. In the pre-compile phase, however, the code pointer can have
4462 that value after the start, because it gets reset as code is discarded
4463 during the pre-compile. However, this can happen only at top level - if
4464 we are within parentheses, the starting BRA will still be present. At
4465 any parenthesis level, the length value can be used to test if anything
4466 has been compiled at that level. Thus, a test for both these conditions
4467 is necessary to ensure we correctly detect the start of the pattern in
4468 both phases.
4469
4470 If we are not at the pattern start, compile code to change the ims
4471 options if this setting actually changes any of them. We also pass the
4472 new setting back so that it can be put at the start of any following
4473 branches, and when this group ends (if we are in a group), a resetting
4474 item can be compiled. */
4475
4476 if (*ptr == ')')
4477 {
4478 if (code == cd->start_code + 1 + LINK_SIZE &&
4479 (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE))
4480 {
4481 cd->external_options = newoptions;
4482 options = newoptions;
4483 }
4484 else
4485 {
4486 if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
4487 {
4488 *code++ = OP_OPT;
4489 *code++ = newoptions & PCRE_IMS;
4490 }
4491
4492 /* Change options at this level, and pass them back for use
4493 in subsequent branches. Reset the greedy defaults and the case
4494 value for firstbyte and reqbyte. */
4495
4496 *optionsptr = options = newoptions;
4497 greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
4498 greedy_non_default = greedy_default ^ 1;
4499 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
4500 }
4501
4502 previous = NULL; /* This item can't be repeated */
4503 continue; /* It is complete */
4504 }
4505
4506 /* If the options ended with ':' we are heading into a nested group
4507 with possible change of options. Such groups are non-capturing and are
4508 not assertions of any kind. All we need to do is skip over the ':';
4509 the newoptions value is handled below. */
4510
4511 bravalue = OP_BRA;
4512 ptr++;
4513 } /* End of switch for character following (? */
4514 } /* End of (? handling */
4515
4516 /* Opening parenthesis not followed by '?'. If PCRE_NO_AUTO_CAPTURE is set,
4517 all unadorned brackets become non-capturing and behave like (?:...)
4518 brackets. */
4519
4520 else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
4521 {
4522 bravalue = OP_BRA;
4523 }
4524
4525 /* Else we have a capturing group. */
4526
4527 else
4528 {
4529 NUMBERED_GROUP:
4530 cd->bracount += 1;
4531 PUT2(code, 1+LINK_SIZE, cd->bracount);
4532 skipbytes = 2;
4533 }
4534
4535 /* Process nested bracketed regex. Assertions may not be repeated, but
4536 other kinds can be. All their opcodes are >= OP_ONCE. We copy code into a
4537 non-register variable in order to be able to pass its address because some
4538 compilers complain otherwise. Pass in a new setting for the ims options if
4539 they have changed. */
4540
4541 previous = (bravalue >= OP_ONCE)? code : NULL;
4542 *code = bravalue;
4543 tempcode = code;
4544 tempreqvary = cd->req_varyopt; /* Save value before bracket */
4545 length_prevgroup = 0; /* Initialize for pre-compile phase */
4546
4547 if (!compile_regex(
4548 newoptions, /* The complete new option state */
4549 options & PCRE_IMS, /* The previous ims option state */
4550 &tempcode, /* Where to put code (updated) */
4551 &ptr, /* Input pointer (updated) */
4552 errorcodeptr, /* Where to put an error message */
4553 (bravalue == OP_ASSERTBACK ||
4554 bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
4555 reset_bracount, /* True if (?| group */
4556 skipbytes, /* Skip over bracket number */
4557 &subfirstbyte, /* For possible first char */
4558 &subreqbyte, /* For possible last char */
4559 bcptr, /* Current branch chain */
4560 cd, /* Tables block */
4561 (lengthptr == NULL)? NULL : /* Actual compile phase */
4562 &length_prevgroup /* Pre-compile phase */
4563 ))
4564 goto FAILED;
4565
4566 /* At the end of compiling, code is still pointing to the start of the
4567 group, while tempcode has been updated to point past the end of the group
4568 and any option resetting that may follow it. The pattern pointer (ptr)
4569 is on the bracket. */
4570
4571 /* If this is a conditional bracket, check that there are no more than
4572 two branches in the group, or just one if it's a DEFINE group. We do this
4573 in the real compile phase, not in the pre-pass, where the whole group may
4574 not be available. */
4575
4576 if (bravalue == OP_COND && lengthptr == NULL)
4577 {
4578 uschar *tc = code;
4579 int condcount = 0;
4580
4581 do {
4582 condcount++;
4583 tc += GET(tc,1);
4584 }
4585 while (*tc != OP_KET);
4586
4587 /* A DEFINE group is never obeyed inline (the "condition" is always
4588 false). It must have only one branch. */
4589
4590 if (code[LINK_SIZE+1] == OP_DEF)
4591 {
4592 if (condcount > 1)
4593 {
4594 *errorcodeptr = ERR54;
4595 goto FAILED;
4596 }
4597 bravalue = OP_DEF; /* Just a flag to suppress char handling below */
4598 }
4599
4600 /* A "normal" conditional group. If there is just one branch, we must not
4601 make use of its firstbyte or reqbyte, because this is equivalent to an
4602 empty second branch. */
4603
4604 else
4605 {
4606 if (condcount > 2)
4607 {
4608 *errorcodeptr = ERR27;
4609 goto FAILED;
4610 }
4611 if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
4612 }
4613 }
4614
4615 /* Error if hit end of pattern */
4616
4617 if (*ptr != ')')
4618 {
4619 *errorcodeptr = ERR14;
4620 goto FAILED;
4621 }
4622
4623 /* In the pre-compile phase, update the length by the length of the nested
4624 group, less the brackets at either end. Then reduce the compiled code to
4625 just the brackets so that it doesn't use much memory if it is duplicated by
4626 a quantifier. */
4627
4628 if (lengthptr != NULL)
4629 {
4630 *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
4631 code++;
4632 PUTINC(code, 0, 1 + LINK_SIZE);
4633 *code++ = OP_KET;
4634 PUTINC(code, 0, 1 + LINK_SIZE);
4635 }
4636
4637 /* Otherwise update the main code pointer to the end of the group. */
4638
4639 else code = tempcode;
4640
4641 /* For a DEFINE group, required and first character settings are not
4642 relevant. */
4643
4644 if (bravalue == OP_DEF) break;
4645
4646 /* Handle updating of the required and first characters for other types of
4647 group. Update for normal brackets of all kinds, and conditions with two
4648 branches (see code above). If the bracket is followed by a quantifier with
4649 zero repeat, we have to back off. Hence the definition of zeroreqbyte and
4650 zerofirstbyte outside the main loop so that they can be accessed for the
4651 back off. */
4652
4653 zeroreqbyte = reqbyte;
4654 zerofirstbyte = firstbyte;
4655 groupsetfirstbyte = FALSE;
4656
4657 if (bravalue >= OP_ONCE)
4658 {
4659 /* If we have not yet set a firstbyte in this branch, take it from the
4660 subpattern, remembering that it was set here so that a repeat of more
4661 than one can replicate it as reqbyte if necessary. If the subpattern has
4662 no firstbyte, set "none" for the whole branch. In both cases, a zero
4663 repeat forces firstbyte to "none". */
4664
4665 if (firstbyte == REQ_UNSET)
4666 {
4667 if (subfirstbyte >= 0)
4668 {
4669 firstbyte = subfirstbyte;
4670 groupsetfirstbyte = TRUE;
4671 }
4672 else firstbyte = REQ_NONE;
4673 zerofirstbyte = REQ_NONE;
4674 }
4675
4676 /* If firstbyte was previously set, convert the subpattern's firstbyte
4677 into reqbyte if there wasn't one, using the vary flag that was in
4678 existence beforehand. */
4679
4680 else if (subfirstbyte >= 0 && subreqbyte < 0)
4681 subreqbyte = subfirstbyte | tempreqvary;
4682
4683 /* If the subpattern set a required byte (or set a first byte that isn't
4684 really the first byte - see above), set it. */
4685
4686 if (subreqbyte >= 0) reqbyte = subreqbyte;
4687 }
4688
4689 /* For a forward assertion, we take the reqbyte, if set. This can be
4690 helpful if the pattern that follows the assertion doesn't set a different
4691 char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
4692 for an assertion, however because it leads to incorrect effect for patterns
4693 such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
4694 of a firstbyte. This is overcome by a scan at the end if there's no
4695 firstbyte, looking for an asserted first char. */
4696
4697 else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
4698 break; /* End of processing '(' */
4699
4700
4701 /* ===================================================================*/
4702 /* Handle metasequences introduced by \. For ones like \d, the ESC_ values
4703 are arranged to be the negation of the corresponding OP_values. For the
4704 back references, the values are ESC_REF plus the reference number. Only
4705 back references and those types that consume a character may be repeated.
4706 We can test for values between ESC_b and ESC_Z for the latter; this may
4707 have to change if any new ones are ever created. */
4708
4709 case '\\':
4710 tempptr = ptr;
4711 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, FALSE);
4712 if (*errorcodeptr != 0) goto FAILED;
4713
4714 if (c < 0)
4715 {
4716 if (-c == ESC_Q) /* Handle start of quoted string */
4717 {
4718 if (ptr[1] == '\\' && ptr[2] == 'E') ptr += 2; /* avoid empty string */
4719 else inescq = TRUE;
4720 continue;
4721 }
4722
4723 if (-c == ESC_E) continue; /* Perl ignores an orphan \E */
4724
4725 /* For metasequences that actually match a character, we disable the
4726 setting of a first character if it hasn't already been set. */
4727
4728 if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
4729 firstbyte = REQ_NONE;
4730
4731 /* Set values to reset to if this is followed by a zero repeat. */
4732
4733 zerofirstbyte = firstbyte;
4734 zeroreqbyte = reqbyte;
4735
4736 /* \k<name> or \k'name' is a back reference by name (Perl syntax).
4737 We also support \k{name} (.NET syntax) */
4738
4739 if (-c == ESC_k && (ptr[1] == '<' || ptr[1] == '\'' || ptr[1] == '{'))
4740 {
4741 is_recurse = FALSE;
4742 terminator = (*(++ptr) == '<')? '>' : (*ptr == '\'')? '\'' : '}';
4743 goto NAMED_REF_OR_RECURSE;
4744 }
4745
4746 /* Back references are handled specially; must disable firstbyte if
4747 not set to cope with cases like (?=(\w+))\1: which would otherwise set
4748 ':' later. */
4749
4750 if (-c >= ESC_REF)
4751 {
4752 recno = -c - ESC_REF;
4753
4754 HANDLE_REFERENCE: /* Come here from named backref handling */
4755 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
4756 previous = code;
4757 *code++ = OP_REF;
4758 PUT2INC(code, 0, recno);
4759 cd->backref_map |= (recno < 32)? (1 << recno) : 1;
4760 if (recno > cd->top_backref) cd->top_backref = recno;
4761 }
4762
4763 /* So are Unicode property matches, if supported. */
4764
4765 #ifdef SUPPORT_UCP
4766 else if (-c == ESC_P || -c == ESC_p)
4767 {
4768 BOOL negated;
4769 int pdata;
4770 int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
4771 if (ptype < 0) goto FAILED;
4772 previous = code;
4773 *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
4774 *code++ = ptype;
4775 *code++ = pdata;
4776 }
4777 #else
4778
4779 /* If Unicode properties are not supported, \X, \P, and \p are not
4780 allowed. */
4781
4782 else if (-c == ESC_X || -c == ESC_P || -c == ESC_p)
4783 {
4784 *errorcodeptr = ERR45;
4785 goto FAILED;
4786 }
4787 #endif
4788
4789 /* For the rest (including \X when Unicode properties are supported), we
4790 can obtain the OP value by negating the escape value. */
4791
4792 else
4793 {
4794 previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
4795 *code++ = -c;
4796 }
4797 continue;
4798 }
4799
4800 /* We have a data character whose value is in c. In UTF-8 mode it may have
4801 a value > 127. We set its representation in the length/buffer, and then
4802 handle it as a data character. */
4803
4804 #ifdef SUPPORT_UTF8
4805 if (utf8 && c > 127)
4806 mclength = _pcre_ord2utf8(c, mcbuffer);
4807 else
4808 #endif
4809
4810 {
4811 mcbuffer[0] = c;
4812 mclength = 1;
4813 }
4814 goto ONE_CHAR;
4815
4816
4817 /* ===================================================================*/
4818 /* Handle a literal character. It is guaranteed not to be whitespace or #
4819 when the extended flag is set. If we are in UTF-8 mode, it may be a
4820 multi-byte literal character. */
4821
4822 default:
4823 NORMAL_CHAR:
4824 mclength = 1;
4825 mcbuffer[0] = c;
4826
4827 #ifdef SUPPORT_UTF8
4828 if (utf8 && c >= 0xc0)
4829 {
4830 while ((ptr[1] & 0xc0) == 0x80)
4831 mcbuffer[mclength++] = *(++ptr);
4832 }
4833 #endif
4834
4835 /* At this point we have the character's bytes in mcbuffer, and the length
4836 in mclength. When not in UTF-8 mode, the length is always 1. */
4837
4838 ONE_CHAR:
4839 previous = code;
4840 *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARNC : OP_CHAR;
4841 for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
4842
4843 /* Set the first and required bytes appropriately. If no previous first
4844 byte, set it from this character, but revert to none on a zero repeat.
4845 Otherwise, leave the firstbyte value alone, and don't change it on a zero
4846 repeat. */
4847
4848 if (firstbyte == REQ_UNSET)
4849 {
4850 zerofirstbyte = REQ_NONE;
4851 zeroreqbyte = reqbyte;
4852
4853 /* If the character is more than one byte long, we can set firstbyte
4854 only if it is not to be matched caselessly. */
4855
4856 if (mclength == 1 || req_caseopt == 0)
4857 {
4858 firstbyte = mcbuffer[0] | req_caseopt;
4859 if (mclength != 1) reqbyte = code[-1] | cd->req_varyopt;
4860 }
4861 else firstbyte = reqbyte = REQ_NONE;
4862 }
4863
4864 /* firstbyte was previously set; we can set reqbyte only the length is
4865 1 or the matching is caseful. */
4866
4867 else
4868 {
4869 zerofirstbyte = firstbyte;
4870 zeroreqbyte = reqbyte;
4871 if (mclength == 1 || req_caseopt == 0)
4872 reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
4873 }
4874
4875 break; /* End of literal character handling */
4876 }
4877 } /* end of big loop */
4878
4879
4880 /* Control never reaches here by falling through, only by a goto for all the
4881 error states. Pass back the position in the pattern so that it can be displayed
4882 to the user for diagnosing the error. */
4883
4884 FAILED:
4885 *ptrptr = ptr;
4886 return FALSE;
4887 }
4888
4889
4890
4891
4892 /*************************************************
4893 * Compile sequence of alternatives *
4894 *************************************************/
4895
4896 /* On entry, ptr is pointing past the bracket character, but on return it
4897 points to the closing bracket, or vertical bar, or end of string. The code
4898 variable is pointing at the byte into which the BRA operator has been stored.
4899 If the ims options are changed at the start (for a (?ims: group) or during any
4900 branch, we need to insert an OP_OPT item at the start of every following branch
4901 to ensure they get set correctly at run time, and also pass the new options
4902 into every subsequent branch compile.
4903
4904 This function is used during the pre-compile phase when we are trying to find
4905 out the amount of memory needed, as well as during the real compile phase. The
4906 value of lengthptr distinguishes the two phases.
4907
4908 Arguments:
4909 options option bits, including any changes for this subpattern
4910 oldims previous settings of ims option bits
4911 codeptr -> the address of the current code pointer
4912 ptrptr -> the address of the current pattern pointer
4913 errorcodeptr -> pointer to error code variable
4914 lookbehind TRUE if this is a lookbehind assertion
4915 reset_bracount TRUE to reset the count for each branch
4916 skipbytes skip this many bytes at start (for brackets and OP_COND)
4917 firstbyteptr place to put the first required character, or a negative number
4918 reqbyteptr place to put the last required character, or a negative number
4919 bcptr pointer to the chain of currently open branches
4920 cd points to the data block with tables pointers etc.
4921 lengthptr NULL during the real compile phase
4922 points to length accumulator during pre-compile phase
4923
4924 Returns: TRUE on success
4925 */
4926
4927 static BOOL
4928 compile_regex(int options, int oldims, uschar **codeptr, const uschar **ptrptr,
4929 int *errorcodeptr, BOOL lookbehind, BOOL reset_bracount, int skipbytes,
4930 int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd,
4931 int *lengthptr)
4932 {
4933 const uschar *ptr = *ptrptr;
4934 uschar *code = *codeptr;
4935 uschar *last_branch = code;
4936 uschar *start_bracket = code;
4937 uschar *reverse_count = NULL;
4938 int firstbyte, reqbyte;
4939 int branchfirstbyte, branchreqbyte;
4940 int length;
4941 int orig_bracount;
4942 int max_bracount;
4943 branch_chain bc;
4944
4945 bc.outer = bcptr;
4946 bc.current = code;
4947
4948 firstbyte = reqbyte = REQ_UNSET;
4949
4950 /* Accumulate the length for use in the pre-compile phase. Start with the
4951 length of the BRA and KET and any extra bytes that are required at the
4952 beginning. We accumulate in a local variable to save frequent testing of
4953 lenthptr for NULL. We cannot do this by looking at the value of code at the
4954 start and end of each alternative, because compiled items are discarded during
4955 the pre-compile phase so that the work space is not exceeded. */
4956
4957 length = 2 + 2*LINK_SIZE + skipbytes;
4958
4959 /* WARNING: If the above line is changed for any reason, you must also change
4960 the code that abstracts option settings at the start of the pattern and makes
4961 them global. It tests the value of length for (2 + 2*LINK_SIZE) in the
4962 pre-compile phase to find out whether anything has yet been compiled or not. */
4963
4964 /* Offset is set zero to mark that this bracket is still open */
4965
4966 PUT(code, 1, 0);
4967 code += 1 + LINK_SIZE + skipbytes;
4968
4969 /* Loop for each alternative branch */
4970
4971 orig_bracount = max_bracount = cd->bracount;
4972 for (;;)
4973 {
4974 /* For a (?| group, reset the capturing bracket count so that each branch
4975 uses the same numbers. */
4976
4977 if (reset_bracount) cd->bracount = orig_bracount;
4978
4979 /* Handle a change of ims options at the start of the branch */
4980
4981 if ((options & PCRE_IMS) != oldims)
4982 {
4983 *code++ = OP_OPT;
4984 *code++ = options & PCRE_IMS;
4985 length += 2;
4986 }
4987
4988 /* Set up dummy OP_REVERSE if lookbehind assertion */
4989
4990 if (lookbehind)
4991 {
4992 *code++ = OP_REVERSE;
4993 reverse_count = code;
4994 PUTINC(code, 0, 0);
4995 length += 1 + LINK_SIZE;
4996 }
4997
4998 /* Now compile the branch; in the pre-compile phase its length gets added
4999 into the length. */
5000
5001 if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstbyte,
5002 &branchreqbyte, &bc, cd, (lengthptr == NULL)? NULL : &length))
5003 {
5004 *ptrptr = ptr;
5005 return FALSE;
5006 }
5007
5008 /* Keep the highest bracket count in case (?| was used and some branch
5009 has fewer than the rest. */
5010
5011 if (cd->bracount > max_bracount) max_bracount = cd->bracount;
5012
5013 /* In the real compile phase, there is some post-processing to be done. */
5014
5015 if (lengthptr == NULL)
5016 {
5017 /* If this is the first branch, the firstbyte and reqbyte values for the
5018 branch become the values for the regex. */
5019
5020 if (*last_branch != OP_ALT)
5021 {
5022 firstbyte = branchfirstbyte;
5023 reqbyte = branchreqbyte;
5024 }
5025
5026 /* If this is not the first branch, the first char and reqbyte have to
5027 match the values from all the previous branches, except that if the
5028 previous value for reqbyte didn't have REQ_VARY set, it can still match,
5029 and we set REQ_VARY for the regex. */
5030
5031 else
5032 {
5033 /* If we previously had a firstbyte, but it doesn't match the new branch,
5034 we have to abandon the firstbyte for the regex, but if there was
5035 previously no reqbyte, it takes on the value of the old firstbyte. */
5036
5037 if (firstbyte >= 0 && firstbyte != branchfirstbyte)
5038 {
5039 if (reqbyte < 0) reqbyte = firstbyte;
5040 firstbyte = REQ_NONE;
5041 }
5042
5043 /* If we (now or from before) have no firstbyte, a firstbyte from the
5044 branch becomes a reqbyte if there isn't a branch reqbyte. */
5045
5046 if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
5047 branchreqbyte = branchfirstbyte;
5048
5049 /* Now ensure that the reqbytes match */
5050
5051 if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
5052 reqbyte = REQ_NONE;
5053 else reqbyte |= branchreqbyte; /* To "or" REQ_VARY */
5054 }
5055
5056 /* If lookbehind, check that this branch matches a fixed-length string, and
5057 put the length into the OP_REVERSE item. Temporarily mark the end of the
5058 branch with OP_END. */
5059
5060 if (lookbehind)
5061 {
5062 int fixed_length;
5063 *code = OP_END;
5064 fixed_length = find_fixedlength(last_branch, options);
5065 DPRINTF(("fixed length = %d\n", fixed_length));
5066 if (fixed_length < 0)
5067 {
5068 *errorcodeptr = (fixed_length == -2)? ERR36 : ERR25;
5069 *ptrptr = ptr;
5070 return FALSE;
5071 }
5072 PUT(reverse_count, 0, fixed_length);
5073 }
5074 }
5075
5076 /* Reached end of expression, either ')' or end of pattern. In the real
5077 compile phase, go back through the alternative branches and reverse the chain
5078 of offsets, with the field in the BRA item now becoming an offset to the
5079 first alternative. If there are no alternatives, it points to the end of the
5080 group. The length in the terminating ket is always the length of the whole
5081 bracketed item. If any of the ims options were changed inside the group,
5082 compile a resetting op-code following, except at the very end of the pattern.
5083 Return leaving the pointer at the terminating char. */
5084
5085 if (*ptr != '|')
5086 {
5087 if (lengthptr == NULL)
5088 {
5089 int branch_length = code - last_branch;
5090 do
5091 {
5092 int prev_length = GET(last_branch, 1);
5093 PUT(last_branch, 1, branch_length);
5094 branch_length = prev_length;
5095 last_branch -= branch_length;
5096 }
5097 while (branch_length > 0);
5098 }
5099
5100 /* Fill in the ket */
5101
5102 *code = OP_KET;
5103 PUT(code, 1, code - start_bracket);
5104 code += 1 + LINK_SIZE;
5105
5106 /* Resetting option if needed */
5107
5108 if ((options & PCRE_IMS) != oldims && *ptr == ')')
5109 {
5110 *code++ = OP_OPT;
5111 *code++ = oldims;
5112 length += 2;
5113 }
5114
5115 /* Retain the highest bracket number, in case resetting was used. */
5116
5117 cd->bracount = max_bracount;
5118
5119 /* Set values to pass back */
5120
5121 *codeptr = code;
5122 *ptrptr = ptr;
5123 *firstbyteptr = firstbyte;
5124 *reqbyteptr = reqbyte;
5125 if (lengthptr != NULL) *lengthptr += length;
5126 return TRUE;
5127 }
5128
5129 /* Another branch follows. In the pre-compile phase, we can move the code
5130 pointer back to where it was for the start of the first branch. (That is,
5131 pretend that each branch is the only one.)
5132
5133 In the real compile phase, insert an ALT node. Its length field points back
5134 to the previous branch while the bracket remains open. At the end the chain
5135 is reversed. It's done like this so that the start of the bracket has a
5136 zero offset until it is closed, making it possible to detect recursion. */
5137
5138 if (lengthptr != NULL)
5139 {
5140 code = *codeptr + 1 + LINK_SIZE + skipbytes;
5141 length += 1 + LINK_SIZE;
5142 }
5143 else
5144 {
5145 *code = OP_ALT;
5146 PUT(code, 1, code - last_branch);
5147 bc.current = last_branch = code;
5148 code += 1 + LINK_SIZE;
5149 }
5150
5151 ptr++;
5152 }
5153 /* Control never reaches here */
5154 }
5155
5156
5157
5158
5159 /*************************************************
5160 * Check for anchored expression *
5161 *************************************************/
5162
5163 /* Try to find out if this is an anchored regular expression. Consider each
5164 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
5165 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
5166 it's anchored. However, if this is a multiline pattern, then only OP_SOD
5167 counts, since OP_CIRC can match in the middle.
5168
5169 We can also consider a regex to be anchored if OP_SOM starts all its branches.
5170 This is the code for \G, which means "match at start of match position, taking
5171 into account the match offset".
5172
5173 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
5174 because that will try the rest of the pattern at all possible matching points,
5175 so there is no point trying again.... er ....
5176
5177 .... except when the .* appears inside capturing parentheses, and there is a
5178 subsequent back reference to those parentheses. We haven't enough information
5179 to catch that case precisely.
5180
5181 At first, the best we could do was to detect when .* was in capturing brackets
5182 and the highest back reference was greater than or equal to that level.
5183 However, by keeping a bitmap of the first 31 back references, we can catch some
5184 of the more common cases more precisely.
5185
5186 Arguments:
5187 code points to start of expression (the bracket)
5188 options points to the options setting
5189 bracket_map a bitmap of which brackets we are inside while testing; this
5190 handles up to substring 31; after that we just have to take
5191 the less precise approach
5192 backref_map the back reference bitmap
5193
5194 Returns: TRUE or FALSE
5195 */
5196
5197 static BOOL
5198 is_anchored(register const uschar *code, int *options, unsigned int bracket_map,
5199 unsigned int backref_map)
5200 {
5201 do {
5202 const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
5203 options, PCRE_MULTILINE, FALSE);
5204 register int op = *scode;
5205
5206 /* Non-capturing brackets */
5207
5208 if (op == OP_BRA)
5209 {
5210 if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
5211 }
5212
5213 /* Capturing brackets */
5214
5215 else if (op == OP_CBRA)
5216 {
5217 int n = GET2(scode, 1+LINK_SIZE);
5218 int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
5219 if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;
5220 }
5221
5222 /* Other brackets */
5223
5224 else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
5225 {
5226 if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
5227 }
5228
5229 /* .* is not anchored unless DOTALL is set and it isn't in brackets that
5230 are or may be referenced. */
5231
5232 else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
5233 op == OP_TYPEPOSSTAR) &&
5234 (*options & PCRE_DOTALL) != 0)
5235 {
5236 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
5237 }
5238
5239 /* Check for explicit anchoring */
5240
5241 else if (op != OP_SOD && op != OP_SOM &&
5242 ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
5243 return FALSE;
5244 code += GET(code, 1);
5245 }
5246 while (*code == OP_ALT); /* Loop for each alternative */
5247 return TRUE;
5248 }
5249
5250
5251
5252 /*************************************************
5253 * Check for starting with ^ or .* *
5254 *************************************************/
5255
5256 /* This is called to find out if every branch starts with ^ or .* so that
5257 "first char" processing can be done to speed things up in multiline
5258 matching and for non-DOTALL patterns that start with .* (which must start at
5259 the beginning or after \n). As in the case of is_anchored() (see above), we
5260 have to take account of back references to capturing brackets that contain .*
5261 because in that case we can't make the assumption.
5262
5263 Arguments:
5264 code points to start of expression (the bracket)
5265 bracket_map a bitmap of which brackets we are inside while testing; this
5266 handles up to substring 31; after that we just have to take
5267 the less precise approach
5268 backref_map the back reference bitmap
5269
5270 Returns: TRUE or FALSE
5271 */
5272
5273 static BOOL
5274 is_startline(const uschar *code, unsigned int bracket_map,
5275 unsigned int backref_map)
5276 {
5277 do {
5278 const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
5279 NULL, 0, FALSE);
5280 register int op = *scode;
5281
5282 /* Non-capturing brackets */
5283
5284 if (op == OP_BRA)
5285 {
5286 if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
5287 }
5288
5289 /* Capturing brackets */
5290
5291 else if (op == OP_CBRA)
5292 {
5293 int n = GET2(scode, 1+LINK_SIZE);
5294 int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
5295 if (!is_startline(scode, new_map, backref_map)) return FALSE;
5296 }
5297
5298 /* Other brackets */
5299
5300 else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
5301 { if (!is_startline(scode, bracket_map, backref_map)) return FALSE; }
5302
5303 /* .* means "start at start or after \n" if it isn't in brackets that
5304 may be referenced. */
5305
5306 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
5307 {
5308 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
5309 }
5310
5311 /* Check for explicit circumflex */
5312
5313 else if (op != OP_CIRC) return FALSE;
5314
5315 /* Move on to the next alternative */
5316
5317 code += GET(code, 1);
5318 }
5319 while (*code == OP_ALT); /* Loop for each alternative */
5320 return TRUE;
5321 }
5322
5323
5324
5325 /*************************************************
5326 * Check for asserted fixed first char *
5327 *************************************************/
5328
5329 /* During compilation, the "first char" settings from forward assertions are
5330 discarded, because they can cause conflicts with actual literals that follow.
5331 However, if we end up without a first char setting for an unanchored pattern,
5332 it is worth scanning the regex to see if there is an initial asserted first
5333 char. If all branches start with the same asserted char, or with a bracket all
5334 of whose alternatives start with the same asserted char (recurse ad lib), then
5335 we return that char, otherwise -1.
5336
5337 Arguments:
5338 code points to start of expression (the bracket)
5339 options pointer to the options (used to check casing changes)
5340 inassert TRUE if in an assertion
5341
5342 Returns: -1 or the fixed first char
5343 */
5344
5345 static int
5346 find_firstassertedchar(const uschar *code, int *options, BOOL inassert)
5347 {
5348 register int c = -1;
5349 do {
5350 int d;
5351 const uschar *scode =
5352 first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE);
5353 register int op = *scode;
5354
5355 switch(op)
5356 {
5357 default:
5358 return -1;
5359
5360 case OP_BRA:
5361 case OP_CBRA:
5362 case OP_ASSERT:
5363 case OP_ONCE:
5364 case OP_COND:
5365 if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0)
5366 return -1;
5367 if (c < 0) c = d; else if (c != d) return -1;
5368 break;
5369
5370 case OP_EXACT: /* Fall through */
5371 scode += 2;
5372
5373 case OP_CHAR:
5374 case OP_CHARNC:
5375 case OP_PLUS:
5376 case OP_MINPLUS:
5377 case OP_POSPLUS:
5378 if (!inassert) return -1;
5379 if (c < 0)
5380 {
5381 c = scode[1];
5382 if ((*options & PCRE_CASELESS) != 0) c |= REQ_CASELESS;
5383 }
5384 else if (c != scode[1]) return -1;
5385 break;
5386 }
5387
5388 code += GET(code, 1);
5389 }
5390 while (*code == OP_ALT);
5391 return c;
5392 }
5393
5394
5395
5396 /*************************************************
5397 * Compile a Regular Expression *
5398 *************************************************/
5399
5400 /* This function takes a string and returns a pointer to a block of store
5401 holding a compiled version of the expression. The original API for this
5402 function had no error code return variable; it is retained for backwards
5403 compatibility. The new function is given a new name.
5404
5405 Arguments:
5406 pattern the regular expression
5407 options various option bits
5408 errorcodeptr pointer to error code variable (pcre_compile2() only)
5409 can be NULL if you don't want a code value
5410 errorptr pointer to pointer to error text
5411 erroroffset ptr offset in pattern where error was detected
5412 tables pointer to character tables or NULL
5413
5414 Returns: pointer to compiled data block, or NULL on error,
5415 with errorptr and erroroffset set
5416 */
5417
5418 PCRE_EXP_DEFN pcre *
5419 pcre_compile(const char *pattern, int options, const char **errorptr,
5420 int *erroroffset, const unsigned char *tables)
5421 {
5422 return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
5423 }
5424
5425
5426 PCRE_EXP_DEFN pcre *
5427 pcre_compile2(const char *pattern, int options, int *errorcodeptr,
5428 const char **errorptr, int *erroroffset, const unsigned char *tables)
5429 {
5430 real_pcre *re;
5431 int length = 1; /* For final END opcode */
5432 int firstbyte, reqbyte, newline;
5433 int errorcode = 0;
5434 #ifdef SUPPORT_UTF8
5435 BOOL utf8;
5436 #endif
5437 size_t size;
5438 uschar *code;
5439 const uschar *codestart;
5440 const uschar *ptr;
5441 compile_data compile_block;
5442 compile_data *cd = &compile_block;
5443
5444 /* This space is used for "compiling" into during the first phase, when we are
5445 computing the amount of memory that is needed. Compiled items are thrown away
5446 as soon as possible, so that a fairly large buffer should be sufficient for
5447 this purpose. The same space is used in the second phase for remembering where
5448 to fill in forward references to subpatterns. */
5449
5450 uschar cworkspace[COMPILE_WORK_SIZE];
5451
5452
5453 /* Set this early so that early errors get offset 0. */
5454
5455 ptr = (const uschar *)pattern;
5456
5457 /* We can't pass back an error message if errorptr is NULL; I guess the best we
5458 can do is just return NULL, but we can set a code value if there is a code
5459 pointer. */
5460
5461 if (errorptr == NULL)
5462 {
5463 if (errorcodeptr != NULL) *errorcodeptr = 99;
5464 return NULL;
5465 }
5466
5467 *errorptr = NULL;
5468 if (errorcodeptr != NULL) *errorcodeptr = ERR0;
5469
5470 /* However, we can give a message for this error */
5471
5472 if (erroroffset == NULL)
5473 {
5474 errorcode = ERR16;
5475 goto PCRE_EARLY_ERROR_RETURN2;
5476 }
5477
5478 *erroroffset = 0;
5479
5480 /* Can't support UTF8 unless PCRE has been compiled to include the code. */
5481
5482 #ifdef SUPPORT_UTF8
5483 utf8 = (options & PCRE_UTF8) != 0;
5484 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
5485 (*erroroffset = _pcre_valid_utf8((uschar *)pattern, -1)) >= 0)
5486 {
5487 errorcode = ERR44;
5488 goto PCRE_EARLY_ERROR_RETURN2;
5489 }
5490 #else
5491 if ((options & PCRE_UTF8) != 0)
5492 {
5493 errorcode = ERR32;
5494 goto PCRE_EARLY_ERROR_RETURN;
5495 }
5496 #endif
5497
5498 if ((options & ~PUBLIC_OPTIONS) != 0)
5499 {
5500 errorcode = ERR17;
5501 goto PCRE_EARLY_ERROR_RETURN;
5502 }
5503
5504 /* Set up pointers to the individual character tables */
5505
5506 if (tables == NULL) tables = _pcre_default_tables;
5507 cd->lcc = tables + lcc_offset;
5508 cd->fcc = tables + fcc_offset;
5509 cd->cbits = tables + cbits_offset;
5510 cd->ctypes = tables + ctypes_offset;
5511
5512 /* Handle different types of newline. The three bits give seven cases. The
5513 current code allows for fixed one- or two-byte sequences, plus "any" and
5514 "anycrlf". */
5515
5516 switch (options & (PCRE_NEWLINE_CRLF | PCRE_NEWLINE_ANY))
5517 {
5518 case 0: newline = NEWLINE; break; /* Compile-time default */
5519 case PCRE_NEWLINE_CR: newline = '\r'; break;
5520 case PCRE_NEWLINE_LF: newline = '\n'; break;
5521 case PCRE_NEWLINE_CR+
5522 PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
5523 case PCRE_NEWLINE_ANY: newline = -1; break;
5524 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
5525 default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
5526 }
5527
5528 if (newline == -2)
5529 {
5530 cd->nltype = NLTYPE_ANYCRLF;
5531 }
5532 else if (newline < 0)
5533 {
5534 cd->nltype = NLTYPE_ANY;
5535 }
5536 else
5537 {
5538 cd->nltype = NLTYPE_FIXED;
5539 if (newline > 255)
5540 {
5541 cd->nllen = 2;
5542 cd->nl[0] = (newline >> 8) & 255;
5543 cd->nl[1] = newline & 255;
5544 }
5545 else
5546 {
5547 cd->nllen = 1;
5548 cd->nl[0] = newline;
5549 }
5550 }
5551
5552 /* Maximum back reference and backref bitmap. The bitmap records up to 31 back
5553 references to help in deciding whether (.*) can be treated as anchored or not.
5554 */
5555
5556 cd->top_backref = 0;
5557 cd->backref_map = 0;
5558
5559 /* Reflect pattern for debugging output */
5560
5561 DPRINTF(("------------------------------------------------------------------\n"));
5562 DPRINTF(("%s\n", pattern));
5563
5564 /* Pretend to compile the pattern while actually just accumulating the length
5565 of memory required. This behaviour is triggered by passing a non-NULL final
5566 argument to compile_regex(). We pass a block of workspace (cworkspace) for it
5567 to compile parts of the pattern into; the compiled code is discarded when it is
5568 no longer needed, so hopefully this workspace will never overflow, though there
5569 is a test for its doing so. */
5570
5571 cd->bracount = 0;
5572 cd->names_found = 0;
5573 cd->name_entry_size = 0;
5574 cd->name_table = NULL;
5575 cd->start_workspace = cworkspace;
5576 cd->start_code = cworkspace;
5577 cd->hwm = cworkspace;
5578 cd->start_pattern = (const uschar *)pattern;
5579 cd->end_pattern = (const uschar *)(pattern + strlen(pattern));
5580 cd->req_varyopt = 0;
5581 cd->nopartial = FALSE;
5582 cd->external_options = options;
5583
5584 /* Now do the pre-compile. On error, errorcode will be set non-zero, so we
5585 don't need to look at the result of the function here. The initial options have
5586 been put into the cd block so that they can be changed if an option setting is
5587 found within the regex right at the beginning. Bringing initial option settings
5588 outside can help speed up starting point checks. */
5589
5590 code = cworkspace;
5591 *code = OP_BRA;
5592 (void)compile_regex(cd->external_options, cd->external_options & PCRE_IMS,
5593 &code, &ptr, &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd,
5594 &length);
5595 if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN;
5596
5597 DPRINTF(("end pre-compile: length=%d workspace=%d\n", length,
5598 cd->hwm - cworkspace));
5599
5600 if (length > MAX_PATTERN_SIZE)
5601 {
5602 errorcode = ERR20;
5603 goto PCRE_EARLY_ERROR_RETURN;
5604 }
5605
5606 /* Compute the size of data block needed and get it, either from malloc or
5607 externally provided function. Integer overflow should no longer be possible
5608 because nowadays we limit the maximum value of cd->names_found and
5609 cd->name_entry_size. */
5610
5611 size = length + sizeof(real_pcre) + cd->names_found * (cd->name_entry_size + 3);
5612 re = (real_pcre *)(pcre_malloc)(size);
5613
5614 if (re == NULL)
5615 {
5616 errorcode = ERR21;
5617 goto PCRE_EARLY_ERROR_RETURN;
5618 }
5619
5620 /* Put in the magic number, and save the sizes, initial options, and character
5621 table pointer. NULL is used for the default character tables. The nullpad field
5622 is at the end; it's there to help in the case when a regex compiled on a system
5623 with 4-byte pointers is run on another with 8-byte pointers. */
5624
5625 re->magic_number = MAGIC_NUMBER;
5626 re->size = size;
5627 re->options = cd->external_options;
5628 re->dummy1 = 0;
5629 re->first_byte = 0;
5630 re->req_byte = 0;
5631 re->name_table_offset = sizeof(real_pcre);
5632 re->name_entry_size = cd->name_entry_size;
5633 re->name_count = cd->names_found;
5634 re->ref_count = 0;
5635 re->tables = (tables == _pcre_default_tables)? NULL : tables;
5636 re->nullpad = NULL;
5637
5638 /* The starting points of the name/number translation table and of the code are
5639 passed around in the compile data block. The start/end pattern and initial
5640 options are already set from the pre-compile phase, as is the name_entry_size
5641 field. Reset the bracket count and the names_found field. Also reset the hwm
5642 field; this time it's used for remembering forward references to subpatterns.
5643 */
5644
5645 cd->bracount = 0;
5646 cd->names_found = 0;
5647 cd->name_table = (uschar *)re + re->name_table_offset;
5648 codestart = cd->name_table + re->name_entry_size * re->name_count;
5649 cd->start_code = codestart;
5650 cd->hwm = cworkspace;
5651 cd->req_varyopt = 0;
5652 cd->nopartial = FALSE;
5653
5654 /* Set up a starting, non-extracting bracket, then compile the expression. On
5655 error, errorcode will be set non-zero, so we don't need to look at the result
5656 of the function here. */
5657
5658 ptr = (const uschar *)pattern;
5659 code = (uschar *)codestart;
5660 *code = OP_BRA;
5661 (void)compile_regex(re->options, re->options & PCRE_IMS, &code, &ptr,
5662 &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd, NULL);
5663 re->top_bracket = cd->bracount;
5664 re->top_backref = cd->top_backref;
5665
5666 if (cd->nopartial) re->options |= PCRE_NOPARTIAL;
5667
5668 /* If not reached end of pattern on success, there's an excess bracket. */
5669
5670 if (errorcode == 0 && *ptr != 0) errorcode = ERR22;
5671
5672 /* Fill in the terminating state and check for disastrous overflow, but
5673 if debugging, leave the test till after things are printed out. */
5674
5675 *code++ = OP_END;
5676
5677 #ifndef DEBUG
5678 if (code - codestart > length) errorcode = ERR23;
5679 #endif
5680
5681 /* Fill in any forward references that are required. */
5682
5683 while (errorcode == 0 && cd->hwm > cworkspace)
5684 {
5685 int offset, recno;
5686 const uschar *groupptr;
5687 cd->hwm -= LINK_SIZE;
5688 offset = GET(cd->hwm, 0);
5689 recno = GET(codestart, offset);
5690 groupptr = find_bracket(codestart, (re->options & PCRE_UTF8) != 0, recno);
5691 if (groupptr == NULL) errorcode = ERR53;
5692 else PUT(((uschar *)codestart), offset, groupptr - codestart);
5693 }
5694
5695 /* Give an error if there's back reference to a non-existent capturing
5696 subpattern. */
5697
5698 if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15;
5699
5700 /* Failed to compile, or error while post-processing */
5701
5702 if (errorcode != 0)
5703 {
5704 (pcre_free)(re);
5705 PCRE_EARLY_ERROR_RETURN:
5706 *erroroffset = ptr - (const uschar *)pattern;
5707 PCRE_EARLY_ERROR_RETURN2:
5708 *errorptr = error_texts[errorcode];
5709 if (errorcodeptr != NULL) *errorcodeptr = errorcode;
5710 return NULL;
5711 }
5712
5713 /* If the anchored option was not passed, set the flag if we can determine that
5714 the pattern is anchored by virtue of ^ characters or \A or anything else (such
5715 as starting with .* when DOTALL is set).
5716
5717 Otherwise, if we know what the first byte has to be, save it, because that
5718 speeds up unanchored matches no end. If not, see if we can set the
5719 PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
5720 start with ^. and also when all branches start with .* for non-DOTALL matches.
5721 */
5722
5723 if ((re->options & PCRE_ANCHORED) == 0)
5724 {
5725 int temp_options = re->options; /* May get changed during these scans */
5726 if (is_anchored(codestart, &temp_options, 0, cd->backref_map))
5727 re->options |= PCRE_ANCHORED;
5728 else
5729 {
5730 if (firstbyte < 0)
5731 firstbyte = find_firstassertedchar(codestart, &temp_options, FALSE);
5732 if (firstbyte >= 0) /* Remove caseless flag for non-caseable chars */
5733 {
5734 int ch = firstbyte & 255;
5735 re->first_byte = ((firstbyte & REQ_CASELESS) != 0 &&
5736 cd->fcc[ch] == ch)? ch : firstbyte;
5737 re->options |= PCRE_FIRSTSET;
5738 }
5739 else if (is_startline(codestart, 0, cd->backref_map))
5740 re->options |= PCRE_STARTLINE;
5741 }
5742 }
5743
5744 /* For an anchored pattern, we use the "required byte" only if it follows a
5745 variable length item in the regex. Remove the caseless flag for non-caseable
5746 bytes. */
5747
5748 if (reqbyte >= 0 &&
5749 ((re->options & PCRE_ANCHORED) == 0 || (reqbyte & REQ_VARY) != 0))
5750 {
5751 int ch = reqbyte & 255;
5752 re->req_byte = ((reqbyte & REQ_CASELESS) != 0 &&
5753 cd->fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte;
5754 re->options |= PCRE_REQCHSET;
5755 }
5756
5757 /* Print out the compiled data if debugging is enabled. This is never the
5758 case when building a production library. */
5759
5760 #ifdef DEBUG
5761
5762 printf("Length = %d top_bracket = %d top_backref = %d\n",
5763 length, re->top_bracket, re->top_backref);
5764
5765 if (re->options != 0)
5766 {
5767 printf("%s%s%s%s%s%s%s%s%s\n",
5768 ((re->options & PCRE_NOPARTIAL) != 0)? "nopartial " : "",
5769 ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
5770 ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
5771 ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
5772 ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
5773 ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
5774 ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
5775 ((re->options & PCRE_EXTRA) != 0)? "extra " : "",
5776 ((re->options & PCRE_UNGREEDY) != 0)? "ungreedy " : "");
5777 }
5778
5779 if ((re->options & PCRE_FIRSTSET) != 0)
5780 {
5781 int ch = re->first_byte & 255;
5782 const char *caseless = ((re->first_byte & REQ_CASELESS) == 0)?
5783 "" : " (caseless)";
5784 if (isprint(ch)) printf("First char = %c%s\n", ch, caseless);
5785 else printf("First char = \\x%02x%s\n", ch, caseless);
5786 }
5787
5788 if ((re->options & PCRE_REQCHSET) != 0)
5789 {
5790 int ch = re->req_byte & 255;
5791 const char *caseless = ((re->req_byte & REQ_CASELESS) == 0)?
5792 "" : " (caseless)";
5793 if (isprint(ch)) printf("Req char = %c%s\n", ch, caseless);
5794 else printf("Req char = \\x%02x%s\n", ch, caseless);
5795 }
5796
5797 pcre_printint(re, stdout, TRUE);
5798
5799 /* This check is done here in the debugging case so that the code that
5800 was compiled can be seen. */
5801
5802 if (code - codestart > length)
5803 {
5804 (pcre_free)(re);
5805 *errorptr = error_texts[ERR23];
5806 *erroroffset = ptr - (uschar *)pattern;
5807 if (errorcodeptr != NULL) *errorcodeptr = ERR23;
5808 return NULL;
5809 }
5810 #endif /* DEBUG */
5811
5812 return (pcre *)re;
5813 }
5814
5815 /* End of pcre_compile.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12