/[pcre]/code/tags/pcre-7.8/pcre_compile.c
ViewVC logotype

Contents of /code/tags/pcre-7.8/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 176 - (show annotations) (download)
Mon Jun 11 13:48:37 2007 UTC (6 years, 10 months ago) by ph10
Original Path: code/trunk/pcre_compile.c
File MIME type: text/plain
File size: 176819 byte(s)
Inserted some (unsigned int) casts to kill compiler warnings.

1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2007 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains the external function pcre_compile(), along with
42 supporting internal functions that are not used by other modules. */
43
44
45 #define NLBLOCK cd /* Block containing newline information */
46 #define PSSTART start_pattern /* Field containing processed string start */
47 #define PSEND end_pattern /* Field containing processed string end */
48
49
50 #include "pcre_internal.h"
51
52
53 /* When DEBUG is defined, we need the pcre_printint() function, which is also
54 used by pcretest. DEBUG is not defined when building a production library. */
55
56 #ifdef DEBUG
57 #include "pcre_printint.src"
58 #endif
59
60
61 /*************************************************
62 * Code parameters and static tables *
63 *************************************************/
64
65 /* This value specifies the size of stack workspace that is used during the
66 first pre-compile phase that determines how much memory is required. The regex
67 is partly compiled into this space, but the compiled parts are discarded as
68 soon as they can be, so that hopefully there will never be an overrun. The code
69 does, however, check for an overrun. The largest amount I've seen used is 218,
70 so this number is very generous.
71
72 The same workspace is used during the second, actual compile phase for
73 remembering forward references to groups so that they can be filled in at the
74 end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
75 is 4 there is plenty of room. */
76
77 #define COMPILE_WORK_SIZE (4096)
78
79
80 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
81 are simple data values; negative values are for special things like \d and so
82 on. Zero means further processing is needed (for things like \x), or the escape
83 is invalid. */
84
85 #ifndef EBCDIC /* This is the "normal" table for ASCII systems */
86 static const short int escapes[] = {
87 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
88 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
89 '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */
90 0, 0, 0, -ESC_K, 0, 0, 0, 0, /* H - O */
91 -ESC_P, -ESC_Q, -ESC_R, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
92 -ESC_X, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
93 '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */
94 0, 0, 0, -ESC_k, 0, 0, ESC_n, 0, /* h - o */
95 -ESC_p, 0, ESC_r, -ESC_s, ESC_tee, 0, 0, -ESC_w, /* p - w */
96 0, 0, -ESC_z /* x - z */
97 };
98
99 #else /* This is the "abnormal" table for EBCDIC systems */
100 static const short int escapes[] = {
101 /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
102 /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
103 /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
104 /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
105 /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
106 /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
107 /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
108 /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
109 /* 88 */ 0, 0, 0, '{', 0, 0, 0, 0,
110 /* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
111 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
112 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0, 0, -ESC_w, 0,
113 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
114 /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
115 /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
116 /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
117 /* C8 */ 0, 0, 0, 0, 0, 0, 0, 0,
118 /* D0 */ '}', 0, 0, 0, 0, 0, 0, -ESC_P,
119 /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
120 /* E0 */ '\\', 0, -ESC_S, 0, 0, 0, -ESC_W, -ESC_X,
121 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
122 /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
123 /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
124 };
125 #endif
126
127
128 /* Tables of names of POSIX character classes and their lengths. The list is
129 terminated by a zero length entry. The first three must be alpha, lower, upper,
130 as this is assumed for handling case independence. */
131
132 static const char *const posix_names[] = {
133 "alpha", "lower", "upper",
134 "alnum", "ascii", "blank", "cntrl", "digit", "graph",
135 "print", "punct", "space", "word", "xdigit" };
136
137 static const uschar posix_name_lengths[] = {
138 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
139
140 /* Table of class bit maps for each POSIX class. Each class is formed from a
141 base map, with an optional addition or removal of another map. Then, for some
142 classes, there is some additional tweaking: for [:blank:] the vertical space
143 characters are removed, and for [:alpha:] and [:alnum:] the underscore
144 character is removed. The triples in the table consist of the base map offset,
145 second map offset or -1 if no second map, and a non-negative value for map
146 addition or a negative value for map subtraction (if there are two maps). The
147 absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
148 remove vertical space characters, 2 => remove underscore. */
149
150 static const int posix_class_maps[] = {
151 cbit_word, cbit_digit, -2, /* alpha */
152 cbit_lower, -1, 0, /* lower */
153 cbit_upper, -1, 0, /* upper */
154 cbit_word, -1, 2, /* alnum - word without underscore */
155 cbit_print, cbit_cntrl, 0, /* ascii */
156 cbit_space, -1, 1, /* blank - a GNU extension */
157 cbit_cntrl, -1, 0, /* cntrl */
158 cbit_digit, -1, 0, /* digit */
159 cbit_graph, -1, 0, /* graph */
160 cbit_print, -1, 0, /* print */
161 cbit_punct, -1, 0, /* punct */
162 cbit_space, -1, 0, /* space */
163 cbit_word, -1, 0, /* word - a Perl extension */
164 cbit_xdigit,-1, 0 /* xdigit */
165 };
166
167
168 #define STRING(a) # a
169 #define XSTRING(s) STRING(s)
170
171 /* The texts of compile-time error messages. These are "char *" because they
172 are passed to the outside world. Do not ever re-use any error number, because
173 they are documented. Always add a new error instead. Messages marked DEAD below
174 are no longer used. */
175
176 static const char *error_texts[] = {
177 "no error",
178 "\\ at end of pattern",
179 "\\c at end of pattern",
180 "unrecognized character follows \\",
181 "numbers out of order in {} quantifier",
182 /* 5 */
183 "number too big in {} quantifier",
184 "missing terminating ] for character class",
185 "invalid escape sequence in character class",
186 "range out of order in character class",
187 "nothing to repeat",
188 /* 10 */
189 "operand of unlimited repeat could match the empty string", /** DEAD **/
190 "internal error: unexpected repeat",
191 "unrecognized character after (?",
192 "POSIX named classes are supported only within a class",
193 "missing )",
194 /* 15 */
195 "reference to non-existent subpattern",
196 "erroffset passed as NULL",
197 "unknown option bit(s) set",
198 "missing ) after comment",
199 "parentheses nested too deeply", /** DEAD **/
200 /* 20 */
201 "regular expression too large",
202 "failed to get memory",
203 "unmatched parentheses",
204 "internal error: code overflow",
205 "unrecognized character after (?<",
206 /* 25 */
207 "lookbehind assertion is not fixed length",
208 "malformed number or name after (?(",
209 "conditional group contains more than two branches",
210 "assertion expected after (?(",
211 "(?R or (?[+-]digits must be followed by )",
212 /* 30 */
213 "unknown POSIX class name",
214 "POSIX collating elements are not supported",
215 "this version of PCRE is not compiled with PCRE_UTF8 support",
216 "spare error", /** DEAD **/
217 "character value in \\x{...} sequence is too large",
218 /* 35 */
219 "invalid condition (?(0)",
220 "\\C not allowed in lookbehind assertion",
221 "PCRE does not support \\L, \\l, \\N, \\U, or \\u",
222 "number after (?C is > 255",
223 "closing ) for (?C expected",
224 /* 40 */
225 "recursive call could loop indefinitely",
226 "unrecognized character after (?P",
227 "syntax error in subpattern name (missing terminator)",
228 "two named subpatterns have the same name",
229 "invalid UTF-8 string",
230 /* 45 */
231 "support for \\P, \\p, and \\X has not been compiled",
232 "malformed \\P or \\p sequence",
233 "unknown property name after \\P or \\p",
234 "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)",
235 "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")",
236 /* 50 */
237 "repeated subpattern is too long",
238 "octal value is greater than \\377 (not in UTF-8 mode)",
239 "internal error: overran compiling workspace",
240 "internal error: previously-checked referenced subpattern not found",
241 "DEFINE group contains more than one branch",
242 /* 55 */
243 "repeating a DEFINE group is not allowed",
244 "inconsistent NEWLINE options",
245 "\\g is not followed by a braced name or an optionally braced non-zero number",
246 "(?+ or (?- or (?(+ or (?(- must be followed by a non-zero number"
247 };
248
249
250 /* Table to identify digits and hex digits. This is used when compiling
251 patterns. Note that the tables in chartables are dependent on the locale, and
252 may mark arbitrary characters as digits - but the PCRE compiling code expects
253 to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
254 a private table here. It costs 256 bytes, but it is a lot faster than doing
255 character value tests (at least in some simple cases I timed), and in some
256 applications one wants PCRE to compile efficiently as well as match
257 efficiently.
258
259 For convenience, we use the same bit definitions as in chartables:
260
261 0x04 decimal digit
262 0x08 hexadecimal digit
263
264 Then we can use ctype_digit and ctype_xdigit in the code. */
265
266 #ifndef EBCDIC /* This is the "normal" case, for ASCII systems */
267 static const unsigned char digitab[] =
268 {
269 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
270 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
271 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
272 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
273 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
274 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
275 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
276 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
277 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
278 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
279 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
280 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
281 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
282 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
283 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
284 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
285 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
286 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
287 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
288 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
289 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
290 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
291 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
292 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
293 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
294 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
295 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
296 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
297 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
298 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
299 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
300 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
301
302 #else /* This is the "abnormal" case, for EBCDIC systems */
303 static const unsigned char digitab[] =
304 {
305 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
306 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
307 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
308 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
309 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
310 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
311 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
312 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
313 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
314 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
315 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
316 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */
317 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
318 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
319 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
320 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
321 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
322 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
323 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
324 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
325 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
326 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
327 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
328 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
329 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
330 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
331 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
332 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
333 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
334 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
335 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
336 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
337
338 static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
339 0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
340 0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
341 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
342 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
343 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
344 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
345 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
346 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
347 0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
348 0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
349 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
350 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */
351 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
352 0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
353 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
354 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
355 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
356 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
357 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
358 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
359 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
360 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
361 0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
362 0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
363 0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
364 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
365 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
366 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
367 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
368 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
369 0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
370 0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
371 #endif
372
373
374 /* Definition to allow mutual recursion */
375
376 static BOOL
377 compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
378 int *, int *, branch_chain *, compile_data *, int *);
379
380
381
382 /*************************************************
383 * Handle escapes *
384 *************************************************/
385
386 /* This function is called when a \ has been encountered. It either returns a
387 positive value for a simple escape such as \n, or a negative value which
388 encodes one of the more complicated things such as \d. A backreference to group
389 n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
390 UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
391 ptr is pointing at the \. On exit, it is on the final character of the escape
392 sequence.
393
394 Arguments:
395 ptrptr points to the pattern position pointer
396 errorcodeptr points to the errorcode variable
397 bracount number of previous extracting brackets
398 options the options bits
399 isclass TRUE if inside a character class
400
401 Returns: zero or positive => a data character
402 negative => a special escape sequence
403 on error, errorptr is set
404 */
405
406 static int
407 check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
408 int options, BOOL isclass)
409 {
410 BOOL utf8 = (options & PCRE_UTF8) != 0;
411 const uschar *ptr = *ptrptr + 1;
412 int c, i;
413
414 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
415 ptr--; /* Set pointer back to the last byte */
416
417 /* If backslash is at the end of the pattern, it's an error. */
418
419 if (c == 0) *errorcodeptr = ERR1;
420
421 /* Non-alphamerics are literals. For digits or letters, do an initial lookup in
422 a table. A non-zero result is something that can be returned immediately.
423 Otherwise further processing may be required. */
424
425 #ifndef EBCDIC /* ASCII coding */
426 else if (c < '0' || c > 'z') {} /* Not alphameric */
427 else if ((i = escapes[c - '0']) != 0) c = i;
428
429 #else /* EBCDIC coding */
430 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphameric */
431 else if ((i = escapes[c - 0x48]) != 0) c = i;
432 #endif
433
434 /* Escapes that need further processing, or are illegal. */
435
436 else
437 {
438 const uschar *oldptr;
439 BOOL braced, negated;
440
441 switch (c)
442 {
443 /* A number of Perl escapes are not handled by PCRE. We give an explicit
444 error. */
445
446 case 'l':
447 case 'L':
448 case 'N':
449 case 'u':
450 case 'U':
451 *errorcodeptr = ERR37;
452 break;
453
454 /* \g must be followed by a number, either plain or braced. If positive, it
455 is an absolute backreference. If negative, it is a relative backreference.
456 This is a Perl 5.10 feature. Perl 5.10 also supports \g{name} as a
457 reference to a named group. This is part of Perl's movement towards a
458 unified syntax for back references. As this is synonymous with \k{name}, we
459 fudge it up by pretending it really was \k. */
460
461 case 'g':
462 if (ptr[1] == '{')
463 {
464 const uschar *p;
465 for (p = ptr+2; *p != 0 && *p != '}'; p++)
466 if (*p != '-' && (digitab[*p] & ctype_digit) == 0) break;
467 if (*p != 0 && *p != '}')
468 {
469 c = -ESC_k;
470 break;
471 }
472 braced = TRUE;
473 ptr++;
474 }
475 else braced = FALSE;
476
477 if (ptr[1] == '-')
478 {
479 negated = TRUE;
480 ptr++;
481 }
482 else negated = FALSE;
483
484 c = 0;
485 while ((digitab[ptr[1]] & ctype_digit) != 0)
486 c = c * 10 + *(++ptr) - '0';
487
488 if (c == 0 || (braced && *(++ptr) != '}'))
489 {
490 *errorcodeptr = ERR57;
491 return 0;
492 }
493
494 if (negated)
495 {
496 if (c > bracount)
497 {
498 *errorcodeptr = ERR15;
499 return 0;
500 }
501 c = bracount - (c - 1);
502 }
503
504 c = -(ESC_REF + c);
505 break;
506
507 /* The handling of escape sequences consisting of a string of digits
508 starting with one that is not zero is not straightforward. By experiment,
509 the way Perl works seems to be as follows:
510
511 Outside a character class, the digits are read as a decimal number. If the
512 number is less than 10, or if there are that many previous extracting
513 left brackets, then it is a back reference. Otherwise, up to three octal
514 digits are read to form an escaped byte. Thus \123 is likely to be octal
515 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
516 value is greater than 377, the least significant 8 bits are taken. Inside a
517 character class, \ followed by a digit is always an octal number. */
518
519 case '1': case '2': case '3': case '4': case '5':
520 case '6': case '7': case '8': case '9':
521
522 if (!isclass)
523 {
524 oldptr = ptr;
525 c -= '0';
526 while ((digitab[ptr[1]] & ctype_digit) != 0)
527 c = c * 10 + *(++ptr) - '0';
528 if (c < 10 || c <= bracount)
529 {
530 c = -(ESC_REF + c);
531 break;
532 }
533 ptr = oldptr; /* Put the pointer back and fall through */
534 }
535
536 /* Handle an octal number following \. If the first digit is 8 or 9, Perl
537 generates a binary zero byte and treats the digit as a following literal.
538 Thus we have to pull back the pointer by one. */
539
540 if ((c = *ptr) >= '8')
541 {
542 ptr--;
543 c = 0;
544 break;
545 }
546
547 /* \0 always starts an octal number, but we may drop through to here with a
548 larger first octal digit. The original code used just to take the least
549 significant 8 bits of octal numbers (I think this is what early Perls used
550 to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
551 than 3 octal digits. */
552
553 case '0':
554 c -= '0';
555 while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
556 c = c * 8 + *(++ptr) - '0';
557 if (!utf8 && c > 255) *errorcodeptr = ERR51;
558 break;
559
560 /* \x is complicated. \x{ddd} is a character number which can be greater
561 than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
562 treated as a data character. */
563
564 case 'x':
565 if (ptr[1] == '{')
566 {
567 const uschar *pt = ptr + 2;
568 int count = 0;
569
570 c = 0;
571 while ((digitab[*pt] & ctype_xdigit) != 0)
572 {
573 register int cc = *pt++;
574 if (c == 0 && cc == '0') continue; /* Leading zeroes */
575 count++;
576
577 #ifndef EBCDIC /* ASCII coding */
578 if (cc >= 'a') cc -= 32; /* Convert to upper case */
579 c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));
580 #else /* EBCDIC coding */
581 if (cc >= 'a' && cc <= 'z') cc += 64; /* Convert to upper case */
582 c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));
583 #endif
584 }
585
586 if (*pt == '}')
587 {
588 if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
589 ptr = pt;
590 break;
591 }
592
593 /* If the sequence of hex digits does not end with '}', then we don't
594 recognize this construct; fall through to the normal \x handling. */
595 }
596
597 /* Read just a single-byte hex-defined char */
598
599 c = 0;
600 while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
601 {
602 int cc; /* Some compilers don't like ++ */
603 cc = *(++ptr); /* in initializers */
604 #ifndef EBCDIC /* ASCII coding */
605 if (cc >= 'a') cc -= 32; /* Convert to upper case */
606 c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
607 #else /* EBCDIC coding */
608 if (cc <= 'z') cc += 64; /* Convert to upper case */
609 c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
610 #endif
611 }
612 break;
613
614 /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
615 This coding is ASCII-specific, but then the whole concept of \cx is
616 ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
617
618 case 'c':
619 c = *(++ptr);
620 if (c == 0)
621 {
622 *errorcodeptr = ERR2;
623 return 0;
624 }
625
626 #ifndef EBCDIC /* ASCII coding */
627 if (c >= 'a' && c <= 'z') c -= 32;
628 c ^= 0x40;
629 #else /* EBCDIC coding */
630 if (c >= 'a' && c <= 'z') c += 64;
631 c ^= 0xC0;
632 #endif
633 break;
634
635 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
636 other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
637 for Perl compatibility, it is a literal. This code looks a bit odd, but
638 there used to be some cases other than the default, and there may be again
639 in future, so I haven't "optimized" it. */
640
641 default:
642 if ((options & PCRE_EXTRA) != 0) switch(c)
643 {
644 default:
645 *errorcodeptr = ERR3;
646 break;
647 }
648 break;
649 }
650 }
651
652 *ptrptr = ptr;
653 return c;
654 }
655
656
657
658 #ifdef SUPPORT_UCP
659 /*************************************************
660 * Handle \P and \p *
661 *************************************************/
662
663 /* This function is called after \P or \p has been encountered, provided that
664 PCRE is compiled with support for Unicode properties. On entry, ptrptr is
665 pointing at the P or p. On exit, it is pointing at the final character of the
666 escape sequence.
667
668 Argument:
669 ptrptr points to the pattern position pointer
670 negptr points to a boolean that is set TRUE for negation else FALSE
671 dptr points to an int that is set to the detailed property value
672 errorcodeptr points to the error code variable
673
674 Returns: type value from ucp_type_table, or -1 for an invalid type
675 */
676
677 static int
678 get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
679 {
680 int c, i, bot, top;
681 const uschar *ptr = *ptrptr;
682 char name[32];
683
684 c = *(++ptr);
685 if (c == 0) goto ERROR_RETURN;
686
687 *negptr = FALSE;
688
689 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
690 negation. */
691
692 if (c == '{')
693 {
694 if (ptr[1] == '^')
695 {
696 *negptr = TRUE;
697 ptr++;
698 }
699 for (i = 0; i < sizeof(name) - 1; i++)
700 {
701 c = *(++ptr);
702 if (c == 0) goto ERROR_RETURN;
703 if (c == '}') break;
704 name[i] = c;
705 }
706 if (c !='}') goto ERROR_RETURN;
707 name[i] = 0;
708 }
709
710 /* Otherwise there is just one following character */
711
712 else
713 {
714 name[0] = c;
715 name[1] = 0;
716 }
717
718 *ptrptr = ptr;
719
720 /* Search for a recognized property name using binary chop */
721
722 bot = 0;
723 top = _pcre_utt_size;
724
725 while (bot < top)
726 {
727 i = (bot + top) >> 1;
728 c = strcmp(name, _pcre_utt[i].name);
729 if (c == 0)
730 {
731 *dptr = _pcre_utt[i].value;
732 return _pcre_utt[i].type;
733 }
734 if (c > 0) bot = i + 1; else top = i;
735 }
736
737 *errorcodeptr = ERR47;
738 *ptrptr = ptr;
739 return -1;
740
741 ERROR_RETURN:
742 *errorcodeptr = ERR46;
743 *ptrptr = ptr;
744 return -1;
745 }
746 #endif
747
748
749
750
751 /*************************************************
752 * Check for counted repeat *
753 *************************************************/
754
755 /* This function is called when a '{' is encountered in a place where it might
756 start a quantifier. It looks ahead to see if it really is a quantifier or not.
757 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
758 where the ddds are digits.
759
760 Arguments:
761 p pointer to the first char after '{'
762
763 Returns: TRUE or FALSE
764 */
765
766 static BOOL
767 is_counted_repeat(const uschar *p)
768 {
769 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
770 while ((digitab[*p] & ctype_digit) != 0) p++;
771 if (*p == '}') return TRUE;
772
773 if (*p++ != ',') return FALSE;
774 if (*p == '}') return TRUE;
775
776 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
777 while ((digitab[*p] & ctype_digit) != 0) p++;
778
779 return (*p == '}');
780 }
781
782
783
784 /*************************************************
785 * Read repeat counts *
786 *************************************************/
787
788 /* Read an item of the form {n,m} and return the values. This is called only
789 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
790 so the syntax is guaranteed to be correct, but we need to check the values.
791
792 Arguments:
793 p pointer to first char after '{'
794 minp pointer to int for min
795 maxp pointer to int for max
796 returned as -1 if no max
797 errorcodeptr points to error code variable
798
799 Returns: pointer to '}' on success;
800 current ptr on error, with errorcodeptr set non-zero
801 */
802
803 static const uschar *
804 read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
805 {
806 int min = 0;
807 int max = -1;
808
809 /* Read the minimum value and do a paranoid check: a negative value indicates
810 an integer overflow. */
811
812 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
813 if (min < 0 || min > 65535)
814 {
815 *errorcodeptr = ERR5;
816 return p;
817 }
818
819 /* Read the maximum value if there is one, and again do a paranoid on its size.
820 Also, max must not be less than min. */
821
822 if (*p == '}') max = min; else
823 {
824 if (*(++p) != '}')
825 {
826 max = 0;
827 while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
828 if (max < 0 || max > 65535)
829 {
830 *errorcodeptr = ERR5;
831 return p;
832 }
833 if (max < min)
834 {
835 *errorcodeptr = ERR4;
836 return p;
837 }
838 }
839 }
840
841 /* Fill in the required variables, and pass back the pointer to the terminating
842 '}'. */
843
844 *minp = min;
845 *maxp = max;
846 return p;
847 }
848
849
850
851 /*************************************************
852 * Find forward referenced subpattern *
853 *************************************************/
854
855 /* This function scans along a pattern's text looking for capturing
856 subpatterns, and counting them. If it finds a named pattern that matches the
857 name it is given, it returns its number. Alternatively, if the name is NULL, it
858 returns when it reaches a given numbered subpattern. This is used for forward
859 references to subpatterns. We know that if (?P< is encountered, the name will
860 be terminated by '>' because that is checked in the first pass.
861
862 Arguments:
863 ptr current position in the pattern
864 count current count of capturing parens so far encountered
865 name name to seek, or NULL if seeking a numbered subpattern
866 lorn name length, or subpattern number if name is NULL
867 xmode TRUE if we are in /x mode
868
869 Returns: the number of the named subpattern, or -1 if not found
870 */
871
872 static int
873 find_parens(const uschar *ptr, int count, const uschar *name, int lorn,
874 BOOL xmode)
875 {
876 const uschar *thisname;
877
878 for (; *ptr != 0; ptr++)
879 {
880 int term;
881
882 /* Skip over backslashed characters and also entire \Q...\E */
883
884 if (*ptr == '\\')
885 {
886 if (*(++ptr) == 0) return -1;
887 if (*ptr == 'Q') for (;;)
888 {
889 while (*(++ptr) != 0 && *ptr != '\\');
890 if (*ptr == 0) return -1;
891 if (*(++ptr) == 'E') break;
892 }
893 continue;
894 }
895
896 /* Skip over character classes */
897
898 if (*ptr == '[')
899 {
900 while (*(++ptr) != ']')
901 {
902 if (*ptr == '\\')
903 {
904 if (*(++ptr) == 0) return -1;
905 if (*ptr == 'Q') for (;;)
906 {
907 while (*(++ptr) != 0 && *ptr != '\\');
908 if (*ptr == 0) return -1;
909 if (*(++ptr) == 'E') break;
910 }
911 continue;
912 }
913 }
914 continue;
915 }
916
917 /* Skip comments in /x mode */
918
919 if (xmode && *ptr == '#')
920 {
921 while (*(++ptr) != 0 && *ptr != '\n');
922 if (*ptr == 0) return -1;
923 continue;
924 }
925
926 /* An opening parens must now be a real metacharacter */
927
928 if (*ptr != '(') continue;
929 if (ptr[1] != '?')
930 {
931 count++;
932 if (name == NULL && count == lorn) return count;
933 continue;
934 }
935
936 ptr += 2;
937 if (*ptr == 'P') ptr++; /* Allow optional P */
938
939 /* We have to disambiguate (?<! and (?<= from (?<name> */
940
941 if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') &&
942 *ptr != '\'')
943 continue;
944
945 count++;
946
947 if (name == NULL && count == lorn) return count;
948 term = *ptr++;
949 if (term == '<') term = '>';
950 thisname = ptr;
951 while (*ptr != term) ptr++;
952 if (name != NULL && lorn == ptr - thisname &&
953 strncmp((const char *)name, (const char *)thisname, lorn) == 0)
954 return count;
955 }
956
957 return -1;
958 }
959
960
961
962 /*************************************************
963 * Find first significant op code *
964 *************************************************/
965
966 /* This is called by several functions that scan a compiled expression looking
967 for a fixed first character, or an anchoring op code etc. It skips over things
968 that do not influence this. For some calls, a change of option is important.
969 For some calls, it makes sense to skip negative forward and all backward
970 assertions, and also the \b assertion; for others it does not.
971
972 Arguments:
973 code pointer to the start of the group
974 options pointer to external options
975 optbit the option bit whose changing is significant, or
976 zero if none are
977 skipassert TRUE if certain assertions are to be skipped
978
979 Returns: pointer to the first significant opcode
980 */
981
982 static const uschar*
983 first_significant_code(const uschar *code, int *options, int optbit,
984 BOOL skipassert)
985 {
986 for (;;)
987 {
988 switch ((int)*code)
989 {
990 case OP_OPT:
991 if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
992 *options = (int)code[1];
993 code += 2;
994 break;
995
996 case OP_ASSERT_NOT:
997 case OP_ASSERTBACK:
998 case OP_ASSERTBACK_NOT:
999 if (!skipassert) return code;
1000 do code += GET(code, 1); while (*code == OP_ALT);
1001 code += _pcre_OP_lengths[*code];
1002 break;
1003
1004 case OP_WORD_BOUNDARY:
1005 case OP_NOT_WORD_BOUNDARY:
1006 if (!skipassert) return code;
1007 /* Fall through */
1008
1009 case OP_CALLOUT:
1010 case OP_CREF:
1011 case OP_RREF:
1012 case OP_DEF:
1013 code += _pcre_OP_lengths[*code];
1014 break;
1015
1016 default:
1017 return code;
1018 }
1019 }
1020 /* Control never reaches here */
1021 }
1022
1023
1024
1025
1026 /*************************************************
1027 * Find the fixed length of a pattern *
1028 *************************************************/
1029
1030 /* Scan a pattern and compute the fixed length of subject that will match it,
1031 if the length is fixed. This is needed for dealing with backward assertions.
1032 In UTF8 mode, the result is in characters rather than bytes.
1033
1034 Arguments:
1035 code points to the start of the pattern (the bracket)
1036 options the compiling options
1037
1038 Returns: the fixed length, or -1 if there is no fixed length,
1039 or -2 if \C was encountered
1040 */
1041
1042 static int
1043 find_fixedlength(uschar *code, int options)
1044 {
1045 int length = -1;
1046
1047 register int branchlength = 0;
1048 register uschar *cc = code + 1 + LINK_SIZE;
1049
1050 /* Scan along the opcodes for this branch. If we get to the end of the
1051 branch, check the length against that of the other branches. */
1052
1053 for (;;)
1054 {
1055 int d;
1056 register int op = *cc;
1057
1058 switch (op)
1059 {
1060 case OP_CBRA:
1061 case OP_BRA:
1062 case OP_ONCE:
1063 case OP_COND:
1064 d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);
1065 if (d < 0) return d;
1066 branchlength += d;
1067 do cc += GET(cc, 1); while (*cc == OP_ALT);
1068 cc += 1 + LINK_SIZE;
1069 break;
1070
1071 /* Reached end of a branch; if it's a ket it is the end of a nested
1072 call. If it's ALT it is an alternation in a nested call. If it is
1073 END it's the end of the outer call. All can be handled by the same code. */
1074
1075 case OP_ALT:
1076 case OP_KET:
1077 case OP_KETRMAX:
1078 case OP_KETRMIN:
1079 case OP_END:
1080 if (length < 0) length = branchlength;
1081 else if (length != branchlength) return -1;
1082 if (*cc != OP_ALT) return length;
1083 cc += 1 + LINK_SIZE;
1084 branchlength = 0;
1085 break;
1086
1087 /* Skip over assertive subpatterns */
1088
1089 case OP_ASSERT:
1090 case OP_ASSERT_NOT:
1091 case OP_ASSERTBACK:
1092 case OP_ASSERTBACK_NOT:
1093 do cc += GET(cc, 1); while (*cc == OP_ALT);
1094 /* Fall through */
1095
1096 /* Skip over things that don't match chars */
1097
1098 case OP_REVERSE:
1099 case OP_CREF:
1100 case OP_RREF:
1101 case OP_DEF:
1102 case OP_OPT:
1103 case OP_CALLOUT:
1104 case OP_SOD:
1105 case OP_SOM:
1106 case OP_EOD:
1107 case OP_EODN:
1108 case OP_CIRC:
1109 case OP_DOLL:
1110 case OP_NOT_WORD_BOUNDARY:
1111 case OP_WORD_BOUNDARY:
1112 cc += _pcre_OP_lengths[*cc];
1113 break;
1114
1115 /* Handle literal characters */
1116
1117 case OP_CHAR:
1118 case OP_CHARNC:
1119 case OP_NOT:
1120 branchlength++;
1121 cc += 2;
1122 #ifdef SUPPORT_UTF8
1123 if ((options & PCRE_UTF8) != 0)
1124 {
1125 while ((*cc & 0xc0) == 0x80) cc++;
1126 }
1127 #endif
1128 break;
1129
1130 /* Handle exact repetitions. The count is already in characters, but we
1131 need to skip over a multibyte character in UTF8 mode. */
1132
1133 case OP_EXACT:
1134 branchlength += GET2(cc,1);
1135 cc += 4;
1136 #ifdef SUPPORT_UTF8
1137 if ((options & PCRE_UTF8) != 0)
1138 {
1139 while((*cc & 0x80) == 0x80) cc++;
1140 }
1141 #endif
1142 break;
1143
1144 case OP_TYPEEXACT:
1145 branchlength += GET2(cc,1);
1146 cc += 4;
1147 break;
1148
1149 /* Handle single-char matchers */
1150
1151 case OP_PROP:
1152 case OP_NOTPROP:
1153 cc += 2;
1154 /* Fall through */
1155
1156 case OP_NOT_DIGIT:
1157 case OP_DIGIT:
1158 case OP_NOT_WHITESPACE:
1159 case OP_WHITESPACE:
1160 case OP_NOT_WORDCHAR:
1161 case OP_WORDCHAR:
1162 case OP_ANY:
1163 branchlength++;
1164 cc++;
1165 break;
1166
1167 /* The single-byte matcher isn't allowed */
1168
1169 case OP_ANYBYTE:
1170 return -2;
1171
1172 /* Check a class for variable quantification */
1173
1174 #ifdef SUPPORT_UTF8
1175 case OP_XCLASS:
1176 cc += GET(cc, 1) - 33;
1177 /* Fall through */
1178 #endif
1179
1180 case OP_CLASS:
1181 case OP_NCLASS:
1182 cc += 33;
1183
1184 switch (*cc)
1185 {
1186 case OP_CRSTAR:
1187 case OP_CRMINSTAR:
1188 case OP_CRQUERY:
1189 case OP_CRMINQUERY:
1190 return -1;
1191
1192 case OP_CRRANGE:
1193 case OP_CRMINRANGE:
1194 if (GET2(cc,1) != GET2(cc,3)) return -1;
1195 branchlength += GET2(cc,1);
1196 cc += 5;
1197 break;
1198
1199 default:
1200 branchlength++;
1201 }
1202 break;
1203
1204 /* Anything else is variable length */
1205
1206 default:
1207 return -1;
1208 }
1209 }
1210 /* Control never gets here */
1211 }
1212
1213
1214
1215
1216 /*************************************************
1217 * Scan compiled regex for numbered bracket *
1218 *************************************************/
1219
1220 /* This little function scans through a compiled pattern until it finds a
1221 capturing bracket with the given number.
1222
1223 Arguments:
1224 code points to start of expression
1225 utf8 TRUE in UTF-8 mode
1226 number the required bracket number
1227
1228 Returns: pointer to the opcode for the bracket, or NULL if not found
1229 */
1230
1231 static const uschar *
1232 find_bracket(const uschar *code, BOOL utf8, int number)
1233 {
1234 for (;;)
1235 {
1236 register int c = *code;
1237 if (c == OP_END) return NULL;
1238
1239 /* XCLASS is used for classes that cannot be represented just by a bit
1240 map. This includes negated single high-valued characters. The length in
1241 the table is zero; the actual length is stored in the compiled code. */
1242
1243 if (c == OP_XCLASS) code += GET(code, 1);
1244
1245 /* Handle capturing bracket */
1246
1247 else if (c == OP_CBRA)
1248 {
1249 int n = GET2(code, 1+LINK_SIZE);
1250 if (n == number) return (uschar *)code;
1251 code += _pcre_OP_lengths[c];
1252 }
1253
1254 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1255 a multi-byte character. The length in the table is a minimum, so we have to
1256 arrange to skip the extra bytes. */
1257
1258 else
1259 {
1260 code += _pcre_OP_lengths[c];
1261 #ifdef SUPPORT_UTF8
1262 if (utf8) switch(c)
1263 {
1264 case OP_CHAR:
1265 case OP_CHARNC:
1266 case OP_EXACT:
1267 case OP_UPTO:
1268 case OP_MINUPTO:
1269 case OP_POSUPTO:
1270 case OP_STAR:
1271 case OP_MINSTAR:
1272 case OP_POSSTAR:
1273 case OP_PLUS:
1274 case OP_MINPLUS:
1275 case OP_POSPLUS:
1276 case OP_QUERY:
1277 case OP_MINQUERY:
1278 case OP_POSQUERY:
1279 if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1280 break;
1281 }
1282 #endif
1283 }
1284 }
1285 }
1286
1287
1288
1289 /*************************************************
1290 * Scan compiled regex for recursion reference *
1291 *************************************************/
1292
1293 /* This little function scans through a compiled pattern until it finds an
1294 instance of OP_RECURSE.
1295
1296 Arguments:
1297 code points to start of expression
1298 utf8 TRUE in UTF-8 mode
1299
1300 Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1301 */
1302
1303 static const uschar *
1304 find_recurse(const uschar *code, BOOL utf8)
1305 {
1306 for (;;)
1307 {
1308 register int c = *code;
1309 if (c == OP_END) return NULL;
1310 if (c == OP_RECURSE) return code;
1311
1312 /* XCLASS is used for classes that cannot be represented just by a bit
1313 map. This includes negated single high-valued characters. The length in
1314 the table is zero; the actual length is stored in the compiled code. */
1315
1316 if (c == OP_XCLASS) code += GET(code, 1);
1317
1318 /* Otherwise, we get the item's length from the table. In UTF-8 mode, opcodes
1319 that are followed by a character may be followed by a multi-byte character.
1320 The length in the table is a minimum, so we have to arrange to skip the extra
1321 bytes. */
1322
1323 else
1324 {
1325 code += _pcre_OP_lengths[c];
1326 #ifdef SUPPORT_UTF8
1327 if (utf8) switch(c)
1328 {
1329 case OP_CHAR:
1330 case OP_CHARNC:
1331 case OP_EXACT:
1332 case OP_UPTO:
1333 case OP_MINUPTO:
1334 case OP_POSUPTO:
1335 case OP_STAR:
1336 case OP_MINSTAR:
1337 case OP_POSSTAR:
1338 case OP_PLUS:
1339 case OP_MINPLUS:
1340 case OP_POSPLUS:
1341 case OP_QUERY:
1342 case OP_MINQUERY:
1343 case OP_POSQUERY:
1344 if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1345 break;
1346 }
1347 #endif
1348 }
1349 }
1350 }
1351
1352
1353
1354 /*************************************************
1355 * Scan compiled branch for non-emptiness *
1356 *************************************************/
1357
1358 /* This function scans through a branch of a compiled pattern to see whether it
1359 can match the empty string or not. It is called from could_be_empty()
1360 below and from compile_branch() when checking for an unlimited repeat of a
1361 group that can match nothing. Note that first_significant_code() skips over
1362 assertions. If we hit an unclosed bracket, we return "empty" - this means we've
1363 struck an inner bracket whose current branch will already have been scanned.
1364
1365 Arguments:
1366 code points to start of search
1367 endcode points to where to stop
1368 utf8 TRUE if in UTF8 mode
1369
1370 Returns: TRUE if what is matched could be empty
1371 */
1372
1373 static BOOL
1374 could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1375 {
1376 register int c;
1377 for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1378 code < endcode;
1379 code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1380 {
1381 const uschar *ccode;
1382
1383 c = *code;
1384
1385 /* Groups with zero repeats can of course be empty; skip them. */
1386
1387 if (c == OP_BRAZERO || c == OP_BRAMINZERO)
1388 {
1389 code += _pcre_OP_lengths[c];
1390 do code += GET(code, 1); while (*code == OP_ALT);
1391 c = *code;
1392 continue;
1393 }
1394
1395 /* For other groups, scan the branches. */
1396
1397 if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE)
1398 {
1399 BOOL empty_branch;
1400 if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1401
1402 /* Scan a closed bracket */
1403
1404 empty_branch = FALSE;
1405 do
1406 {
1407 if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1408 empty_branch = TRUE;
1409 code += GET(code, 1);
1410 }
1411 while (*code == OP_ALT);
1412 if (!empty_branch) return FALSE; /* All branches are non-empty */
1413 c = *code;
1414 continue;
1415 }
1416
1417 /* Handle the other opcodes */
1418
1419 switch (c)
1420 {
1421 /* Check for quantifiers after a class */
1422
1423 #ifdef SUPPORT_UTF8
1424 case OP_XCLASS:
1425 ccode = code + GET(code, 1);
1426 goto CHECK_CLASS_REPEAT;
1427 #endif
1428
1429 case OP_CLASS:
1430 case OP_NCLASS:
1431 ccode = code + 33;
1432
1433 #ifdef SUPPORT_UTF8
1434 CHECK_CLASS_REPEAT:
1435 #endif
1436
1437 switch (*ccode)
1438 {
1439 case OP_CRSTAR: /* These could be empty; continue */
1440 case OP_CRMINSTAR:
1441 case OP_CRQUERY:
1442 case OP_CRMINQUERY:
1443 break;
1444
1445 default: /* Non-repeat => class must match */
1446 case OP_CRPLUS: /* These repeats aren't empty */
1447 case OP_CRMINPLUS:
1448 return FALSE;
1449
1450 case OP_CRRANGE:
1451 case OP_CRMINRANGE:
1452 if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
1453 break;
1454 }
1455 break;
1456
1457 /* Opcodes that must match a character */
1458
1459 case OP_PROP:
1460 case OP_NOTPROP:
1461 case OP_EXTUNI:
1462 case OP_NOT_DIGIT:
1463 case OP_DIGIT:
1464 case OP_NOT_WHITESPACE:
1465 case OP_WHITESPACE:
1466 case OP_NOT_WORDCHAR:
1467 case OP_WORDCHAR:
1468 case OP_ANY:
1469 case OP_ANYBYTE:
1470 case OP_CHAR:
1471 case OP_CHARNC:
1472 case OP_NOT:
1473 case OP_PLUS:
1474 case OP_MINPLUS:
1475 case OP_POSPLUS:
1476 case OP_EXACT:
1477 case OP_NOTPLUS:
1478 case OP_NOTMINPLUS:
1479 case OP_NOTPOSPLUS:
1480 case OP_NOTEXACT:
1481 case OP_TYPEPLUS:
1482 case OP_TYPEMINPLUS:
1483 case OP_TYPEPOSPLUS:
1484 case OP_TYPEEXACT:
1485 return FALSE;
1486
1487 /* End of branch */
1488
1489 case OP_KET:
1490 case OP_KETRMAX:
1491 case OP_KETRMIN:
1492 case OP_ALT:
1493 return TRUE;
1494
1495 /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1496 MINUPTO, and POSUPTO may be followed by a multibyte character */
1497
1498 #ifdef SUPPORT_UTF8
1499 case OP_STAR:
1500 case OP_MINSTAR:
1501 case OP_POSSTAR:
1502 case OP_QUERY:
1503 case OP_MINQUERY:
1504 case OP_POSQUERY:
1505 case OP_UPTO:
1506 case OP_MINUPTO:
1507 case OP_POSUPTO:
1508 if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1509 break;
1510 #endif
1511 }
1512 }
1513
1514 return TRUE;
1515 }
1516
1517
1518
1519 /*************************************************
1520 * Scan compiled regex for non-emptiness *
1521 *************************************************/
1522
1523 /* This function is called to check for left recursive calls. We want to check
1524 the current branch of the current pattern to see if it could match the empty
1525 string. If it could, we must look outwards for branches at other levels,
1526 stopping when we pass beyond the bracket which is the subject of the recursion.
1527
1528 Arguments:
1529 code points to start of the recursion
1530 endcode points to where to stop (current RECURSE item)
1531 bcptr points to the chain of current (unclosed) branch starts
1532 utf8 TRUE if in UTF-8 mode
1533
1534 Returns: TRUE if what is matched could be empty
1535 */
1536
1537 static BOOL
1538 could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1539 BOOL utf8)
1540 {
1541 while (bcptr != NULL && bcptr->current >= code)
1542 {
1543 if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1544 bcptr = bcptr->outer;
1545 }
1546 return TRUE;
1547 }
1548
1549
1550
1551 /*************************************************
1552 * Check for POSIX class syntax *
1553 *************************************************/
1554
1555 /* This function is called when the sequence "[:" or "[." or "[=" is
1556 encountered in a character class. It checks whether this is followed by an
1557 optional ^ and then a sequence of letters, terminated by a matching ":]" or
1558 ".]" or "=]".
1559
1560 Argument:
1561 ptr pointer to the initial [
1562 endptr where to return the end pointer
1563 cd pointer to compile data
1564
1565 Returns: TRUE or FALSE
1566 */
1567
1568 static BOOL
1569 check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
1570 {
1571 int terminator; /* Don't combine these lines; the Solaris cc */
1572 terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
1573 if (*(++ptr) == '^') ptr++;
1574 while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
1575 if (*ptr == terminator && ptr[1] == ']')
1576 {
1577 *endptr = ptr;
1578 return TRUE;
1579 }
1580 return FALSE;
1581 }
1582
1583
1584
1585
1586 /*************************************************
1587 * Check POSIX class name *
1588 *************************************************/
1589
1590 /* This function is called to check the name given in a POSIX-style class entry
1591 such as [:alnum:].
1592
1593 Arguments:
1594 ptr points to the first letter
1595 len the length of the name
1596
1597 Returns: a value representing the name, or -1 if unknown
1598 */
1599
1600 static int
1601 check_posix_name(const uschar *ptr, int len)
1602 {
1603 register int yield = 0;
1604 while (posix_name_lengths[yield] != 0)
1605 {
1606 if (len == posix_name_lengths[yield] &&
1607 strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
1608 yield++;
1609 }
1610 return -1;
1611 }
1612
1613
1614 /*************************************************
1615 * Adjust OP_RECURSE items in repeated group *
1616 *************************************************/
1617
1618 /* OP_RECURSE items contain an offset from the start of the regex to the group
1619 that is referenced. This means that groups can be replicated for fixed
1620 repetition simply by copying (because the recursion is allowed to refer to
1621 earlier groups that are outside the current group). However, when a group is
1622 optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
1623 it, after it has been compiled. This means that any OP_RECURSE items within it
1624 that refer to the group itself or any contained groups have to have their
1625 offsets adjusted. That one of the jobs of this function. Before it is called,
1626 the partially compiled regex must be temporarily terminated with OP_END.
1627
1628 This function has been extended with the possibility of forward references for
1629 recursions and subroutine calls. It must also check the list of such references
1630 for the group we are dealing with. If it finds that one of the recursions in
1631 the current group is on this list, it adjusts the offset in the list, not the
1632 value in the reference (which is a group number).
1633
1634 Arguments:
1635 group points to the start of the group
1636 adjust the amount by which the group is to be moved
1637 utf8 TRUE in UTF-8 mode
1638 cd contains pointers to tables etc.
1639 save_hwm the hwm forward reference pointer at the start of the group
1640
1641 Returns: nothing
1642 */
1643
1644 static void
1645 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
1646 uschar *save_hwm)
1647 {
1648 uschar *ptr = group;
1649 while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1650 {
1651 int offset;
1652 uschar *hc;
1653
1654 /* See if this recursion is on the forward reference list. If so, adjust the
1655 reference. */
1656
1657 for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
1658 {
1659 offset = GET(hc, 0);
1660 if (cd->start_code + offset == ptr + 1)
1661 {
1662 PUT(hc, 0, offset + adjust);
1663 break;
1664 }
1665 }
1666
1667 /* Otherwise, adjust the recursion offset if it's after the start of this
1668 group. */
1669
1670 if (hc >= cd->hwm)
1671 {
1672 offset = GET(ptr, 1);
1673 if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1674 }
1675
1676 ptr += 1 + LINK_SIZE;
1677 }
1678 }
1679
1680
1681
1682 /*************************************************
1683 * Insert an automatic callout point *
1684 *************************************************/
1685
1686 /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
1687 callout points before each pattern item.
1688
1689 Arguments:
1690 code current code pointer
1691 ptr current pattern pointer
1692 cd pointers to tables etc
1693
1694 Returns: new code pointer
1695 */
1696
1697 static uschar *
1698 auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
1699 {
1700 *code++ = OP_CALLOUT;
1701 *code++ = 255;
1702 PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */
1703 PUT(code, LINK_SIZE, 0); /* Default length */
1704 return code + 2*LINK_SIZE;
1705 }
1706
1707
1708
1709 /*************************************************
1710 * Complete a callout item *
1711 *************************************************/
1712
1713 /* A callout item contains the length of the next item in the pattern, which
1714 we can't fill in till after we have reached the relevant point. This is used
1715 for both automatic and manual callouts.
1716
1717 Arguments:
1718 previous_callout points to previous callout item
1719 ptr current pattern pointer
1720 cd pointers to tables etc
1721
1722 Returns: nothing
1723 */
1724
1725 static void
1726 complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
1727 {
1728 int length = ptr - cd->start_pattern - GET(previous_callout, 2);
1729 PUT(previous_callout, 2 + LINK_SIZE, length);
1730 }
1731
1732
1733
1734 #ifdef SUPPORT_UCP
1735 /*************************************************
1736 * Get othercase range *
1737 *************************************************/
1738
1739 /* This function is passed the start and end of a class range, in UTF-8 mode
1740 with UCP support. It searches up the characters, looking for internal ranges of
1741 characters in the "other" case. Each call returns the next one, updating the
1742 start address.
1743
1744 Arguments:
1745 cptr points to starting character value; updated
1746 d end value
1747 ocptr where to put start of othercase range
1748 odptr where to put end of othercase range
1749
1750 Yield: TRUE when range returned; FALSE when no more
1751 */
1752
1753 static BOOL
1754 get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
1755 unsigned int *odptr)
1756 {
1757 unsigned int c, othercase, next;
1758
1759 for (c = *cptr; c <= d; c++)
1760 { if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR) break; }
1761
1762 if (c > d) return FALSE;
1763
1764 *ocptr = othercase;
1765 next = othercase + 1;
1766
1767 for (++c; c <= d; c++)
1768 {
1769 if (_pcre_ucp_othercase(c) != next) break;
1770 next++;
1771 }
1772
1773 *odptr = next - 1;
1774 *cptr = c;
1775
1776 return TRUE;
1777 }
1778 #endif /* SUPPORT_UCP */
1779
1780
1781
1782 /*************************************************
1783 * Check if auto-possessifying is possible *
1784 *************************************************/
1785
1786 /* This function is called for unlimited repeats of certain items, to see
1787 whether the next thing could possibly match the repeated item. If not, it makes
1788 sense to automatically possessify the repeated item.
1789
1790 Arguments:
1791 op_code the repeated op code
1792 this data for this item, depends on the opcode
1793 utf8 TRUE in UTF-8 mode
1794 utf8_char used for utf8 character bytes, NULL if not relevant
1795 ptr next character in pattern
1796 options options bits
1797 cd contains pointers to tables etc.
1798
1799 Returns: TRUE if possessifying is wanted
1800 */
1801
1802 static BOOL
1803 check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
1804 const uschar *ptr, int options, compile_data *cd)
1805 {
1806 int next;
1807
1808 /* Skip whitespace and comments in extended mode */
1809
1810 if ((options & PCRE_EXTENDED) != 0)
1811 {
1812 for (;;)
1813 {
1814 while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1815 if (*ptr == '#')
1816 {
1817 while (*(++ptr) != 0)
1818 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1819 }
1820 else break;
1821 }
1822 }
1823
1824 /* If the next item is one that we can handle, get its value. A non-negative
1825 value is a character, a negative value is an escape value. */
1826
1827 if (*ptr == '\\')
1828 {
1829 int temperrorcode = 0;
1830 next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
1831 if (temperrorcode != 0) return FALSE;
1832 ptr++; /* Point after the escape sequence */
1833 }
1834
1835 else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
1836 {
1837 #ifdef SUPPORT_UTF8
1838 if (utf8) { GETCHARINC(next, ptr); } else
1839 #endif
1840 next = *ptr++;
1841 }
1842
1843 else return FALSE;
1844
1845 /* Skip whitespace and comments in extended mode */
1846
1847 if ((options & PCRE_EXTENDED) != 0)
1848 {
1849 for (;;)
1850 {
1851 while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1852 if (*ptr == '#')
1853 {
1854 while (*(++ptr) != 0)
1855 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1856 }
1857 else break;
1858 }
1859 }
1860
1861 /* If the next thing is itself optional, we have to give up. */
1862
1863 if (*ptr == '*' || *ptr == '?' || strncmp((char *)ptr, "{0,", 3) == 0)
1864 return FALSE;
1865
1866 /* Now compare the next item with the previous opcode. If the previous is a
1867 positive single character match, "item" either contains the character or, if
1868 "item" is greater than 127 in utf8 mode, the character's bytes are in
1869 utf8_char. */
1870
1871
1872 /* Handle cases when the next item is a character. */
1873
1874 if (next >= 0) switch(op_code)
1875 {
1876 case OP_CHAR:
1877 #ifdef SUPPORT_UTF8
1878 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1879 #endif
1880 return item != next;
1881
1882 /* For CHARNC (caseless character) we must check the other case. If we have
1883 Unicode property support, we can use it to test the other case of
1884 high-valued characters. */
1885
1886 case OP_CHARNC:
1887 #ifdef SUPPORT_UTF8
1888 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1889 #endif
1890 if (item == next) return FALSE;
1891 #ifdef SUPPORT_UTF8
1892 if (utf8)
1893 {
1894 unsigned int othercase;
1895 if (next < 128) othercase = cd->fcc[next]; else
1896 #ifdef SUPPORT_UCP
1897 othercase = _pcre_ucp_othercase((unsigned int)next);
1898 #else
1899 othercase = NOTACHAR;
1900 #endif
1901 return (unsigned int)item != othercase;
1902 }
1903 else
1904 #endif /* SUPPORT_UTF8 */
1905 return (item != cd->fcc[next]); /* Non-UTF-8 mode */
1906
1907 /* For OP_NOT, "item" must be a single-byte character. */
1908
1909 case OP_NOT:
1910 if (next < 0) return FALSE; /* Not a character */
1911 if (item == next) return TRUE;
1912 if ((options & PCRE_CASELESS) == 0) return FALSE;
1913 #ifdef SUPPORT_UTF8
1914 if (utf8)
1915 {
1916 unsigned int othercase;
1917 if (next < 128) othercase = cd->fcc[next]; else
1918 #ifdef SUPPORT_UCP
1919 othercase = _pcre_ucp_othercase(next);
1920 #else
1921 othercase = NOTACHAR;
1922 #endif
1923 return (unsigned int)item == othercase;
1924 }
1925 else
1926 #endif /* SUPPORT_UTF8 */
1927 return (item == cd->fcc[next]); /* Non-UTF-8 mode */
1928
1929 case OP_DIGIT:
1930 return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
1931
1932 case OP_NOT_DIGIT:
1933 return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
1934
1935 case OP_WHITESPACE:
1936 return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
1937
1938 case OP_NOT_WHITESPACE:
1939 return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
1940
1941 case OP_WORDCHAR:
1942 return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
1943
1944 case OP_NOT_WORDCHAR:
1945 return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
1946
1947 default:
1948 return FALSE;
1949 }
1950
1951
1952 /* Handle the case when the next item is \d, \s, etc. */
1953
1954 switch(op_code)
1955 {
1956 case OP_CHAR:
1957 case OP_CHARNC:
1958 #ifdef SUPPORT_UTF8
1959 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1960 #endif
1961 switch(-next)
1962 {
1963 case ESC_d:
1964 return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
1965
1966 case ESC_D:
1967 return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
1968
1969 case ESC_s:
1970 return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
1971
1972 case ESC_S:
1973 return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
1974
1975 case ESC_w:
1976 return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
1977
1978 case ESC_W:
1979 return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
1980
1981 default:
1982 return FALSE;
1983 }
1984
1985 case OP_DIGIT:
1986 return next == -ESC_D || next == -ESC_s || next == -ESC_W;
1987
1988 case OP_NOT_DIGIT:
1989 return next == -ESC_d;
1990
1991 case OP_WHITESPACE:
1992 return next == -ESC_S || next == -ESC_d || next == -ESC_w;
1993
1994 case OP_NOT_WHITESPACE:
1995 return next == -ESC_s;
1996
1997 case OP_WORDCHAR:
1998 return next == -ESC_W || next == -ESC_s;
1999
2000 case OP_NOT_WORDCHAR:
2001 return next == -ESC_w || next == -ESC_d;
2002
2003 default:
2004 return FALSE;
2005 }
2006
2007 /* Control does not reach here */
2008 }
2009
2010
2011
2012 /*************************************************
2013 * Compile one branch *
2014 *************************************************/
2015
2016 /* Scan the pattern, compiling it into the a vector. If the options are
2017 changed during the branch, the pointer is used to change the external options
2018 bits. This function is used during the pre-compile phase when we are trying
2019 to find out the amount of memory needed, as well as during the real compile
2020 phase. The value of lengthptr distinguishes the two phases.
2021
2022 Arguments:
2023 optionsptr pointer to the option bits
2024 codeptr points to the pointer to the current code point
2025 ptrptr points to the current pattern pointer
2026 errorcodeptr points to error code variable
2027 firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2028 reqbyteptr set to the last literal character required, else < 0
2029 bcptr points to current branch chain
2030 cd contains pointers to tables etc.
2031 lengthptr NULL during the real compile phase
2032 points to length accumulator during pre-compile phase
2033
2034 Returns: TRUE on success
2035 FALSE, with *errorcodeptr set non-zero on error
2036 */
2037
2038 static BOOL
2039 compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2040 int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2041 compile_data *cd, int *lengthptr)
2042 {
2043 int repeat_type, op_type;
2044 int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
2045 int bravalue = 0;
2046 int greedy_default, greedy_non_default;
2047 int firstbyte, reqbyte;
2048 int zeroreqbyte, zerofirstbyte;
2049 int req_caseopt, reqvary, tempreqvary;
2050 int options = *optionsptr;
2051 int after_manual_callout = 0;
2052 int length_prevgroup = 0;
2053 register int c;
2054 register uschar *code = *codeptr;
2055 uschar *last_code = code;
2056 uschar *orig_code = code;
2057 uschar *tempcode;
2058 BOOL inescq = FALSE;
2059 BOOL groupsetfirstbyte = FALSE;
2060 const uschar *ptr = *ptrptr;
2061 const uschar *tempptr;
2062 uschar *previous = NULL;
2063 uschar *previous_callout = NULL;
2064 uschar *save_hwm = NULL;
2065 uschar classbits[32];
2066
2067 #ifdef SUPPORT_UTF8
2068 BOOL class_utf8;
2069 BOOL utf8 = (options & PCRE_UTF8) != 0;
2070 uschar *class_utf8data;
2071 uschar utf8_char[6];
2072 #else
2073 BOOL utf8 = FALSE;
2074 uschar *utf8_char = NULL;
2075 #endif
2076
2077 #ifdef DEBUG
2078 if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2079 #endif
2080
2081 /* Set up the default and non-default settings for greediness */
2082
2083 greedy_default = ((options & PCRE_UNGREEDY) != 0);
2084 greedy_non_default = greedy_default ^ 1;
2085
2086 /* Initialize no first byte, no required byte. REQ_UNSET means "no char
2087 matching encountered yet". It gets changed to REQ_NONE if we hit something that
2088 matches a non-fixed char first char; reqbyte just remains unset if we never
2089 find one.
2090
2091 When we hit a repeat whose minimum is zero, we may have to adjust these values
2092 to take the zero repeat into account. This is implemented by setting them to
2093 zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2094 item types that can be repeated set these backoff variables appropriately. */
2095
2096 firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2097
2098 /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2099 according to the current setting of the caseless flag. REQ_CASELESS is a bit
2100 value > 255. It is added into the firstbyte or reqbyte variables to record the
2101 case status of the value. This is used only for ASCII characters. */
2102
2103 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2104
2105 /* Switch on next character until the end of the branch */
2106
2107 for (;; ptr++)
2108 {
2109 BOOL negate_class;
2110 BOOL possessive_quantifier;
2111 BOOL is_quantifier;
2112 BOOL is_recurse;
2113 BOOL reset_bracount;
2114 int class_charcount;
2115 int class_lastchar;
2116 int newoptions;
2117 int recno;
2118 int refsign;
2119 int skipbytes;
2120 int subreqbyte;
2121 int subfirstbyte;
2122 int terminator;
2123 int mclength;
2124 uschar mcbuffer[8];
2125
2126 /* Get next byte in the pattern */
2127
2128 c = *ptr;
2129
2130 /* If we are in the pre-compile phase, accumulate the length used for the
2131 previous cycle of this loop. */
2132
2133 if (lengthptr != NULL)
2134 {
2135 #ifdef DEBUG
2136 if (code > cd->hwm) cd->hwm = code; /* High water info */
2137 #endif
2138 if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
2139 {
2140 *errorcodeptr = ERR52;
2141 goto FAILED;
2142 }
2143
2144 /* There is at least one situation where code goes backwards: this is the
2145 case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2146 the class is simply eliminated. However, it is created first, so we have to
2147 allow memory for it. Therefore, don't ever reduce the length at this point.
2148 */
2149
2150 if (code < last_code) code = last_code;
2151 *lengthptr += code - last_code;
2152 DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2153
2154 /* If "previous" is set and it is not at the start of the work space, move
2155 it back to there, in order to avoid filling up the work space. Otherwise,
2156 if "previous" is NULL, reset the current code pointer to the start. */
2157
2158 if (previous != NULL)
2159 {
2160 if (previous > orig_code)
2161 {
2162 memmove(orig_code, previous, code - previous);
2163 code -= previous - orig_code;
2164 previous = orig_code;
2165 }
2166 }
2167 else code = orig_code;
2168
2169 /* Remember where this code item starts so we can pick up the length
2170 next time round. */
2171
2172 last_code = code;
2173 }
2174
2175 /* In the real compile phase, just check the workspace used by the forward
2176 reference list. */
2177
2178 else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
2179 {
2180 *errorcodeptr = ERR52;
2181 goto FAILED;
2182 }
2183
2184 /* If in \Q...\E, check for the end; if not, we have a literal */
2185
2186 if (inescq && c != 0)
2187 {
2188 if (c == '\\' && ptr[1] == 'E')
2189 {
2190 inescq = FALSE;
2191 ptr++;
2192 continue;
2193 }
2194 else
2195 {
2196 if (previous_callout != NULL)
2197 {
2198 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2199 complete_callout(previous_callout, ptr, cd);
2200 previous_callout = NULL;
2201 }
2202 if ((options & PCRE_AUTO_CALLOUT) != 0)
2203 {
2204 previous_callout = code;
2205 code = auto_callout(code, ptr, cd);
2206 }
2207 goto NORMAL_CHAR;
2208 }
2209 }
2210
2211 /* Fill in length of a previous callout, except when the next thing is
2212 a quantifier. */
2213
2214 is_quantifier = c == '*' || c == '+' || c == '?' ||
2215 (c == '{' && is_counted_repeat(ptr+1));
2216
2217 if (!is_quantifier && previous_callout != NULL &&
2218 after_manual_callout-- <= 0)
2219 {
2220 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2221 complete_callout(previous_callout, ptr, cd);
2222 previous_callout = NULL;
2223 }
2224
2225 /* In extended mode, skip white space and comments */
2226
2227 if ((options & PCRE_EXTENDED) != 0)
2228 {
2229 if ((cd->ctypes[c] & ctype_space) != 0) continue;
2230 if (c == '#')
2231 {
2232 while (*(++ptr) != 0)
2233 {
2234 if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
2235 }
2236 if (*ptr != 0) continue;
2237
2238 /* Else fall through to handle end of string */
2239 c = 0;
2240 }
2241 }
2242
2243 /* No auto callout for quantifiers. */
2244
2245 if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
2246 {
2247 previous_callout = code;
2248 code = auto_callout(code, ptr, cd);
2249 }
2250
2251 switch(c)
2252 {
2253 /* ===================================================================*/
2254 case 0: /* The branch terminates at string end */
2255 case '|': /* or | or ) */
2256 case ')':
2257 *firstbyteptr = firstbyte;
2258 *reqbyteptr = reqbyte;
2259 *codeptr = code;
2260 *ptrptr = ptr;
2261 if (lengthptr != NULL)
2262 {
2263 *lengthptr += code - last_code; /* To include callout length */
2264 DPRINTF((">> end branch\n"));
2265 }
2266 return TRUE;
2267
2268
2269 /* ===================================================================*/
2270 /* Handle single-character metacharacters. In multiline mode, ^ disables
2271 the setting of any following char as a first character. */
2272
2273 case '^':
2274 if ((options & PCRE_MULTILINE) != 0)
2275 {
2276 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2277 }
2278 previous = NULL;
2279 *code++ = OP_CIRC;
2280 break;
2281
2282 case '$':
2283 previous = NULL;
2284 *code++ = OP_DOLL;
2285 break;
2286
2287 /* There can never be a first char if '.' is first, whatever happens about
2288 repeats. The value of reqbyte doesn't change either. */
2289
2290 case '.':
2291 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2292 zerofirstbyte = firstbyte;
2293 zeroreqbyte = reqbyte;
2294 previous = code;
2295 *code++ = OP_ANY;
2296 break;
2297
2298
2299 /* ===================================================================*/
2300 /* Character classes. If the included characters are all < 256, we build a
2301 32-byte bitmap of the permitted characters, except in the special case
2302 where there is only one such character. For negated classes, we build the
2303 map as usual, then invert it at the end. However, we use a different opcode
2304 so that data characters > 255 can be handled correctly.
2305
2306 If the class contains characters outside the 0-255 range, a different
2307 opcode is compiled. It may optionally have a bit map for characters < 256,
2308 but those above are are explicitly listed afterwards. A flag byte tells
2309 whether the bitmap is present, and whether this is a negated class or not.
2310 */
2311
2312 case '[':
2313 previous = code;
2314
2315 /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
2316 they are encountered at the top level, so we'll do that too. */
2317
2318 if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2319 check_posix_syntax(ptr, &tempptr, cd))
2320 {
2321 *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;
2322 goto FAILED;
2323 }
2324
2325 /* If the first character is '^', set the negation flag and skip it. */
2326
2327 if ((c = *(++ptr)) == '^')
2328 {
2329 negate_class = TRUE;
2330 c = *(++ptr);
2331 }
2332 else
2333 {
2334 negate_class = FALSE;
2335 }
2336
2337 /* Keep a count of chars with values < 256 so that we can optimize the case
2338 of just a single character (as long as it's < 256). However, For higher
2339 valued UTF-8 characters, we don't yet do any optimization. */
2340
2341 class_charcount = 0;
2342 class_lastchar = -1;
2343
2344 /* Initialize the 32-char bit map to all zeros. We build the map in a
2345 temporary bit of memory, in case the class contains only 1 character (less
2346 than 256), because in that case the compiled code doesn't use the bit map.
2347 */
2348
2349 memset(classbits, 0, 32 * sizeof(uschar));
2350
2351 #ifdef SUPPORT_UTF8
2352 class_utf8 = FALSE; /* No chars >= 256 */
2353 class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */
2354 #endif
2355
2356 /* Process characters until ] is reached. By writing this as a "do" it
2357 means that an initial ] is taken as a data character. At the start of the
2358 loop, c contains the first byte of the character. */
2359
2360 if (c != 0) do
2361 {
2362 const uschar *oldptr;
2363
2364 #ifdef SUPPORT_UTF8
2365 if (utf8 && c > 127)
2366 { /* Braces are required because the */
2367 GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
2368 }
2369 #endif
2370
2371 /* Inside \Q...\E everything is literal except \E */
2372
2373 if (inescq)
2374 {
2375 if (c == '\\' && ptr[1] == 'E') /* If we are at \E */
2376 {
2377 inescq = FALSE; /* Reset literal state */
2378 ptr++; /* Skip the 'E' */
2379 continue; /* Carry on with next */
2380 }
2381 goto CHECK_RANGE; /* Could be range if \E follows */
2382 }
2383
2384 /* Handle POSIX class names. Perl allows a negation extension of the
2385 form [:^name:]. A square bracket that doesn't match the syntax is
2386 treated as a literal. We also recognize the POSIX constructions
2387 [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
2388 5.6 and 5.8 do. */
2389
2390 if (c == '[' &&
2391 (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2392 check_posix_syntax(ptr, &tempptr, cd))
2393 {
2394 BOOL local_negate = FALSE;
2395 int posix_class, taboffset, tabopt;
2396 register const uschar *cbits = cd->cbits;
2397 uschar pbits[32];
2398
2399 if (ptr[1] != ':')
2400 {
2401 *errorcodeptr = ERR31;
2402 goto FAILED;
2403 }
2404
2405 ptr += 2;
2406 if (*ptr == '^')
2407 {
2408 local_negate = TRUE;
2409 ptr++;
2410 }
2411
2412 posix_class = check_posix_name(ptr, tempptr - ptr);
2413 if (posix_class < 0)
2414 {
2415 *errorcodeptr = ERR30;
2416 goto FAILED;
2417 }
2418
2419 /* If matching is caseless, upper and lower are converted to
2420 alpha. This relies on the fact that the class table starts with
2421 alpha, lower, upper as the first 3 entries. */
2422
2423 if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
2424 posix_class = 0;
2425
2426 /* We build the bit map for the POSIX class in a chunk of local store
2427 because we may be adding and subtracting from it, and we don't want to
2428 subtract bits that may be in the main map already. At the end we or the
2429 result into the bit map that is being built. */
2430
2431 posix_class *= 3;
2432
2433 /* Copy in the first table (always present) */
2434
2435 memcpy(pbits, cbits + posix_class_maps[posix_class],
2436 32 * sizeof(uschar));
2437
2438 /* If there is a second table, add or remove it as required. */
2439
2440 taboffset = posix_class_maps[posix_class + 1];
2441 tabopt = posix_class_maps[posix_class + 2];
2442
2443 if (taboffset >= 0)
2444 {
2445 if (tabopt >= 0)
2446 for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
2447 else
2448 for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
2449 }
2450
2451 /* Not see if we need to remove any special characters. An option
2452 value of 1 removes vertical space and 2 removes underscore. */
2453
2454 if (tabopt < 0) tabopt = -tabopt;
2455 if (tabopt == 1) pbits[1] &= ~0x3c;
2456 else if (tabopt == 2) pbits[11] &= 0x7f;
2457
2458 /* Add the POSIX table or its complement into the main table that is
2459 being built and we are done. */
2460
2461 if (local_negate)
2462 for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
2463 else
2464 for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
2465
2466 ptr = tempptr + 1;
2467 class_charcount = 10; /* Set > 1; assumes more than 1 per class */
2468 continue; /* End of POSIX syntax handling */
2469 }
2470
2471 /* Backslash may introduce a single character, or it may introduce one
2472 of the specials, which just set a flag. The sequence \b is a special
2473 case. Inside a class (and only there) it is treated as backspace.
2474 Elsewhere it marks a word boundary. Other escapes have preset maps ready
2475 to or into the one we are building. We assume they have more than one
2476 character in them, so set class_charcount bigger than one. */
2477
2478 if (c == '\\')
2479 {
2480 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2481 if (*errorcodeptr != 0) goto FAILED;
2482
2483 if (-c == ESC_b) c = '\b'; /* \b is backslash in a class */
2484 else if (-c == ESC_X) c = 'X'; /* \X is literal X in a class */
2485 else if (-c == ESC_R) c = 'R'; /* \R is literal R in a class */
2486 else if (-c == ESC_Q) /* Handle start of quoted string */
2487 {
2488 if (ptr[1] == '\\' && ptr[2] == 'E')
2489 {
2490 ptr += 2; /* avoid empty string */
2491 }
2492 else inescq = TRUE;
2493 continue;
2494 }
2495
2496 if (c < 0)
2497 {
2498 register const uschar *cbits = cd->cbits;
2499 class_charcount += 2; /* Greater than 1 is what matters */
2500
2501 /* Save time by not doing this in the pre-compile phase. */
2502
2503 if (lengthptr == NULL) switch (-c)
2504 {
2505 case ESC_d:
2506 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
2507 continue;
2508
2509 case ESC_D:
2510 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
2511 continue;
2512
2513 case ESC_w:
2514 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
2515 continue;
2516
2517 case ESC_W:
2518 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
2519 continue;
2520
2521 case ESC_s:
2522 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
2523 classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
2524 continue;
2525
2526 case ESC_S:
2527 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
2528 classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
2529 continue;
2530
2531 case ESC_E: /* Perl ignores an orphan \E */
2532 continue;
2533
2534 default: /* Not recognized; fall through */
2535 break; /* Need "default" setting to stop compiler warning. */
2536 }
2537
2538 /* In the pre-compile phase, just do the recognition. */
2539
2540 else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
2541 c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
2542
2543 /* We need to deal with \P and \p in both phases. */
2544
2545 #ifdef SUPPORT_UCP
2546 if (-c == ESC_p || -c == ESC_P)
2547 {
2548 BOOL negated;
2549 int pdata;
2550 int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
2551 if (ptype < 0) goto FAILED;
2552 class_utf8 = TRUE;
2553 *class_utf8data++ = ((-c == ESC_p) != negated)?
2554 XCL_PROP : XCL_NOTPROP;
2555 *class_utf8data++ = ptype;
2556 *class_utf8data++ = pdata;
2557 class_charcount -= 2; /* Not a < 256 character */
2558 continue;
2559 }
2560 #endif
2561 /* Unrecognized escapes are faulted if PCRE is running in its
2562 strict mode. By default, for compatibility with Perl, they are
2563 treated as literals. */
2564
2565 if ((options & PCRE_EXTRA) != 0)
2566 {
2567 *errorcodeptr = ERR7;
2568 goto FAILED;
2569 }
2570
2571 class_charcount -= 2; /* Undo the default count from above */
2572 c = *ptr; /* Get the final character and fall through */
2573 }
2574
2575 /* Fall through if we have a single character (c >= 0). This may be
2576 greater than 256 in UTF-8 mode. */
2577
2578 } /* End of backslash handling */
2579
2580 /* A single character may be followed by '-' to form a range. However,
2581 Perl does not permit ']' to be the end of the range. A '-' character
2582 at the end is treated as a literal. Perl ignores orphaned \E sequences
2583 entirely. The code for handling \Q and \E is messy. */
2584
2585 CHECK_RANGE:
2586 while (ptr[1] == '\\' && ptr[2] == 'E')
2587 {
2588 inescq = FALSE;
2589 ptr += 2;
2590 }
2591
2592 oldptr = ptr;
2593
2594 if (!inescq && ptr[1] == '-')
2595 {
2596 int d;
2597 ptr += 2;
2598 while (*ptr == '\\' && ptr[1] == 'E') ptr += 2;
2599
2600 /* If we hit \Q (not followed by \E) at this point, go into escaped
2601 mode. */
2602
2603 while (*ptr == '\\' && ptr[1] == 'Q')
2604 {
2605 ptr += 2;
2606 if (*ptr == '\\' && ptr[1] == 'E') { ptr += 2; continue; }
2607 inescq = TRUE;
2608 break;
2609 }
2610
2611 if (*ptr == 0 || (!inescq && *ptr == ']'))
2612 {
2613 ptr = oldptr;
2614 goto LONE_SINGLE_CHARACTER;
2615 }
2616
2617 #ifdef SUPPORT_UTF8
2618 if (utf8)
2619 { /* Braces are required because the */
2620 GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
2621 }
2622 else
2623 #endif
2624 d = *ptr; /* Not UTF-8 mode */
2625
2626 /* The second part of a range can be a single-character escape, but
2627 not any of the other escapes. Perl 5.6 treats a hyphen as a literal
2628 in such circumstances. */
2629
2630 if (!inescq && d == '\\')
2631 {
2632 d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2633 if (*errorcodeptr != 0) goto FAILED;
2634
2635 /* \b is backslash; \X is literal X; \R is literal R; any other
2636 special means the '-' was literal */
2637
2638 if (d < 0)
2639 {
2640 if (d == -ESC_b) d = '\b';
2641 else if (d == -ESC_X) d = 'X';
2642 else if (d == -ESC_R) d = 'R'; else
2643 {
2644 ptr = oldptr;
2645 goto LONE_SINGLE_CHARACTER; /* A few lines below */
2646 }
2647 }
2648 }
2649
2650 /* Check that the two values are in the correct order. Optimize
2651 one-character ranges */
2652
2653 if (d < c)
2654 {
2655 *errorcodeptr = ERR8;
2656 goto FAILED;
2657 }
2658
2659 if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
2660
2661 /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
2662 matching, we have to use an XCLASS with extra data items. Caseless
2663 matching for characters > 127 is available only if UCP support is
2664 available. */
2665
2666 #ifdef SUPPORT_UTF8
2667 if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
2668 {
2669 class_utf8 = TRUE;
2670
2671 /* With UCP support, we can find the other case equivalents of
2672 the relevant characters. There may be several ranges. Optimize how
2673 they fit with the basic range. */
2674
2675 #ifdef SUPPORT_UCP
2676 if ((options & PCRE_CASELESS) != 0)
2677 {
2678 unsigned int occ, ocd;
2679 unsigned int cc = c;
2680 unsigned int origd = d;
2681 while (get_othercase_range(&cc, origd, &occ, &ocd))
2682 {
2683 if (occ >= (unsigned int)c &&
2684 ocd <= (unsigned int)d)
2685 continue; /* Skip embedded ranges */
2686
2687 if (occ < (unsigned int)c &&
2688 ocd >= (unsigned int)c - 1) /* Extend the basic range */
2689 { /* if there is overlap, */
2690 c = occ; /* noting that if occ < c */
2691 continue; /* we can't have ocd > d */
2692 } /* because a subrange is */
2693 if (ocd > (unsigned int)d &&
2694 occ <= (unsigned int)d + 1) /* always shorter than */
2695 { /* the basic range. */
2696 d = ocd;
2697 continue;
2698 }
2699
2700 if (occ == ocd)
2701 {
2702 *class_utf8data++ = XCL_SINGLE;
2703 }
2704 else
2705 {
2706 *class_utf8data++ = XCL_RANGE;
2707 class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
2708 }
2709 class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
2710 }
2711 }
2712 #endif /* SUPPORT_UCP */
2713
2714 /* Now record the original range, possibly modified for UCP caseless
2715 overlapping ranges. */
2716
2717 *class_utf8data++ = XCL_RANGE;
2718 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
2719 class_utf8data += _pcre_ord2utf8(d, class_utf8data);
2720
2721 /* With UCP support, we are done. Without UCP support, there is no
2722 caseless matching for UTF-8 characters > 127; we can use the bit map
2723 for the smaller ones. */
2724
2725 #ifdef SUPPORT_UCP
2726 continue; /* With next character in the class */
2727 #else
2728 if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
2729
2730 /* Adjust upper limit and fall through to set up the map */
2731
2732 d = 127;
2733
2734 #endif /* SUPPORT_UCP */
2735 }
2736 #endif /* SUPPORT_UTF8 */
2737
2738 /* We use the bit map for all cases when not in UTF-8 mode; else
2739 ranges that lie entirely within 0-127 when there is UCP support; else
2740 for partial ranges without UCP support. */
2741
2742 class_charcount += d - c + 1;
2743 class_lastchar = d;
2744
2745 /* We can save a bit of time by skipping this in the pre-compile. */
2746
2747 if (lengthptr == NULL) for (; c <= d; c++)
2748 {
2749 classbits[c/8] |= (1 << (c&7));
2750 if ((options & PCRE_CASELESS) != 0)
2751 {
2752 int uc = cd->fcc[c]; /* flip case */
2753 classbits[uc/8] |= (1 << (uc&7));
2754 }
2755 }
2756
2757 continue; /* Go get the next char in the class */
2758 }
2759
2760 /* Handle a lone single character - we can get here for a normal
2761 non-escape char, or after \ that introduces a single character or for an
2762 apparent range that isn't. */
2763
2764 LONE_SINGLE_CHARACTER:
2765
2766 /* Handle a character that cannot go in the bit map */
2767
2768 #ifdef SUPPORT_UTF8
2769 if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
2770 {
2771 class_utf8 = TRUE;
2772 *class_utf8data++ = XCL_SINGLE;
2773 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
2774
2775 #ifdef SUPPORT_UCP
2776 if ((options & PCRE_CASELESS) != 0)
2777 {
2778 unsigned int othercase;
2779 if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR)
2780 {
2781 *class_utf8data++ = XCL_SINGLE;
2782 class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
2783 }
2784 }
2785 #endif /* SUPPORT_UCP */
2786
2787 }
2788 else
2789 #endif /* SUPPORT_UTF8 */
2790
2791 /* Handle a single-byte character */
2792 {
2793 classbits[c/8] |= (1 << (c&7));
2794 if ((options & PCRE_CASELESS) != 0)
2795 {
2796 c = cd->fcc[c]; /* flip case */
2797 classbits[c/8] |= (1 << (c&7));
2798 }
2799 class_charcount++;
2800 class_lastchar = c;
2801 }
2802 }
2803
2804 /* Loop until ']' reached. This "while" is the end of the "do" above. */
2805
2806 while ((c = *(++ptr)) != 0 && (c != ']' || inescq));
2807
2808 if (c == 0) /* Missing terminating ']' */
2809 {
2810 *errorcodeptr = ERR6;
2811 goto FAILED;
2812 }
2813
2814 /* If class_charcount is 1, we saw precisely one character whose value is
2815 less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we
2816 can optimize the negative case only if there were no characters >= 128
2817 because OP_NOT and the related opcodes like OP_NOTSTAR operate on
2818 single-bytes only. This is an historical hangover. Maybe one day we can
2819 tidy these opcodes to handle multi-byte characters.
2820
2821 The optimization throws away the bit map. We turn the item into a
2822 1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
2823 that OP_NOT does not support multibyte characters. In the positive case, it
2824 can cause firstbyte to be set. Otherwise, there can be no first char if
2825 this item is first, whatever repeat count may follow. In the case of
2826 reqbyte, save the previous value for reinstating. */
2827
2828 #ifdef SUPPORT_UTF8
2829 if (class_charcount == 1 &&
2830 (!utf8 ||
2831 (!class_utf8 && (!negate_class || class_lastchar < 128))))
2832
2833 #else
2834 if (class_charcount == 1)
2835 #endif
2836 {
2837 zeroreqbyte = reqbyte;
2838
2839 /* The OP_NOT opcode works on one-byte characters only. */
2840
2841 if (negate_class)
2842 {
2843 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2844 zerofirstbyte = firstbyte;
2845 *code++ = OP_NOT;
2846 *code++ = class_lastchar;
2847 break;
2848 }
2849
2850 /* For a single, positive character, get the value into mcbuffer, and
2851 then we can handle this with the normal one-character code. */
2852
2853 #ifdef SUPPORT_UTF8
2854 if (utf8 && class_lastchar > 127)
2855 mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
2856 else
2857 #endif
2858 {
2859 mcbuffer[0] = class_lastchar;
2860 mclength = 1;
2861 }
2862 goto ONE_CHAR;
2863 } /* End of 1-char optimization */
2864
2865 /* The general case - not the one-char optimization. If this is the first
2866 thing in the branch, there can be no first char setting, whatever the
2867 repeat count. Any reqbyte setting must remain unchanged after any kind of
2868 repeat. */
2869
2870 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2871 zerofirstbyte = firstbyte;
2872 zeroreqbyte = reqbyte;
2873
2874 /* If there are characters with values > 255, we have to compile an
2875 extended class, with its own opcode. If there are no characters < 256,
2876 we can omit the bitmap in the actual compiled code. */
2877
2878 #ifdef SUPPORT_UTF8
2879 if (class_utf8)
2880 {
2881 *class_utf8data++ = XCL_END; /* Marks the end of extra data */
2882 *code++ = OP_XCLASS;
2883 code += LINK_SIZE;
2884 *code = negate_class? XCL_NOT : 0;
2885
2886 /* If the map is required, move up the extra data to make room for it;
2887 otherwise just move the code pointer to the end of the extra data. */
2888
2889 if (class_charcount > 0)
2890 {
2891 *code++ |= XCL_MAP;
2892 memmove(code + 32, code, class_utf8data - code);
2893 memcpy(code, classbits, 32);
2894 code = class_utf8data + 32;
2895 }
2896 else code = class_utf8data;
2897
2898 /* Now fill in the complete length of the item */
2899
2900 PUT(previous, 1, code - previous);
2901 break; /* End of class handling */
2902 }
2903 #endif
2904
2905 /* If there are no characters > 255, negate the 32-byte map if necessary,
2906 and copy it into the code vector. If this is the first thing in the branch,
2907 there can be no first char setting, whatever the repeat count. Any reqbyte
2908 setting must remain unchanged after any kind of repeat. */
2909
2910 if (negate_class)
2911 {
2912 *code++ = OP_NCLASS;
2913 if (lengthptr == NULL) /* Save time in the pre-compile phase */
2914 for (c = 0; c < 32; c++) code[c] = ~classbits[c];
2915 }
2916 else
2917 {
2918 *code++ = OP_CLASS;
2919 memcpy(code, classbits, 32);
2920 }
2921 code += 32;
2922 break;
2923
2924
2925 /* ===================================================================*/
2926 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
2927 has been tested above. */
2928
2929 case '{':
2930 if (!is_quantifier) goto NORMAL_CHAR;
2931 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
2932 if (*errorcodeptr != 0) goto FAILED;
2933 goto REPEAT;
2934
2935 case '*':
2936 repeat_min = 0;
2937 repeat_max = -1;
2938 goto REPEAT;
2939
2940 case '+':
2941 repeat_min = 1;
2942 repeat_max = -1;
2943 goto REPEAT;
2944
2945 case '?':
2946 repeat_min = 0;
2947 repeat_max = 1;
2948
2949 REPEAT:
2950 if (previous == NULL)
2951 {
2952 *errorcodeptr = ERR9;
2953 goto FAILED;
2954 }
2955
2956 if (repeat_min == 0)
2957 {
2958 firstbyte = zerofirstbyte; /* Adjust for zero repeat */
2959 reqbyte = zeroreqbyte; /* Ditto */
2960 }
2961
2962 /* Remember whether this is a variable length repeat */
2963
2964 reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
2965
2966 op_type = 0; /* Default single-char op codes */
2967 possessive_quantifier = FALSE; /* Default not possessive quantifier */
2968
2969 /* Save start of previous item, in case we have to move it up to make space
2970 for an inserted OP_ONCE for the additional '+' extension. */
2971
2972 tempcode = previous;
2973
2974 /* If the next character is '+', we have a possessive quantifier. This
2975 implies greediness, whatever the setting of the PCRE_UNGREEDY option.
2976 If the next character is '?' this is a minimizing repeat, by default,
2977 but if PCRE_UNGREEDY is set, it works the other way round. We change the
2978 repeat type to the non-default. */
2979
2980 if (ptr[1] == '+')
2981 {
2982 repeat_type = 0; /* Force greedy */
2983 possessive_quantifier = TRUE;
2984 ptr++;
2985 }
2986 else if (ptr[1] == '?')
2987 {
2988 repeat_type = greedy_non_default;
2989 ptr++;
2990 }
2991 else repeat_type = greedy_default;
2992
2993 /* If previous was a character match, abolish the item and generate a
2994 repeat item instead. If a char item has a minumum of more than one, ensure
2995 that it is set in reqbyte - it might not be if a sequence such as x{3} is
2996 the first thing in a branch because the x will have gone into firstbyte
2997 instead. */
2998
2999 if (*previous == OP_CHAR || *previous == OP_CHARNC)
3000 {
3001 /* Deal with UTF-8 characters that take up more than one byte. It's
3002 easier to write this out separately than try to macrify it. Use c to
3003 hold the length of the character in bytes, plus 0x80 to flag that it's a
3004 length rather than a small character. */
3005
3006 #ifdef SUPPORT_UTF8
3007 if (utf8 && (code[-1] & 0x80) != 0)
3008 {
3009 uschar *lastchar = code - 1;
3010 while((*lastchar & 0xc0) == 0x80) lastchar--;
3011 c = code - lastchar; /* Length of UTF-8 character */
3012 memcpy(utf8_char, lastchar, c); /* Save the char */
3013 c |= 0x80; /* Flag c as a length */
3014 }
3015 else
3016 #endif
3017
3018 /* Handle the case of a single byte - either with no UTF8 support, or
3019 with UTF-8 disabled, or for a UTF-8 character < 128. */
3020
3021 {
3022 c = code[-1];
3023 if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
3024 }
3025
3026 /* If the repetition is unlimited, it pays to see if the next thing on
3027 the line is something that cannot possibly match this character. If so,
3028 automatically possessifying this item gains some performance in the case
3029 where the match fails. */
3030
3031 if (!possessive_quantifier &&
3032 repeat_max < 0 &&
3033 check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3034 options, cd))
3035 {
3036 repeat_type = 0; /* Force greedy */
3037 possessive_quantifier = TRUE;
3038 }
3039
3040 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
3041 }
3042
3043 /* If previous was a single negated character ([^a] or similar), we use
3044 one of the special opcodes, replacing it. The code is shared with single-
3045 character repeats by setting opt_type to add a suitable offset into
3046 repeat_type. We can also test for auto-possessification. OP_NOT is
3047 currently used only for single-byte chars. */
3048
3049 else if (*previous == OP_NOT)
3050 {
3051 op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
3052 c = previous[1];
3053 if (!possessive_quantifier &&
3054 repeat_max < 0 &&
3055 check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3056 {
3057 repeat_type = 0; /* Force greedy */
3058 possessive_quantifier = TRUE;
3059 }
3060 goto OUTPUT_SINGLE_REPEAT;
3061 }
3062
3063 /* If previous was a character type match (\d or similar), abolish it and
3064 create a suitable repeat item. The code is shared with single-character
3065 repeats by setting op_type to add a suitable offset into repeat_type. Note
3066 the the Unicode property types will be present only when SUPPORT_UCP is
3067 defined, but we don't wrap the little bits of code here because it just
3068 makes it horribly messy. */
3069
3070 else if (*previous < OP_EODN)
3071 {
3072 uschar *oldcode;
3073 int prop_type, prop_value;
3074 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
3075 c = *previous;
3076
3077 if (!possessive_quantifier &&
3078 repeat_max < 0 &&
3079 check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3080 {
3081 repeat_type = 0; /* Force greedy */
3082 possessive_quantifier = TRUE;
3083 }
3084
3085 OUTPUT_SINGLE_REPEAT:
3086 if (*previous == OP_PROP || *previous == OP_NOTPROP)
3087 {
3088 prop_type = previous[1];
3089 prop_value = previous[2];
3090 }
3091 else prop_type = prop_value = -1;
3092
3093 oldcode = code;
3094 code = previous; /* Usually overwrite previous item */
3095
3096 /* If the maximum is zero then the minimum must also be zero; Perl allows
3097 this case, so we do too - by simply omitting the item altogether. */
3098
3099 if (repeat_max == 0) goto END_REPEAT;
3100
3101 /* All real repeats make it impossible to handle partial matching (maybe
3102 one day we will be able to remove this restriction). */
3103
3104 if (repeat_max != 1) cd->nopartial = TRUE;
3105
3106 /* Combine the op_type with the repeat_type */
3107
3108 repeat_type += op_type;
3109
3110 /* A minimum of zero is handled either as the special case * or ?, or as
3111 an UPTO, with the maximum given. */
3112
3113 if (repeat_min == 0)
3114 {
3115 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
3116 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
3117 else
3118 {
3119 *code++ = OP_UPTO + repeat_type;
3120 PUT2INC(code, 0, repeat_max);
3121 }
3122 }
3123
3124 /* A repeat minimum of 1 is optimized into some special cases. If the
3125 maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
3126 left in place and, if the maximum is greater than 1, we use OP_UPTO with
3127 one less than the maximum. */
3128
3129 else if (repeat_min == 1)
3130 {
3131 if (repeat_max == -1)
3132 *code++ = OP_PLUS + repeat_type;
3133 else
3134 {
3135 code = oldcode; /* leave previous item in place */
3136 if (repeat_max == 1) goto END_REPEAT;
3137 *code++ = OP_UPTO + repeat_type;
3138 PUT2INC(code, 0, repeat_max - 1);
3139 }
3140 }
3141
3142 /* The case {n,n} is just an EXACT, while the general case {n,m} is
3143 handled as an EXACT followed by an UPTO. */
3144
3145 else
3146 {
3147 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
3148 PUT2INC(code, 0, repeat_min);
3149
3150 /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
3151 we have to insert the character for the previous code. For a repeated
3152 Unicode property match, there are two extra bytes that define the
3153 required property. In UTF-8 mode, long characters have their length in
3154 c, with the 0x80 bit as a flag. */
3155
3156 if (repeat_max < 0)
3157 {
3158 #ifdef SUPPORT_UTF8
3159 if (utf8 && c >= 128)
3160 {
3161 memcpy(code, utf8_char, c & 7);
3162 code += c & 7;
3163 }
3164 else
3165 #endif
3166 {
3167 *code++ = c;
3168 if (prop_type >= 0)
3169 {
3170 *code++ = prop_type;
3171 *code++ = prop_value;
3172 }
3173 }
3174 *code++ = OP_STAR + repeat_type;
3175 }
3176
3177 /* Else insert an UPTO if the max is greater than the min, again
3178 preceded by the character, for the previously inserted code. If the
3179 UPTO is just for 1 instance, we can use QUERY instead. */
3180
3181 else if (repeat_max != repeat_min)
3182 {
3183 #ifdef SUPPORT_UTF8
3184 if (utf8 && c >= 128)
3185 {
3186 memcpy(code, utf8_char, c & 7);
3187 code += c & 7;
3188 }
3189 else
3190 #endif
3191 *code++ = c;
3192 if (prop_type >= 0)
3193 {
3194 *code++ = prop_type;
3195 *code++ = prop_value;
3196 }
3197 repeat_max -= repeat_min;
3198
3199 if (repeat_max == 1)
3200 {
3201 *code++ = OP_QUERY + repeat_type;
3202 }
3203 else
3204 {
3205 *code++ = OP_UPTO + repeat_type;
3206 PUT2INC(code, 0, repeat_max);
3207 }
3208 }
3209 }
3210
3211 /* The character or character type itself comes last in all cases. */
3212
3213 #ifdef SUPPORT_UTF8
3214 if (utf8 && c >= 128)
3215 {
3216 memcpy(code, utf8_char, c & 7);
3217 code += c & 7;
3218 }
3219 else
3220 #endif
3221 *code++ = c;
3222
3223 /* For a repeated Unicode property match, there are two extra bytes that
3224 define the required property. */
3225
3226 #ifdef SUPPORT_UCP
3227 if (prop_type >= 0)
3228 {
3229 *code++ = prop_type;
3230 *code++ = prop_value;
3231 }
3232 #endif
3233 }
3234
3235 /* If previous was a character class or a back reference, we put the repeat
3236 stuff after it, but just skip the item if the repeat was {0,0}. */
3237
3238 else if (*previous == OP_CLASS ||
3239 *previous == OP_NCLASS ||
3240 #ifdef SUPPORT_UTF8
3241 *previous == OP_XCLASS ||
3242 #endif
3243 *previous == OP_REF)
3244 {
3245 if (repeat_max == 0)
3246 {
3247 code = previous;
3248 goto END_REPEAT;
3249 }
3250
3251 /* All real repeats make it impossible to handle partial matching (maybe
3252 one day we will be able to remove this restriction). */
3253
3254 if (repeat_max != 1) cd->nopartial = TRUE;
3255
3256 if (repeat_min == 0 && repeat_max == -1)
3257 *code++ = OP_CRSTAR + repeat_type;
3258 else if (repeat_min == 1 && repeat_max == -1)
3259 *code++ = OP_CRPLUS + repeat_type;
3260 else if (repeat_min == 0 && repeat_max == 1)
3261 *code++ = OP_CRQUERY + repeat_type;
3262 else
3263 {
3264 *code++ = OP_CRRANGE + repeat_type;
3265 PUT2INC(code, 0, repeat_min);
3266 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
3267 PUT2INC(code, 0, repeat_max);
3268 }
3269 }
3270
3271 /* If previous was a bracket group, we may have to replicate it in certain
3272 cases. */
3273
3274 else if (*previous == OP_BRA || *previous == OP_CBRA ||
3275 *previous == OP_ONCE || *previous == OP_COND)
3276 {
3277 register int i;
3278 int ketoffset = 0;
3279 int len = code - previous;
3280 uschar *bralink = NULL;
3281
3282 /* Repeating a DEFINE group is pointless */
3283
3284 if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
3285 {
3286 *errorcodeptr = ERR55;
3287 goto FAILED;
3288 }
3289
3290 /* This is a paranoid check to stop integer overflow later on */
3291
3292 if (len > MAX_DUPLENGTH)
3293 {
3294 *errorcodeptr = ERR50;
3295 goto FAILED;
3296 }
3297
3298 /* If the maximum repeat count is unlimited, find the end of the bracket
3299 by scanning through from the start, and compute the offset back to it
3300 from the current code pointer. There may be an OP_OPT setting following
3301 the final KET, so we can't find the end just by going back from the code
3302 pointer. */
3303
3304 if (repeat_max == -1)
3305 {
3306 register uschar *ket = previous;
3307 do ket += GET(ket, 1); while (*ket != OP_KET);
3308 ketoffset = code - ket;
3309 }
3310
3311 /* The case of a zero minimum is special because of the need to stick
3312 OP_BRAZERO in front of it, and because the group appears once in the
3313 data, whereas in other cases it appears the minimum number of times. For
3314 this reason, it is simplest to treat this case separately, as otherwise
3315 the code gets far too messy. There are several special subcases when the
3316 minimum is zero. */
3317
3318 if (repeat_min == 0)
3319 {
3320 /* If the maximum is also zero, we just omit the group from the output
3321 altogether. */
3322
3323 if (repeat_max == 0)
3324 {
3325 code = previous;
3326 goto END_REPEAT;
3327 }
3328
3329 /* If the maximum is 1 or unlimited, we just have to stick in the
3330 BRAZERO and do no more at this point. However, we do need to adjust
3331 any OP_RECURSE calls inside the group that refer to the group itself or
3332 any internal or forward referenced group, because the offset is from
3333 the start of the whole regex. Temporarily terminate the pattern while
3334 doing this. */
3335
3336 if (repeat_max <= 1)
3337 {
3338 *code = OP_END;
3339 adjust_recurse(previous, 1, utf8, cd, save_hwm);
3340 memmove(previous+1, previous, len);
3341 code++;
3342 *previous++ = OP_BRAZERO + repeat_type;
3343 }
3344
3345 /* If the maximum is greater than 1 and limited, we have to replicate
3346 in a nested fashion, sticking OP_BRAZERO before each set of brackets.
3347 The first one has to be handled carefully because it's the original
3348 copy, which has to be moved up. The remainder can be handled by code
3349 that is common with the non-zero minimum case below. We have to
3350 adjust the value or repeat_max, since one less copy is required. Once
3351 again, we may have to adjust any OP_RECURSE calls inside the group. */
3352
3353 else
3354 {
3355 int offset;
3356 *code = OP_END;
3357 adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
3358 memmove(previous + 2 + LINK_SIZE, previous, len);
3359 code += 2 + LINK_SIZE;
3360 *previous++ = OP_BRAZERO + repeat_type;
3361 *previous++ = OP_BRA;
3362
3363 /* We chain together the bracket offset fields that have to be
3364 filled in later when the ends of the brackets are reached. */
3365
3366 offset = (bralink == NULL)? 0 : previous - bralink;
3367 bralink = previous;
3368 PUTINC(previous, 0, offset);
3369 }
3370
3371 repeat_max--;
3372 }
3373
3374 /* If the minimum is greater than zero, replicate the group as many
3375 times as necessary, and adjust the maximum to the number of subsequent
3376 copies that we need. If we set a first char from the group, and didn't
3377 set a required char, copy the latter from the former. If there are any
3378 forward reference subroutine calls in the group, there will be entries on
3379 the workspace list; replicate these with an appropriate increment. */
3380
3381 else
3382 {
3383 if (repeat_min > 1)
3384 {
3385 /* In the pre-compile phase, we don't actually do the replication. We
3386 just adjust the length as if we had. */
3387
3388 if (lengthptr != NULL)
3389 *lengthptr += (repeat_min - 1)*length_prevgroup;
3390
3391 /* This is compiling for real */
3392
3393 else
3394 {
3395 if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
3396 for (i = 1; i < repeat_min; i++)
3397 {
3398 uschar *hc;
3399 uschar *this_hwm = cd->hwm;
3400 memcpy(code, previous, len);
3401 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3402 {
3403 PUT(cd->hwm, 0, GET(hc, 0) + len);
3404 cd->hwm += LINK_SIZE;
3405 }
3406 save_hwm = this_hwm;
3407 code += len;
3408 }
3409 }
3410 }
3411
3412 if (repeat_max > 0) repeat_max -= repeat_min;
3413 }
3414
3415 /* This code is common to both the zero and non-zero minimum cases. If
3416 the maximum is limited, it replicates the group in a nested fashion,
3417 remembering the bracket starts on a stack. In the case of a zero minimum,
3418 the first one was set up above. In all cases the repeat_max now specifies
3419 the number of additional copies needed. Again, we must remember to
3420 replicate entries on the forward reference list. */
3421
3422 if (repeat_max >= 0)
3423 {
3424 /* In the pre-compile phase, we don't actually do the replication. We
3425 just adjust the length as if we had. For each repetition we must add 1
3426 to the length for BRAZERO and for all but the last repetition we must
3427 add 2 + 2*LINKSIZE to allow for the nesting that occurs. */
3428
3429 if (lengthptr != NULL && repeat_max > 0)
3430 *lengthptr += repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
3431 2 - 2*LINK_SIZE; /* Last one doesn't nest */
3432
3433 /* This is compiling for real */
3434
3435 else for (i = repeat_max - 1; i >= 0; i--)
3436 {
3437 uschar *hc;
3438 uschar *this_hwm = cd->hwm;
3439
3440 *code++ = OP_BRAZERO + repeat_type;
3441
3442 /* All but the final copy start a new nesting, maintaining the
3443 chain of brackets outstanding. */
3444
3445 if (i != 0)
3446 {
3447 int offset;
3448 *code++ = OP_BRA;
3449 offset = (bralink == NULL)? 0 : code - bralink;
3450 bralink = code;
3451 PUTINC(code, 0, offset);
3452 }
3453
3454 memcpy(code, previous, len);
3455 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3456 {
3457 PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
3458 cd->hwm += LINK_SIZE;
3459 }
3460 save_hwm = this_hwm;
3461 code += len;
3462 }
3463
3464 /* Now chain through the pending brackets, and fill in their length
3465 fields (which are holding the chain links pro tem). */
3466
3467 while (bralink != NULL)
3468 {
3469 int oldlinkoffset;
3470 int offset = code - bralink + 1;
3471 uschar *bra = code - offset;
3472 oldlinkoffset = GET(bra, 1);
3473 bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
3474 *code++ = OP_KET;
3475 PUTINC(code, 0, offset);
3476 PUT(bra, 1, offset);
3477 }
3478 }
3479
3480 /* If the maximum is unlimited, set a repeater in the final copy. We
3481 can't just offset backwards from the current code point, because we
3482 don't know if there's been an options resetting after the ket. The
3483 correct offset was computed above.
3484
3485 Then, when we are doing the actual compile phase, check to see whether
3486 this group is a non-atomic one that could match an empty string. If so,
3487 convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
3488 that runtime checking can be done. [This check is also applied to
3489 atomic groups at runtime, but in a different way.] */
3490
3491 else
3492 {
3493 uschar *ketcode = code - ketoffset;
3494 uschar *bracode = ketcode - GET(ketcode, 1);
3495 *ketcode = OP_KETRMAX + repeat_type;
3496 if (lengthptr == NULL && *bracode != OP_ONCE)
3497 {
3498 uschar *scode = bracode;
3499 do
3500 {
3501 if (could_be_empty_branch(scode, ketcode, utf8))
3502 {
3503 *bracode += OP_SBRA - OP_BRA;
3504 break;
3505 }
3506 scode += GET(scode, 1);
3507 }
3508 while (*scode == OP_ALT);
3509 }
3510 }
3511 }
3512
3513 /* Else there's some kind of shambles */
3514
3515 else
3516 {
3517 *errorcodeptr = ERR11;
3518 goto FAILED;
3519 }
3520
3521 /* If the character following a repeat is '+', or if certain optimization
3522 tests above succeeded, possessive_quantifier is TRUE. For some of the
3523 simpler opcodes, there is an special alternative opcode for this. For
3524 anything else, we wrap the entire repeated item inside OP_ONCE brackets.
3525 The '+' notation is just syntactic sugar, taken from Sun's Java package,
3526 but the special opcodes can optimize it a bit. The repeated item starts at
3527 tempcode, not at previous, which might be the first part of a string whose
3528 (former) last char we repeated.
3529
3530 Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
3531 an 'upto' may follow. We skip over an 'exact' item, and then test the
3532 length of what remains before proceeding. */
3533
3534 if (possessive_quantifier)
3535 {
3536 int len;
3537 if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||
3538 *tempcode == OP_NOTEXACT)
3539 tempcode += _pcre_OP_lengths[*tempcode];
3540 len = code - tempcode;
3541 if (len > 0) switch (*tempcode)
3542 {
3543 case OP_STAR: *tempcode = OP_POSSTAR; break;
3544 case OP_PLUS: *tempcode = OP_POSPLUS; break;
3545 case OP_QUERY: *tempcode = OP_POSQUERY; break;
3546 case OP_UPTO: *tempcode = OP_POSUPTO; break;
3547
3548 case OP_TYPESTAR: *tempcode = OP_TYPEPOSSTAR; break;
3549 case OP_TYPEPLUS: *tempcode = OP_TYPEPOSPLUS; break;
3550 case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
3551 case OP_TYPEUPTO: *tempcode = OP_TYPEPOSUPTO; break;
3552
3553 case OP_NOTSTAR: *tempcode = OP_NOTPOSSTAR; break;
3554 case OP_NOTPLUS: *tempcode = OP_NOTPOSPLUS; break;
3555 case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
3556 case OP_NOTUPTO: *tempcode = OP_NOTPOSUPTO; break;
3557
3558 default:
3559 memmove(tempcode + 1+LINK_SIZE, tempcode, len);
3560 code += 1 + LINK_SIZE;
3561 len += 1 + LINK_SIZE;
3562 tempcode[0] = OP_ONCE;
3563 *code++ = OP_KET;
3564 PUTINC(code, 0, len);
3565 PUT(tempcode, 1, len);
3566 break;
3567 }
3568 }
3569
3570 /* In all case we no longer have a previous item. We also set the
3571 "follows varying string" flag for subsequently encountered reqbytes if
3572 it isn't already set and we have just passed a varying length item. */
3573
3574 END_REPEAT:
3575 previous = NULL;
3576 cd->req_varyopt |= reqvary;
3577 break;
3578
3579
3580 /* ===================================================================*/
3581 /* Start of nested parenthesized sub-expression, or comment or lookahead or
3582 lookbehind or option setting or condition or all the other extended
3583 parenthesis forms. First deal with the specials; all are introduced by ?,
3584 and the appearance of any of them means that this is not a capturing
3585 group. */
3586
3587 case '(':
3588 newoptions = options;
3589 skipbytes = 0;
3590 bravalue = OP_CBRA;
3591 save_hwm = cd->hwm;
3592 reset_bracount = FALSE;
3593
3594 if (*(++ptr) == '?')
3595 {
3596 int i, set, unset, namelen;
3597 int *optset;
3598 const uschar *name;
3599 uschar *slot;
3600
3601 switch (*(++ptr))
3602 {
3603 case '#': /* Comment; skip to ket */
3604 ptr++;
3605 while (*ptr != 0 && *ptr != ')') ptr++;
3606 if (*ptr == 0)
3607 {
3608 *errorcodeptr = ERR18;
3609 goto FAILED;
3610 }
3611 continue;
3612
3613
3614 /* ------------------------------------------------------------ */
3615 case '|': /* Reset capture count for each branch */
3616 reset_bracount = TRUE;
3617 /* Fall through */
3618
3619 /* ------------------------------------------------------------ */
3620 case ':': /* Non-capturing bracket */
3621 bravalue = OP_BRA;
3622 ptr++;
3623 break;
3624
3625
3626 /* ------------------------------------------------------------ */
3627 case '(':
3628 bravalue = OP_COND; /* Conditional group */
3629
3630 /* A condition can be an assertion, a number (referring to a numbered
3631 group), a name (referring to a named group), or 'R', referring to
3632 recursion. R<digits> and R&name are also permitted for recursion tests.
3633
3634 There are several syntaxes for testing a named group: (?(name)) is used
3635 by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
3636
3637 There are two unfortunate ambiguities, caused by history. (a) 'R' can
3638 be the recursive thing or the name 'R' (and similarly for 'R' followed
3639 by digits), and (b) a number could be a name that consists of digits.
3640 In both cases, we look for a name first; if not found, we try the other
3641 cases. */
3642
3643 /* For conditions that are assertions, check the syntax, and then exit
3644 the switch. This will take control down to where bracketed groups,
3645 including assertions, are processed. */
3646
3647 if (ptr[1] == '?' && (ptr[2] == '=' || ptr[2] == '!' || ptr[2] == '<'))
3648 break;
3649
3650 /* Most other conditions use OP_CREF (a couple change to OP_RREF
3651 below), and all need to skip 3 bytes at the start of the group. */
3652
3653 code[1+LINK_SIZE] = OP_CREF;
3654 skipbytes = 3;
3655 refsign = -1;
3656
3657 /* Check for a test for recursion in a named group. */
3658
3659 if (ptr[1] == 'R' && ptr[2] == '&')
3660 {
3661 terminator = -1;
3662 ptr += 2;
3663 code[1+LINK_SIZE] = OP_RREF; /* Change the type of test */
3664 }
3665
3666 /* Check for a test for a named group's having been set, using the Perl
3667 syntax (?(<name>) or (?('name') */
3668
3669 else if (ptr[1] == '<')
3670 {
3671 terminator = '>';
3672 ptr++;
3673 }
3674 else if (ptr[1] == '\'')
3675 {
3676 terminator = '\'';
3677 ptr++;
3678 }
3679 else
3680 {
3681 terminator = 0;
3682 if (ptr[1] == '-' || ptr[1] == '+') refsign = *(++ptr);
3683 }
3684
3685 /* We now expect to read a name; any thing else is an error */
3686
3687 if ((cd->ctypes[ptr[1]] & ctype_word) == 0)
3688 {
3689 ptr += 1; /* To get the right offset */
3690 *errorcodeptr = ERR28;
3691 goto FAILED;
3692 }
3693
3694 /* Read the name, but also get it as a number if it's all digits */
3695
3696 recno = 0;
3697 name = ++ptr;
3698 while ((cd->ctypes[*ptr] & ctype_word) != 0)
3699 {
3700 if (recno >= 0)
3701 recno = ((digitab[*ptr] & ctype_digit) != 0)?
3702 recno * 10 + *ptr - '0' : -1;
3703 ptr++;
3704 }
3705 namelen = ptr - name;
3706
3707 if ((terminator > 0 && *ptr++ != terminator) || *ptr++ != ')')
3708 {
3709 ptr--; /* Error offset */
3710 *errorcodeptr = ERR26;
3711 goto FAILED;
3712 }
3713
3714 /* Do no further checking in the pre-compile phase. */
3715
3716 if (lengthptr != NULL) break;
3717
3718 /* In the real compile we do the work of looking for the actual
3719 reference. If the string started with "+" or "-" we require the rest to
3720 be digits, in which case recno will be set. */
3721
3722 if (refsign > 0)
3723 {
3724 if (recno <= 0)
3725 {
3726 *errorcodeptr = ERR58;
3727 goto FAILED;
3728 }
3729 if (refsign == '-')
3730 {
3731 recno = cd->bracount - recno + 1;
3732 if (recno <= 0)
3733 {
3734 *errorcodeptr = ERR15;
3735 goto FAILED;
3736 }
3737 }
3738 else recno += cd->bracount;
3739 PUT2(code, 2+LINK_SIZE, recno);
3740 break;
3741 }
3742
3743 /* Otherwise (did not start with "+" or "-"), start by looking for the
3744 name. */
3745
3746 slot = cd->name_table;
3747 for (i = 0; i < cd->names_found; i++)
3748 {
3749 if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
3750 slot += cd->name_entry_size;
3751 }
3752
3753 /* Found a previous named subpattern */
3754
3755 if (i < cd->names_found)
3756 {
3757 recno = GET2(slot, 0);
3758 PUT2(code, 2+LINK_SIZE, recno);
3759 }
3760
3761 /* Search the pattern for a forward reference */
3762
3763 else if ((i = find_parens(ptr, cd->bracount, name, namelen,
3764 (options & PCRE_EXTENDED) != 0)) > 0)
3765 {
3766 PUT2(code, 2+LINK_SIZE, i);
3767 }
3768
3769 /* If terminator == 0 it means that the name followed directly after
3770 the opening parenthesis [e.g. (?(abc)...] and in this case there are
3771 some further alternatives to try. For the cases where terminator != 0
3772 [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
3773 now checked all the possibilities, so give an error. */
3774
3775 else if (terminator != 0)
3776 {
3777 *errorcodeptr = ERR15;
3778 goto FAILED;
3779 }
3780
3781 /* Check for (?(R) for recursion. Allow digits after R to specify a
3782 specific group number. */
3783
3784 else if (*name == 'R')
3785 {
3786 recno = 0;
3787 for (i = 1; i < namelen; i++)
3788 {
3789 if ((digitab[name[i]] & ctype_digit) == 0)
3790 {
3791 *errorcodeptr = ERR15;
3792 goto FAILED;
3793 }
3794 recno = recno * 10 + name[i] - '0';
3795 }
3796 if (recno == 0) recno = RREF_ANY;
3797 code[1+LINK_SIZE] = OP_RREF; /* Change test type */
3798 PUT2(code, 2+LINK_SIZE, recno);
3799 }
3800
3801 /* Similarly, check for the (?(DEFINE) "condition", which is always
3802 false. */
3803
3804 else if (namelen == 6 && strncmp((char *)name, "DEFINE", 6) == 0)
3805 {
3806 code[1+LINK_SIZE] = OP_DEF;
3807 skipbytes = 1;
3808 }
3809
3810 /* Check for the "name" actually being a subpattern number. */
3811
3812 else if (recno > 0)
3813 {
3814 PUT2(code, 2+LINK_SIZE, recno);
3815 }
3816
3817 /* Either an unidentified subpattern, or a reference to (?(0) */
3818
3819 else
3820 {
3821 *errorcodeptr = (recno == 0)? ERR35: ERR15;
3822 goto FAILED;
3823 }
3824 break;
3825
3826
3827 /* ------------------------------------------------------------ */
3828 case '=': /* Positive lookahead */
3829 bravalue = OP_ASSERT;
3830 ptr++;
3831 break;
3832
3833
3834 /* ------------------------------------------------------------ */
3835 case '!': /* Negative lookahead */
3836 bravalue = OP_ASSERT_NOT;
3837 ptr++;
3838 break;
3839
3840
3841 /* ------------------------------------------------------------ */
3842 case '<': /* Lookbehind or named define */
3843 switch (ptr[1])
3844 {
3845 case '=': /* Positive lookbehind */
3846 bravalue = OP_ASSERTBACK;
3847 ptr += 2;
3848 break;
3849
3850 case '!': /* Negative lookbehind */
3851 bravalue = OP_ASSERTBACK_NOT;
3852 ptr += 2;
3853 break;
3854
3855 default: /* Could be name define, else bad */
3856 if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME;
3857 ptr++; /* Correct offset for error */
3858 *errorcodeptr = ERR24;
3859 goto FAILED;
3860 }
3861 break;
3862
3863
3864 /* ------------------------------------------------------------ */
3865 case '>': /* One-time brackets */
3866 bravalue = OP_ONCE;
3867 ptr++;
3868 break;
3869
3870
3871 /* ------------------------------------------------------------ */
3872 case 'C': /* Callout - may be followed by digits; */
3873 previous_callout = code; /* Save for later completion */
3874 after_manual_callout = 1; /* Skip one item before completing */
3875 *code++ = OP_CALLOUT;
3876 {
3877 int n = 0;
3878 while ((digitab[*(++ptr)] & ctype_digit) != 0)
3879 n = n * 10 + *ptr - '0';
3880 if (*ptr != ')')
3881 {
3882 *errorcodeptr = ERR39;
3883 goto FAILED;
3884 }
3885 if (n > 255)
3886 {
3887 *errorcodeptr = ERR38;
3888 goto FAILED;
3889 }
3890 *code++ = n;
3891 PUT(code, 0, ptr - cd->start_pattern + 1); /* Pattern offset */
3892 PUT(code, LINK_SIZE, 0); /* Default length */
3893 code += 2 * LINK_SIZE;
3894 }
3895 previous = NULL;
3896 continue;
3897
3898
3899 /* ------------------------------------------------------------ */
3900 case 'P': /* Python-style named subpattern handling */
3901 if (*(++ptr) == '=' || *ptr == '>') /* Reference or recursion */
3902 {
3903 is_recurse = *ptr == '>';
3904 terminator = ')';
3905 goto NAMED_REF_OR_RECURSE;
3906 }
3907 else if (*ptr != '<') /* Test for Python-style definition */
3908 {
3909 *errorcodeptr = ERR41;
3910 goto FAILED;
3911 }
3912 /* Fall through to handle (?P< as (?< is handled */
3913
3914
3915 /* ------------------------------------------------------------ */
3916 DEFINE_NAME: /* Come here from (?< handling */
3917 case '\'':
3918 {
3919 terminator = (*ptr == '<')? '>' : '\'';
3920 name = ++ptr;
3921
3922 while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
3923 namelen = ptr - name;
3924
3925 /* In the pre-compile phase, just do a syntax check. */
3926
3927 if (lengthptr != NULL)
3928 {
3929 if (*ptr != terminator)
3930 {
3931 *errorcodeptr = ERR42;
3932 goto FAILED;
3933 }
3934 if (cd->names_found >= MAX_NAME_COUNT)
3935 {
3936 *errorcodeptr = ERR49;
3937 goto FAILED;
3938 }
3939 if (namelen + 3 > cd->name_entry_size)
3940 {
3941 cd->name_entry_size = namelen + 3;
3942 if (namelen > MAX_NAME_SIZE)
3943 {
3944 *errorcodeptr = ERR48;
3945 goto FAILED;
3946 }
3947 }
3948 }
3949
3950 /* In the real compile, create the entry in the table */
3951
3952 else
3953 {
3954 slot = cd->name_table;
3955 for (i = 0; i < cd->names_found; i++)
3956 {
3957 int crc = memcmp(name, slot+2, namelen);
3958 if (crc == 0)
3959 {
3960 if (slot[2+namelen] == 0)
3961 {
3962 if ((options & PCRE_DUPNAMES) == 0)
3963 {
3964 *errorcodeptr = ERR43;
3965 goto FAILED;
3966 }
3967 }
3968 else crc = -1; /* Current name is substring */
3969 }
3970 if (crc < 0)
3971 {
3972 memmove(slot + cd->name_entry_size, slot,
3973 (cd->names_found - i) * cd->name_entry_size);
3974 break;
3975 }
3976 slot += cd->name_entry_size;
3977 }
3978
3979 PUT2(slot, 0, cd->bracount + 1);
3980 memcpy(slot + 2, name, namelen);
3981 slot[2+namelen] = 0;
3982 }
3983 }
3984
3985 /* In both cases, count the number of names we've encountered. */
3986
3987 ptr++; /* Move past > or ' */
3988 cd->names_found++;
3989 goto NUMBERED_GROUP;
3990
3991
3992 /* ------------------------------------------------------------ */
3993 case '&': /* Perl recursion/subroutine syntax */
3994 terminator = ')';
3995 is_recurse = TRUE;
3996 /* Fall through */
3997
3998 /* We come here from the Python syntax above that handles both
3999 references (?P=name) and recursion (?P>name), as well as falling
4000 through from the Perl recursion syntax (?&name). */
4001
4002 NAMED_REF_OR_RECURSE:
4003 name = ++ptr;
4004 while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4005 namelen = ptr - name;
4006
4007 /* In the pre-compile phase, do a syntax check and set a dummy
4008 reference number. */
4009
4010 if (lengthptr != NULL)
4011 {
4012 if (*ptr != terminator)
4013 {
4014 *errorcodeptr = ERR42;
4015 goto FAILED;
4016 }
4017 if (namelen > MAX_NAME_SIZE)
4018 {
4019 *errorcodeptr = ERR48;
4020 goto FAILED;
4021 }
4022 recno = 0;
4023 }
4024
4025 /* In the real compile, seek the name in the table */
4026
4027 else
4028 {
4029 slot = cd->name_table;
4030 for (i = 0; i < cd->names_found; i++)
4031 {
4032 if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
4033 slot += cd->name_entry_size;
4034 }
4035
4036 if (i < cd->names_found) /* Back reference */
4037 {
4038 recno = GET2(slot, 0);
4039 }
4040 else if ((recno = /* Forward back reference */
4041 find_parens(ptr, cd->bracount, name, namelen,
4042 (options & PCRE_EXTENDED) != 0)) <= 0)
4043 {
4044 *errorcodeptr = ERR15;
4045 goto FAILED;
4046 }
4047 }
4048
4049 /* In both phases, we can now go to the code than handles numerical
4050 recursion or backreferences. */
4051
4052 if (is_recurse) goto HANDLE_RECURSION;
4053 else goto HANDLE_REFERENCE;
4054
4055
4056 /* ------------------------------------------------------------ */
4057 case 'R': /* Recursion */
4058 ptr++; /* Same as (?0) */
4059 /* Fall through */
4060
4061
4062 /* ------------------------------------------------------------ */
4063 case '-': case '+':
4064 case '0': case '1': case '2': case '3': case '4': /* Recursion or */
4065 case '5': case '6': case '7': case '8': case '9': /* subroutine */
4066 {
4067 const uschar *called;
4068
4069 if ((refsign = *ptr) == '+') ptr++;
4070 else if (refsign == '-')
4071 {
4072 if ((digitab[ptr[1]] & ctype_digit) == 0)
4073 goto OTHER_CHAR_AFTER_QUERY;
4074 ptr++;
4075 }
4076
4077 recno = 0;
4078 while((digitab[*ptr] & ctype_digit) != 0)
4079 recno = recno * 10 + *ptr++ - '0';
4080
4081 if (*ptr != ')')
4082 {
4083 *errorcodeptr = ERR29;
4084 goto FAILED;
4085 }
4086
4087 if (refsign == '-')
4088 {
4089 if (recno == 0)
4090 {
4091 *errorcodeptr = ERR58;
4092 goto FAILED;
4093 }
4094 recno = cd->bracount - recno + 1;
4095 if (recno <= 0)
4096 {
4097 *errorcodeptr = ERR15;
4098 goto FAILED;
4099 }
4100 }
4101 else if (refsign == '+')
4102 {
4103 if (recno == 0)
4104 {
4105 *errorcodeptr = ERR58;
4106 goto FAILED;
4107 }
4108 recno += cd->bracount;
4109 }
4110
4111 /* Come here from code above that handles a named recursion */
4112
4113 HANDLE_RECURSION:
4114
4115 previous = code;
4116 called = cd->start_code;
4117
4118 /* When we are actually compiling, find the bracket that is being
4119 referenced. Temporarily end the regex in case it doesn't exist before
4120 this point. If we end up with a forward reference, first check that
4121 the bracket does occur later so we can give the error (and position)
4122 now. Then remember this forward reference in the workspace so it can
4123 be filled in at the end. */
4124
4125 if (lengthptr == NULL)
4126 {
4127 *code = OP_END;
4128 if (recno != 0) called = find_bracket(cd->start_code, utf8, recno);
4129
4130 /* Forward reference */
4131
4132 if (called == NULL)
4133 {
4134 if (find_parens(ptr, cd->bracount, NULL, recno,
4135 (options & PCRE_EXTENDED) != 0) < 0)
4136 {
4137 *errorcodeptr = ERR15;
4138 goto FAILED;
4139 }
4140 called = cd->start_code + recno;
4141 PUTINC(cd->hwm, 0, code + 2 + LINK_SIZE - cd->start_code);
4142 }
4143
4144 /* If not a forward reference, and the subpattern is still open,
4145 this is a recursive call. We check to see if this is a left
4146 recursion that could loop for ever, and diagnose that case. */
4147
4148 else if (GET(called, 1) == 0 &&
4149 could_be_empty(called, code, bcptr, utf8))
4150 {
4151 *errorcodeptr = ERR40;
4152 goto FAILED;
4153 }
4154 }
4155
4156 /* Insert the recursion/subroutine item, automatically wrapped inside
4157 "once" brackets. Set up a "previous group" length so that a
4158 subsequent quantifier will work. */
4159
4160 *code = OP_ONCE;
4161 PUT(code, 1, 2 + 2*LINK_SIZE);
4162 code += 1 + LINK_SIZE;
4163
4164 *code = OP_RECURSE;
4165 PUT(code, 1, called - cd->start_code);
4166 code += 1 + LINK_SIZE;
4167
4168 *code = OP_KET;
4169 PUT(code, 1, 2 + 2*LINK_SIZE);
4170 code += 1 + LINK_SIZE;
4171
4172 length_prevgroup = 3 + 3*LINK_SIZE;
4173 }
4174
4175 /* Can't determine a first byte now */
4176
4177 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
4178 continue;
4179
4180
4181 /* ------------------------------------------------------------ */
4182 default: /* Other characters: check option setting */
4183 OTHER_CHAR_AFTER_QUERY:
4184 set = unset = 0;
4185 optset = &set;
4186
4187 while (*ptr != ')' && *ptr != ':')
4188 {
4189 switch (*ptr++)
4190 {
4191 case '-': optset = &unset; break;
4192
4193 case 'J': /* Record that it changed in the external options */
4194 *optset |= PCRE_DUPNAMES;
4195 cd->external_options |= PCRE_JCHANGED;
4196 break;
4197
4198 case 'i': *optset |= PCRE_CASELESS; break;
4199 case 'm': *optset |= PCRE_MULTILINE; break;
4200 case 's': *optset |= PCRE_DOTALL; break;
4201 case 'x': *optset |= PCRE_EXTENDED; break;
4202 case 'U': *optset |= PCRE_UNGREEDY; break;
4203 case 'X': *optset |= PCRE_EXTRA; break;
4204
4205 default: *errorcodeptr = ERR12;
4206 ptr--; /* Correct the offset */
4207 goto FAILED;
4208 }
4209 }
4210
4211 /* Set up the changed option bits, but don't change anything yet. */
4212
4213 newoptions = (options | set) & (~unset);
4214
4215 /* If the options ended with ')' this is not the start of a nested
4216 group with option changes, so the options change at this level. If this
4217 item is right at the start of the pattern, the options can be
4218 abstracted and made external in the pre-compile phase, and ignored in
4219 the compile phase. This can be helpful when matching -- for instance in
4220 caseless checking of required bytes.
4221
4222 If the code pointer is not (cd->start_code + 1 + LINK_SIZE), we are
4223 definitely *not* at the start of the pattern because something has been
4224 compiled. In the pre-compile phase, however, the code pointer can have
4225 that value after the start, because it gets reset as code is discarded
4226 during the pre-compile. However, this can happen only at top level - if
4227 we are within parentheses, the starting BRA will still be present. At
4228 any parenthesis level, the length value can be used to test if anything
4229 has been compiled at that level. Thus, a test for both these conditions
4230 is necessary to ensure we correctly detect the start of the pattern in
4231 both phases.
4232
4233 If we are not at the pattern start, compile code to change the ims
4234 options if this setting actually changes any of them. We also pass the
4235 new setting back so that it can be put at the start of any following
4236 branches, and when this group ends (if we are in a group), a resetting
4237 item can be compiled. */
4238
4239 if (*ptr == ')')
4240 {
4241 if (code == cd->start_code + 1 + LINK_SIZE &&
4242 (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE))
4243 {
4244 cd->external_options = newoptions;
4245 options = newoptions;
4246 }
4247 else
4248 {
4249 if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
4250 {
4251 *code++ = OP_OPT;
4252 *code++ = newoptions & PCRE_IMS;
4253 }
4254
4255 /* Change options at this level, and pass them back for use
4256 in subsequent branches. Reset the greedy defaults and the case
4257 value for firstbyte and reqbyte. */
4258
4259 *optionsptr = options = newoptions;
4260 greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
4261 greedy_non_default = greedy_default ^ 1;
4262 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
4263 }
4264
4265 previous = NULL; /* This item can't be repeated */
4266 continue; /* It is complete */
4267 }
4268
4269 /* If the options ended with ':' we are heading into a nested group
4270 with possible change of options. Such groups are non-capturing and are
4271 not assertions of any kind. All we need to do is skip over the ':';
4272 the newoptions value is handled below. */
4273
4274 bravalue = OP_BRA;
4275 ptr++;
4276 } /* End of switch for character following (? */
4277 } /* End of (? handling */
4278
4279 /* Opening parenthesis not followed by '?'. If PCRE_NO_AUTO_CAPTURE is set,
4280 all unadorned brackets become non-capturing and behave like (?:...)
4281 brackets. */
4282
4283 else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
4284 {
4285 bravalue = OP_BRA;
4286 }
4287
4288 /* Else we have a capturing group. */
4289
4290 else
4291 {
4292 NUMBERED_GROUP:
4293 cd->bracount += 1;
4294 PUT2(code, 1+LINK_SIZE, cd->bracount);
4295 skipbytes = 2;
4296 }
4297
4298 /* Process nested bracketed regex. Assertions may not be repeated, but
4299 other kinds can be. All their opcodes are >= OP_ONCE. We copy code into a
4300 non-register variable in order to be able to pass its address because some
4301 compilers complain otherwise. Pass in a new setting for the ims options if
4302 they have changed. */
4303
4304 previous = (bravalue >= OP_ONCE)? code : NULL;
4305 *code = bravalue;
4306 tempcode = code;
4307 tempreqvary = cd->req_varyopt; /* Save value before bracket */
4308 length_prevgroup = 0; /* Initialize for pre-compile phase */
4309
4310 if (!compile_regex(
4311 newoptions, /* The complete new option state */
4312 options & PCRE_IMS, /* The previous ims option state */
4313 &tempcode, /* Where to put code (updated) */
4314 &ptr, /* Input pointer (updated) */
4315 errorcodeptr, /* Where to put an error message */
4316 (bravalue == OP_ASSERTBACK ||
4317 bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
4318 reset_bracount, /* True if (?| group */
4319 skipbytes, /* Skip over bracket number */
4320 &subfirstbyte, /* For possible first char */
4321 &subreqbyte, /* For possible last char */
4322 bcptr, /* Current branch chain */
4323 cd, /* Tables block */
4324 (lengthptr == NULL)? NULL : /* Actual compile phase */
4325 &length_prevgroup /* Pre-compile phase */
4326 ))
4327 goto FAILED;
4328
4329 /* At the end of compiling, code is still pointing to the start of the
4330 group, while tempcode has been updated to point past the end of the group
4331 and any option resetting that may follow it. The pattern pointer (ptr)
4332 is on the bracket. */
4333
4334 /* If this is a conditional bracket, check that there are no more than
4335 two branches in the group, or just one if it's a DEFINE group. We do this
4336 in the real compile phase, not in the pre-pass, where the whole group may
4337 not be available. */
4338
4339 if (bravalue == OP_COND && lengthptr == NULL)
4340 {
4341 uschar *tc = code;
4342 int condcount = 0;
4343
4344 do {
4345 condcount++;
4346 tc += GET(tc,1);
4347 }
4348 while (*tc != OP_KET);
4349
4350 /* A DEFINE group is never obeyed inline (the "condition" is always
4351 false). It must have only one branch. */
4352
4353 if (code[LINK_SIZE+1] == OP_DEF)
4354 {
4355 if (condcount > 1)
4356 {
4357 *errorcodeptr = ERR54;
4358 goto FAILED;
4359 }
4360 bravalue = OP_DEF; /* Just a flag to suppress char handling below */
4361 }
4362
4363 /* A "normal" conditional group. If there is just one branch, we must not
4364 make use of its firstbyte or reqbyte, because this is equivalent to an
4365 empty second branch. */
4366
4367 else
4368 {
4369 if (condcount > 2)
4370 {
4371 *errorcodeptr = ERR27;
4372 goto FAILED;
4373 }
4374 if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
4375 }
4376 }
4377
4378 /* Error if hit end of pattern */
4379
4380 if (*ptr != ')')
4381 {
4382 *errorcodeptr = ERR14;
4383 goto FAILED;
4384 }
4385
4386 /* In the pre-compile phase, update the length by the length of the nested
4387 group, less the brackets at either end. Then reduce the compiled code to
4388 just the brackets so that it doesn't use much memory if it is duplicated by
4389 a quantifier. */
4390
4391 if (lengthptr != NULL)
4392 {
4393 *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
4394 code++;
4395 PUTINC(code, 0, 1 + LINK_SIZE);
4396 *code++ = OP_KET;
4397 PUTINC(code, 0, 1 + LINK_SIZE);
4398 }
4399
4400 /* Otherwise update the main code pointer to the end of the group. */
4401
4402 else code = tempcode;
4403
4404 /* For a DEFINE group, required and first character settings are not
4405 relevant. */
4406
4407 if (bravalue == OP_DEF) break;
4408
4409 /* Handle updating of the required and first characters for other types of
4410 group. Update for normal brackets of all kinds, and conditions with two
4411 branches (see code above). If the bracket is followed by a quantifier with
4412 zero repeat, we have to back off. Hence the definition of zeroreqbyte and
4413 zerofirstbyte outside the main loop so that they can be accessed for the
4414 back off. */
4415
4416 zeroreqbyte = reqbyte;
4417 zerofirstbyte = firstbyte;
4418 groupsetfirstbyte = FALSE;
4419
4420 if (bravalue >= OP_ONCE)
4421 {
4422 /* If we have not yet set a firstbyte in this branch, take it from the
4423 subpattern, remembering that it was set here so that a repeat of more
4424 than one can replicate it as reqbyte if necessary. If the subpattern has
4425 no firstbyte, set "none" for the whole branch. In both cases, a zero
4426 repeat forces firstbyte to "none". */
4427
4428 if (firstbyte == REQ_UNSET)
4429 {
4430 if (subfirstbyte >= 0)
4431 {
4432 firstbyte = subfirstbyte;
4433 groupsetfirstbyte = TRUE;
4434 }
4435 else firstbyte = REQ_NONE;
4436 zerofirstbyte = REQ_NONE;
4437 }
4438
4439 /* If firstbyte was previously set, convert the subpattern's firstbyte
4440 into reqbyte if there wasn't one, using the vary flag that was in
4441 existence beforehand. */
4442
4443 else if (subfirstbyte >= 0 && subreqbyte < 0)
4444 subreqbyte = subfirstbyte | tempreqvary;
4445
4446 /* If the subpattern set a required byte (or set a first byte that isn't
4447 really the first byte - see above), set it. */
4448
4449 if (subreqbyte >= 0) reqbyte = subreqbyte;
4450 }
4451
4452 /* For a forward assertion, we take the reqbyte, if set. This can be
4453 helpful if the pattern that follows the assertion doesn't set a different
4454 char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
4455 for an assertion, however because it leads to incorrect effect for patterns
4456 such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
4457 of a firstbyte. This is overcome by a scan at the end if there's no
4458 firstbyte, looking for an asserted first char. */
4459
4460 else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
4461 break; /* End of processing '(' */
4462
4463
4464 /* ===================================================================*/
4465 /* Handle metasequences introduced by \. For ones like \d, the ESC_ values
4466 are arranged to be the negation of the corresponding OP_values. For the
4467 back references, the values are ESC_REF plus the reference number. Only
4468 back references and those types that consume a character may be repeated.
4469 We can test for values between ESC_b and ESC_Z for the latter; this may
4470 have to change if any new ones are ever created. */
4471
4472 case '\\':
4473 tempptr = ptr;
4474 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, FALSE);
4475 if (*errorcodeptr != 0) goto FAILED;
4476
4477 if (c < 0)
4478 {
4479 if (-c == ESC_Q) /* Handle start of quoted string */
4480 {
4481 if (ptr[1] == '\\' && ptr[2] == 'E') ptr += 2; /* avoid empty string */
4482 else inescq = TRUE;
4483 continue;
4484 }
4485
4486 if (-c == ESC_E) continue; /* Perl ignores an orphan \E */
4487
4488 /* For metasequences that actually match a character, we disable the
4489 setting of a first character if it hasn't already been set. */
4490
4491 if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
4492 firstbyte = REQ_NONE;
4493
4494 /* Set values to reset to if this is followed by a zero repeat. */
4495
4496 zerofirstbyte = firstbyte;
4497 zeroreqbyte = reqbyte;
4498
4499 /* \k<name> or \k'name' is a back reference by name (Perl syntax).
4500 We also support \k{name} (.NET syntax) */
4501
4502 if (-c == ESC_k && (ptr[1] == '<' || ptr[1] == '\'' || ptr[1] == '{'))
4503 {
4504 is_recurse = FALSE;
4505 terminator = (*(++ptr) == '<')? '>' : (*ptr == '\'')? '\'' : '}';
4506 goto NAMED_REF_OR_RECURSE;
4507 }
4508
4509 /* Back references are handled specially; must disable firstbyte if
4510 not set to cope with cases like (?=(\w+))\1: which would otherwise set
4511 ':' later. */
4512
4513 if (-c >= ESC_REF)
4514 {
4515 recno = -c - ESC_REF;
4516
4517 HANDLE_REFERENCE: /* Come here from named backref handling */
4518 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
4519 previous = code;
4520 *code++ = OP_REF;
4521 PUT2INC(code, 0, recno);
4522 cd->backref_map |= (recno < 32)? (1 << recno) : 1;
4523 if (recno > cd->top_backref) cd->top_backref = recno;
4524 }
4525
4526 /* So are Unicode property matches, if supported. */
4527
4528 #ifdef SUPPORT_UCP
4529 else if (-c == ESC_P || -c == ESC_p)
4530 {
4531 BOOL negated;
4532 int pdata;
4533 int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
4534 if (ptype < 0) goto FAILED;
4535 previous = code;
4536 *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
4537 *code++ = ptype;
4538 *code++ = pdata;
4539 }
4540 #else
4541
4542 /* If Unicode properties are not supported, \X, \P, and \p are not
4543 allowed. */
4544
4545 else if (-c == ESC_X || -c == ESC_P || -c == ESC_p)
4546 {
4547 *errorcodeptr = ERR45;
4548 goto FAILED;
4549 }
4550 #endif
4551
4552 /* For the rest (including \X when Unicode properties are supported), we
4553 can obtain the OP value by negating the escape value. */
4554
4555 else
4556 {
4557 previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
4558 *code++ = -c;
4559 }
4560 continue;
4561 }
4562
4563 /* We have a data character whose value is in c. In UTF-8 mode it may have
4564 a value > 127. We set its representation in the length/buffer, and then
4565 handle it as a data character. */
4566
4567 #ifdef SUPPORT_UTF8
4568 if (utf8 && c > 127)
4569 mclength = _pcre_ord2utf8(c, mcbuffer);
4570 else
4571 #endif
4572
4573 {
4574 mcbuffer[0] = c;
4575 mclength = 1;
4576 }
4577 goto ONE_CHAR;
4578
4579
4580 /* ===================================================================*/
4581 /* Handle a literal character. It is guaranteed not to be whitespace or #
4582 when the extended flag is set. If we are in UTF-8 mode, it may be a
4583 multi-byte literal character. */
4584
4585 default:
4586 NORMAL_CHAR:
4587 mclength = 1;
4588 mcbuffer[0] = c;
4589
4590 #ifdef SUPPORT_UTF8
4591 if (utf8 && c >= 0xc0)
4592 {
4593 while ((ptr[1] & 0xc0) == 0x80)
4594 mcbuffer[mclength++] = *(++ptr);
4595 }
4596 #endif
4597
4598 /* At this point we have the character's bytes in mcbuffer, and the length
4599 in mclength. When not in UTF-8 mode, the length is always 1. */
4600
4601 ONE_CHAR:
4602 previous = code;
4603 *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARNC : OP_CHAR;
4604 for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
4605
4606 /* Set the first and required bytes appropriately. If no previous first
4607 byte, set it from this character, but revert to none on a zero repeat.
4608 Otherwise, leave the firstbyte value alone, and don't change it on a zero
4609 repeat. */
4610
4611 if (firstbyte == REQ_UNSET)
4612 {
4613 zerofirstbyte = REQ_NONE;
4614 zeroreqbyte = reqbyte;
4615
4616 /* If the character is more than one byte long, we can set firstbyte
4617 only if it is not to be matched caselessly. */
4618
4619 if (mclength == 1 || req_caseopt == 0)
4620 {
4621 firstbyte = mcbuffer[0] | req_caseopt;
4622 if (mclength != 1) reqbyte = code[-1] | cd->req_varyopt;
4623 }
4624 else firstbyte = reqbyte = REQ_NONE;
4625 }
4626
4627 /* firstbyte was previously set; we can set reqbyte only the length is
4628 1 or the matching is caseful. */
4629
4630 else
4631 {
4632 zerofirstbyte = firstbyte;
4633 zeroreqbyte = reqbyte;
4634 if (mclength == 1 || req_caseopt == 0)
4635 reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
4636 }
4637
4638 break; /* End of literal character handling */
4639 }
4640 } /* end of big loop */
4641
4642
4643 /* Control never reaches here by falling through, only by a goto for all the
4644 error states. Pass back the position in the pattern so that it can be displayed
4645 to the user for diagnosing the error. */
4646
4647 FAILED:
4648 *ptrptr = ptr;
4649 return FALSE;
4650 }
4651
4652
4653
4654
4655 /*************************************************
4656 * Compile sequence of alternatives *
4657 *************************************************/
4658
4659 /* On entry, ptr is pointing past the bracket character, but on return it
4660 points to the closing bracket, or vertical bar, or end of string. The code
4661 variable is pointing at the byte into which the BRA operator has been stored.
4662 If the ims options are changed at the start (for a (?ims: group) or during any
4663 branch, we need to insert an OP_OPT item at the start of every following branch
4664 to ensure they get set correctly at run time, and also pass the new options
4665 into every subsequent branch compile.
4666
4667 This function is used during the pre-compile phase when we are trying to find
4668 out the amount of memory needed, as well as during the real compile phase. The
4669 value of lengthptr distinguishes the two phases.
4670
4671 Arguments:
4672 options option bits, including any changes for this subpattern
4673 oldims previous settings of ims option bits
4674 codeptr -> the address of the current code pointer
4675 ptrptr -> the address of the current pattern pointer
4676 errorcodeptr -> pointer to error code variable
4677 lookbehind TRUE if this is a lookbehind assertion
4678 reset_bracount TRUE to reset the count for each branch
4679 skipbytes skip this many bytes at start (for brackets and OP_COND)
4680 firstbyteptr place to put the first required character, or a negative number
4681 reqbyteptr place to put the last required character, or a negative number
4682 bcptr pointer to the chain of currently open branches
4683 cd points to the data block with tables pointers etc.
4684 lengthptr NULL during the real compile phase
4685 points to length accumulator during pre-compile phase
4686
4687 Returns: TRUE on success
4688 */
4689
4690 static BOOL
4691 compile_regex(int options, int oldims, uschar **codeptr, const uschar **ptrptr,
4692 int *errorcodeptr, BOOL lookbehind, BOOL reset_bracount, int skipbytes,
4693 int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd,
4694 int *lengthptr)
4695 {
4696 const uschar *ptr = *ptrptr;
4697 uschar *code = *codeptr;
4698 uschar *last_branch = code;
4699 uschar *start_bracket = code;
4700 uschar *reverse_count = NULL;
4701 int firstbyte, reqbyte;
4702 int branchfirstbyte, branchreqbyte;
4703 int length;
4704 int orig_bracount;
4705 int max_bracount;
4706 branch_chain bc;
4707
4708 bc.outer = bcptr;
4709 bc.current = code;
4710
4711 firstbyte = reqbyte = REQ_UNSET;
4712
4713 /* Accumulate the length for use in the pre-compile phase. Start with the
4714 length of the BRA and KET and any extra bytes that are required at the
4715 beginning. We accumulate in a local variable to save frequent testing of
4716 lenthptr for NULL. We cannot do this by looking at the value of code at the
4717 start and end of each alternative, because compiled items are discarded during
4718 the pre-compile phase so that the work space is not exceeded. */
4719
4720 length = 2 + 2*LINK_SIZE + skipbytes;
4721
4722 /* WARNING: If the above line is changed for any reason, you must also change
4723 the code that abstracts option settings at the start of the pattern and makes
4724 them global. It tests the value of length for (2 + 2*LINK_SIZE) in the
4725 pre-compile phase to find out whether anything has yet been compiled or not. */
4726
4727 /* Offset is set zero to mark that this bracket is still open */
4728
4729 PUT(code, 1, 0);
4730 code += 1 + LINK_SIZE + skipbytes;
4731
4732 /* Loop for each alternative branch */
4733
4734 orig_bracount = max_bracount = cd->bracount;
4735 for (;;)
4736 {
4737 /* For a (?| group, reset the capturing bracket count so that each branch
4738 uses the same numbers. */
4739
4740 if (reset_bracount) cd->bracount = orig_bracount;
4741
4742 /* Handle a change of ims options at the start of the branch */
4743
4744 if ((options & PCRE_IMS) != oldims)
4745 {
4746 *code++ = OP_OPT;
4747 *code++ = options & PCRE_IMS;
4748 length += 2;
4749 }
4750
4751 /* Set up dummy OP_REVERSE if lookbehind assertion */
4752
4753 if (lookbehind)
4754 {
4755 *code++ = OP_REVERSE;
4756 reverse_count = code;
4757 PUTINC(code, 0, 0);
4758 length += 1 + LINK_SIZE;
4759 }
4760
4761 /* Now compile the branch; in the pre-compile phase its length gets added
4762 into the length. */
4763
4764 if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstbyte,
4765 &branchreqbyte, &bc, cd, (lengthptr == NULL)? NULL : &length))
4766 {
4767 *ptrptr = ptr;
4768 return FALSE;
4769 }
4770
4771 /* Keep the highest bracket count in case (?| was used and some branch
4772 has fewer than the rest. */
4773
4774 if (cd->bracount > max_bracount) max_bracount = cd->bracount;
4775
4776 /* In the real compile phase, there is some post-processing to be done. */
4777
4778 if (lengthptr == NULL)
4779 {
4780 /* If this is the first branch, the firstbyte and reqbyte values for the
4781 branch become the values for the regex. */
4782
4783 if (*last_branch != OP_ALT)
4784 {
4785 firstbyte = branchfirstbyte;
4786 reqbyte = branchreqbyte;
4787 }
4788
4789 /* If this is not the first branch, the first char and reqbyte have to
4790 match the values from all the previous branches, except that if the
4791 previous value for reqbyte didn't have REQ_VARY set, it can still match,
4792 and we set REQ_VARY for the regex. */
4793
4794 else
4795 {
4796 /* If we previously had a firstbyte, but it doesn't match the new branch,
4797 we have to abandon the firstbyte for the regex, but if there was
4798 previously no reqbyte, it takes on the value of the old firstbyte. */
4799
4800 if (firstbyte >= 0 && firstbyte != branchfirstbyte)
4801 {
4802 if (reqbyte < 0) reqbyte = firstbyte;
4803 firstbyte = REQ_NONE;
4804 }
4805
4806 /* If we (now or from before) have no firstbyte, a firstbyte from the
4807 branch becomes a reqbyte if there isn't a branch reqbyte. */
4808
4809 if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
4810 branchreqbyte = branchfirstbyte;
4811
4812 /* Now ensure that the reqbytes match */
4813
4814 if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
4815 reqbyte = REQ_NONE;
4816 else reqbyte |= branchreqbyte; /* To "or" REQ_VARY */
4817 }
4818
4819 /* If lookbehind, check that this branch matches a fixed-length string, and
4820 put the length into the OP_REVERSE item. Temporarily mark the end of the
4821 branch with OP_END. */
4822
4823 if (lookbehind)
4824 {
4825 int fixed_length;
4826 *code = OP_END;
4827 fixed_length = find_fixedlength(last_branch, options);
4828 DPRINTF(("fixed length = %d\n", fixed_length));
4829 if (fixed_length < 0)
4830 {
4831 *errorcodeptr = (fixed_length == -2)? ERR36 : ERR25;
4832 *ptrptr = ptr;
4833 return FALSE;
4834 }
4835 PUT(reverse_count, 0, fixed_length);
4836 }
4837 }
4838
4839 /* Reached end of expression, either ')' or end of pattern. In the real
4840 compile phase, go back through the alternative branches and reverse the chain
4841 of offsets, with the field in the BRA item now becoming an offset to the
4842 first alternative. If there are no alternatives, it points to the end of the
4843 group. The length in the terminating ket is always the length of the whole
4844 bracketed item. If any of the ims options were changed inside the group,
4845 compile a resetting op-code following, except at the very end of the pattern.
4846 Return leaving the pointer at the terminating char. */
4847
4848 if (*ptr != '|')
4849 {
4850 if (lengthptr == NULL)
4851 {
4852 int branch_length = code - last_branch;
4853 do
4854 {
4855 int prev_length = GET(last_branch, 1);
4856 PUT(last_branch, 1, branch_length);
4857 branch_length = prev_length;
4858 last_branch -= branch_length;
4859 }
4860 while (branch_length > 0);
4861 }
4862
4863 /* Fill in the ket */
4864
4865 *code = OP_KET;
4866 PUT(code, 1, code - start_bracket);
4867 code += 1 + LINK_SIZE;
4868
4869 /* Resetting option if needed */
4870
4871 if ((options & PCRE_IMS) != oldims && *ptr == ')')
4872 {
4873 *code++ = OP_OPT;
4874 *code++ = oldims;
4875 length += 2;
4876 }
4877
4878 /* Retain the highest bracket number, in case resetting was used. */
4879
4880 cd->bracount = max_bracount;
4881
4882 /* Set values to pass back */
4883
4884 *codeptr = code;
4885 *ptrptr = ptr;
4886 *firstbyteptr = firstbyte;
4887 *reqbyteptr = reqbyte;
4888 if (lengthptr != NULL) *lengthptr += length;
4889 return TRUE;
4890 }
4891
4892 /* Another branch follows. In the pre-compile phase, we can move the code
4893 pointer back to where it was for the start of the first branch. (That is,
4894 pretend that each branch is the only one.)
4895
4896 In the real compile phase, insert an ALT node. Its length field points back
4897 to the previous branch while the bracket remains open. At the end the chain
4898 is reversed. It's done like this so that the start of the bracket has a
4899 zero offset until it is closed, making it possible to detect recursion. */
4900
4901 if (lengthptr != NULL)
4902 {
4903 code = *codeptr + 1 + LINK_SIZE + skipbytes;
4904 length += 1 + LINK_SIZE;
4905 }
4906 else
4907 {
4908 *code = OP_ALT;
4909 PUT(code, 1, code - last_branch);
4910 bc.current = last_branch = code;
4911 code += 1 + LINK_SIZE;
4912 }
4913
4914 ptr++;
4915 }
4916 /* Control never reaches here */
4917 }
4918
4919
4920
4921
4922 /*************************************************
4923 * Check for anchored expression *
4924 *************************************************/
4925
4926 /* Try to find out if this is an anchored regular expression. Consider each
4927 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
4928 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
4929 it's anchored. However, if this is a multiline pattern, then only OP_SOD
4930 counts, since OP_CIRC can match in the middle.
4931
4932 We can also consider a regex to be anchored if OP_SOM starts all its branches.
4933 This is the code for \G, which means "match at start of match position, taking
4934 into account the match offset".
4935
4936 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
4937 because that will try the rest of the pattern at all possible matching points,
4938 so there is no point trying again.... er ....
4939
4940 .... except when the .* appears inside capturing parentheses, and there is a
4941 subsequent back reference to those parentheses. We haven't enough information
4942 to catch that case precisely.
4943
4944 At first, the best we could do was to detect when .* was in capturing brackets
4945 and the highest back reference was greater than or equal to that level.
4946 However, by keeping a bitmap of the first 31 back references, we can catch some
4947 of the more common cases more precisely.
4948
4949 Arguments:
4950 code points to start of expression (the bracket)
4951 options points to the options setting
4952 bracket_map a bitmap of which brackets we are inside while testing; this
4953 handles up to substring 31; after that we just have to take
4954 the less precise approach
4955 backref_map the back reference bitmap
4956
4957 Returns: TRUE or FALSE
4958 */
4959
4960 static BOOL
4961 is_anchored(register const uschar *code, int *options, unsigned int bracket_map,
4962 unsigned int backref_map)
4963 {
4964 do {
4965 const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
4966 options, PCRE_MULTILINE, FALSE);
4967 register int op = *scode;
4968
4969 /* Non-capturing brackets */
4970
4971 if (op == OP_BRA)
4972 {
4973 if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
4974 }
4975
4976 /* Capturing brackets */
4977
4978 else if (op == OP_CBRA)
4979 {
4980 int n = GET2(scode, 1+LINK_SIZE);
4981 int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
4982 if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;
4983 }
4984
4985 /* Other brackets */
4986
4987 else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
4988 {
4989 if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
4990 }
4991
4992 /* .* is not anchored unless DOTALL is set and it isn't in brackets that
4993 are or may be referenced. */
4994
4995 else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
4996 op == OP_TYPEPOSSTAR) &&
4997 (*options & PCRE_DOTALL) != 0)
4998 {
4999 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
5000 }
5001
5002 /* Check for explicit anchoring */
5003
5004 else if (op != OP_SOD && op != OP_SOM &&
5005 ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
5006 return FALSE;
5007 code += GET(code, 1);
5008 }
5009 while (*code == OP_ALT); /* Loop for each alternative */
5010 return TRUE;
5011 }
5012
5013
5014
5015 /*************************************************
5016 * Check for starting with ^ or .* *
5017 *************************************************/
5018
5019 /* This is called to find out if every branch starts with ^ or .* so that
5020 "first char" processing can be done to speed things up in multiline
5021 matching and for non-DOTALL patterns that start with .* (which must start at
5022 the beginning or after \n). As in the case of is_anchored() (see above), we
5023 have to take account of back references to capturing brackets that contain .*
5024 because in that case we can't make the assumption.
5025
5026 Arguments:
5027 code points to start of expression (the bracket)
5028 bracket_map a bitmap of which brackets we are inside while testing; this
5029 handles up to substring 31; after that we just have to take
5030 the less precise approach
5031 backref_map the back reference bitmap
5032
5033 Returns: TRUE or FALSE
5034 */
5035
5036 static BOOL
5037 is_startline(const uschar *code, unsigned int bracket_map,
5038 unsigned int backref_map)
5039 {
5040 do {
5041 const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
5042 NULL, 0, FALSE);
5043 register int op = *scode;
5044
5045 /* Non-capturing brackets */
5046
5047 if (op == OP_BRA)
5048 {
5049 if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
5050 }
5051
5052 /* Capturing brackets */
5053
5054 else if (op == OP_CBRA)
5055 {
5056 int n = GET2(scode, 1+LINK_SIZE);
5057 int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
5058 if (!is_startline(scode, new_map, backref_map)) return FALSE;
5059 }
5060
5061 /* Other brackets */
5062
5063 else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
5064 { if (!is_startline(scode, bracket_map, backref_map)) return FALSE; }
5065
5066 /* .* means "start at start or after \n" if it isn't in brackets that
5067 may be referenced. */
5068
5069 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
5070 {
5071 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
5072 }
5073
5074 /* Check for explicit circumflex */
5075
5076 else if (op != OP_CIRC) return FALSE;
5077
5078 /* Move on to the next alternative */
5079
5080 code += GET(code, 1);
5081 }
5082 while (*code == OP_ALT); /* Loop for each alternative */
5083 return TRUE;
5084 }
5085
5086
5087
5088 /*************************************************
5089 * Check for asserted fixed first char *
5090 *************************************************/
5091
5092 /* During compilation, the "first char" settings from forward assertions are
5093 discarded, because they can cause conflicts with actual literals that follow.
5094 However, if we end up without a first char setting for an unanchored pattern,
5095 it is worth scanning the regex to see if there is an initial asserted first
5096 char. If all branches start with the same asserted char, or with a bracket all
5097 of whose alternatives start with the same asserted char (recurse ad lib), then
5098 we return that char, otherwise -1.
5099
5100 Arguments:
5101 code points to start of expression (the bracket)
5102 options pointer to the options (used to check casing changes)
5103 inassert TRUE if in an assertion
5104
5105 Returns: -1 or the fixed first char
5106 */
5107
5108 static int
5109 find_firstassertedchar(const uschar *code, int *options, BOOL inassert)
5110 {
5111 register int c = -1;
5112 do {
5113 int d;
5114 const uschar *scode =
5115 first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE);
5116 register int op = *scode;
5117
5118 switch(op)
5119 {
5120 default:
5121 return -1;
5122
5123 case OP_BRA:
5124 case OP_CBRA:
5125 case OP_ASSERT:
5126 case OP_ONCE:
5127 case OP_COND:
5128 if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0)
5129 return -1;
5130 if (c < 0) c = d; else if (c != d) return -1;
5131 break;
5132
5133 case OP_EXACT: /* Fall through */
5134 scode += 2;
5135
5136 case OP_CHAR:
5137 case OP_CHARNC:
5138 case OP_PLUS:
5139 case OP_MINPLUS:
5140 case OP_POSPLUS:
5141 if (!inassert) return -1;
5142 if (c < 0)
5143 {
5144 c = scode[1];
5145 if ((*options & PCRE_CASELESS) != 0) c |= REQ_CASELESS;
5146 }
5147 else if (c != scode[1]) return -1;
5148 break;
5149 }
5150
5151 code += GET(code, 1);
5152 }
5153 while (*code == OP_ALT);
5154 return c;
5155 }
5156
5157
5158
5159 /*************************************************
5160 * Compile a Regular Expression *
5161 *************************************************/
5162
5163 /* This function takes a string and returns a pointer to a block of store
5164 holding a compiled version of the expression. The original API for this
5165 function had no error code return variable; it is retained for backwards
5166 compatibility. The new function is given a new name.
5167
5168 Arguments:
5169 pattern the regular expression
5170 options various option bits
5171 errorcodeptr pointer to error code variable (pcre_compile2() only)
5172 can be NULL if you don't want a code value
5173 errorptr pointer to pointer to error text
5174 erroroffset ptr offset in pattern where error was detected
5175 tables pointer to character tables or NULL
5176
5177 Returns: pointer to compiled data block, or NULL on error,
5178 with errorptr and erroroffset set
5179 */
5180
5181 PCRE_EXP_DEFN pcre *
5182 pcre_compile(const char *pattern, int options, const char **errorptr,
5183 int *erroroffset, const unsigned char *tables)
5184 {
5185 return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
5186 }
5187
5188
5189 PCRE_EXP_DEFN pcre *
5190 pcre_compile2(const char *pattern, int options, int *errorcodeptr,
5191 const char **errorptr, int *erroroffset, const unsigned char *tables)
5192 {
5193 real_pcre *re;
5194 int length = 1; /* For final END opcode */
5195 int firstbyte, reqbyte, newline;
5196 int errorcode = 0;
5197 #ifdef SUPPORT_UTF8
5198 BOOL utf8;
5199 #endif
5200 size_t size;
5201 uschar *code;
5202 const uschar *codestart;
5203 const uschar *ptr;
5204 compile_data compile_block;
5205 compile_data *cd = &compile_block;
5206
5207 /* This space is used for "compiling" into during the first phase, when we are
5208 computing the amount of memory that is needed. Compiled items are thrown away
5209 as soon as possible, so that a fairly large buffer should be sufficient for
5210 this purpose. The same space is used in the second phase for remembering where
5211 to fill in forward references to subpatterns. */
5212
5213 uschar cworkspace[COMPILE_WORK_SIZE];
5214
5215
5216 /* Set this early so that early errors get offset 0. */
5217
5218 ptr = (const uschar *)pattern;
5219
5220 /* We can't pass back an error message if errorptr is NULL; I guess the best we
5221 can do is just return NULL, but we can set a code value if there is a code
5222 pointer. */
5223
5224 if (errorptr == NULL)
5225 {
5226 if (errorcodeptr != NULL) *errorcodeptr = 99;
5227 return NULL;
5228 }
5229
5230 *errorptr = NULL;
5231 if (errorcodeptr != NULL) *errorcodeptr = ERR0;
5232
5233 /* However, we can give a message for this error */
5234
5235 if (erroroffset == NULL)
5236 {
5237 errorcode = ERR16;
5238 goto PCRE_EARLY_ERROR_RETURN2;
5239 }
5240
5241 *erroroffset = 0;
5242
5243 /* Can't support UTF8 unless PCRE has been compiled to include the code. */
5244
5245 #ifdef SUPPORT_UTF8
5246 utf8 = (options & PCRE_UTF8) != 0;
5247 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
5248 (*erroroffset = _pcre_valid_utf8((uschar *)pattern, -1)) >= 0)
5249 {
5250 errorcode = ERR44;
5251 goto PCRE_EARLY_ERROR_RETURN2;
5252 }
5253 #else
5254 if ((options & PCRE_UTF8) != 0)
5255 {
5256 errorcode = ERR32;
5257 goto PCRE_EARLY_ERROR_RETURN;
5258 }
5259 #endif
5260
5261 if ((options & ~PUBLIC_OPTIONS) != 0)
5262 {
5263 errorcode = ERR17;
5264 goto PCRE_EARLY_ERROR_RETURN;
5265 }
5266
5267 /* Set up pointers to the individual character tables */
5268
5269 if (tables == NULL) tables = _pcre_default_tables;
5270 cd->lcc = tables + lcc_offset;
5271 cd->fcc = tables + fcc_offset;
5272 cd->cbits = tables + cbits_offset;
5273 cd->ctypes = tables + ctypes_offset;
5274
5275 /* Handle different types of newline. The three bits give seven cases. The
5276 current code allows for fixed one- or two-byte sequences, plus "any" and
5277 "anycrlf". */
5278
5279 switch (options & (PCRE_NEWLINE_CRLF | PCRE_NEWLINE_ANY))
5280 {
5281 case 0: newline = NEWLINE; break; /* Compile-time default */
5282 case PCRE_NEWLINE_CR: newline = '\r'; break;
5283 case PCRE_NEWLINE_LF: newline = '\n'; break;
5284 case PCRE_NEWLINE_CR+
5285 PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
5286 case PCRE_NEWLINE_ANY: newline = -1; break;
5287 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
5288 default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
5289 }
5290
5291 if (newline == -2)
5292 {
5293 cd->nltype = NLTYPE_ANYCRLF;
5294 }
5295 else if (newline < 0)
5296 {
5297 cd->nltype = NLTYPE_ANY;
5298 }
5299 else
5300 {
5301 cd->nltype = NLTYPE_FIXED;
5302 if (newline > 255)
5303 {
5304 cd->nllen = 2;
5305 cd->nl[0] = (newline >> 8) & 255;
5306 cd->nl[1] = newline & 255;
5307 }
5308 else
5309 {
5310 cd->nllen = 1;
5311 cd->nl[0] = newline;
5312 }
5313 }
5314
5315 /* Maximum back reference and backref bitmap. The bitmap records up to 31 back
5316 references to help in deciding whether (.*) can be treated as anchored or not.
5317 */
5318
5319 cd->top_backref = 0;
5320 cd->backref_map = 0;
5321
5322 /* Reflect pattern for debugging output */
5323
5324 DPRINTF(("------------------------------------------------------------------\n"));
5325 DPRINTF(("%s\n", pattern));
5326
5327 /* Pretend to compile the pattern while actually just accumulating the length
5328 of memory required. This behaviour is triggered by passing a non-NULL final
5329 argument to compile_regex(). We pass a block of workspace (cworkspace) for it
5330 to compile parts of the pattern into; the compiled code is discarded when it is
5331 no longer needed, so hopefully this workspace will never overflow, though there
5332 is a test for its doing so. */
5333
5334 cd->bracount = 0;
5335 cd->names_found = 0;
5336 cd->name_entry_size = 0;
5337 cd->name_table = NULL;
5338 cd->start_workspace = cworkspace;
5339 cd->start_code = cworkspace;
5340 cd->hwm = cworkspace;
5341 cd->start_pattern = (const uschar *)pattern;
5342 cd->end_pattern = (const uschar *)(pattern + strlen(pattern));
5343 cd->req_varyopt = 0;
5344 cd->nopartial = FALSE;
5345 cd->external_options = options;
5346
5347 /* Now do the pre-compile. On error, errorcode will be set non-zero, so we
5348 don't need to look at the result of the function here. The initial options have
5349 been put into the cd block so that they can be changed if an option setting is
5350 found within the regex right at the beginning. Bringing initial option settings
5351 outside can help speed up starting point checks. */
5352
5353 code = cworkspace;
5354 *code = OP_BRA;
5355 (void)compile_regex(cd->external_options, cd->external_options & PCRE_IMS,
5356 &code, &ptr, &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd,
5357 &length);
5358 if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN;
5359
5360 DPRINTF(("end pre-compile: length=%d workspace=%d\n", length,
5361 cd->hwm - cworkspace));
5362
5363 if (length > MAX_PATTERN_SIZE)
5364 {
5365 errorcode = ERR20;
5366 goto PCRE_EARLY_ERROR_RETURN;
5367 }
5368
5369 /* Compute the size of data block needed and get it, either from malloc or
5370 externally provided function. Integer overflow should no longer be possible
5371 because nowadays we limit the maximum value of cd->names_found and
5372 cd->name_entry_size. */
5373
5374 size = length + sizeof(real_pcre) + cd->names_found * (cd->name_entry_size + 3);
5375 re = (real_pcre *)(pcre_malloc)(size);
5376
5377 if (re == NULL)
5378 {
5379 errorcode = ERR21;
5380 goto PCRE_EARLY_ERROR_RETURN;
5381 }
5382
5383 /* Put in the magic number, and save the sizes, initial options, and character
5384 table pointer. NULL is used for the default character tables. The nullpad field
5385 is at the end; it's there to help in the case when a regex compiled on a system
5386 with 4-byte pointers is run on another with 8-byte pointers. */
5387
5388 re->magic_number = MAGIC_NUMBER;
5389 re->size = size;
5390 re->options = cd->external_options;
5391 re->dummy1 = 0;
5392 re->first_byte = 0;
5393 re->req_byte = 0;
5394 re->name_table_offset = sizeof(real_pcre);
5395 re->name_entry_size = cd->name_entry_size;
5396 re->name_count = cd->names_found;
5397 re->ref_count = 0;
5398 re->tables = (tables == _pcre_default_tables)? NULL : tables;
5399 re->nullpad = NULL;
5400
5401 /* The starting points of the name/number translation table and of the code are
5402 passed around in the compile data block. The start/end pattern and initial
5403 options are already set from the pre-compile phase, as is the name_entry_size
5404 field. Reset the bracket count and the names_found field. Also reset the hwm
5405 field; this time it's used for remembering forward references to subpatterns.
5406 */
5407
5408 cd->bracount = 0;
5409 cd->names_found = 0;
5410 cd->name_table = (uschar *)re + re->name_table_offset;
5411 codestart = cd->name_table + re->name_entry_size * re->name_count;
5412 cd->start_code = codestart;
5413 cd->hwm = cworkspace;
5414 cd->req_varyopt = 0;
5415 cd->nopartial = FALSE;
5416
5417 /* Set up a starting, non-extracting bracket, then compile the expression. On
5418 error, errorcode will be set non-zero, so we don't need to look at the result
5419 of the function here. */
5420
5421 ptr = (const uschar *)pattern;
5422 code = (uschar *)codestart;
5423 *code = OP_BRA;
5424 (void)compile_regex(re->options, re->options & PCRE_IMS, &code, &ptr,
5425 &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd, NULL);
5426 re->top_bracket = cd->bracount;
5427 re->top_backref = cd->top_backref;
5428
5429 if (cd->nopartial) re->options |= PCRE_NOPARTIAL;
5430
5431 /* If not reached end of pattern on success, there's an excess bracket. */
5432
5433 if (errorcode == 0 && *ptr != 0) errorcode = ERR22;
5434
5435 /* Fill in the terminating state and check for disastrous overflow, but
5436 if debugging, leave the test till after things are printed out. */
5437
5438 *code++ = OP_END;
5439
5440 #ifndef DEBUG
5441 if (code - codestart > length) errorcode = ERR23;
5442 #endif
5443
5444 /* Fill in any forward references that are required. */
5445
5446 while (errorcode == 0 && cd->hwm > cworkspace)
5447 {
5448 int offset, recno;
5449 const uschar *groupptr;
5450 cd->hwm -= LINK_SIZE;
5451 offset = GET(cd->hwm, 0);
5452 recno = GET(codestart, offset);
5453 groupptr = find_bracket(codestart, (re->options & PCRE_UTF8) != 0, recno);
5454 if (groupptr == NULL) errorcode = ERR53;
5455 else PUT(((uschar *)codestart), offset, groupptr - codestart);
5456 }
5457
5458 /* Give an error if there's back reference to a non-existent capturing
5459 subpattern. */
5460
5461 if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15;
5462
5463 /* Failed to compile, or error while post-processing */
5464
5465 if (errorcode != 0)
5466 {
5467 (pcre_free)(re);
5468 PCRE_EARLY_ERROR_RETURN:
5469 *erroroffset = ptr - (const uschar *)pattern;
5470 PCRE_EARLY_ERROR_RETURN2:
5471 *errorptr = error_texts[errorcode];
5472 if (errorcodeptr != NULL) *errorcodeptr = errorcode;
5473 return NULL;
5474 }
5475
5476 /* If the anchored option was not passed, set the flag if we can determine that
5477 the pattern is anchored by virtue of ^ characters or \A or anything else (such
5478 as starting with .* when DOTALL is set).
5479
5480 Otherwise, if we know what the first byte has to be, save it, because that
5481 speeds up unanchored matches no end. If not, see if we can set the
5482 PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
5483 start with ^. and also when all branches start with .* for non-DOTALL matches.
5484 */
5485
5486 if ((re->options & PCRE_ANCHORED) == 0)
5487 {
5488 int temp_options = re->options; /* May get changed during these scans */
5489 if (is_anchored(codestart, &temp_options, 0, cd->backref_map))
5490 re->options |= PCRE_ANCHORED;
5491 else
5492 {
5493 if (firstbyte < 0)
5494 firstbyte = find_firstassertedchar(codestart, &temp_options, FALSE);
5495 if (firstbyte >= 0) /* Remove caseless flag for non-caseable chars */
5496 {
5497 int ch = firstbyte & 255;
5498 re->first_byte = ((firstbyte & REQ_CASELESS) != 0 &&
5499 cd->fcc[ch] == ch)? ch : firstbyte;
5500 re->options |= PCRE_FIRSTSET;
5501 }
5502 else if (is_startline(codestart, 0, cd->backref_map))
5503 re->options |= PCRE_STARTLINE;
5504 }
5505 }
5506
5507 /* For an anchored pattern, we use the "required byte" only if it follows a
5508 variable length item in the regex. Remove the caseless flag for non-caseable
5509 bytes. */
5510
5511 if (reqbyte >= 0 &&
5512 ((re->options & PCRE_ANCHORED) == 0 || (reqbyte & REQ_VARY) != 0))
5513 {
5514 int ch = reqbyte & 255;
5515 re->req_byte = ((reqbyte & REQ_CASELESS) != 0 &&
5516 cd->fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte;
5517 re->options |= PCRE_REQCHSET;
5518 }
5519
5520 /* Print out the compiled data if debugging is enabled. This is never the
5521 case when building a production library. */
5522
5523 #ifdef DEBUG
5524
5525 printf("Length = %d top_bracket = %d top_backref = %d\n",
5526 length, re->top_bracket, re->top_backref);
5527
5528 if (re->options != 0)
5529 {
5530 printf("%s%s%s%s%s%s%s%s%s\n",
5531 ((re->options & PCRE_NOPARTIAL) != 0)? "nopartial " : "",
5532 ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
5533 ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
5534 ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
5535 ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
5536 ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
5537 ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
5538 ((re->options & PCRE_EXTRA) != 0)? "extra " : "",
5539 ((re->options & PCRE_UNGREEDY) != 0)? "ungreedy " : "");
5540 }
5541
5542 if ((re->options & PCRE_FIRSTSET) != 0)
5543 {
5544 int ch = re->first_byte & 255;
5545 const char *caseless = ((re->first_byte & REQ_CASELESS) == 0)?
5546 "" : " (caseless)";
5547 if (isprint(ch)) printf("First char = %c%s\n", ch, caseless);
5548 else printf("First char = \\x%02x%s\n", ch, caseless);
5549 }
5550
5551 if ((re->options & PCRE_REQCHSET) != 0)
5552 {
5553 int ch = re->req_byte & 255;
5554 const char *caseless = ((re->req_byte & REQ_CASELESS) == 0)?
5555 "" : " (caseless)";
5556 if (isprint(ch)) printf("Req char = %c%s\n", ch, caseless);
5557 else printf("Req char = \\x%02x%s\n", ch, caseless);
5558 }
5559
5560 pcre_printint(re, stdout, TRUE);
5561
5562 /* This check is done here in the debugging case so that the code that
5563 was compiled can be seen. */
5564
5565 if (code - codestart > length)
5566 {
5567 (pcre_free)(re);
5568 *errorptr = error_texts[ERR23];
5569 *erroroffset = ptr - (uschar *)pattern;
5570 if (errorcodeptr != NULL) *errorcodeptr = ERR23;
5571 return NULL;
5572 }
5573 #endif /* DEBUG */
5574
5575 return (pcre *)re;
5576 }
5577
5578 /* End of pcre_compile.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12