/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Contents of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 171 - (show annotations) (download)
Mon Jun 4 14:28:58 2007 UTC (7 years, 5 months ago) by ph10
File MIME type: text/plain
File size: 175192 byte(s)
Support \k{name} and \g{name} a la Perl 5.10.

1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2007 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains the external function pcre_compile(), along with
42 supporting internal functions that are not used by other modules. */
43
44
45 #define NLBLOCK cd /* Block containing newline information */
46 #define PSSTART start_pattern /* Field containing processed string start */
47 #define PSEND end_pattern /* Field containing processed string end */
48
49
50 #include "pcre_internal.h"
51
52
53 /* When DEBUG is defined, we need the pcre_printint() function, which is also
54 used by pcretest. DEBUG is not defined when building a production library. */
55
56 #ifdef DEBUG
57 #include "pcre_printint.src"
58 #endif
59
60
61 /*************************************************
62 * Code parameters and static tables *
63 *************************************************/
64
65 /* This value specifies the size of stack workspace that is used during the
66 first pre-compile phase that determines how much memory is required. The regex
67 is partly compiled into this space, but the compiled parts are discarded as
68 soon as they can be, so that hopefully there will never be an overrun. The code
69 does, however, check for an overrun. The largest amount I've seen used is 218,
70 so this number is very generous.
71
72 The same workspace is used during the second, actual compile phase for
73 remembering forward references to groups so that they can be filled in at the
74 end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
75 is 4 there is plenty of room. */
76
77 #define COMPILE_WORK_SIZE (4096)
78
79
80 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
81 are simple data values; negative values are for special things like \d and so
82 on. Zero means further processing is needed (for things like \x), or the escape
83 is invalid. */
84
85 #ifndef EBCDIC /* This is the "normal" table for ASCII systems */
86 static const short int escapes[] = {
87 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
88 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
89 '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */
90 0, 0, 0, -ESC_K, 0, 0, 0, 0, /* H - O */
91 -ESC_P, -ESC_Q, -ESC_R, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
92 -ESC_X, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
93 '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */
94 0, 0, 0, -ESC_k, 0, 0, ESC_n, 0, /* h - o */
95 -ESC_p, 0, ESC_r, -ESC_s, ESC_tee, 0, 0, -ESC_w, /* p - w */
96 0, 0, -ESC_z /* x - z */
97 };
98
99 #else /* This is the "abnormal" table for EBCDIC systems */
100 static const short int escapes[] = {
101 /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
102 /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
103 /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
104 /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
105 /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
106 /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
107 /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
108 /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
109 /* 88 */ 0, 0, 0, '{', 0, 0, 0, 0,
110 /* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
111 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
112 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0, 0, -ESC_w, 0,
113 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
114 /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
115 /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
116 /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
117 /* C8 */ 0, 0, 0, 0, 0, 0, 0, 0,
118 /* D0 */ '}', 0, 0, 0, 0, 0, 0, -ESC_P,
119 /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
120 /* E0 */ '\\', 0, -ESC_S, 0, 0, 0, -ESC_W, -ESC_X,
121 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
122 /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
123 /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
124 };
125 #endif
126
127
128 /* Tables of names of POSIX character classes and their lengths. The list is
129 terminated by a zero length entry. The first three must be alpha, lower, upper,
130 as this is assumed for handling case independence. */
131
132 static const char *const posix_names[] = {
133 "alpha", "lower", "upper",
134 "alnum", "ascii", "blank", "cntrl", "digit", "graph",
135 "print", "punct", "space", "word", "xdigit" };
136
137 static const uschar posix_name_lengths[] = {
138 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
139
140 /* Table of class bit maps for each POSIX class. Each class is formed from a
141 base map, with an optional addition or removal of another map. Then, for some
142 classes, there is some additional tweaking: for [:blank:] the vertical space
143 characters are removed, and for [:alpha:] and [:alnum:] the underscore
144 character is removed. The triples in the table consist of the base map offset,
145 second map offset or -1 if no second map, and a non-negative value for map
146 addition or a negative value for map subtraction (if there are two maps). The
147 absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
148 remove vertical space characters, 2 => remove underscore. */
149
150 static const int posix_class_maps[] = {
151 cbit_word, cbit_digit, -2, /* alpha */
152 cbit_lower, -1, 0, /* lower */
153 cbit_upper, -1, 0, /* upper */
154 cbit_word, -1, 2, /* alnum - word without underscore */
155 cbit_print, cbit_cntrl, 0, /* ascii */
156 cbit_space, -1, 1, /* blank - a GNU extension */
157 cbit_cntrl, -1, 0, /* cntrl */
158 cbit_digit, -1, 0, /* digit */
159 cbit_graph, -1, 0, /* graph */
160 cbit_print, -1, 0, /* print */
161 cbit_punct, -1, 0, /* punct */
162 cbit_space, -1, 0, /* space */
163 cbit_word, -1, 0, /* word - a Perl extension */
164 cbit_xdigit,-1, 0 /* xdigit */
165 };
166
167
168 #define STRING(a) # a
169 #define XSTRING(s) STRING(s)
170
171 /* The texts of compile-time error messages. These are "char *" because they
172 are passed to the outside world. Do not ever re-use any error number, because
173 they are documented. Always add a new error instead. Messages marked DEAD below
174 are no longer used. */
175
176 static const char *error_texts[] = {
177 "no error",
178 "\\ at end of pattern",
179 "\\c at end of pattern",
180 "unrecognized character follows \\",
181 "numbers out of order in {} quantifier",
182 /* 5 */
183 "number too big in {} quantifier",
184 "missing terminating ] for character class",
185 "invalid escape sequence in character class",
186 "range out of order in character class",
187 "nothing to repeat",
188 /* 10 */
189 "operand of unlimited repeat could match the empty string", /** DEAD **/
190 "internal error: unexpected repeat",
191 "unrecognized character after (?",
192 "POSIX named classes are supported only within a class",
193 "missing )",
194 /* 15 */
195 "reference to non-existent subpattern",
196 "erroffset passed as NULL",
197 "unknown option bit(s) set",
198 "missing ) after comment",
199 "parentheses nested too deeply", /** DEAD **/
200 /* 20 */
201 "regular expression too large",
202 "failed to get memory",
203 "unmatched parentheses",
204 "internal error: code overflow",
205 "unrecognized character after (?<",
206 /* 25 */
207 "lookbehind assertion is not fixed length",
208 "malformed number or name after (?(",
209 "conditional group contains more than two branches",
210 "assertion expected after (?(",
211 "(?R or (?[+-]digits must be followed by )",
212 /* 30 */
213 "unknown POSIX class name",
214 "POSIX collating elements are not supported",
215 "this version of PCRE is not compiled with PCRE_UTF8 support",
216 "spare error", /** DEAD **/
217 "character value in \\x{...} sequence is too large",
218 /* 35 */
219 "invalid condition (?(0)",
220 "\\C not allowed in lookbehind assertion",
221 "PCRE does not support \\L, \\l, \\N, \\U, or \\u",
222 "number after (?C is > 255",
223 "closing ) for (?C expected",
224 /* 40 */
225 "recursive call could loop indefinitely",
226 "unrecognized character after (?P",
227 "syntax error in subpattern name (missing terminator)",
228 "two named subpatterns have the same name",
229 "invalid UTF-8 string",
230 /* 45 */
231 "support for \\P, \\p, and \\X has not been compiled",
232 "malformed \\P or \\p sequence",
233 "unknown property name after \\P or \\p",
234 "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)",
235 "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")",
236 /* 50 */
237 "repeated subpattern is too long",
238 "octal value is greater than \\377 (not in UTF-8 mode)",
239 "internal error: overran compiling workspace",
240 "internal error: previously-checked referenced subpattern not found",
241 "DEFINE group contains more than one branch",
242 /* 55 */
243 "repeating a DEFINE group is not allowed",
244 "inconsistent NEWLINE options",
245 "\\g is not followed by a braced name or an optionally braced non-zero number",
246 "(?+ or (?- or (?(+ or (?(- must be followed by a non-zero number"
247 };
248
249
250 /* Table to identify digits and hex digits. This is used when compiling
251 patterns. Note that the tables in chartables are dependent on the locale, and
252 may mark arbitrary characters as digits - but the PCRE compiling code expects
253 to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
254 a private table here. It costs 256 bytes, but it is a lot faster than doing
255 character value tests (at least in some simple cases I timed), and in some
256 applications one wants PCRE to compile efficiently as well as match
257 efficiently.
258
259 For convenience, we use the same bit definitions as in chartables:
260
261 0x04 decimal digit
262 0x08 hexadecimal digit
263
264 Then we can use ctype_digit and ctype_xdigit in the code. */
265
266 #ifndef EBCDIC /* This is the "normal" case, for ASCII systems */
267 static const unsigned char digitab[] =
268 {
269 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
270 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
271 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
272 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
273 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
274 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
275 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
276 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
277 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
278 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
279 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
280 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
281 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
282 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
283 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
284 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
285 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
286 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
287 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
288 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
289 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
290 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
291 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
292 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
293 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
294 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
295 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
296 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
297 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
298 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
299 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
300 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
301
302 #else /* This is the "abnormal" case, for EBCDIC systems */
303 static const unsigned char digitab[] =
304 {
305 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
306 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
307 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
308 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
309 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
310 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
311 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
312 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
313 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
314 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
315 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
316 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */
317 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
318 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
319 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
320 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
321 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
322 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
323 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
324 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
325 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
326 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
327 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
328 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
329 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
330 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
331 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
332 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
333 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
334 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
335 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
336 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
337
338 static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
339 0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
340 0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
341 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
342 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
343 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
344 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
345 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
346 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
347 0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
348 0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
349 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
350 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */
351 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
352 0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
353 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
354 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
355 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
356 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
357 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
358 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
359 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
360 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
361 0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
362 0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
363 0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
364 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
365 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
366 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
367 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
368 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
369 0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
370 0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
371 #endif
372
373
374 /* Definition to allow mutual recursion */
375
376 static BOOL
377 compile_regex(int, int, uschar **, const uschar **, int *, BOOL, int, int *,
378 int *, branch_chain *, compile_data *, int *);
379
380
381
382 /*************************************************
383 * Handle escapes *
384 *************************************************/
385
386 /* This function is called when a \ has been encountered. It either returns a
387 positive value for a simple escape such as \n, or a negative value which
388 encodes one of the more complicated things such as \d. A backreference to group
389 n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
390 UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
391 ptr is pointing at the \. On exit, it is on the final character of the escape
392 sequence.
393
394 Arguments:
395 ptrptr points to the pattern position pointer
396 errorcodeptr points to the errorcode variable
397 bracount number of previous extracting brackets
398 options the options bits
399 isclass TRUE if inside a character class
400
401 Returns: zero or positive => a data character
402 negative => a special escape sequence
403 on error, errorptr is set
404 */
405
406 static int
407 check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
408 int options, BOOL isclass)
409 {
410 BOOL utf8 = (options & PCRE_UTF8) != 0;
411 const uschar *ptr = *ptrptr + 1;
412 int c, i;
413
414 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
415 ptr--; /* Set pointer back to the last byte */
416
417 /* If backslash is at the end of the pattern, it's an error. */
418
419 if (c == 0) *errorcodeptr = ERR1;
420
421 /* Non-alphamerics are literals. For digits or letters, do an initial lookup in
422 a table. A non-zero result is something that can be returned immediately.
423 Otherwise further processing may be required. */
424
425 #ifndef EBCDIC /* ASCII coding */
426 else if (c < '0' || c > 'z') {} /* Not alphameric */
427 else if ((i = escapes[c - '0']) != 0) c = i;
428
429 #else /* EBCDIC coding */
430 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphameric */
431 else if ((i = escapes[c - 0x48]) != 0) c = i;
432 #endif
433
434 /* Escapes that need further processing, or are illegal. */
435
436 else
437 {
438 const uschar *oldptr;
439 BOOL braced, negated;
440
441 switch (c)
442 {
443 /* A number of Perl escapes are not handled by PCRE. We give an explicit
444 error. */
445
446 case 'l':
447 case 'L':
448 case 'N':
449 case 'u':
450 case 'U':
451 *errorcodeptr = ERR37;
452 break;
453
454 /* \g must be followed by a number, either plain or braced. If positive, it
455 is an absolute backreference. If negative, it is a relative backreference.
456 This is a Perl 5.10 feature. Perl 5.10 also supports \g{name} as a
457 reference to a named group. This is part of Perl's movement towards a
458 unified syntax for back references. As this is synonymous with \k{name}, we
459 fudge it up by pretending it really was \k. */
460
461 case 'g':
462 if (ptr[1] == '{')
463 {
464 const uschar *p;
465 for (p = ptr+2; *p != 0 && *p != '}'; p++)
466 if (*p != '-' && (digitab[*p] & ctype_digit) == 0) break;
467 if (*p != 0 && *p != '}')
468 {
469 c = -ESC_k;
470 break;
471 }
472 braced = TRUE;
473 ptr++;
474 }
475 else braced = FALSE;
476
477 if (ptr[1] == '-')
478 {
479 negated = TRUE;
480 ptr++;
481 }
482 else negated = FALSE;
483
484 c = 0;
485 while ((digitab[ptr[1]] & ctype_digit) != 0)
486 c = c * 10 + *(++ptr) - '0';
487
488 if (c == 0 || (braced && *(++ptr) != '}'))
489 {
490 *errorcodeptr = ERR57;
491 return 0;
492 }
493
494 if (negated)
495 {
496 if (c > bracount)
497 {
498 *errorcodeptr = ERR15;
499 return 0;
500 }
501 c = bracount - (c - 1);
502 }
503
504 c = -(ESC_REF + c);
505 break;
506
507 /* The handling of escape sequences consisting of a string of digits
508 starting with one that is not zero is not straightforward. By experiment,
509 the way Perl works seems to be as follows:
510
511 Outside a character class, the digits are read as a decimal number. If the
512 number is less than 10, or if there are that many previous extracting
513 left brackets, then it is a back reference. Otherwise, up to three octal
514 digits are read to form an escaped byte. Thus \123 is likely to be octal
515 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
516 value is greater than 377, the least significant 8 bits are taken. Inside a
517 character class, \ followed by a digit is always an octal number. */
518
519 case '1': case '2': case '3': case '4': case '5':
520 case '6': case '7': case '8': case '9':
521
522 if (!isclass)
523 {
524 oldptr = ptr;
525 c -= '0';
526 while ((digitab[ptr[1]] & ctype_digit) != 0)
527 c = c * 10 + *(++ptr) - '0';
528 if (c < 10 || c <= bracount)
529 {
530 c = -(ESC_REF + c);
531 break;
532 }
533 ptr = oldptr; /* Put the pointer back and fall through */
534 }
535
536 /* Handle an octal number following \. If the first digit is 8 or 9, Perl
537 generates a binary zero byte and treats the digit as a following literal.
538 Thus we have to pull back the pointer by one. */
539
540 if ((c = *ptr) >= '8')
541 {
542 ptr--;
543 c = 0;
544 break;
545 }
546
547 /* \0 always starts an octal number, but we may drop through to here with a
548 larger first octal digit. The original code used just to take the least
549 significant 8 bits of octal numbers (I think this is what early Perls used
550 to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
551 than 3 octal digits. */
552
553 case '0':
554 c -= '0';
555 while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
556 c = c * 8 + *(++ptr) - '0';
557 if (!utf8 && c > 255) *errorcodeptr = ERR51;
558 break;
559
560 /* \x is complicated. \x{ddd} is a character number which can be greater
561 than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
562 treated as a data character. */
563
564 case 'x':
565 if (ptr[1] == '{')
566 {
567 const uschar *pt = ptr + 2;
568 int count = 0;
569
570 c = 0;
571 while ((digitab[*pt] & ctype_xdigit) != 0)
572 {
573 register int cc = *pt++;
574 if (c == 0 && cc == '0') continue; /* Leading zeroes */
575 count++;
576
577 #ifndef EBCDIC /* ASCII coding */
578 if (cc >= 'a') cc -= 32; /* Convert to upper case */
579 c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));
580 #else /* EBCDIC coding */
581 if (cc >= 'a' && cc <= 'z') cc += 64; /* Convert to upper case */
582 c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));
583 #endif
584 }
585
586 if (*pt == '}')
587 {
588 if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
589 ptr = pt;
590 break;
591 }
592
593 /* If the sequence of hex digits does not end with '}', then we don't
594 recognize this construct; fall through to the normal \x handling. */
595 }
596
597 /* Read just a single-byte hex-defined char */
598
599 c = 0;
600 while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
601 {
602 int cc; /* Some compilers don't like ++ */
603 cc = *(++ptr); /* in initializers */
604 #ifndef EBCDIC /* ASCII coding */
605 if (cc >= 'a') cc -= 32; /* Convert to upper case */
606 c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
607 #else /* EBCDIC coding */
608 if (cc <= 'z') cc += 64; /* Convert to upper case */
609 c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
610 #endif
611 }
612 break;
613
614 /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
615 This coding is ASCII-specific, but then the whole concept of \cx is
616 ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
617
618 case 'c':
619 c = *(++ptr);
620 if (c == 0)
621 {
622 *errorcodeptr = ERR2;
623 return 0;
624 }
625
626 #ifndef EBCDIC /* ASCII coding */
627 if (c >= 'a' && c <= 'z') c -= 32;
628 c ^= 0x40;
629 #else /* EBCDIC coding */
630 if (c >= 'a' && c <= 'z') c += 64;
631 c ^= 0xC0;
632 #endif
633 break;
634
635 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
636 other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
637 for Perl compatibility, it is a literal. This code looks a bit odd, but
638 there used to be some cases other than the default, and there may be again
639 in future, so I haven't "optimized" it. */
640
641 default:
642 if ((options & PCRE_EXTRA) != 0) switch(c)
643 {
644 default:
645 *errorcodeptr = ERR3;
646 break;
647 }
648 break;
649 }
650 }
651
652 *ptrptr = ptr;
653 return c;
654 }
655
656
657
658 #ifdef SUPPORT_UCP
659 /*************************************************
660 * Handle \P and \p *
661 *************************************************/
662
663 /* This function is called after \P or \p has been encountered, provided that
664 PCRE is compiled with support for Unicode properties. On entry, ptrptr is
665 pointing at the P or p. On exit, it is pointing at the final character of the
666 escape sequence.
667
668 Argument:
669 ptrptr points to the pattern position pointer
670 negptr points to a boolean that is set TRUE for negation else FALSE
671 dptr points to an int that is set to the detailed property value
672 errorcodeptr points to the error code variable
673
674 Returns: type value from ucp_type_table, or -1 for an invalid type
675 */
676
677 static int
678 get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
679 {
680 int c, i, bot, top;
681 const uschar *ptr = *ptrptr;
682 char name[32];
683
684 c = *(++ptr);
685 if (c == 0) goto ERROR_RETURN;
686
687 *negptr = FALSE;
688
689 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
690 negation. */
691
692 if (c == '{')
693 {
694 if (ptr[1] == '^')
695 {
696 *negptr = TRUE;
697 ptr++;
698 }
699 for (i = 0; i < sizeof(name) - 1; i++)
700 {
701 c = *(++ptr);
702 if (c == 0) goto ERROR_RETURN;
703 if (c == '}') break;
704 name[i] = c;
705 }
706 if (c !='}') goto ERROR_RETURN;
707 name[i] = 0;
708 }
709
710 /* Otherwise there is just one following character */
711
712 else
713 {
714 name[0] = c;
715 name[1] = 0;
716 }
717
718 *ptrptr = ptr;
719
720 /* Search for a recognized property name using binary chop */
721
722 bot = 0;
723 top = _pcre_utt_size;
724
725 while (bot < top)
726 {
727 i = (bot + top) >> 1;
728 c = strcmp(name, _pcre_utt[i].name);
729 if (c == 0)
730 {
731 *dptr = _pcre_utt[i].value;
732 return _pcre_utt[i].type;
733 }
734 if (c > 0) bot = i + 1; else top = i;
735 }
736
737 *errorcodeptr = ERR47;
738 *ptrptr = ptr;
739 return -1;
740
741 ERROR_RETURN:
742 *errorcodeptr = ERR46;
743 *ptrptr = ptr;
744 return -1;
745 }
746 #endif
747
748
749
750
751 /*************************************************
752 * Check for counted repeat *
753 *************************************************/
754
755 /* This function is called when a '{' is encountered in a place where it might
756 start a quantifier. It looks ahead to see if it really is a quantifier or not.
757 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
758 where the ddds are digits.
759
760 Arguments:
761 p pointer to the first char after '{'
762
763 Returns: TRUE or FALSE
764 */
765
766 static BOOL
767 is_counted_repeat(const uschar *p)
768 {
769 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
770 while ((digitab[*p] & ctype_digit) != 0) p++;
771 if (*p == '}') return TRUE;
772
773 if (*p++ != ',') return FALSE;
774 if (*p == '}') return TRUE;
775
776 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
777 while ((digitab[*p] & ctype_digit) != 0) p++;
778
779 return (*p == '}');
780 }
781
782
783
784 /*************************************************
785 * Read repeat counts *
786 *************************************************/
787
788 /* Read an item of the form {n,m} and return the values. This is called only
789 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
790 so the syntax is guaranteed to be correct, but we need to check the values.
791
792 Arguments:
793 p pointer to first char after '{'
794 minp pointer to int for min
795 maxp pointer to int for max
796 returned as -1 if no max
797 errorcodeptr points to error code variable
798
799 Returns: pointer to '}' on success;
800 current ptr on error, with errorcodeptr set non-zero
801 */
802
803 static const uschar *
804 read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
805 {
806 int min = 0;
807 int max = -1;
808
809 /* Read the minimum value and do a paranoid check: a negative value indicates
810 an integer overflow. */
811
812 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
813 if (min < 0 || min > 65535)
814 {
815 *errorcodeptr = ERR5;
816 return p;
817 }
818
819 /* Read the maximum value if there is one, and again do a paranoid on its size.
820 Also, max must not be less than min. */
821
822 if (*p == '}') max = min; else
823 {
824 if (*(++p) != '}')
825 {
826 max = 0;
827 while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
828 if (max < 0 || max > 65535)
829 {
830 *errorcodeptr = ERR5;
831 return p;
832 }
833 if (max < min)
834 {
835 *errorcodeptr = ERR4;
836 return p;
837 }
838 }
839 }
840
841 /* Fill in the required variables, and pass back the pointer to the terminating
842 '}'. */
843
844 *minp = min;
845 *maxp = max;
846 return p;
847 }
848
849
850
851 /*************************************************
852 * Find forward referenced subpattern *
853 *************************************************/
854
855 /* This function scans along a pattern's text looking for capturing
856 subpatterns, and counting them. If it finds a named pattern that matches the
857 name it is given, it returns its number. Alternatively, if the name is NULL, it
858 returns when it reaches a given numbered subpattern. This is used for forward
859 references to subpatterns. We know that if (?P< is encountered, the name will
860 be terminated by '>' because that is checked in the first pass.
861
862 Arguments:
863 ptr current position in the pattern
864 count current count of capturing parens so far encountered
865 name name to seek, or NULL if seeking a numbered subpattern
866 lorn name length, or subpattern number if name is NULL
867 xmode TRUE if we are in /x mode
868
869 Returns: the number of the named subpattern, or -1 if not found
870 */
871
872 static int
873 find_parens(const uschar *ptr, int count, const uschar *name, int lorn,
874 BOOL xmode)
875 {
876 const uschar *thisname;
877
878 for (; *ptr != 0; ptr++)
879 {
880 int term;
881
882 /* Skip over backslashed characters and also entire \Q...\E */
883
884 if (*ptr == '\\')
885 {
886 if (*(++ptr) == 0) return -1;
887 if (*ptr == 'Q') for (;;)
888 {
889 while (*(++ptr) != 0 && *ptr != '\\');
890 if (*ptr == 0) return -1;
891 if (*(++ptr) == 'E') break;
892 }
893 continue;
894 }
895
896 /* Skip over character classes */
897
898 if (*ptr == '[')
899 {
900 while (*(++ptr) != ']')
901 {
902 if (*ptr == '\\')
903 {
904 if (*(++ptr) == 0) return -1;
905 if (*ptr == 'Q') for (;;)
906 {
907 while (*(++ptr) != 0 && *ptr != '\\');
908 if (*ptr == 0) return -1;
909 if (*(++ptr) == 'E') break;
910 }
911 continue;
912 }
913 }
914 continue;
915 }
916
917 /* Skip comments in /x mode */
918
919 if (xmode && *ptr == '#')
920 {
921 while (*(++ptr) != 0 && *ptr != '\n');
922 if (*ptr == 0) return -1;
923 continue;
924 }
925
926 /* An opening parens must now be a real metacharacter */
927
928 if (*ptr != '(') continue;
929 if (ptr[1] != '?')
930 {
931 count++;
932 if (name == NULL && count == lorn) return count;
933 continue;
934 }
935
936 ptr += 2;
937 if (*ptr == 'P') ptr++; /* Allow optional P */
938
939 /* We have to disambiguate (?<! and (?<= from (?<name> */
940
941 if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') &&
942 *ptr != '\'')
943 continue;
944
945 count++;
946
947 if (name == NULL && count == lorn) return count;
948 term = *ptr++;
949 if (term == '<') term = '>';
950 thisname = ptr;
951 while (*ptr != term) ptr++;
952 if (name != NULL && lorn == ptr - thisname &&
953 strncmp((const char *)name, (const char *)thisname, lorn) == 0)
954 return count;
955 }
956
957 return -1;
958 }
959
960
961
962 /*************************************************
963 * Find first significant op code *
964 *************************************************/
965
966 /* This is called by several functions that scan a compiled expression looking
967 for a fixed first character, or an anchoring op code etc. It skips over things
968 that do not influence this. For some calls, a change of option is important.
969 For some calls, it makes sense to skip negative forward and all backward
970 assertions, and also the \b assertion; for others it does not.
971
972 Arguments:
973 code pointer to the start of the group
974 options pointer to external options
975 optbit the option bit whose changing is significant, or
976 zero if none are
977 skipassert TRUE if certain assertions are to be skipped
978
979 Returns: pointer to the first significant opcode
980 */
981
982 static const uschar*
983 first_significant_code(const uschar *code, int *options, int optbit,
984 BOOL skipassert)
985 {
986 for (;;)
987 {
988 switch ((int)*code)
989 {
990 case OP_OPT:
991 if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
992 *options = (int)code[1];
993 code += 2;
994 break;
995
996 case OP_ASSERT_NOT:
997 case OP_ASSERTBACK:
998 case OP_ASSERTBACK_NOT:
999 if (!skipassert) return code;
1000 do code += GET(code, 1); while (*code == OP_ALT);
1001 code += _pcre_OP_lengths[*code];
1002 break;
1003
1004 case OP_WORD_BOUNDARY:
1005 case OP_NOT_WORD_BOUNDARY:
1006 if (!skipassert) return code;
1007 /* Fall through */
1008
1009 case OP_CALLOUT:
1010 case OP_CREF:
1011 case OP_RREF:
1012 case OP_DEF:
1013 code += _pcre_OP_lengths[*code];
1014 break;
1015
1016 default:
1017 return code;
1018 }
1019 }
1020 /* Control never reaches here */
1021 }
1022
1023
1024
1025
1026 /*************************************************
1027 * Find the fixed length of a pattern *
1028 *************************************************/
1029
1030 /* Scan a pattern and compute the fixed length of subject that will match it,
1031 if the length is fixed. This is needed for dealing with backward assertions.
1032 In UTF8 mode, the result is in characters rather than bytes.
1033
1034 Arguments:
1035 code points to the start of the pattern (the bracket)
1036 options the compiling options
1037
1038 Returns: the fixed length, or -1 if there is no fixed length,
1039 or -2 if \C was encountered
1040 */
1041
1042 static int
1043 find_fixedlength(uschar *code, int options)
1044 {
1045 int length = -1;
1046
1047 register int branchlength = 0;
1048 register uschar *cc = code + 1 + LINK_SIZE;
1049
1050 /* Scan along the opcodes for this branch. If we get to the end of the
1051 branch, check the length against that of the other branches. */
1052
1053 for (;;)
1054 {
1055 int d;
1056 register int op = *cc;
1057
1058 switch (op)
1059 {
1060 case OP_CBRA:
1061 case OP_BRA:
1062 case OP_ONCE:
1063 case OP_COND:
1064 d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);
1065 if (d < 0) return d;
1066 branchlength += d;
1067 do cc += GET(cc, 1); while (*cc == OP_ALT);
1068 cc += 1 + LINK_SIZE;
1069 break;
1070
1071 /* Reached end of a branch; if it's a ket it is the end of a nested
1072 call. If it's ALT it is an alternation in a nested call. If it is
1073 END it's the end of the outer call. All can be handled by the same code. */
1074
1075 case OP_ALT:
1076 case OP_KET:
1077 case OP_KETRMAX:
1078 case OP_KETRMIN:
1079 case OP_END:
1080 if (length < 0) length = branchlength;
1081 else if (length != branchlength) return -1;
1082 if (*cc != OP_ALT) return length;
1083 cc += 1 + LINK_SIZE;
1084 branchlength = 0;
1085 break;
1086
1087 /* Skip over assertive subpatterns */
1088
1089 case OP_ASSERT:
1090 case OP_ASSERT_NOT:
1091 case OP_ASSERTBACK:
1092 case OP_ASSERTBACK_NOT:
1093 do cc += GET(cc, 1); while (*cc == OP_ALT);
1094 /* Fall through */
1095
1096 /* Skip over things that don't match chars */
1097
1098 case OP_REVERSE:
1099 case OP_CREF:
1100 case OP_RREF:
1101 case OP_DEF:
1102 case OP_OPT:
1103 case OP_CALLOUT:
1104 case OP_SOD:
1105 case OP_SOM:
1106 case OP_EOD:
1107 case OP_EODN:
1108 case OP_CIRC:
1109 case OP_DOLL:
1110 case OP_NOT_WORD_BOUNDARY:
1111 case OP_WORD_BOUNDARY:
1112 cc += _pcre_OP_lengths[*cc];
1113 break;
1114
1115 /* Handle literal characters */
1116
1117 case OP_CHAR:
1118 case OP_CHARNC:
1119 case OP_NOT:
1120 branchlength++;
1121 cc += 2;
1122 #ifdef SUPPORT_UTF8
1123 if ((options & PCRE_UTF8) != 0)
1124 {
1125 while ((*cc & 0xc0) == 0x80) cc++;
1126 }
1127 #endif
1128 break;
1129
1130 /* Handle exact repetitions. The count is already in characters, but we
1131 need to skip over a multibyte character in UTF8 mode. */
1132
1133 case OP_EXACT:
1134 branchlength += GET2(cc,1);
1135 cc += 4;
1136 #ifdef SUPPORT_UTF8
1137 if ((options & PCRE_UTF8) != 0)
1138 {
1139 while((*cc & 0x80) == 0x80) cc++;
1140 }
1141 #endif
1142 break;
1143
1144 case OP_TYPEEXACT:
1145 branchlength += GET2(cc,1);
1146 cc += 4;
1147 break;
1148
1149 /* Handle single-char matchers */
1150
1151 case OP_PROP:
1152 case OP_NOTPROP:
1153 cc += 2;
1154 /* Fall through */
1155
1156 case OP_NOT_DIGIT:
1157 case OP_DIGIT:
1158 case OP_NOT_WHITESPACE:
1159 case OP_WHITESPACE:
1160 case OP_NOT_WORDCHAR:
1161 case OP_WORDCHAR:
1162 case OP_ANY:
1163 branchlength++;
1164 cc++;
1165 break;
1166
1167 /* The single-byte matcher isn't allowed */
1168
1169 case OP_ANYBYTE:
1170 return -2;
1171
1172 /* Check a class for variable quantification */
1173
1174 #ifdef SUPPORT_UTF8
1175 case OP_XCLASS:
1176 cc += GET(cc, 1) - 33;
1177 /* Fall through */
1178 #endif
1179
1180 case OP_CLASS:
1181 case OP_NCLASS:
1182 cc += 33;
1183
1184 switch (*cc)
1185 {
1186 case OP_CRSTAR:
1187 case OP_CRMINSTAR:
1188 case OP_CRQUERY:
1189 case OP_CRMINQUERY:
1190 return -1;
1191
1192 case OP_CRRANGE:
1193 case OP_CRMINRANGE:
1194 if (GET2(cc,1) != GET2(cc,3)) return -1;
1195 branchlength += GET2(cc,1);
1196 cc += 5;
1197 break;
1198
1199 default:
1200 branchlength++;
1201 }
1202 break;
1203
1204 /* Anything else is variable length */
1205
1206 default:
1207 return -1;
1208 }
1209 }
1210 /* Control never gets here */
1211 }
1212
1213
1214
1215
1216 /*************************************************
1217 * Scan compiled regex for numbered bracket *
1218 *************************************************/
1219
1220 /* This little function scans through a compiled pattern until it finds a
1221 capturing bracket with the given number.
1222
1223 Arguments:
1224 code points to start of expression
1225 utf8 TRUE in UTF-8 mode
1226 number the required bracket number
1227
1228 Returns: pointer to the opcode for the bracket, or NULL if not found
1229 */
1230
1231 static const uschar *
1232 find_bracket(const uschar *code, BOOL utf8, int number)
1233 {
1234 for (;;)
1235 {
1236 register int c = *code;
1237 if (c == OP_END) return NULL;
1238
1239 /* XCLASS is used for classes that cannot be represented just by a bit
1240 map. This includes negated single high-valued characters. The length in
1241 the table is zero; the actual length is stored in the compiled code. */
1242
1243 if (c == OP_XCLASS) code += GET(code, 1);
1244
1245 /* Handle capturing bracket */
1246
1247 else if (c == OP_CBRA)
1248 {
1249 int n = GET2(code, 1+LINK_SIZE);
1250 if (n == number) return (uschar *)code;
1251 code += _pcre_OP_lengths[c];
1252 }
1253
1254 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1255 a multi-byte character. The length in the table is a minimum, so we have to
1256 arrange to skip the extra bytes. */
1257
1258 else
1259 {
1260 code += _pcre_OP_lengths[c];
1261 #ifdef SUPPORT_UTF8
1262 if (utf8) switch(c)
1263 {
1264 case OP_CHAR:
1265 case OP_CHARNC:
1266 case OP_EXACT:
1267 case OP_UPTO:
1268 case OP_MINUPTO:
1269 case OP_POSUPTO:
1270 case OP_STAR:
1271 case OP_MINSTAR:
1272 case OP_POSSTAR:
1273 case OP_PLUS:
1274 case OP_MINPLUS:
1275 case OP_POSPLUS:
1276 case OP_QUERY:
1277 case OP_MINQUERY:
1278 case OP_POSQUERY:
1279 if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1280 break;
1281 }
1282 #endif
1283 }
1284 }
1285 }
1286
1287
1288
1289 /*************************************************
1290 * Scan compiled regex for recursion reference *
1291 *************************************************/
1292
1293 /* This little function scans through a compiled pattern until it finds an
1294 instance of OP_RECURSE.
1295
1296 Arguments:
1297 code points to start of expression
1298 utf8 TRUE in UTF-8 mode
1299
1300 Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1301 */
1302
1303 static const uschar *
1304 find_recurse(const uschar *code, BOOL utf8)
1305 {
1306 for (;;)
1307 {
1308 register int c = *code;
1309 if (c == OP_END) return NULL;
1310 if (c == OP_RECURSE) return code;
1311
1312 /* XCLASS is used for classes that cannot be represented just by a bit
1313 map. This includes negated single high-valued characters. The length in
1314 the table is zero; the actual length is stored in the compiled code. */
1315
1316 if (c == OP_XCLASS) code += GET(code, 1);
1317
1318 /* Otherwise, we get the item's length from the table. In UTF-8 mode, opcodes
1319 that are followed by a character may be followed by a multi-byte character.
1320 The length in the table is a minimum, so we have to arrange to skip the extra
1321 bytes. */
1322
1323 else
1324 {
1325 code += _pcre_OP_lengths[c];
1326 #ifdef SUPPORT_UTF8
1327 if (utf8) switch(c)
1328 {
1329 case OP_CHAR:
1330 case OP_CHARNC:
1331 case OP_EXACT:
1332 case OP_UPTO:
1333 case OP_MINUPTO:
1334 case OP_POSUPTO:
1335 case OP_STAR:
1336 case OP_MINSTAR:
1337 case OP_POSSTAR:
1338 case OP_PLUS:
1339 case OP_MINPLUS:
1340 case OP_POSPLUS:
1341 case OP_QUERY:
1342 case OP_MINQUERY:
1343 case OP_POSQUERY:
1344 if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1345 break;
1346 }
1347 #endif
1348 }
1349 }
1350 }
1351
1352
1353
1354 /*************************************************
1355 * Scan compiled branch for non-emptiness *
1356 *************************************************/
1357
1358 /* This function scans through a branch of a compiled pattern to see whether it
1359 can match the empty string or not. It is called from could_be_empty()
1360 below and from compile_branch() when checking for an unlimited repeat of a
1361 group that can match nothing. Note that first_significant_code() skips over
1362 assertions. If we hit an unclosed bracket, we return "empty" - this means we've
1363 struck an inner bracket whose current branch will already have been scanned.
1364
1365 Arguments:
1366 code points to start of search
1367 endcode points to where to stop
1368 utf8 TRUE if in UTF8 mode
1369
1370 Returns: TRUE if what is matched could be empty
1371 */
1372
1373 static BOOL
1374 could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1375 {
1376 register int c;
1377 for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1378 code < endcode;
1379 code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1380 {
1381 const uschar *ccode;
1382
1383 c = *code;
1384
1385 /* Groups with zero repeats can of course be empty; skip them. */
1386
1387 if (c == OP_BRAZERO || c == OP_BRAMINZERO)
1388 {
1389 do code += GET(code, 1); while (*code == OP_ALT);
1390 c = *code;
1391 continue;
1392 }
1393
1394 /* For other groups, scan the branches. */
1395
1396 if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE)
1397 {
1398 BOOL empty_branch;
1399 if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1400
1401 /* Scan a closed bracket */
1402
1403 empty_branch = FALSE;
1404 do
1405 {
1406 if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1407 empty_branch = TRUE;
1408 code += GET(code, 1);
1409 }
1410 while (*code == OP_ALT);
1411 if (!empty_branch) return FALSE; /* All branches are non-empty */
1412 c = *code;
1413 continue;
1414 }
1415
1416 /* Handle the other opcodes */
1417
1418 switch (c)
1419 {
1420 /* Check for quantifiers after a class */
1421
1422 #ifdef SUPPORT_UTF8
1423 case OP_XCLASS:
1424 ccode = code + GET(code, 1);
1425 goto CHECK_CLASS_REPEAT;
1426 #endif
1427
1428 case OP_CLASS:
1429 case OP_NCLASS:
1430 ccode = code + 33;
1431
1432 #ifdef SUPPORT_UTF8
1433 CHECK_CLASS_REPEAT:
1434 #endif
1435
1436 switch (*ccode)
1437 {
1438 case OP_CRSTAR: /* These could be empty; continue */
1439 case OP_CRMINSTAR:
1440 case OP_CRQUERY:
1441 case OP_CRMINQUERY:
1442 break;
1443
1444 default: /* Non-repeat => class must match */
1445 case OP_CRPLUS: /* These repeats aren't empty */
1446 case OP_CRMINPLUS:
1447 return FALSE;
1448
1449 case OP_CRRANGE:
1450 case OP_CRMINRANGE:
1451 if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
1452 break;
1453 }
1454 break;
1455
1456 /* Opcodes that must match a character */
1457
1458 case OP_PROP:
1459 case OP_NOTPROP:
1460 case OP_EXTUNI:
1461 case OP_NOT_DIGIT:
1462 case OP_DIGIT:
1463 case OP_NOT_WHITESPACE:
1464 case OP_WHITESPACE:
1465 case OP_NOT_WORDCHAR:
1466 case OP_WORDCHAR:
1467 case OP_ANY:
1468 case OP_ANYBYTE:
1469 case OP_CHAR:
1470 case OP_CHARNC:
1471 case OP_NOT:
1472 case OP_PLUS:
1473 case OP_MINPLUS:
1474 case OP_POSPLUS:
1475 case OP_EXACT:
1476 case OP_NOTPLUS:
1477 case OP_NOTMINPLUS:
1478 case OP_NOTPOSPLUS:
1479 case OP_NOTEXACT:
1480 case OP_TYPEPLUS:
1481 case OP_TYPEMINPLUS:
1482 case OP_TYPEPOSPLUS:
1483 case OP_TYPEEXACT:
1484 return FALSE;
1485
1486 /* End of branch */
1487
1488 case OP_KET:
1489 case OP_KETRMAX:
1490 case OP_KETRMIN:
1491 case OP_ALT:
1492 return TRUE;
1493
1494 /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1495 MINUPTO, and POSUPTO may be followed by a multibyte character */
1496
1497 #ifdef SUPPORT_UTF8
1498 case OP_STAR:
1499 case OP_MINSTAR:
1500 case OP_POSSTAR:
1501 case OP_QUERY:
1502 case OP_MINQUERY:
1503 case OP_POSQUERY:
1504 case OP_UPTO:
1505 case OP_MINUPTO:
1506 case OP_POSUPTO:
1507 if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1508 break;
1509 #endif
1510 }
1511 }
1512
1513 return TRUE;
1514 }
1515
1516
1517
1518 /*************************************************
1519 * Scan compiled regex for non-emptiness *
1520 *************************************************/
1521
1522 /* This function is called to check for left recursive calls. We want to check
1523 the current branch of the current pattern to see if it could match the empty
1524 string. If it could, we must look outwards for branches at other levels,
1525 stopping when we pass beyond the bracket which is the subject of the recursion.
1526
1527 Arguments:
1528 code points to start of the recursion
1529 endcode points to where to stop (current RECURSE item)
1530 bcptr points to the chain of current (unclosed) branch starts
1531 utf8 TRUE if in UTF-8 mode
1532
1533 Returns: TRUE if what is matched could be empty
1534 */
1535
1536 static BOOL
1537 could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1538 BOOL utf8)
1539 {
1540 while (bcptr != NULL && bcptr->current >= code)
1541 {
1542 if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1543 bcptr = bcptr->outer;
1544 }
1545 return TRUE;
1546 }
1547
1548
1549
1550 /*************************************************
1551 * Check for POSIX class syntax *
1552 *************************************************/
1553
1554 /* This function is called when the sequence "[:" or "[." or "[=" is
1555 encountered in a character class. It checks whether this is followed by an
1556 optional ^ and then a sequence of letters, terminated by a matching ":]" or
1557 ".]" or "=]".
1558
1559 Argument:
1560 ptr pointer to the initial [
1561 endptr where to return the end pointer
1562 cd pointer to compile data
1563
1564 Returns: TRUE or FALSE
1565 */
1566
1567 static BOOL
1568 check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
1569 {
1570 int terminator; /* Don't combine these lines; the Solaris cc */
1571 terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
1572 if (*(++ptr) == '^') ptr++;
1573 while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
1574 if (*ptr == terminator && ptr[1] == ']')
1575 {
1576 *endptr = ptr;
1577 return TRUE;
1578 }
1579 return FALSE;
1580 }
1581
1582
1583
1584
1585 /*************************************************
1586 * Check POSIX class name *
1587 *************************************************/
1588
1589 /* This function is called to check the name given in a POSIX-style class entry
1590 such as [:alnum:].
1591
1592 Arguments:
1593 ptr points to the first letter
1594 len the length of the name
1595
1596 Returns: a value representing the name, or -1 if unknown
1597 */
1598
1599 static int
1600 check_posix_name(const uschar *ptr, int len)
1601 {
1602 register int yield = 0;
1603 while (posix_name_lengths[yield] != 0)
1604 {
1605 if (len == posix_name_lengths[yield] &&
1606 strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
1607 yield++;
1608 }
1609 return -1;
1610 }
1611
1612
1613 /*************************************************
1614 * Adjust OP_RECURSE items in repeated group *
1615 *************************************************/
1616
1617 /* OP_RECURSE items contain an offset from the start of the regex to the group
1618 that is referenced. This means that groups can be replicated for fixed
1619 repetition simply by copying (because the recursion is allowed to refer to
1620 earlier groups that are outside the current group). However, when a group is
1621 optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
1622 it, after it has been compiled. This means that any OP_RECURSE items within it
1623 that refer to the group itself or any contained groups have to have their
1624 offsets adjusted. That one of the jobs of this function. Before it is called,
1625 the partially compiled regex must be temporarily terminated with OP_END.
1626
1627 This function has been extended with the possibility of forward references for
1628 recursions and subroutine calls. It must also check the list of such references
1629 for the group we are dealing with. If it finds that one of the recursions in
1630 the current group is on this list, it adjusts the offset in the list, not the
1631 value in the reference (which is a group number).
1632
1633 Arguments:
1634 group points to the start of the group
1635 adjust the amount by which the group is to be moved
1636 utf8 TRUE in UTF-8 mode
1637 cd contains pointers to tables etc.
1638 save_hwm the hwm forward reference pointer at the start of the group
1639
1640 Returns: nothing
1641 */
1642
1643 static void
1644 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
1645 uschar *save_hwm)
1646 {
1647 uschar *ptr = group;
1648 while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1649 {
1650 int offset;
1651 uschar *hc;
1652
1653 /* See if this recursion is on the forward reference list. If so, adjust the
1654 reference. */
1655
1656 for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
1657 {
1658 offset = GET(hc, 0);
1659 if (cd->start_code + offset == ptr + 1)
1660 {
1661 PUT(hc, 0, offset + adjust);
1662 break;
1663 }
1664 }
1665
1666 /* Otherwise, adjust the recursion offset if it's after the start of this
1667 group. */
1668
1669 if (hc >= cd->hwm)
1670 {
1671 offset = GET(ptr, 1);
1672 if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1673 }
1674
1675 ptr += 1 + LINK_SIZE;
1676 }
1677 }
1678
1679
1680
1681 /*************************************************
1682 * Insert an automatic callout point *
1683 *************************************************/
1684
1685 /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
1686 callout points before each pattern item.
1687
1688 Arguments:
1689 code current code pointer
1690 ptr current pattern pointer
1691 cd pointers to tables etc
1692
1693 Returns: new code pointer
1694 */
1695
1696 static uschar *
1697 auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
1698 {
1699 *code++ = OP_CALLOUT;
1700 *code++ = 255;
1701 PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */
1702 PUT(code, LINK_SIZE, 0); /* Default length */
1703 return code + 2*LINK_SIZE;
1704 }
1705
1706
1707
1708 /*************************************************
1709 * Complete a callout item *
1710 *************************************************/
1711
1712 /* A callout item contains the length of the next item in the pattern, which
1713 we can't fill in till after we have reached the relevant point. This is used
1714 for both automatic and manual callouts.
1715
1716 Arguments:
1717 previous_callout points to previous callout item
1718 ptr current pattern pointer
1719 cd pointers to tables etc
1720
1721 Returns: nothing
1722 */
1723
1724 static void
1725 complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
1726 {
1727 int length = ptr - cd->start_pattern - GET(previous_callout, 2);
1728 PUT(previous_callout, 2 + LINK_SIZE, length);
1729 }
1730
1731
1732
1733 #ifdef SUPPORT_UCP
1734 /*************************************************
1735 * Get othercase range *
1736 *************************************************/
1737
1738 /* This function is passed the start and end of a class range, in UTF-8 mode
1739 with UCP support. It searches up the characters, looking for internal ranges of
1740 characters in the "other" case. Each call returns the next one, updating the
1741 start address.
1742
1743 Arguments:
1744 cptr points to starting character value; updated
1745 d end value
1746 ocptr where to put start of othercase range
1747 odptr where to put end of othercase range
1748
1749 Yield: TRUE when range returned; FALSE when no more
1750 */
1751
1752 static BOOL
1753 get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
1754 unsigned int *odptr)
1755 {
1756 unsigned int c, othercase, next;
1757
1758 for (c = *cptr; c <= d; c++)
1759 { if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR) break; }
1760
1761 if (c > d) return FALSE;
1762
1763 *ocptr = othercase;
1764 next = othercase + 1;
1765
1766 for (++c; c <= d; c++)
1767 {
1768 if (_pcre_ucp_othercase(c) != next) break;
1769 next++;
1770 }
1771
1772 *odptr = next - 1;
1773 *cptr = c;
1774
1775 return TRUE;
1776 }
1777 #endif /* SUPPORT_UCP */
1778
1779
1780
1781 /*************************************************
1782 * Check if auto-possessifying is possible *
1783 *************************************************/
1784
1785 /* This function is called for unlimited repeats of certain items, to see
1786 whether the next thing could possibly match the repeated item. If not, it makes
1787 sense to automatically possessify the repeated item.
1788
1789 Arguments:
1790 op_code the repeated op code
1791 this data for this item, depends on the opcode
1792 utf8 TRUE in UTF-8 mode
1793 utf8_char used for utf8 character bytes, NULL if not relevant
1794 ptr next character in pattern
1795 options options bits
1796 cd contains pointers to tables etc.
1797
1798 Returns: TRUE if possessifying is wanted
1799 */
1800
1801 static BOOL
1802 check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
1803 const uschar *ptr, int options, compile_data *cd)
1804 {
1805 int next;
1806
1807 /* Skip whitespace and comments in extended mode */
1808
1809 if ((options & PCRE_EXTENDED) != 0)
1810 {
1811 for (;;)
1812 {
1813 while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1814 if (*ptr == '#')
1815 {
1816 while (*(++ptr) != 0)
1817 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1818 }
1819 else break;
1820 }
1821 }
1822
1823 /* If the next item is one that we can handle, get its value. A non-negative
1824 value is a character, a negative value is an escape value. */
1825
1826 if (*ptr == '\\')
1827 {
1828 int temperrorcode = 0;
1829 next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
1830 if (temperrorcode != 0) return FALSE;
1831 ptr++; /* Point after the escape sequence */
1832 }
1833
1834 else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
1835 {
1836 #ifdef SUPPORT_UTF8
1837 if (utf8) { GETCHARINC(next, ptr); } else
1838 #endif
1839 next = *ptr++;
1840 }
1841
1842 else return FALSE;
1843
1844 /* Skip whitespace and comments in extended mode */
1845
1846 if ((options & PCRE_EXTENDED) != 0)
1847 {
1848 for (;;)
1849 {
1850 while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1851 if (*ptr == '#')
1852 {
1853 while (*(++ptr) != 0)
1854 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1855 }
1856 else break;
1857 }
1858 }
1859
1860 /* If the next thing is itself optional, we have to give up. */
1861
1862 if (*ptr == '*' || *ptr == '?' || strncmp((char *)ptr, "{0,", 3) == 0)
1863 return FALSE;
1864
1865 /* Now compare the next item with the previous opcode. If the previous is a
1866 positive single character match, "item" either contains the character or, if
1867 "item" is greater than 127 in utf8 mode, the character's bytes are in
1868 utf8_char. */
1869
1870
1871 /* Handle cases when the next item is a character. */
1872
1873 if (next >= 0) switch(op_code)
1874 {
1875 case OP_CHAR:
1876 #ifdef SUPPORT_UTF8
1877 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1878 #endif
1879 return item != next;
1880
1881 /* For CHARNC (caseless character) we must check the other case. If we have
1882 Unicode property support, we can use it to test the other case of
1883 high-valued characters. */
1884
1885 case OP_CHARNC:
1886 #ifdef SUPPORT_UTF8
1887 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1888 #endif
1889 if (item == next) return FALSE;
1890 #ifdef SUPPORT_UTF8
1891 if (utf8)
1892 {
1893 unsigned int othercase;
1894 if (next < 128) othercase = cd->fcc[next]; else
1895 #ifdef SUPPORT_UCP
1896 othercase = _pcre_ucp_othercase((unsigned int)next);
1897 #else
1898 othercase = NOTACHAR;
1899 #endif
1900 return (unsigned int)item != othercase;
1901 }
1902 else
1903 #endif /* SUPPORT_UTF8 */
1904 return (item != cd->fcc[next]); /* Non-UTF-8 mode */
1905
1906 /* For OP_NOT, "item" must be a single-byte character. */
1907
1908 case OP_NOT:
1909 if (next < 0) return FALSE; /* Not a character */
1910 if (item == next) return TRUE;
1911 if ((options & PCRE_CASELESS) == 0) return FALSE;
1912 #ifdef SUPPORT_UTF8
1913 if (utf8)
1914 {
1915 unsigned int othercase;
1916 if (next < 128) othercase = cd->fcc[next]; else
1917 #ifdef SUPPORT_UCP
1918 othercase = _pcre_ucp_othercase(next);
1919 #else
1920 othercase = NOTACHAR;
1921 #endif
1922 return (unsigned int)item == othercase;
1923 }
1924 else
1925 #endif /* SUPPORT_UTF8 */
1926 return (item == cd->fcc[next]); /* Non-UTF-8 mode */
1927
1928 case OP_DIGIT:
1929 return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
1930
1931 case OP_NOT_DIGIT:
1932 return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
1933
1934 case OP_WHITESPACE:
1935 return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
1936
1937 case OP_NOT_WHITESPACE:
1938 return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
1939
1940 case OP_WORDCHAR:
1941 return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
1942
1943 case OP_NOT_WORDCHAR:
1944 return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
1945
1946 default:
1947 return FALSE;
1948 }
1949
1950
1951 /* Handle the case when the next item is \d, \s, etc. */
1952
1953 switch(op_code)
1954 {
1955 case OP_CHAR:
1956 case OP_CHARNC:
1957 #ifdef SUPPORT_UTF8
1958 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1959 #endif
1960 switch(-next)
1961 {
1962 case ESC_d:
1963 return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
1964
1965 case ESC_D:
1966 return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
1967
1968 case ESC_s:
1969 return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
1970
1971 case ESC_S:
1972 return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
1973
1974 case ESC_w:
1975 return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
1976
1977 case ESC_W:
1978 return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
1979
1980 default:
1981 return FALSE;
1982 }
1983
1984 case OP_DIGIT:
1985 return next == -ESC_D || next == -ESC_s || next == -ESC_W;
1986
1987 case OP_NOT_DIGIT:
1988 return next == -ESC_d;
1989
1990 case OP_WHITESPACE:
1991 return next == -ESC_S || next == -ESC_d || next == -ESC_w;
1992
1993 case OP_NOT_WHITESPACE:
1994 return next == -ESC_s;
1995
1996 case OP_WORDCHAR:
1997 return next == -ESC_W || next == -ESC_s;
1998
1999 case OP_NOT_WORDCHAR:
2000 return next == -ESC_w || next == -ESC_d;
2001
2002 default:
2003 return FALSE;
2004 }
2005
2006 /* Control does not reach here */
2007 }
2008
2009
2010
2011 /*************************************************
2012 * Compile one branch *
2013 *************************************************/
2014
2015 /* Scan the pattern, compiling it into the a vector. If the options are
2016 changed during the branch, the pointer is used to change the external options
2017 bits. This function is used during the pre-compile phase when we are trying
2018 to find out the amount of memory needed, as well as during the real compile
2019 phase. The value of lengthptr distinguishes the two phases.
2020
2021 Arguments:
2022 optionsptr pointer to the option bits
2023 codeptr points to the pointer to the current code point
2024 ptrptr points to the current pattern pointer
2025 errorcodeptr points to error code variable
2026 firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2027 reqbyteptr set to the last literal character required, else < 0
2028 bcptr points to current branch chain
2029 cd contains pointers to tables etc.
2030 lengthptr NULL during the real compile phase
2031 points to length accumulator during pre-compile phase
2032
2033 Returns: TRUE on success
2034 FALSE, with *errorcodeptr set non-zero on error
2035 */
2036
2037 static BOOL
2038 compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2039 int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2040 compile_data *cd, int *lengthptr)
2041 {
2042 int repeat_type, op_type;
2043 int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
2044 int bravalue = 0;
2045 int greedy_default, greedy_non_default;
2046 int firstbyte, reqbyte;
2047 int zeroreqbyte, zerofirstbyte;
2048 int req_caseopt, reqvary, tempreqvary;
2049 int options = *optionsptr;
2050 int after_manual_callout = 0;
2051 int length_prevgroup = 0;
2052 register int c;
2053 register uschar *code = *codeptr;
2054 uschar *last_code = code;
2055 uschar *orig_code = code;
2056 uschar *tempcode;
2057 BOOL inescq = FALSE;
2058 BOOL groupsetfirstbyte = FALSE;
2059 const uschar *ptr = *ptrptr;
2060 const uschar *tempptr;
2061 uschar *previous = NULL;
2062 uschar *previous_callout = NULL;
2063 uschar *save_hwm = NULL;
2064 uschar classbits[32];
2065
2066 #ifdef SUPPORT_UTF8
2067 BOOL class_utf8;
2068 BOOL utf8 = (options & PCRE_UTF8) != 0;
2069 uschar *class_utf8data;
2070 uschar utf8_char[6];
2071 #else
2072 BOOL utf8 = FALSE;
2073 uschar *utf8_char = NULL;
2074 #endif
2075
2076 #ifdef DEBUG
2077 if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2078 #endif
2079
2080 /* Set up the default and non-default settings for greediness */
2081
2082 greedy_default = ((options & PCRE_UNGREEDY) != 0);
2083 greedy_non_default = greedy_default ^ 1;
2084
2085 /* Initialize no first byte, no required byte. REQ_UNSET means "no char
2086 matching encountered yet". It gets changed to REQ_NONE if we hit something that
2087 matches a non-fixed char first char; reqbyte just remains unset if we never
2088 find one.
2089
2090 When we hit a repeat whose minimum is zero, we may have to adjust these values
2091 to take the zero repeat into account. This is implemented by setting them to
2092 zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2093 item types that can be repeated set these backoff variables appropriately. */
2094
2095 firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2096
2097 /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2098 according to the current setting of the caseless flag. REQ_CASELESS is a bit
2099 value > 255. It is added into the firstbyte or reqbyte variables to record the
2100 case status of the value. This is used only for ASCII characters. */
2101
2102 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2103
2104 /* Switch on next character until the end of the branch */
2105
2106 for (;; ptr++)
2107 {
2108 BOOL negate_class;
2109 BOOL possessive_quantifier;
2110 BOOL is_quantifier;
2111 BOOL is_recurse;
2112 int class_charcount;
2113 int class_lastchar;
2114 int newoptions;
2115 int recno;
2116 int refsign;
2117 int skipbytes;
2118 int subreqbyte;
2119 int subfirstbyte;
2120 int terminator;
2121 int mclength;
2122 uschar mcbuffer[8];
2123
2124 /* Get next byte in the pattern */
2125
2126 c = *ptr;
2127
2128 /* If we are in the pre-compile phase, accumulate the length used for the
2129 previous cycle of this loop. */
2130
2131 if (lengthptr != NULL)
2132 {
2133 #ifdef DEBUG
2134 if (code > cd->hwm) cd->hwm = code; /* High water info */
2135 #endif
2136 if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
2137 {
2138 *errorcodeptr = ERR52;
2139 goto FAILED;
2140 }
2141
2142 /* There is at least one situation where code goes backwards: this is the
2143 case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2144 the class is simply eliminated. However, it is created first, so we have to
2145 allow memory for it. Therefore, don't ever reduce the length at this point.
2146 */
2147
2148 if (code < last_code) code = last_code;
2149 *lengthptr += code - last_code;
2150 DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2151
2152 /* If "previous" is set and it is not at the start of the work space, move
2153 it back to there, in order to avoid filling up the work space. Otherwise,
2154 if "previous" is NULL, reset the current code pointer to the start. */
2155
2156 if (previous != NULL)
2157 {
2158 if (previous > orig_code)
2159 {
2160 memmove(orig_code, previous, code - previous);
2161 code -= previous - orig_code;
2162 previous = orig_code;
2163 }
2164 }
2165 else code = orig_code;
2166
2167 /* Remember where this code item starts so we can pick up the length
2168 next time round. */
2169
2170 last_code = code;
2171 }
2172
2173 /* In the real compile phase, just check the workspace used by the forward
2174 reference list. */
2175
2176 else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
2177 {
2178 *errorcodeptr = ERR52;
2179 goto FAILED;
2180 }
2181
2182 /* If in \Q...\E, check for the end; if not, we have a literal */
2183
2184 if (inescq && c != 0)
2185 {
2186 if (c == '\\' && ptr[1] == 'E')
2187 {
2188 inescq = FALSE;
2189 ptr++;
2190 continue;
2191 }
2192 else
2193 {
2194 if (previous_callout != NULL)
2195 {
2196 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2197 complete_callout(previous_callout, ptr, cd);
2198 previous_callout = NULL;
2199 }
2200 if ((options & PCRE_AUTO_CALLOUT) != 0)
2201 {
2202 previous_callout = code;
2203 code = auto_callout(code, ptr, cd);
2204 }
2205 goto NORMAL_CHAR;
2206 }
2207 }
2208
2209 /* Fill in length of a previous callout, except when the next thing is
2210 a quantifier. */
2211
2212 is_quantifier = c == '*' || c == '+' || c == '?' ||
2213 (c == '{' && is_counted_repeat(ptr+1));
2214
2215 if (!is_quantifier && previous_callout != NULL &&
2216 after_manual_callout-- <= 0)
2217 {
2218 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2219 complete_callout(previous_callout, ptr, cd);
2220 previous_callout = NULL;
2221 }
2222
2223 /* In extended mode, skip white space and comments */
2224
2225 if ((options & PCRE_EXTENDED) != 0)
2226 {
2227 if ((cd->ctypes[c] & ctype_space) != 0) continue;
2228 if (c == '#')
2229 {
2230 while (*(++ptr) != 0)
2231 {
2232 if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
2233 }
2234 if (*ptr != 0) continue;
2235
2236 /* Else fall through to handle end of string */
2237 c = 0;
2238 }
2239 }
2240
2241 /* No auto callout for quantifiers. */
2242
2243 if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
2244 {
2245 previous_callout = code;
2246 code = auto_callout(code, ptr, cd);
2247 }
2248
2249 switch(c)
2250 {
2251 /* ===================================================================*/
2252 case 0: /* The branch terminates at string end */
2253 case '|': /* or | or ) */
2254 case ')':
2255 *firstbyteptr = firstbyte;
2256 *reqbyteptr = reqbyte;
2257 *codeptr = code;
2258 *ptrptr = ptr;
2259 if (lengthptr != NULL)
2260 {
2261 *lengthptr += code - last_code; /* To include callout length */
2262 DPRINTF((">> end branch\n"));
2263 }
2264 return TRUE;
2265
2266
2267 /* ===================================================================*/
2268 /* Handle single-character metacharacters. In multiline mode, ^ disables
2269 the setting of any following char as a first character. */
2270
2271 case '^':
2272 if ((options & PCRE_MULTILINE) != 0)
2273 {
2274 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2275 }
2276 previous = NULL;
2277 *code++ = OP_CIRC;
2278 break;
2279
2280 case '$':
2281 previous = NULL;
2282 *code++ = OP_DOLL;
2283 break;
2284
2285 /* There can never be a first char if '.' is first, whatever happens about
2286 repeats. The value of reqbyte doesn't change either. */
2287
2288 case '.':
2289 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2290 zerofirstbyte = firstbyte;
2291 zeroreqbyte = reqbyte;
2292 previous = code;
2293 *code++ = OP_ANY;
2294 break;
2295
2296
2297 /* ===================================================================*/
2298 /* Character classes. If the included characters are all < 256, we build a
2299 32-byte bitmap of the permitted characters, except in the special case
2300 where there is only one such character. For negated classes, we build the
2301 map as usual, then invert it at the end. However, we use a different opcode
2302 so that data characters > 255 can be handled correctly.
2303
2304 If the class contains characters outside the 0-255 range, a different
2305 opcode is compiled. It may optionally have a bit map for characters < 256,
2306 but those above are are explicitly listed afterwards. A flag byte tells
2307 whether the bitmap is present, and whether this is a negated class or not.
2308 */
2309
2310 case '[':
2311 previous = code;
2312
2313 /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
2314 they are encountered at the top level, so we'll do that too. */
2315
2316 if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2317 check_posix_syntax(ptr, &tempptr, cd))
2318 {
2319 *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;
2320 goto FAILED;
2321 }
2322
2323 /* If the first character is '^', set the negation flag and skip it. */
2324
2325 if ((c = *(++ptr)) == '^')
2326 {
2327 negate_class = TRUE;
2328 c = *(++ptr);
2329 }
2330 else
2331 {
2332 negate_class = FALSE;
2333 }
2334
2335 /* Keep a count of chars with values < 256 so that we can optimize the case
2336 of just a single character (as long as it's < 256). However, For higher
2337 valued UTF-8 characters, we don't yet do any optimization. */
2338
2339 class_charcount = 0;
2340 class_lastchar = -1;
2341
2342 /* Initialize the 32-char bit map to all zeros. We build the map in a
2343 temporary bit of memory, in case the class contains only 1 character (less
2344 than 256), because in that case the compiled code doesn't use the bit map.
2345 */
2346
2347 memset(classbits, 0, 32 * sizeof(uschar));
2348
2349 #ifdef SUPPORT_UTF8
2350 class_utf8 = FALSE; /* No chars >= 256 */
2351 class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */
2352 #endif
2353
2354 /* Process characters until ] is reached. By writing this as a "do" it
2355 means that an initial ] is taken as a data character. At the start of the
2356 loop, c contains the first byte of the character. */
2357
2358 if (c != 0) do
2359 {
2360 const uschar *oldptr;
2361
2362 #ifdef SUPPORT_UTF8
2363 if (utf8 && c > 127)
2364 { /* Braces are required because the */
2365 GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
2366 }
2367 #endif
2368
2369 /* Inside \Q...\E everything is literal except \E */
2370
2371 if (inescq)
2372 {
2373 if (c == '\\' && ptr[1] == 'E') /* If we are at \E */
2374 {
2375 inescq = FALSE; /* Reset literal state */
2376 ptr++; /* Skip the 'E' */
2377 continue; /* Carry on with next */
2378 }
2379 goto CHECK_RANGE; /* Could be range if \E follows */
2380 }
2381
2382 /* Handle POSIX class names. Perl allows a negation extension of the
2383 form [:^name:]. A square bracket that doesn't match the syntax is
2384 treated as a literal. We also recognize the POSIX constructions
2385 [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
2386 5.6 and 5.8 do. */
2387
2388 if (c == '[' &&
2389 (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2390 check_posix_syntax(ptr, &tempptr, cd))
2391 {
2392 BOOL local_negate = FALSE;
2393 int posix_class, taboffset, tabopt;
2394 register const uschar *cbits = cd->cbits;
2395 uschar pbits[32];
2396
2397 if (ptr[1] != ':')
2398 {
2399 *errorcodeptr = ERR31;
2400 goto FAILED;
2401 }
2402
2403 ptr += 2;
2404 if (*ptr == '^')
2405 {
2406 local_negate = TRUE;
2407 ptr++;
2408 }
2409
2410 posix_class = check_posix_name(ptr, tempptr - ptr);
2411 if (posix_class < 0)
2412 {
2413 *errorcodeptr = ERR30;
2414 goto FAILED;
2415 }
2416
2417 /* If matching is caseless, upper and lower are converted to
2418 alpha. This relies on the fact that the class table starts with
2419 alpha, lower, upper as the first 3 entries. */
2420
2421 if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
2422 posix_class = 0;
2423
2424 /* We build the bit map for the POSIX class in a chunk of local store
2425 because we may be adding and subtracting from it, and we don't want to
2426 subtract bits that may be in the main map already. At the end we or the
2427 result into the bit map that is being built. */
2428
2429 posix_class *= 3;
2430
2431 /* Copy in the first table (always present) */
2432
2433 memcpy(pbits, cbits + posix_class_maps[posix_class],
2434 32 * sizeof(uschar));
2435
2436 /* If there is a second table, add or remove it as required. */
2437
2438 taboffset = posix_class_maps[posix_class + 1];
2439 tabopt = posix_class_maps[posix_class + 2];
2440
2441 if (taboffset >= 0)
2442 {
2443 if (tabopt >= 0)
2444 for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
2445 else
2446 for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
2447 }
2448
2449 /* Not see if we need to remove any special characters. An option
2450 value of 1 removes vertical space and 2 removes underscore. */
2451
2452 if (tabopt < 0) tabopt = -tabopt;
2453 if (tabopt == 1) pbits[1] &= ~0x3c;
2454 else if (tabopt == 2) pbits[11] &= 0x7f;
2455
2456 /* Add the POSIX table or its complement into the main table that is
2457 being built and we are done. */
2458
2459 if (local_negate)
2460 for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
2461 else
2462 for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
2463
2464 ptr = tempptr + 1;
2465 class_charcount = 10; /* Set > 1; assumes more than 1 per class */
2466 continue; /* End of POSIX syntax handling */
2467 }
2468
2469 /* Backslash may introduce a single character, or it may introduce one
2470 of the specials, which just set a flag. The sequence \b is a special
2471 case. Inside a class (and only there) it is treated as backspace.
2472 Elsewhere it marks a word boundary. Other escapes have preset maps ready
2473 to or into the one we are building. We assume they have more than one
2474 character in them, so set class_charcount bigger than one. */
2475
2476 if (c == '\\')
2477 {
2478 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2479 if (*errorcodeptr != 0) goto FAILED;
2480
2481 if (-c == ESC_b) c = '\b'; /* \b is backslash in a class */
2482 else if (-c == ESC_X) c = 'X'; /* \X is literal X in a class */
2483 else if (-c == ESC_R) c = 'R'; /* \R is literal R in a class */
2484 else if (-c == ESC_Q) /* Handle start of quoted string */
2485 {
2486 if (ptr[1] == '\\' && ptr[2] == 'E')
2487 {
2488 ptr += 2; /* avoid empty string */
2489 }
2490 else inescq = TRUE;
2491 continue;
2492 }
2493
2494 if (c < 0)
2495 {
2496 register const uschar *cbits = cd->cbits;
2497 class_charcount += 2; /* Greater than 1 is what matters */
2498
2499 /* Save time by not doing this in the pre-compile phase. */
2500
2501 if (lengthptr == NULL) switch (-c)
2502 {
2503 case ESC_d:
2504 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
2505 continue;
2506
2507 case ESC_D:
2508 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
2509 continue;
2510
2511 case ESC_w:
2512 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
2513 continue;
2514
2515 case ESC_W:
2516 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
2517 continue;
2518
2519 case ESC_s:
2520 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
2521 classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
2522 continue;
2523
2524 case ESC_S:
2525 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
2526 classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
2527 continue;
2528
2529 case ESC_E: /* Perl ignores an orphan \E */
2530 continue;
2531
2532 default: /* Not recognized; fall through */
2533 break; /* Need "default" setting to stop compiler warning. */
2534 }
2535
2536 /* In the pre-compile phase, just do the recognition. */
2537
2538 else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
2539 c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
2540
2541 /* We need to deal with \P and \p in both phases. */
2542
2543 #ifdef SUPPORT_UCP
2544 if (-c == ESC_p || -c == ESC_P)
2545 {
2546 BOOL negated;
2547 int pdata;
2548 int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
2549 if (ptype < 0) goto FAILED;
2550 class_utf8 = TRUE;
2551 *class_utf8data++ = ((-c == ESC_p) != negated)?
2552 XCL_PROP : XCL_NOTPROP;
2553 *class_utf8data++ = ptype;
2554 *class_utf8data++ = pdata;
2555 class_charcount -= 2; /* Not a < 256 character */
2556 continue;
2557 }
2558 #endif
2559 /* Unrecognized escapes are faulted if PCRE is running in its
2560 strict mode. By default, for compatibility with Perl, they are
2561 treated as literals. */
2562
2563 if ((options & PCRE_EXTRA) != 0)
2564 {
2565 *errorcodeptr = ERR7;
2566 goto FAILED;
2567 }
2568
2569 class_charcount -= 2; /* Undo the default count from above */
2570 c = *ptr; /* Get the final character and fall through */
2571 }
2572
2573 /* Fall through if we have a single character (c >= 0). This may be
2574 greater than 256 in UTF-8 mode. */
2575
2576 } /* End of backslash handling */
2577
2578 /* A single character may be followed by '-' to form a range. However,
2579 Perl does not permit ']' to be the end of the range. A '-' character
2580 at the end is treated as a literal. Perl ignores orphaned \E sequences
2581 entirely. The code for handling \Q and \E is messy. */
2582
2583 CHECK_RANGE:
2584 while (ptr[1] == '\\' && ptr[2] == 'E')
2585 {
2586 inescq = FALSE;
2587 ptr += 2;
2588 }
2589
2590 oldptr = ptr;
2591
2592 if (!inescq && ptr[1] == '-')
2593 {
2594 int d;
2595 ptr += 2;
2596 while (*ptr == '\\' && ptr[1] == 'E') ptr += 2;
2597
2598 /* If we hit \Q (not followed by \E) at this point, go into escaped
2599 mode. */
2600
2601 while (*ptr == '\\' && ptr[1] == 'Q')
2602 {
2603 ptr += 2;
2604 if (*ptr == '\\' && ptr[1] == 'E') { ptr += 2; continue; }
2605 inescq = TRUE;
2606 break;
2607 }
2608
2609 if (*ptr == 0 || (!inescq && *ptr == ']'))
2610 {
2611 ptr = oldptr;
2612 goto LONE_SINGLE_CHARACTER;
2613 }
2614
2615 #ifdef SUPPORT_UTF8
2616 if (utf8)
2617 { /* Braces are required because the */
2618 GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
2619 }
2620 else
2621 #endif
2622 d = *ptr; /* Not UTF-8 mode */
2623
2624 /* The second part of a range can be a single-character escape, but
2625 not any of the other escapes. Perl 5.6 treats a hyphen as a literal
2626 in such circumstances. */
2627
2628 if (!inescq && d == '\\')
2629 {
2630 d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2631 if (*errorcodeptr != 0) goto FAILED;
2632
2633 /* \b is backslash; \X is literal X; \R is literal R; any other
2634 special means the '-' was literal */
2635
2636 if (d < 0)
2637 {
2638 if (d == -ESC_b) d = '\b';
2639 else if (d == -ESC_X) d = 'X';
2640 else if (d == -ESC_R) d = 'R'; else
2641 {
2642 ptr = oldptr;
2643 goto LONE_SINGLE_CHARACTER; /* A few lines below */
2644 }
2645 }
2646 }
2647
2648 /* Check that the two values are in the correct order. Optimize
2649 one-character ranges */
2650
2651 if (d < c)
2652 {
2653 *errorcodeptr = ERR8;
2654 goto FAILED;
2655 }
2656
2657 if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
2658
2659 /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
2660 matching, we have to use an XCLASS with extra data items. Caseless
2661 matching for characters > 127 is available only if UCP support is
2662 available. */
2663
2664 #ifdef SUPPORT_UTF8
2665 if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
2666 {
2667 class_utf8 = TRUE;
2668
2669 /* With UCP support, we can find the other case equivalents of
2670 the relevant characters. There may be several ranges. Optimize how
2671 they fit with the basic range. */
2672
2673 #ifdef SUPPORT_UCP
2674 if ((options & PCRE_CASELESS) != 0)
2675 {
2676 unsigned int occ, ocd;
2677 unsigned int cc = c;
2678 unsigned int origd = d;
2679 while (get_othercase_range(&cc, origd, &occ, &ocd))
2680 {
2681 if (occ >= c && ocd <= d) continue; /* Skip embedded ranges */
2682
2683 if (occ < c && ocd >= c - 1) /* Extend the basic range */
2684 { /* if there is overlap, */
2685 c = occ; /* noting that if occ < c */
2686 continue; /* we can't have ocd > d */
2687 } /* because a subrange is */
2688 if (ocd > d && occ <= d + 1) /* always shorter than */
2689 { /* the basic range. */
2690 d = ocd;
2691 continue;
2692 }
2693
2694 if (occ == ocd)
2695 {
2696 *class_utf8data++ = XCL_SINGLE;
2697 }
2698 else
2699 {
2700 *class_utf8data++ = XCL_RANGE;
2701 class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
2702 }
2703 class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
2704 }
2705 }
2706 #endif /* SUPPORT_UCP */
2707
2708 /* Now record the original range, possibly modified for UCP caseless
2709 overlapping ranges. */
2710
2711 *class_utf8data++ = XCL_RANGE;
2712 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
2713 class_utf8data += _pcre_ord2utf8(d, class_utf8data);
2714
2715 /* With UCP support, we are done. Without UCP support, there is no
2716 caseless matching for UTF-8 characters > 127; we can use the bit map
2717 for the smaller ones. */
2718
2719 #ifdef SUPPORT_UCP
2720 continue; /* With next character in the class */
2721 #else
2722 if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
2723
2724 /* Adjust upper limit and fall through to set up the map */
2725
2726 d = 127;
2727
2728 #endif /* SUPPORT_UCP */
2729 }
2730 #endif /* SUPPORT_UTF8 */
2731
2732 /* We use the bit map for all cases when not in UTF-8 mode; else
2733 ranges that lie entirely within 0-127 when there is UCP support; else
2734 for partial ranges without UCP support. */
2735
2736 class_charcount += d - c + 1;
2737 class_lastchar = d;
2738
2739 /* We can save a bit of time by skipping this in the pre-compile. */
2740
2741 if (lengthptr == NULL) for (; c <= d; c++)
2742 {
2743 classbits[c/8] |= (1 << (c&7));
2744 if ((options & PCRE_CASELESS) != 0)
2745 {
2746 int uc = cd->fcc[c]; /* flip case */
2747 classbits[uc/8] |= (1 << (uc&7));
2748 }
2749 }
2750
2751 continue; /* Go get the next char in the class */
2752 }
2753
2754 /* Handle a lone single character - we can get here for a normal
2755 non-escape char, or after \ that introduces a single character or for an
2756 apparent range that isn't. */
2757
2758 LONE_SINGLE_CHARACTER:
2759
2760 /* Handle a character that cannot go in the bit map */
2761
2762 #ifdef SUPPORT_UTF8
2763 if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
2764 {
2765 class_utf8 = TRUE;
2766 *class_utf8data++ = XCL_SINGLE;
2767 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
2768
2769 #ifdef SUPPORT_UCP
2770 if ((options & PCRE_CASELESS) != 0)
2771 {
2772 unsigned int othercase;
2773 if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR)
2774 {
2775 *class_utf8data++ = XCL_SINGLE;
2776 class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
2777 }
2778 }
2779 #endif /* SUPPORT_UCP */
2780
2781 }
2782 else
2783 #endif /* SUPPORT_UTF8 */
2784
2785 /* Handle a single-byte character */
2786 {
2787 classbits[c/8] |= (1 << (c&7));
2788 if ((options & PCRE_CASELESS) != 0)
2789 {
2790 c = cd->fcc[c]; /* flip case */
2791 classbits[c/8] |= (1 << (c&7));
2792 }
2793 class_charcount++;
2794 class_lastchar = c;
2795 }
2796 }
2797
2798 /* Loop until ']' reached. This "while" is the end of the "do" above. */
2799
2800 while ((c = *(++ptr)) != 0 && (c != ']' || inescq));
2801
2802 if (c == 0) /* Missing terminating ']' */
2803 {
2804 *errorcodeptr = ERR6;
2805 goto FAILED;
2806 }
2807
2808 /* If class_charcount is 1, we saw precisely one character whose value is
2809 less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we
2810 can optimize the negative case only if there were no characters >= 128
2811 because OP_NOT and the related opcodes like OP_NOTSTAR operate on
2812 single-bytes only. This is an historical hangover. Maybe one day we can
2813 tidy these opcodes to handle multi-byte characters.
2814
2815 The optimization throws away the bit map. We turn the item into a
2816 1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
2817 that OP_NOT does not support multibyte characters. In the positive case, it
2818 can cause firstbyte to be set. Otherwise, there can be no first char if
2819 this item is first, whatever repeat count may follow. In the case of
2820 reqbyte, save the previous value for reinstating. */
2821
2822 #ifdef SUPPORT_UTF8
2823 if (class_charcount == 1 &&
2824 (!utf8 ||
2825 (!class_utf8 && (!negate_class || class_lastchar < 128))))
2826
2827 #else
2828 if (class_charcount == 1)
2829 #endif
2830 {
2831 zeroreqbyte = reqbyte;
2832
2833 /* The OP_NOT opcode works on one-byte characters only. */
2834
2835 if (negate_class)
2836 {
2837 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2838 zerofirstbyte = firstbyte;
2839 *code++ = OP_NOT;
2840 *code++ = class_lastchar;
2841 break;
2842 }
2843
2844 /* For a single, positive character, get the value into mcbuffer, and
2845 then we can handle this with the normal one-character code. */
2846
2847 #ifdef SUPPORT_UTF8
2848 if (utf8 && class_lastchar > 127)
2849 mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
2850 else
2851 #endif
2852 {
2853 mcbuffer[0] = class_lastchar;
2854 mclength = 1;
2855 }
2856 goto ONE_CHAR;
2857 } /* End of 1-char optimization */
2858
2859 /* The general case - not the one-char optimization. If this is the first
2860 thing in the branch, there can be no first char setting, whatever the
2861 repeat count. Any reqbyte setting must remain unchanged after any kind of
2862 repeat. */
2863
2864 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2865 zerofirstbyte = firstbyte;
2866 zeroreqbyte = reqbyte;
2867
2868 /* If there are characters with values > 255, we have to compile an
2869 extended class, with its own opcode. If there are no characters < 256,
2870 we can omit the bitmap in the actual compiled code. */
2871
2872 #ifdef SUPPORT_UTF8
2873 if (class_utf8)
2874 {
2875 *class_utf8data++ = XCL_END; /* Marks the end of extra data */
2876 *code++ = OP_XCLASS;
2877 code += LINK_SIZE;
2878 *code = negate_class? XCL_NOT : 0;
2879
2880 /* If the map is required, move up the extra data to make room for it;
2881 otherwise just move the code pointer to the end of the extra data. */
2882
2883 if (class_charcount > 0)
2884 {
2885 *code++ |= XCL_MAP;
2886 memmove(code + 32, code, class_utf8data - code);
2887 memcpy(code, classbits, 32);
2888 code = class_utf8data + 32;
2889 }
2890 else code = class_utf8data;
2891
2892 /* Now fill in the complete length of the item */
2893
2894 PUT(previous, 1, code - previous);
2895 break; /* End of class handling */
2896 }
2897 #endif
2898
2899 /* If there are no characters > 255, negate the 32-byte map if necessary,
2900 and copy it into the code vector. If this is the first thing in the branch,
2901 there can be no first char setting, whatever the repeat count. Any reqbyte
2902 setting must remain unchanged after any kind of repeat. */
2903
2904 if (negate_class)
2905 {
2906 *code++ = OP_NCLASS;
2907 if (lengthptr == NULL) /* Save time in the pre-compile phase */
2908 for (c = 0; c < 32; c++) code[c] = ~classbits[c];
2909 }
2910 else
2911 {
2912 *code++ = OP_CLASS;
2913 memcpy(code, classbits, 32);
2914 }
2915 code += 32;
2916 break;
2917
2918
2919 /* ===================================================================*/
2920 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
2921 has been tested above. */
2922
2923 case '{':
2924 if (!is_quantifier) goto NORMAL_CHAR;
2925 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
2926 if (*errorcodeptr != 0) goto FAILED;
2927 goto REPEAT;
2928
2929 case '*':
2930 repeat_min = 0;
2931 repeat_max = -1;
2932 goto REPEAT;
2933
2934 case '+':
2935 repeat_min = 1;
2936 repeat_max = -1;
2937 goto REPEAT;
2938
2939 case '?':
2940 repeat_min = 0;
2941 repeat_max = 1;
2942
2943 REPEAT:
2944 if (previous == NULL)
2945 {
2946 *errorcodeptr = ERR9;
2947 goto FAILED;
2948 }
2949
2950 if (repeat_min == 0)
2951 {
2952 firstbyte = zerofirstbyte; /* Adjust for zero repeat */
2953 reqbyte = zeroreqbyte; /* Ditto */
2954 }
2955
2956 /* Remember whether this is a variable length repeat */
2957
2958 reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
2959
2960 op_type = 0; /* Default single-char op codes */
2961 possessive_quantifier = FALSE; /* Default not possessive quantifier */
2962
2963 /* Save start of previous item, in case we have to move it up to make space
2964 for an inserted OP_ONCE for the additional '+' extension. */
2965
2966 tempcode = previous;
2967
2968 /* If the next character is '+', we have a possessive quantifier. This
2969 implies greediness, whatever the setting of the PCRE_UNGREEDY option.
2970 If the next character is '?' this is a minimizing repeat, by default,
2971 but if PCRE_UNGREEDY is set, it works the other way round. We change the
2972 repeat type to the non-default. */
2973
2974 if (ptr[1] == '+')
2975 {
2976 repeat_type = 0; /* Force greedy */
2977 possessive_quantifier = TRUE;
2978 ptr++;
2979 }
2980 else if (ptr[1] == '?')
2981 {
2982 repeat_type = greedy_non_default;
2983 ptr++;
2984 }
2985 else repeat_type = greedy_default;
2986
2987 /* If previous was a character match, abolish the item and generate a
2988 repeat item instead. If a char item has a minumum of more than one, ensure
2989 that it is set in reqbyte - it might not be if a sequence such as x{3} is
2990 the first thing in a branch because the x will have gone into firstbyte
2991 instead. */
2992
2993 if (*previous == OP_CHAR || *previous == OP_CHARNC)
2994 {
2995 /* Deal with UTF-8 characters that take up more than one byte. It's
2996 easier to write this out separately than try to macrify it. Use c to
2997 hold the length of the character in bytes, plus 0x80 to flag that it's a
2998 length rather than a small character. */
2999
3000 #ifdef SUPPORT_UTF8
3001 if (utf8 && (code[-1] & 0x80) != 0)
3002 {
3003 uschar *lastchar = code - 1;
3004 while((*lastchar & 0xc0) == 0x80) lastchar--;
3005 c = code - lastchar; /* Length of UTF-8 character */
3006 memcpy(utf8_char, lastchar, c); /* Save the char */
3007 c |= 0x80; /* Flag c as a length */
3008 }
3009 else
3010 #endif
3011
3012 /* Handle the case of a single byte - either with no UTF8 support, or
3013 with UTF-8 disabled, or for a UTF-8 character < 128. */
3014
3015 {
3016 c = code[-1];
3017 if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
3018 }
3019
3020 /* If the repetition is unlimited, it pays to see if the next thing on
3021 the line is something that cannot possibly match this character. If so,
3022 automatically possessifying this item gains some performance in the case
3023 where the match fails. */
3024
3025 if (!possessive_quantifier &&
3026 repeat_max < 0 &&
3027 check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3028 options, cd))
3029 {
3030 repeat_type = 0; /* Force greedy */
3031 possessive_quantifier = TRUE;
3032 }
3033
3034 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
3035 }
3036
3037 /* If previous was a single negated character ([^a] or similar), we use
3038 one of the special opcodes, replacing it. The code is shared with single-
3039 character repeats by setting opt_type to add a suitable offset into
3040 repeat_type. We can also test for auto-possessification. OP_NOT is
3041 currently used only for single-byte chars. */
3042
3043 else if (*previous == OP_NOT)
3044 {
3045 op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
3046 c = previous[1];
3047 if (!possessive_quantifier &&
3048 repeat_max < 0 &&
3049 check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3050 {
3051 repeat_type = 0; /* Force greedy */
3052 possessive_quantifier = TRUE;
3053 }
3054 goto OUTPUT_SINGLE_REPEAT;
3055 }
3056
3057 /* If previous was a character type match (\d or similar), abolish it and
3058 create a suitable repeat item. The code is shared with single-character
3059 repeats by setting op_type to add a suitable offset into repeat_type. Note
3060 the the Unicode property types will be present only when SUPPORT_UCP is
3061 defined, but we don't wrap the little bits of code here because it just
3062 makes it horribly messy. */
3063
3064 else if (*previous < OP_EODN)
3065 {
3066 uschar *oldcode;
3067 int prop_type, prop_value;
3068 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
3069 c = *previous;
3070
3071 if (!possessive_quantifier &&
3072 repeat_max < 0 &&
3073 check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3074 {
3075 repeat_type = 0; /* Force greedy */
3076 possessive_quantifier = TRUE;
3077 }
3078
3079 OUTPUT_SINGLE_REPEAT:
3080 if (*previous == OP_PROP || *previous == OP_NOTPROP)
3081 {
3082 prop_type = previous[1];
3083 prop_value = previous[2];
3084 }
3085 else prop_type = prop_value = -1;
3086
3087 oldcode = code;
3088 code = previous; /* Usually overwrite previous item */
3089
3090 /* If the maximum is zero then the minimum must also be zero; Perl allows
3091 this case, so we do too - by simply omitting the item altogether. */
3092
3093 if (repeat_max == 0) goto END_REPEAT;
3094
3095 /* All real repeats make it impossible to handle partial matching (maybe
3096 one day we will be able to remove this restriction). */
3097
3098 if (repeat_max != 1) cd->nopartial = TRUE;
3099
3100 /* Combine the op_type with the repeat_type */
3101
3102 repeat_type += op_type;
3103
3104 /* A minimum of zero is handled either as the special case * or ?, or as
3105 an UPTO, with the maximum given. */
3106
3107 if (repeat_min == 0)
3108 {
3109 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
3110 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
3111 else
3112 {
3113 *code++ = OP_UPTO + repeat_type;
3114 PUT2INC(code, 0, repeat_max);
3115 }
3116 }
3117
3118 /* A repeat minimum of 1 is optimized into some special cases. If the
3119 maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
3120 left in place and, if the maximum is greater than 1, we use OP_UPTO with
3121 one less than the maximum. */
3122
3123 else if (repeat_min == 1)
3124 {
3125 if (repeat_max == -1)
3126 *code++ = OP_PLUS + repeat_type;
3127 else
3128 {
3129 code = oldcode; /* leave previous item in place */
3130 if (repeat_max == 1) goto END_REPEAT;
3131 *code++ = OP_UPTO + repeat_type;
3132 PUT2INC(code, 0, repeat_max - 1);
3133 }
3134 }
3135
3136 /* The case {n,n} is just an EXACT, while the general case {n,m} is
3137 handled as an EXACT followed by an UPTO. */
3138
3139 else
3140 {
3141 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
3142 PUT2INC(code, 0, repeat_min);
3143
3144 /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
3145 we have to insert the character for the previous code. For a repeated
3146 Unicode property match, there are two extra bytes that define the
3147 required property. In UTF-8 mode, long characters have their length in
3148 c, with the 0x80 bit as a flag. */
3149
3150 if (repeat_max < 0)
3151 {
3152 #ifdef SUPPORT_UTF8
3153 if (utf8 && c >= 128)
3154 {
3155 memcpy(code, utf8_char, c & 7);
3156 code += c & 7;
3157 }
3158 else
3159 #endif
3160 {
3161 *code++ = c;
3162 if (prop_type >= 0)
3163 {
3164 *code++ = prop_type;
3165 *code++ = prop_value;
3166 }
3167 }
3168 *code++ = OP_STAR + repeat_type;
3169 }
3170
3171 /* Else insert an UPTO if the max is greater than the min, again
3172 preceded by the character, for the previously inserted code. If the
3173 UPTO is just for 1 instance, we can use QUERY instead. */
3174
3175 else if (repeat_max != repeat_min)
3176 {
3177 #ifdef SUPPORT_UTF8
3178 if (utf8 && c >= 128)
3179 {
3180 memcpy(code, utf8_char, c & 7);
3181 code += c & 7;
3182 }
3183 else
3184 #endif
3185 *code++ = c;
3186 if (prop_type >= 0)
3187 {
3188 *code++ = prop_type;
3189 *code++ = prop_value;
3190 }
3191 repeat_max -= repeat_min;
3192
3193 if (repeat_max == 1)
3194 {
3195 *code++ = OP_QUERY + repeat_type;
3196 }
3197 else
3198 {
3199 *code++ = OP_UPTO + repeat_type;
3200 PUT2INC(code, 0, repeat_max);
3201 }
3202 }
3203 }
3204
3205 /* The character or character type itself comes last in all cases. */
3206
3207 #ifdef SUPPORT_UTF8
3208 if (utf8 && c >= 128)
3209 {
3210 memcpy(code, utf8_char, c & 7);
3211 code += c & 7;
3212 }
3213 else
3214 #endif
3215 *code++ = c;
3216
3217 /* For a repeated Unicode property match, there are two extra bytes that
3218 define the required property. */
3219
3220 #ifdef SUPPORT_UCP
3221 if (prop_type >= 0)
3222 {
3223 *code++ = prop_type;
3224 *code++ = prop_value;
3225 }
3226 #endif
3227 }
3228
3229 /* If previous was a character class or a back reference, we put the repeat
3230 stuff after it, but just skip the item if the repeat was {0,0}. */
3231
3232 else if (*previous == OP_CLASS ||
3233 *previous == OP_NCLASS ||
3234 #ifdef SUPPORT_UTF8
3235 *previous == OP_XCLASS ||
3236 #endif
3237 *previous == OP_REF)
3238 {
3239 if (repeat_max == 0)
3240 {
3241 code = previous;
3242 goto END_REPEAT;
3243 }
3244
3245 /* All real repeats make it impossible to handle partial matching (maybe
3246 one day we will be able to remove this restriction). */
3247
3248 if (repeat_max != 1) cd->nopartial = TRUE;
3249
3250 if (repeat_min == 0 && repeat_max == -1)
3251 *code++ = OP_CRSTAR + repeat_type;
3252 else if (repeat_min == 1 && repeat_max == -1)
3253 *code++ = OP_CRPLUS + repeat_type;
3254 else if (repeat_min == 0 && repeat_max == 1)
3255 *code++ = OP_CRQUERY + repeat_type;
3256 else
3257 {
3258 *code++ = OP_CRRANGE + repeat_type;
3259 PUT2INC(code, 0, repeat_min);
3260 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
3261 PUT2INC(code, 0, repeat_max);
3262 }
3263 }
3264
3265 /* If previous was a bracket group, we may have to replicate it in certain
3266 cases. */
3267
3268 else if (*previous == OP_BRA || *previous == OP_CBRA ||
3269 *previous == OP_ONCE || *previous == OP_COND)
3270 {
3271 register int i;
3272 int ketoffset = 0;
3273 int len = code - previous;
3274 uschar *bralink = NULL;
3275
3276 /* Repeating a DEFINE group is pointless */
3277
3278 if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
3279 {
3280 *errorcodeptr = ERR55;
3281 goto FAILED;
3282 }
3283
3284 /* This is a paranoid check to stop integer overflow later on */
3285
3286 if (len > MAX_DUPLENGTH)
3287 {
3288 *errorcodeptr = ERR50;
3289 goto FAILED;
3290 }
3291
3292 /* If the maximum repeat count is unlimited, find the end of the bracket
3293 by scanning through from the start, and compute the offset back to it
3294 from the current code pointer. There may be an OP_OPT setting following
3295 the final KET, so we can't find the end just by going back from the code
3296 pointer. */
3297
3298 if (repeat_max == -1)
3299 {
3300 register uschar *ket = previous;
3301 do ket += GET(ket, 1); while (*ket != OP_KET);
3302 ketoffset = code - ket;
3303 }
3304
3305 /* The case of a zero minimum is special because of the need to stick
3306 OP_BRAZERO in front of it, and because the group appears once in the
3307 data, whereas in other cases it appears the minimum number of times. For
3308 this reason, it is simplest to treat this case separately, as otherwise
3309 the code gets far too messy. There are several special subcases when the
3310 minimum is zero. */
3311
3312 if (repeat_min == 0)
3313 {
3314 /* If the maximum is also zero, we just omit the group from the output
3315 altogether. */
3316
3317 if (repeat_max == 0)
3318 {
3319 code = previous;
3320 goto END_REPEAT;
3321 }
3322
3323 /* If the maximum is 1 or unlimited, we just have to stick in the
3324 BRAZERO and do no more at this point. However, we do need to adjust
3325 any OP_RECURSE calls inside the group that refer to the group itself or
3326 any internal or forward referenced group, because the offset is from
3327 the start of the whole regex. Temporarily terminate the pattern while
3328 doing this. */
3329
3330 if (repeat_max <= 1)
3331 {
3332 *code = OP_END;
3333 adjust_recurse(previous, 1, utf8, cd, save_hwm);
3334 memmove(previous+1, previous, len);
3335 code++;
3336 *previous++ = OP_BRAZERO + repeat_type;
3337 }
3338
3339 /* If the maximum is greater than 1 and limited, we have to replicate
3340 in a nested fashion, sticking OP_BRAZERO before each set of brackets.
3341 The first one has to be handled carefully because it's the original
3342 copy, which has to be moved up. The remainder can be handled by code
3343 that is common with the non-zero minimum case below. We have to
3344 adjust the value or repeat_max, since one less copy is required. Once
3345 again, we may have to adjust any OP_RECURSE calls inside the group. */
3346
3347 else
3348 {
3349 int offset;
3350 *code = OP_END;
3351 adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
3352 memmove(previous + 2 + LINK_SIZE, previous, len);
3353 code += 2 + LINK_SIZE;
3354 *previous++ = OP_BRAZERO + repeat_type;
3355 *previous++ = OP_BRA;
3356
3357 /* We chain together the bracket offset fields that have to be
3358 filled in later when the ends of the brackets are reached. */
3359
3360 offset = (bralink == NULL)? 0 : previous - bralink;
3361 bralink = previous;
3362 PUTINC(previous, 0, offset);
3363 }
3364
3365 repeat_max--;
3366 }
3367
3368 /* If the minimum is greater than zero, replicate the group as many
3369 times as necessary, and adjust the maximum to the number of subsequent
3370 copies that we need. If we set a first char from the group, and didn't
3371 set a required char, copy the latter from the former. If there are any
3372 forward reference subroutine calls in the group, there will be entries on
3373 the workspace list; replicate these with an appropriate increment. */
3374
3375 else
3376 {
3377 if (repeat_min > 1)
3378 {
3379 /* In the pre-compile phase, we don't actually do the replication. We
3380 just adjust the length as if we had. */
3381
3382 if (lengthptr != NULL)
3383 *lengthptr += (repeat_min - 1)*length_prevgroup;
3384
3385 /* This is compiling for real */
3386
3387 else
3388 {
3389 if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
3390 for (i = 1; i < repeat_min; i++)
3391 {
3392 uschar *hc;
3393 uschar *this_hwm = cd->hwm;
3394 memcpy(code, previous, len);
3395 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3396 {
3397 PUT(cd->hwm, 0, GET(hc, 0) + len);
3398 cd->hwm += LINK_SIZE;
3399 }
3400 save_hwm = this_hwm;
3401 code += len;
3402 }
3403 }
3404 }
3405
3406 if (repeat_max > 0) repeat_max -= repeat_min;
3407 }
3408
3409 /* This code is common to both the zero and non-zero minimum cases. If
3410 the maximum is limited, it replicates the group in a nested fashion,
3411 remembering the bracket starts on a stack. In the case of a zero minimum,
3412 the first one was set up above. In all cases the repeat_max now specifies
3413 the number of additional copies needed. Again, we must remember to
3414 replicate entries on the forward reference list. */
3415
3416 if (repeat_max >= 0)
3417 {
3418 /* In the pre-compile phase, we don't actually do the replication. We
3419 just adjust the length as if we had. For each repetition we must add 1
3420 to the length for BRAZERO and for all but the last repetition we must
3421 add 2 + 2*LINKSIZE to allow for the nesting that occurs. */
3422
3423 if (lengthptr != NULL && repeat_max > 0)
3424 *lengthptr += repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
3425 2 - 2*LINK_SIZE; /* Last one doesn't nest */
3426
3427 /* This is compiling for real */
3428
3429 else for (i = repeat_max - 1; i >= 0; i--)
3430 {
3431 uschar *hc;
3432 uschar *this_hwm = cd->hwm;
3433
3434 *code++ = OP_BRAZERO + repeat_type;
3435
3436 /* All but the final copy start a new nesting, maintaining the
3437 chain of brackets outstanding. */
3438
3439 if (i != 0)
3440 {
3441 int offset;
3442 *code++ = OP_BRA;
3443 offset = (bralink == NULL)? 0 : code - bralink;
3444 bralink = code;
3445 PUTINC(code, 0, offset);
3446 }
3447
3448 memcpy(code, previous, len);
3449 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3450 {
3451 PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
3452 cd->hwm += LINK_SIZE;
3453 }
3454 save_hwm = this_hwm;
3455 code += len;
3456 }
3457
3458 /* Now chain through the pending brackets, and fill in their length
3459 fields (which are holding the chain links pro tem). */
3460
3461 while (bralink != NULL)
3462 {
3463 int oldlinkoffset;
3464 int offset = code - bralink + 1;
3465 uschar *bra = code - offset;
3466 oldlinkoffset = GET(bra, 1);
3467 bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
3468 *code++ = OP_KET;
3469 PUTINC(code, 0, offset);
3470 PUT(bra, 1, offset);
3471 }
3472 }
3473
3474 /* If the maximum is unlimited, set a repeater in the final copy. We
3475 can't just offset backwards from the current code point, because we
3476 don't know if there's been an options resetting after the ket. The
3477 correct offset was computed above.
3478
3479 Then, when we are doing the actual compile phase, check to see whether
3480 this group is a non-atomic one that could match an empty string. If so,
3481 convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
3482 that runtime checking can be done. [This check is also applied to
3483 atomic groups at runtime, but in a different way.] */
3484
3485 else
3486 {
3487 uschar *ketcode = code - ketoffset;
3488 uschar *bracode = ketcode - GET(ketcode, 1);
3489 *ketcode = OP_KETRMAX + repeat_type;
3490 if (lengthptr == NULL && *bracode != OP_ONCE)
3491 {
3492 uschar *scode = bracode;
3493 do
3494 {
3495 if (could_be_empty_branch(scode, ketcode, utf8))
3496 {
3497 *bracode += OP_SBRA - OP_BRA;
3498 break;
3499 }
3500 scode += GET(scode, 1);
3501 }
3502 while (*scode == OP_ALT);
3503 }
3504 }
3505 }
3506
3507 /* Else there's some kind of shambles */
3508
3509 else
3510 {
3511 *errorcodeptr = ERR11;
3512 goto FAILED;
3513 }
3514
3515 /* If the character following a repeat is '+', or if certain optimization
3516 tests above succeeded, possessive_quantifier is TRUE. For some of the
3517 simpler opcodes, there is an special alternative opcode for this. For
3518 anything else, we wrap the entire repeated item inside OP_ONCE brackets.
3519 The '+' notation is just syntactic sugar, taken from Sun's Java package,
3520 but the special opcodes can optimize it a bit. The repeated item starts at
3521 tempcode, not at previous, which might be the first part of a string whose
3522 (former) last char we repeated.
3523
3524 Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
3525 an 'upto' may follow. We skip over an 'exact' item, and then test the
3526 length of what remains before proceeding. */
3527
3528 if (possessive_quantifier)
3529 {
3530 int len;
3531 if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||
3532 *tempcode == OP_NOTEXACT)
3533 tempcode += _pcre_OP_lengths[*tempcode];
3534 len = code - tempcode;
3535 if (len > 0) switch (*tempcode)
3536 {
3537 case OP_STAR: *tempcode = OP_POSSTAR; break;
3538 case OP_PLUS: *tempcode = OP_POSPLUS; break;
3539 case OP_QUERY: *tempcode = OP_POSQUERY; break;
3540 case OP_UPTO: *tempcode = OP_POSUPTO; break;
3541
3542 case OP_TYPESTAR: *tempcode = OP_TYPEPOSSTAR; break;
3543 case OP_TYPEPLUS: *tempcode = OP_TYPEPOSPLUS; break;
3544 case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
3545 case OP_TYPEUPTO: *tempcode = OP_TYPEPOSUPTO; break;
3546
3547 case OP_NOTSTAR: *tempcode = OP_NOTPOSSTAR; break;
3548 case OP_NOTPLUS: *tempcode = OP_NOTPOSPLUS; break;
3549 case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
3550 case OP_NOTUPTO: *tempcode = OP_NOTPOSUPTO; break;
3551
3552 default:
3553 memmove(tempcode + 1+LINK_SIZE, tempcode, len);
3554 code += 1 + LINK_SIZE;
3555 len += 1 + LINK_SIZE;
3556 tempcode[0] = OP_ONCE;
3557 *code++ = OP_KET;
3558 PUTINC(code, 0, len);
3559 PUT(tempcode, 1, len);
3560 break;
3561 }
3562 }
3563
3564 /* In all case we no longer have a previous item. We also set the
3565 "follows varying string" flag for subsequently encountered reqbytes if
3566 it isn't already set and we have just passed a varying length item. */
3567
3568 END_REPEAT:
3569 previous = NULL;
3570 cd->req_varyopt |= reqvary;
3571 break;
3572
3573
3574 /* ===================================================================*/
3575 /* Start of nested parenthesized sub-expression, or comment or lookahead or
3576 lookbehind or option setting or condition or all the other extended
3577 parenthesis forms. First deal with the specials; all are introduced by ?,
3578 and the appearance of any of them means that this is not a capturing
3579 group. */
3580
3581 case '(':
3582 newoptions = options;
3583 skipbytes = 0;
3584 bravalue = OP_CBRA;
3585 save_hwm = cd->hwm;
3586
3587 if (*(++ptr) == '?')
3588 {
3589 int i, set, unset, namelen;
3590 int *optset;
3591 const uschar *name;
3592 uschar *slot;
3593
3594 switch (*(++ptr))
3595 {
3596 case '#': /* Comment; skip to ket */
3597 ptr++;
3598 while (*ptr != 0 && *ptr != ')') ptr++;
3599 if (*ptr == 0)
3600 {
3601 *errorcodeptr = ERR18;
3602 goto FAILED;
3603 }
3604 continue;
3605
3606
3607 /* ------------------------------------------------------------ */
3608 case ':': /* Non-capturing bracket */
3609 bravalue = OP_BRA;
3610 ptr++;
3611 break;
3612
3613
3614 /* ------------------------------------------------------------ */
3615 case '(':
3616 bravalue = OP_COND; /* Conditional group */
3617
3618 /* A condition can be an assertion, a number (referring to a numbered
3619 group), a name (referring to a named group), or 'R', referring to
3620 recursion. R<digits> and R&name are also permitted for recursion tests.
3621
3622 There are several syntaxes for testing a named group: (?(name)) is used
3623 by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
3624
3625 There are two unfortunate ambiguities, caused by history. (a) 'R' can
3626 be the recursive thing or the name 'R' (and similarly for 'R' followed
3627 by digits), and (b) a number could be a name that consists of digits.
3628 In both cases, we look for a name first; if not found, we try the other
3629 cases. */
3630
3631 /* For conditions that are assertions, check the syntax, and then exit
3632 the switch. This will take control down to where bracketed groups,
3633 including assertions, are processed. */
3634
3635 if (ptr[1] == '?' && (ptr[2] == '=' || ptr[2] == '!' || ptr[2] == '<'))
3636 break;
3637
3638 /* Most other conditions use OP_CREF (a couple change to OP_RREF
3639 below), and all need to skip 3 bytes at the start of the group. */
3640
3641 code[1+LINK_SIZE] = OP_CREF;
3642 skipbytes = 3;
3643 refsign = -1;
3644
3645 /* Check for a test for recursion in a named group. */
3646
3647 if (ptr[1] == 'R' && ptr[2] == '&')
3648 {
3649 terminator = -1;
3650 ptr += 2;
3651 code[1+LINK_SIZE] = OP_RREF; /* Change the type of test */
3652 }
3653
3654 /* Check for a test for a named group's having been set, using the Perl
3655 syntax (?(<name>) or (?('name') */
3656
3657 else if (ptr[1] == '<')
3658 {
3659 terminator = '>';
3660 ptr++;
3661 }
3662 else if (ptr[1] == '\'')
3663 {
3664 terminator = '\'';
3665 ptr++;
3666 }
3667 else
3668 {
3669 terminator = 0;
3670 if (ptr[1] == '-' || ptr[1] == '+') refsign = *(++ptr);
3671 }
3672
3673 /* We now expect to read a name; any thing else is an error */
3674
3675 if ((cd->ctypes[ptr[1]] & ctype_word) == 0)
3676 {
3677 ptr += 1; /* To get the right offset */
3678 *errorcodeptr = ERR28;
3679 goto FAILED;
3680 }
3681
3682 /* Read the name, but also get it as a number if it's all digits */
3683
3684 recno = 0;
3685 name = ++ptr;
3686 while ((cd->ctypes[*ptr] & ctype_word) != 0)
3687 {
3688 if (recno >= 0)
3689 recno = ((digitab[*ptr] & ctype_digit) != 0)?
3690 recno * 10 + *ptr - '0' : -1;
3691 ptr++;
3692 }
3693 namelen = ptr - name;
3694
3695 if ((terminator > 0 && *ptr++ != terminator) || *ptr++ != ')')
3696 {
3697 ptr--; /* Error offset */
3698 *errorcodeptr = ERR26;
3699 goto FAILED;
3700 }
3701
3702 /* Do no further checking in the pre-compile phase. */
3703
3704 if (lengthptr != NULL) break;
3705
3706 /* In the real compile we do the work of looking for the actual
3707 reference. If the string started with "+" or "-" we require the rest to
3708 be digits, in which case recno will be set. */
3709
3710 if (refsign > 0)
3711 {
3712 if (recno <= 0)
3713 {
3714 *errorcodeptr = ERR58;
3715 goto FAILED;
3716 }
3717 if (refsign == '-')
3718 {
3719 recno = cd->bracount - recno + 1;
3720 if (recno <= 0)
3721 {
3722 *errorcodeptr = ERR15;
3723 goto FAILED;
3724 }
3725 }
3726 else recno += cd->bracount;
3727 PUT2(code, 2+LINK_SIZE, recno);
3728 break;
3729 }
3730
3731 /* Otherwise (did not start with "+" or "-"), start by looking for the
3732 name. */
3733
3734 slot = cd->name_table;
3735 for (i = 0; i < cd->names_found; i++)
3736 {
3737 if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
3738 slot += cd->name_entry_size;
3739 }
3740
3741 /* Found a previous named subpattern */
3742
3743 if (i < cd->names_found)
3744 {
3745 recno = GET2(slot, 0);
3746 PUT2(code, 2+LINK_SIZE, recno);
3747 }
3748
3749 /* Search the pattern for a forward reference */
3750
3751 else if ((i = find_parens(ptr, cd->bracount, name, namelen,
3752 (options & PCRE_EXTENDED) != 0)) > 0)
3753 {
3754 PUT2(code, 2+LINK_SIZE, i);
3755 }
3756
3757 /* If terminator == 0 it means that the name followed directly after
3758 the opening parenthesis [e.g. (?(abc)...] and in this case there are
3759 some further alternatives to try. For the cases where terminator != 0
3760 [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
3761 now checked all the possibilities, so give an error. */
3762
3763 else if (terminator != 0)
3764 {
3765 *errorcodeptr = ERR15;
3766 goto FAILED;
3767 }
3768
3769 /* Check for (?(R) for recursion. Allow digits after R to specify a
3770 specific group number. */
3771
3772 else if (*name == 'R')
3773 {
3774 recno = 0;
3775 for (i = 1; i < namelen; i++)
3776 {
3777 if ((digitab[name[i]] & ctype_digit) == 0)
3778 {
3779 *errorcodeptr = ERR15;
3780 goto FAILED;
3781 }
3782 recno = recno * 10 + name[i] - '0';
3783 }
3784 if (recno == 0) recno = RREF_ANY;
3785 code[1+LINK_SIZE] = OP_RREF; /* Change test type */
3786 PUT2(code, 2+LINK_SIZE, recno);
3787 }
3788
3789 /* Similarly, check for the (?(DEFINE) "condition", which is always
3790 false. */
3791
3792 else if (namelen == 6 && strncmp((char *)name, "DEFINE", 6) == 0)
3793 {
3794 code[1+LINK_SIZE] = OP_DEF;
3795 skipbytes = 1;
3796 }
3797
3798 /* Check for the "name" actually being a subpattern number. */
3799
3800 else if (recno > 0)
3801 {
3802 PUT2(code, 2+LINK_SIZE, recno);
3803 }
3804
3805 /* Either an unidentified subpattern, or a reference to (?(0) */
3806
3807 else
3808 {
3809 *errorcodeptr = (recno == 0)? ERR35: ERR15;
3810 goto FAILED;
3811 }
3812 break;
3813
3814
3815 /* ------------------------------------------------------------ */
3816 case '=': /* Positive lookahead */
3817 bravalue = OP_ASSERT;
3818 ptr++;
3819 break;
3820
3821
3822 /* ------------------------------------------------------------ */
3823 case '!': /* Negative lookahead */
3824 bravalue = OP_ASSERT_NOT;
3825 ptr++;
3826 break;
3827
3828
3829 /* ------------------------------------------------------------ */
3830 case '<': /* Lookbehind or named define */
3831 switch (ptr[1])
3832 {
3833 case '=': /* Positive lookbehind */
3834 bravalue = OP_ASSERTBACK;
3835 ptr += 2;
3836 break;
3837
3838 case '!': /* Negative lookbehind */
3839 bravalue = OP_ASSERTBACK_NOT;
3840 ptr += 2;
3841 break;
3842
3843 default: /* Could be name define, else bad */
3844 if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME;
3845 ptr++; /* Correct offset for error */
3846 *errorcodeptr = ERR24;
3847 goto FAILED;
3848 }
3849 break;
3850
3851
3852 /* ------------------------------------------------------------ */
3853 case '>': /* One-time brackets */
3854 bravalue = OP_ONCE;
3855 ptr++;
3856 break;
3857
3858
3859 /* ------------------------------------------------------------ */
3860 case 'C': /* Callout - may be followed by digits; */
3861 previous_callout = code; /* Save for later completion */
3862 after_manual_callout = 1; /* Skip one item before completing */
3863 *code++ = OP_CALLOUT;
3864 {
3865 int n = 0;
3866 while ((digitab[*(++ptr)] & ctype_digit) != 0)
3867 n = n * 10 + *ptr - '0';
3868 if (*ptr != ')')
3869 {
3870 *errorcodeptr = ERR39;
3871 goto FAILED;
3872 }
3873 if (n > 255)
3874 {
3875 *errorcodeptr = ERR38;
3876 goto FAILED;
3877 }
3878 *code++ = n;
3879 PUT(code, 0, ptr - cd->start_pattern + 1); /* Pattern offset */
3880 PUT(code, LINK_SIZE, 0); /* Default length */
3881 code += 2 * LINK_SIZE;
3882 }
3883 previous = NULL;
3884 continue;
3885
3886
3887 /* ------------------------------------------------------------ */
3888 case 'P': /* Python-style named subpattern handling */
3889 if (*(++ptr) == '=' || *ptr == '>') /* Reference or recursion */
3890 {
3891 is_recurse = *ptr == '>';
3892 terminator = ')';
3893 goto NAMED_REF_OR_RECURSE;
3894 }
3895 else if (*ptr != '<') /* Test for Python-style definition */
3896 {
3897 *errorcodeptr = ERR41;
3898 goto FAILED;
3899 }
3900 /* Fall through to handle (?P< as (?< is handled */
3901
3902
3903 /* ------------------------------------------------------------ */
3904 DEFINE_NAME: /* Come here from (?< handling */
3905 case '\'':
3906 {
3907 terminator = (*ptr == '<')? '>' : '\'';
3908 name = ++ptr;
3909
3910 while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
3911 namelen = ptr - name;
3912
3913 /* In the pre-compile phase, just do a syntax check. */
3914
3915 if (lengthptr != NULL)
3916 {
3917 if (*ptr != terminator)
3918 {
3919 *errorcodeptr = ERR42;
3920 goto FAILED;
3921 }
3922 if (cd->names_found >= MAX_NAME_COUNT)
3923 {
3924 *errorcodeptr = ERR49;
3925 goto FAILED;
3926 }
3927 if (namelen + 3 > cd->name_entry_size)
3928 {
3929 cd->name_entry_size = namelen + 3;
3930 if (namelen > MAX_NAME_SIZE)
3931 {
3932 *errorcodeptr = ERR48;
3933 goto FAILED;
3934 }
3935 }
3936 }
3937
3938 /* In the real compile, create the entry in the table */
3939
3940 else
3941 {
3942 slot = cd->name_table;
3943 for (i = 0; i < cd->names_found; i++)
3944 {
3945 int crc = memcmp(name, slot+2, namelen);
3946 if (crc == 0)
3947 {
3948 if (slot[2+namelen] == 0)
3949 {
3950 if ((options & PCRE_DUPNAMES) == 0)
3951 {
3952 *errorcodeptr = ERR43;
3953 goto FAILED;
3954 }
3955 }
3956 else crc = -1; /* Current name is substring */
3957 }
3958 if (crc < 0)
3959 {
3960 memmove(slot + cd->name_entry_size, slot,
3961 (cd->names_found - i) * cd->name_entry_size);
3962 break;
3963 }
3964 slot += cd->name_entry_size;
3965 }
3966
3967 PUT2(slot, 0, cd->bracount + 1);
3968 memcpy(slot + 2, name, namelen);
3969 slot[2+namelen] = 0;
3970 }
3971 }
3972
3973 /* In both cases, count the number of names we've encountered. */
3974
3975 ptr++; /* Move past > or ' */
3976 cd->names_found++;
3977 goto NUMBERED_GROUP;
3978
3979
3980 /* ------------------------------------------------------------ */
3981 case '&': /* Perl recursion/subroutine syntax */
3982 terminator = ')';
3983 is_recurse = TRUE;
3984 /* Fall through */
3985
3986 /* We come here from the Python syntax above that handles both
3987 references (?P=name) and recursion (?P>name), as well as falling
3988 through from the Perl recursion syntax (?&name). */
3989
3990 NAMED_REF_OR_RECURSE:
3991 name = ++ptr;
3992 while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
3993 namelen = ptr - name;
3994
3995 /* In the pre-compile phase, do a syntax check and set a dummy
3996 reference number. */
3997
3998 if (lengthptr != NULL)
3999 {
4000 if (*ptr != terminator)
4001 {
4002 *errorcodeptr = ERR42;
4003 goto FAILED;
4004 }
4005 if (namelen > MAX_NAME_SIZE)
4006 {
4007 *errorcodeptr = ERR48;
4008 goto FAILED;
4009 }
4010 recno = 0;
4011 }
4012
4013 /* In the real compile, seek the name in the table */
4014
4015 else
4016 {
4017 slot = cd->name_table;
4018 for (i = 0; i < cd->names_found; i++)
4019 {
4020 if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
4021 slot += cd->name_entry_size;
4022 }
4023
4024 if (i < cd->names_found) /* Back reference */
4025 {
4026 recno = GET2(slot, 0);
4027 }
4028 else if ((recno = /* Forward back reference */
4029 find_parens(ptr, cd->bracount, name, namelen,
4030 (options & PCRE_EXTENDED) != 0)) <= 0)
4031 {
4032 *errorcodeptr = ERR15;
4033 goto FAILED;
4034 }
4035 }
4036
4037 /* In both phases, we can now go to the code than handles numerical
4038 recursion or backreferences. */
4039
4040 if (is_recurse) goto HANDLE_RECURSION;
4041 else goto HANDLE_REFERENCE;
4042
4043
4044 /* ------------------------------------------------------------ */
4045 case 'R': /* Recursion */
4046 ptr++; /* Same as (?0) */
4047 /* Fall through */
4048
4049
4050 /* ------------------------------------------------------------ */
4051 case '-': case '+':
4052 case '0': case '1': case '2': case '3': case '4': /* Recursion or */
4053 case '5': case '6': case '7': case '8': case '9': /* subroutine */
4054 {
4055 const uschar *called;
4056
4057 if ((refsign = *ptr) == '+') ptr++;
4058 else if (refsign == '-')
4059 {
4060 if ((digitab[ptr[1]] & ctype_digit) == 0)
4061 goto OTHER_CHAR_AFTER_QUERY;
4062 ptr++;
4063 }
4064
4065 recno = 0;
4066 while((digitab[*ptr] & ctype_digit) != 0)
4067 recno = recno * 10 + *ptr++ - '0';
4068
4069 if (*ptr != ')')
4070 {
4071 *errorcodeptr = ERR29;
4072 goto FAILED;
4073 }
4074
4075 if (refsign == '-')
4076 {
4077 if (recno == 0)
4078 {
4079 *errorcodeptr = ERR58;
4080 goto FAILED;
4081 }
4082 recno = cd->bracount - recno + 1;
4083 if (recno <= 0)
4084 {
4085 *errorcodeptr = ERR15;
4086 goto FAILED;
4087 }
4088 }
4089 else if (refsign == '+')
4090 {
4091 if (recno == 0)
4092 {
4093 *errorcodeptr = ERR58;
4094 goto FAILED;
4095 }
4096 recno += cd->bracount;
4097 }
4098
4099 /* Come here from code above that handles a named recursion */
4100
4101 HANDLE_RECURSION:
4102
4103 previous = code;
4104 called = cd->start_code;
4105
4106 /* When we are actually compiling, find the bracket that is being
4107 referenced. Temporarily end the regex in case it doesn't exist before
4108 this point. If we end up with a forward reference, first check that
4109 the bracket does occur later so we can give the error (and position)
4110 now. Then remember this forward reference in the workspace so it can
4111 be filled in at the end. */
4112
4113 if (lengthptr == NULL)
4114 {
4115 *code = OP_END;
4116 if (recno != 0) called = find_bracket(cd->start_code, utf8, recno);
4117
4118 /* Forward reference */
4119
4120 if (called == NULL)
4121 {
4122 if (find_parens(ptr, cd->bracount, NULL, recno,
4123 (options & PCRE_EXTENDED) != 0) < 0)
4124 {
4125 *errorcodeptr = ERR15;
4126 goto FAILED;
4127 }
4128 called = cd->start_code + recno;
4129 PUTINC(cd->hwm, 0, code + 2 + LINK_SIZE - cd->start_code);
4130 }
4131
4132 /* If not a forward reference, and the subpattern is still open,
4133 this is a recursive call. We check to see if this is a left
4134 recursion that could loop for ever, and diagnose that case. */
4135
4136 else if (GET(called, 1) == 0 &&
4137 could_be_empty(called, code, bcptr, utf8))
4138 {
4139 *errorcodeptr = ERR40;
4140 goto FAILED;
4141 }
4142 }
4143
4144 /* Insert the recursion/subroutine item, automatically wrapped inside
4145 "once" brackets. Set up a "previous group" length so that a
4146 subsequent quantifier will work. */
4147
4148 *code = OP_ONCE;
4149 PUT(code, 1, 2 + 2*LINK_SIZE);
4150 code += 1 + LINK_SIZE;
4151
4152 *code = OP_RECURSE;
4153 PUT(code, 1, called - cd->start_code);
4154 code += 1 + LINK_SIZE;
4155
4156 *code = OP_KET;
4157 PUT(code, 1, 2 + 2*LINK_SIZE);
4158 code += 1 + LINK_SIZE;
4159
4160 length_prevgroup = 3 + 3*LINK_SIZE;
4161 }
4162
4163 /* Can't determine a first byte now */
4164
4165 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
4166 continue;
4167
4168
4169 /* ------------------------------------------------------------ */
4170 default: /* Other characters: check option setting */
4171 OTHER_CHAR_AFTER_QUERY:
4172 set = unset = 0;
4173 optset = &set;
4174
4175 while (*ptr != ')' && *ptr != ':')
4176 {
4177 switch (*ptr++)
4178 {
4179 case '-': optset = &unset; break;
4180
4181 case 'J': /* Record that it changed in the external options */
4182 *optset |= PCRE_DUPNAMES;
4183 cd->external_options |= PCRE_JCHANGED;
4184 break;
4185
4186 case 'i': *optset |= PCRE_CASELESS; break;
4187 case 'm': *optset |= PCRE_MULTILINE; break;
4188 case 's': *optset |= PCRE_DOTALL; break;
4189 case 'x': *optset |= PCRE_EXTENDED; break;
4190 case 'U': *optset |= PCRE_UNGREEDY; break;
4191 case 'X': *optset |= PCRE_EXTRA; break;
4192
4193 default: *errorcodeptr = ERR12;
4194 ptr--; /* Correct the offset */
4195 goto FAILED;
4196 }
4197 }
4198
4199 /* Set up the changed option bits, but don't change anything yet. */
4200
4201 newoptions = (options | set) & (~unset);
4202
4203 /* If the options ended with ')' this is not the start of a nested
4204 group with option changes, so the options change at this level. If this
4205 item is right at the start of the pattern, the options can be
4206 abstracted and made external in the pre-compile phase, and ignored in
4207 the compile phase. This can be helpful when matching -- for instance in
4208 caseless checking of required bytes.
4209
4210 If the code pointer is not (cd->start_code + 1 + LINK_SIZE), we are
4211 definitely *not* at the start of the pattern because something has been
4212 compiled. In the pre-compile phase, however, the code pointer can have
4213 that value after the start, because it gets reset as code is discarded
4214 during the pre-compile. However, this can happen only at top level - if
4215 we are within parentheses, the starting BRA will still be present. At
4216 any parenthesis level, the length value can be used to test if anything
4217 has been compiled at that level. Thus, a test for both these conditions
4218 is necessary to ensure we correctly detect the start of the pattern in
4219 both phases.
4220
4221 If we are not at the pattern start, compile code to change the ims
4222 options if this setting actually changes any of them. We also pass the
4223 new setting back so that it can be put at the start of any following
4224 branches, and when this group ends (if we are in a group), a resetting
4225 item can be compiled. */
4226
4227 if (*ptr == ')')
4228 {
4229 if (code == cd->start_code + 1 + LINK_SIZE &&
4230 (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE))
4231 {
4232 cd->external_options = newoptions;
4233 options = newoptions;
4234 }
4235 else
4236 {
4237 if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
4238 {
4239 *code++ = OP_OPT;
4240 *code++ = newoptions & PCRE_IMS;
4241 }
4242
4243 /* Change options at this level, and pass them back for use
4244 in subsequent branches. Reset the greedy defaults and the case
4245 value for firstbyte and reqbyte. */
4246
4247 *optionsptr = options = newoptions;
4248 greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
4249 greedy_non_default = greedy_default ^ 1;
4250 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
4251 }
4252
4253 previous = NULL; /* This item can't be repeated */
4254 continue; /* It is complete */
4255 }
4256
4257 /* If the options ended with ':' we are heading into a nested group
4258 with possible change of options. Such groups are non-capturing and are
4259 not assertions of any kind. All we need to do is skip over the ':';
4260 the newoptions value is handled below. */
4261
4262 bravalue = OP_BRA;
4263 ptr++;
4264 } /* End of switch for character following (? */
4265 } /* End of (? handling */
4266
4267 /* Opening parenthesis not followed by '?'. If PCRE_NO_AUTO_CAPTURE is set,
4268 all unadorned brackets become non-capturing and behave like (?:...)
4269 brackets. */
4270
4271 else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
4272 {
4273 bravalue = OP_BRA;
4274 }
4275
4276 /* Else we have a capturing group. */
4277
4278 else
4279 {
4280 NUMBERED_GROUP:
4281 cd->bracount += 1;
4282 PUT2(code, 1+LINK_SIZE, cd->bracount);
4283 skipbytes = 2;
4284 }
4285
4286 /* Process nested bracketed regex. Assertions may not be repeated, but
4287 other kinds can be. All their opcodes are >= OP_ONCE. We copy code into a
4288 non-register variable in order to be able to pass its address because some
4289 compilers complain otherwise. Pass in a new setting for the ims options if
4290 they have changed. */
4291
4292 previous = (bravalue >= OP_ONCE)? code : NULL;
4293 *code = bravalue;
4294 tempcode = code;
4295 tempreqvary = cd->req_varyopt; /* Save value before bracket */
4296 length_prevgroup = 0; /* Initialize for pre-compile phase */
4297
4298 if (!compile_regex(
4299 newoptions, /* The complete new option state */
4300 options & PCRE_IMS, /* The previous ims option state */
4301 &tempcode, /* Where to put code (updated) */
4302 &ptr, /* Input pointer (updated) */
4303 errorcodeptr, /* Where to put an error message */
4304 (bravalue == OP_ASSERTBACK ||
4305 bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
4306 skipbytes, /* Skip over bracket number */
4307 &subfirstbyte, /* For possible first char */
4308 &subreqbyte, /* For possible last char */
4309 bcptr, /* Current branch chain */
4310 cd, /* Tables block */
4311 (lengthptr == NULL)? NULL : /* Actual compile phase */
4312 &length_prevgroup /* Pre-compile phase */
4313 ))
4314 goto FAILED;
4315
4316 /* At the end of compiling, code is still pointing to the start of the
4317 group, while tempcode has been updated to point past the end of the group
4318 and any option resetting that may follow it. The pattern pointer (ptr)
4319 is on the bracket. */
4320
4321 /* If this is a conditional bracket, check that there are no more than
4322 two branches in the group, or just one if it's a DEFINE group. */
4323
4324 if (bravalue == OP_COND)
4325 {
4326 uschar *tc = code;
4327 int condcount = 0;
4328
4329 do {
4330 condcount++;
4331 tc += GET(tc,1);
4332 }
4333 while (*tc != OP_KET);
4334
4335 /* A DEFINE group is never obeyed inline (the "condition" is always
4336 false). It must have only one branch. */
4337
4338 if (code[LINK_SIZE+1] == OP_DEF)
4339 {
4340 if (condcount > 1)
4341 {
4342 *errorcodeptr = ERR54;
4343 goto FAILED;
4344 }
4345 bravalue = OP_DEF; /* Just a flag to suppress char handling below */
4346 }
4347
4348 /* A "normal" conditional group. If there is just one branch, we must not
4349 make use of its firstbyte or reqbyte, because this is equivalent to an
4350 empty second branch. */
4351
4352 else
4353 {
4354 if (condcount > 2)
4355 {
4356 *errorcodeptr = ERR27;
4357 goto FAILED;
4358 }
4359 if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
4360 }
4361 }
4362
4363 /* Error if hit end of pattern */
4364
4365 if (*ptr != ')')
4366 {
4367 *errorcodeptr = ERR14;
4368 goto FAILED;
4369 }
4370
4371 /* In the pre-compile phase, update the length by the length of the nested
4372 group, less the brackets at either end. Then reduce the compiled code to
4373 just the brackets so that it doesn't use much memory if it is duplicated by
4374 a quantifier. */
4375
4376 if (lengthptr != NULL)
4377 {
4378 *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
4379 code++;
4380 PUTINC(code, 0, 1 + LINK_SIZE);
4381 *code++ = OP_KET;
4382 PUTINC(code, 0, 1 + LINK_SIZE);
4383 }
4384
4385 /* Otherwise update the main code pointer to the end of the group. */
4386
4387 else code = tempcode;
4388
4389 /* For a DEFINE group, required and first character settings are not
4390 relevant. */
4391
4392 if (bravalue == OP_DEF) break;
4393
4394 /* Handle updating of the required and first characters for other types of
4395 group. Update for normal brackets of all kinds, and conditions with two
4396 branches (see code above). If the bracket is followed by a quantifier with
4397 zero repeat, we have to back off. Hence the definition of zeroreqbyte and
4398 zerofirstbyte outside the main loop so that they can be accessed for the
4399 back off. */
4400
4401 zeroreqbyte = reqbyte;
4402 zerofirstbyte = firstbyte;
4403 groupsetfirstbyte = FALSE;
4404
4405 if (bravalue >= OP_ONCE)
4406 {
4407 /* If we have not yet set a firstbyte in this branch, take it from the
4408 subpattern, remembering that it was set here so that a repeat of more
4409 than one can replicate it as reqbyte if necessary. If the subpattern has
4410 no firstbyte, set "none" for the whole branch. In both cases, a zero
4411 repeat forces firstbyte to "none". */
4412
4413 if (firstbyte == REQ_UNSET)
4414 {
4415 if (subfirstbyte >= 0)
4416 {
4417 firstbyte = subfirstbyte;
4418 groupsetfirstbyte = TRUE;
4419 }
4420 else firstbyte = REQ_NONE;
4421 zerofirstbyte = REQ_NONE;
4422 }
4423
4424 /* If firstbyte was previously set, convert the subpattern's firstbyte
4425 into reqbyte if there wasn't one, using the vary flag that was in
4426 existence beforehand. */
4427
4428 else if (subfirstbyte >= 0 && subreqbyte < 0)
4429 subreqbyte = subfirstbyte | tempreqvary;
4430
4431 /* If the subpattern set a required byte (or set a first byte that isn't
4432 really the first byte - see above), set it. */
4433
4434 if (subreqbyte >= 0) reqbyte = subreqbyte;
4435 }
4436
4437 /* For a forward assertion, we take the reqbyte, if set. This can be
4438 helpful if the pattern that follows the assertion doesn't set a different
4439 char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
4440 for an assertion, however because it leads to incorrect effect for patterns
4441 such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
4442 of a firstbyte. This is overcome by a scan at the end if there's no
4443 firstbyte, looking for an asserted first char. */
4444
4445 else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
4446 break; /* End of processing '(' */
4447
4448
4449 /* ===================================================================*/
4450 /* Handle metasequences introduced by \. For ones like \d, the ESC_ values
4451 are arranged to be the negation of the corresponding OP_values. For the
4452 back references, the values are ESC_REF plus the reference number. Only
4453 back references and those types that consume a character may be repeated.
4454 We can test for values between ESC_b and ESC_Z for the latter; this may
4455 have to change if any new ones are ever created. */
4456
4457 case '\\':
4458 tempptr = ptr;
4459 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, FALSE);
4460 if (*errorcodeptr != 0) goto FAILED;
4461
4462 if (c < 0)
4463 {
4464 if (-c == ESC_Q) /* Handle start of quoted string */
4465 {
4466 if (ptr[1] == '\\' && ptr[2] == 'E') ptr += 2; /* avoid empty string */
4467 else inescq = TRUE;
4468 continue;
4469 }
4470
4471 if (-c == ESC_E) continue; /* Perl ignores an orphan \E */
4472
4473 /* For metasequences that actually match a character, we disable the
4474 setting of a first character if it hasn't already been set. */
4475
4476 if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
4477 firstbyte = REQ_NONE;
4478
4479 /* Set values to reset to if this is followed by a zero repeat. */
4480
4481 zerofirstbyte = firstbyte;
4482 zeroreqbyte = reqbyte;
4483
4484 /* \k<name> or \k'name' is a back reference by name (Perl syntax).
4485 We also support \k{name} (.NET syntax) */
4486
4487 if (-c == ESC_k && (ptr[1] == '<' || ptr[1] == '\'' || ptr[1] == '{'))
4488 {
4489 is_recurse = FALSE;
4490 terminator = (*(++ptr) == '<')? '>' : (*ptr == '\'')? '\'' : '}';
4491 goto NAMED_REF_OR_RECURSE;
4492 }
4493
4494 /* Back references are handled specially; must disable firstbyte if
4495 not set to cope with cases like (?=(\w+))\1: which would otherwise set
4496 ':' later. */
4497
4498 if (-c >= ESC_REF)
4499 {
4500 recno = -c - ESC_REF;
4501
4502 HANDLE_REFERENCE: /* Come here from named backref handling */
4503 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
4504 previous = code;
4505 *code++ = OP_REF;
4506 PUT2INC(code, 0, recno);
4507 cd->backref_map |= (recno < 32)? (1 << recno) : 1;
4508 if (recno > cd->top_backref) cd->top_backref = recno;
4509 }
4510
4511 /* So are Unicode property matches, if supported. */
4512
4513 #ifdef SUPPORT_UCP
4514 else if (-c == ESC_P || -c == ESC_p)
4515 {
4516 BOOL negated;
4517 int pdata;
4518 int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
4519 if (ptype < 0) goto FAILED;
4520 previous = code;
4521 *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
4522 *code++ = ptype;
4523 *code++ = pdata;
4524 }
4525 #else
4526
4527 /* If Unicode properties are not supported, \X, \P, and \p are not
4528 allowed. */
4529
4530 else if (-c == ESC_X || -c == ESC_P || -c == ESC_p)
4531 {
4532 *errorcodeptr = ERR45;
4533 goto FAILED;
4534 }
4535 #endif
4536
4537 /* For the rest (including \X when Unicode properties are supported), we
4538 can obtain the OP value by negating the escape value. */
4539
4540 else
4541 {
4542 previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
4543 *code++ = -c;
4544 }
4545 continue;
4546 }
4547
4548 /* We have a data character whose value is in c. In UTF-8 mode it may have
4549 a value > 127. We set its representation in the length/buffer, and then
4550 handle it as a data character. */
4551
4552 #ifdef SUPPORT_UTF8
4553 if (utf8 && c > 127)
4554 mclength = _pcre_ord2utf8(c, mcbuffer);
4555 else
4556 #endif
4557
4558 {
4559 mcbuffer[0] = c;
4560 mclength = 1;
4561 }
4562 goto ONE_CHAR;
4563
4564
4565 /* ===================================================================*/
4566 /* Handle a literal character. It is guaranteed not to be whitespace or #
4567 when the extended flag is set. If we are in UTF-8 mode, it may be a
4568 multi-byte literal character. */
4569
4570 default:
4571 NORMAL_CHAR:
4572 mclength = 1;
4573 mcbuffer[0] = c;
4574
4575 #ifdef SUPPORT_UTF8
4576 if (utf8 && c >= 0xc0)
4577 {
4578 while ((ptr[1] & 0xc0) == 0x80)
4579 mcbuffer[mclength++] = *(++ptr);
4580 }
4581 #endif
4582
4583 /* At this point we have the character's bytes in mcbuffer, and the length
4584 in mclength. When not in UTF-8 mode, the length is always 1. */
4585
4586 ONE_CHAR:
4587 previous = code;
4588 *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARNC : OP_CHAR;
4589 for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
4590
4591 /* Set the first and required bytes appropriately. If no previous first
4592 byte, set it from this character, but revert to none on a zero repeat.
4593 Otherwise, leave the firstbyte value alone, and don't change it on a zero
4594 repeat. */
4595
4596 if (firstbyte == REQ_UNSET)
4597 {
4598 zerofirstbyte = REQ_NONE;
4599 zeroreqbyte = reqbyte;
4600
4601 /* If the character is more than one byte long, we can set firstbyte
4602 only if it is not to be matched caselessly. */
4603
4604 if (mclength == 1 || req_caseopt == 0)
4605 {
4606 firstbyte = mcbuffer[0] | req_caseopt;
4607 if (mclength != 1) reqbyte = code[-1] | cd->req_varyopt;
4608 }
4609 else firstbyte = reqbyte = REQ_NONE;
4610 }
4611
4612 /* firstbyte was previously set; we can set reqbyte only the length is
4613 1 or the matching is caseful. */
4614
4615 else
4616 {
4617 zerofirstbyte = firstbyte;
4618 zeroreqbyte = reqbyte;
4619 if (mclength == 1 || req_caseopt == 0)
4620 reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
4621 }
4622
4623 break; /* End of literal character handling */
4624 }
4625 } /* end of big loop */
4626
4627
4628 /* Control never reaches here by falling through, only by a goto for all the
4629 error states. Pass back the position in the pattern so that it can be displayed
4630 to the user for diagnosing the error. */
4631
4632 FAILED:
4633 *ptrptr = ptr;
4634 return FALSE;
4635 }
4636
4637
4638
4639
4640 /*************************************************
4641 * Compile sequence of alternatives *
4642 *************************************************/
4643
4644 /* On entry, ptr is pointing past the bracket character, but on return it
4645 points to the closing bracket, or vertical bar, or end of string. The code
4646 variable is pointing at the byte into which the BRA operator has been stored.
4647 If the ims options are changed at the start (for a (?ims: group) or during any
4648 branch, we need to insert an OP_OPT item at the start of every following branch
4649 to ensure they get set correctly at run time, and also pass the new options
4650 into every subsequent branch compile.
4651
4652 This function is used during the pre-compile phase when we are trying to find
4653 out the amount of memory needed, as well as during the real compile phase. The
4654 value of lengthptr distinguishes the two phases.
4655
4656 Argument:
4657 options option bits, including any changes for this subpattern
4658 oldims previous settings of ims option bits
4659 codeptr -> the address of the current code pointer
4660 ptrptr -> the address of the current pattern pointer
4661 errorcodeptr -> pointer to error code variable
4662 lookbehind TRUE if this is a lookbehind assertion
4663 skipbytes skip this many bytes at start (for brackets and OP_COND)
4664 firstbyteptr place to put the first required character, or a negative number
4665 reqbyteptr place to put the last required character, or a negative number
4666 bcptr pointer to the chain of currently open branches
4667 cd points to the data block with tables pointers etc.
4668 lengthptr NULL during the real compile phase
4669 points to length accumulator during pre-compile phase
4670
4671 Returns: TRUE on success
4672 */
4673
4674 static BOOL
4675 compile_regex(int options, int oldims, uschar **codeptr, const uschar **ptrptr,
4676 int *errorcodeptr, BOOL lookbehind, int skipbytes, int *firstbyteptr,
4677 int *reqbyteptr, branch_chain *bcptr, compile_data *cd, int *lengthptr)
4678 {
4679 const uschar *ptr = *ptrptr;
4680 uschar *code = *codeptr;
4681 uschar *last_branch = code;
4682 uschar *start_bracket = code;
4683 uschar *reverse_count = NULL;
4684 int firstbyte, reqbyte;
4685 int branchfirstbyte, branchreqbyte;
4686 int length;
4687 branch_chain bc;
4688
4689 bc.outer = bcptr;
4690 bc.current = code;
4691
4692 firstbyte = reqbyte = REQ_UNSET;
4693
4694 /* Accumulate the length for use in the pre-compile phase. Start with the
4695 length of the BRA and KET and any extra bytes that are required at the
4696 beginning. We accumulate in a local variable to save frequent testing of
4697 lenthptr for NULL. We cannot do this by looking at the value of code at the
4698 start and end of each alternative, because compiled items are discarded during
4699 the pre-compile phase so that the work space is not exceeded. */
4700
4701 length = 2 + 2*LINK_SIZE + skipbytes;
4702
4703 /* WARNING: If the above line is changed for any reason, you must also change
4704 the code that abstracts option settings at the start of the pattern and makes
4705 them global. It tests the value of length for (2 + 2*LINK_SIZE) in the
4706 pre-compile phase to find out whether anything has yet been compiled or not. */
4707
4708 /* Offset is set zero to mark that this bracket is still open */
4709
4710 PUT(code, 1, 0);
4711 code += 1 + LINK_SIZE + skipbytes;
4712
4713 /* Loop for each alternative branch */
4714
4715 for (;;)
4716 {
4717 /* Handle a change of ims options at the start of the branch */
4718
4719 if ((options & PCRE_IMS) != oldims)
4720 {
4721 *code++ = OP_OPT;
4722 *code++ = options & PCRE_IMS;
4723 length += 2;
4724 }
4725
4726 /* Set up dummy OP_REVERSE if lookbehind assertion */
4727
4728 if (lookbehind)
4729 {
4730 *code++ = OP_REVERSE;
4731 reverse_count = code;
4732 PUTINC(code, 0, 0);
4733 length += 1 + LINK_SIZE;
4734 }
4735
4736 /* Now compile the branch; in the pre-compile phase its length gets added
4737 into the length. */
4738
4739 if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstbyte,
4740 &branchreqbyte, &bc, cd, (lengthptr == NULL)? NULL : &length))
4741 {
4742 *ptrptr = ptr;
4743 return FALSE;
4744 }
4745
4746 /* In the real compile phase, there is some post-processing to be done. */
4747
4748 if (lengthptr == NULL)
4749 {
4750 /* If this is the first branch, the firstbyte and reqbyte values for the
4751 branch become the values for the regex. */
4752
4753 if (*last_branch != OP_ALT)
4754 {
4755 firstbyte = branchfirstbyte;
4756 reqbyte = branchreqbyte;
4757 }
4758
4759 /* If this is not the first branch, the first char and reqbyte have to
4760 match the values from all the previous branches, except that if the
4761 previous value for reqbyte didn't have REQ_VARY set, it can still match,
4762 and we set REQ_VARY for the regex. */
4763
4764 else
4765 {
4766 /* If we previously had a firstbyte, but it doesn't match the new branch,
4767 we have to abandon the firstbyte for the regex, but if there was
4768 previously no reqbyte, it takes on the value of the old firstbyte. */
4769
4770 if (firstbyte >= 0 && firstbyte != branchfirstbyte)
4771 {
4772 if (reqbyte < 0) reqbyte = firstbyte;
4773 firstbyte = REQ_NONE;
4774 }
4775
4776 /* If we (now or from before) have no firstbyte, a firstbyte from the
4777 branch becomes a reqbyte if there isn't a branch reqbyte. */
4778
4779 if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
4780 branchreqbyte = branchfirstbyte;
4781
4782 /* Now ensure that the reqbytes match */
4783
4784 if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
4785 reqbyte = REQ_NONE;
4786 else reqbyte |= branchreqbyte; /* To "or" REQ_VARY */
4787 }
4788
4789 /* If lookbehind, check that this branch matches a fixed-length string, and
4790 put the length into the OP_REVERSE item. Temporarily mark the end of the
4791 branch with OP_END. */
4792
4793 if (lookbehind)
4794 {
4795 int fixed_length;
4796 *code = OP_END;
4797 fixed_length = find_fixedlength(last_branch, options);
4798 DPRINTF(("fixed length = %d\n", fixed_length));
4799 if (fixed_length < 0)
4800 {
4801 *errorcodeptr = (fixed_length == -2)? ERR36 : ERR25;
4802 *ptrptr = ptr;
4803 return FALSE;
4804 }
4805 PUT(reverse_count, 0, fixed_length);
4806 }
4807 }
4808
4809 /* Reached end of expression, either ')' or end of pattern. Go back through
4810 the alternative branches and reverse the chain of offsets, with the field in
4811 the BRA item now becoming an offset to the first alternative. If there are
4812 no alternatives, it points to the end of the group. The length in the
4813 terminating ket is always the length of the whole bracketed item. If any of
4814 the ims options were changed inside the group, compile a resetting op-code
4815 following, except at the very end of the pattern. Return leaving the pointer
4816 at the terminating char. */
4817
4818 if (*ptr != '|')
4819 {
4820 int branch_length = code - last_branch;
4821 do
4822 {
4823 int prev_length = GET(last_branch, 1);
4824 PUT(last_branch, 1, branch_length);
4825 branch_length = prev_length;
4826 last_branch -= branch_length;
4827 }
4828 while (branch_length > 0);
4829
4830 /* Fill in the ket */
4831
4832 *code = OP_KET;
4833 PUT(code, 1, code - start_bracket);
4834 code += 1 + LINK_SIZE;
4835
4836 /* Resetting option if needed */
4837
4838 if ((options & PCRE_IMS) != oldims && *ptr == ')')
4839 {
4840 *code++ = OP_OPT;
4841 *code++ = oldims;
4842 length += 2;
4843 }
4844
4845 /* Set values to pass back */
4846
4847 *codeptr = code;
4848 *ptrptr = ptr;
4849 *firstbyteptr = firstbyte;
4850 *reqbyteptr = reqbyte;
4851 if (lengthptr != NULL) *lengthptr += length;
4852 return TRUE;
4853 }
4854
4855 /* Another branch follows; insert an "or" node. Its length field points back
4856 to the previous branch while the bracket remains open. At the end the chain
4857 is reversed. It's done like this so that the start of the bracket has a
4858 zero offset until it is closed, making it possible to detect recursion. */
4859
4860 *code = OP_ALT;
4861 PUT(code, 1, code - last_branch);
4862 bc.current = last_branch = code;
4863 code += 1 + LINK_SIZE;
4864 ptr++;
4865 length += 1 + LINK_SIZE;
4866 }
4867 /* Control never reaches here */
4868 }
4869
4870
4871
4872
4873 /*************************************************
4874 * Check for anchored expression *
4875 *************************************************/
4876
4877 /* Try to find out if this is an anchored regular expression. Consider each
4878 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
4879 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
4880 it's anchored. However, if this is a multiline pattern, then only OP_SOD
4881 counts, since OP_CIRC can match in the middle.
4882
4883 We can also consider a regex to be anchored if OP_SOM starts all its branches.
4884 This is the code for \G, which means "match at start of match position, taking
4885 into account the match offset".
4886
4887 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
4888 because that will try the rest of the pattern at all possible matching points,
4889 so there is no point trying again.... er ....
4890
4891 .... except when the .* appears inside capturing parentheses, and there is a
4892 subsequent back reference to those parentheses. We haven't enough information
4893 to catch that case precisely.
4894
4895 At first, the best we could do was to detect when .* was in capturing brackets
4896 and the highest back reference was greater than or equal to that level.
4897 However, by keeping a bitmap of the first 31 back references, we can catch some
4898 of the more common cases more precisely.
4899
4900 Arguments:
4901 code points to start of expression (the bracket)
4902 options points to the options setting
4903 bracket_map a bitmap of which brackets we are inside while testing; this
4904 handles up to substring 31; after that we just have to take
4905 the less precise approach
4906 backref_map the back reference bitmap
4907
4908 Returns: TRUE or FALSE
4909 */
4910
4911 static BOOL
4912 is_anchored(register const uschar *code, int *options, unsigned int bracket_map,
4913 unsigned int backref_map)
4914 {
4915 do {
4916 const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
4917 options, PCRE_MULTILINE, FALSE);
4918 register int op = *scode;
4919
4920 /* Non-capturing brackets */
4921
4922 if (op == OP_BRA)
4923 {
4924 if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
4925 }
4926
4927 /* Capturing brackets */
4928
4929 else if (op == OP_CBRA)
4930 {
4931 int n = GET2(scode, 1+LINK_SIZE);
4932 int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
4933 if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;
4934 }
4935
4936 /* Other brackets */
4937
4938 else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
4939 {
4940 if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
4941 }
4942
4943 /* .* is not anchored unless DOTALL is set and it isn't in brackets that
4944 are or may be referenced. */
4945
4946 else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
4947 op == OP_TYPEPOSSTAR) &&
4948 (*options & PCRE_DOTALL) != 0)
4949 {
4950 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
4951 }
4952
4953 /* Check for explicit anchoring */
4954
4955 else if (op != OP_SOD && op != OP_SOM &&
4956 ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
4957 return FALSE;
4958 code += GET(code, 1);
4959 }
4960 while (*code == OP_ALT); /* Loop for each alternative */
4961 return TRUE;
4962 }
4963
4964
4965
4966 /*************************************************
4967 * Check for starting with ^ or .* *
4968 *************************************************/
4969
4970 /* This is called to find out if every branch starts with ^ or .* so that
4971 "first char" processing can be done to speed things up in multiline
4972 matching and for non-DOTALL patterns that start with .* (which must start at
4973 the beginning or after \n). As in the case of is_anchored() (see above), we
4974 have to take account of back references to capturing brackets that contain .*
4975 because in that case we can't make the assumption.
4976
4977 Arguments:
4978 code points to start of expression (the bracket)
4979 bracket_map a bitmap of which brackets we are inside while testing; this
4980 handles up to substring 31; after that we just have to take
4981 the less precise approach
4982 backref_map the back reference bitmap
4983
4984 Returns: TRUE or FALSE
4985 */
4986
4987 static BOOL
4988 is_startline(const uschar *code, unsigned int bracket_map,
4989 unsigned int backref_map)
4990 {
4991 do {
4992 const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
4993 NULL, 0, FALSE);
4994 register int op = *scode;
4995
4996 /* Non-capturing brackets */
4997
4998 if (op == OP_BRA)
4999 {
5000 if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
5001 }
5002
5003 /* Capturing brackets */
5004
5005 else if (op == OP_CBRA)
5006 {
5007 int n = GET2(scode, 1+LINK_SIZE);
5008 int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
5009 if (!is_startline(scode, new_map, backref_map)) return FALSE;
5010 }
5011
5012 /* Other brackets */
5013
5014 else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
5015 { if (!is_startline(scode, bracket_map, backref_map)) return FALSE; }
5016
5017 /* .* means "start at start or after \n" if it isn't in brackets that
5018 may be referenced. */
5019
5020 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
5021 {
5022 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
5023 }
5024
5025 /* Check for explicit circumflex */
5026
5027 else if (op != OP_CIRC) return FALSE;
5028
5029 /* Move on to the next alternative */
5030
5031 code += GET(code, 1);
5032 }
5033 while (*code == OP_ALT); /* Loop for each alternative */
5034 return TRUE;
5035 }
5036
5037
5038
5039 /*************************************************
5040 * Check for asserted fixed first char *
5041 *************************************************/
5042
5043 /* During compilation, the "first char" settings from forward assertions are
5044 discarded, because they can cause conflicts with actual literals that follow.
5045 However, if we end up without a first char setting for an unanchored pattern,
5046 it is worth scanning the regex to see if there is an initial asserted first
5047 char. If all branches start with the same asserted char, or with a bracket all
5048 of whose alternatives start with the same asserted char (recurse ad lib), then
5049 we return that char, otherwise -1.
5050
5051 Arguments:
5052 code points to start of expression (the bracket)
5053 options pointer to the options (used to check casing changes)
5054 inassert TRUE if in an assertion
5055
5056 Returns: -1 or the fixed first char
5057 */
5058
5059 static int
5060 find_firstassertedchar(const uschar *code, int *options, BOOL inassert)
5061 {
5062 register int c = -1;
5063 do {
5064 int d;
5065 const uschar *scode =
5066 first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE);
5067 register int op = *scode;
5068
5069 switch(op)
5070 {
5071 default:
5072 return -1;
5073
5074 case OP_BRA:
5075 case OP_CBRA:
5076 case OP_ASSERT:
5077 case OP_ONCE:
5078 case OP_COND:
5079 if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0)
5080 return -1;
5081 if (c < 0) c = d; else if (c != d) return -1;
5082 break;
5083
5084 case OP_EXACT: /* Fall through */
5085 scode += 2;
5086
5087 case OP_CHAR:
5088 case OP_CHARNC:
5089 case OP_PLUS:
5090 case OP_MINPLUS:
5091 case OP_POSPLUS:
5092 if (!inassert) return -1;
5093 if (c < 0)
5094 {
5095 c = scode[1];
5096 if ((*options & PCRE_CASELESS) != 0) c |= REQ_CASELESS;
5097 }
5098 else if (c != scode[1]) return -1;
5099 break;
5100 }
5101
5102 code += GET(code, 1);
5103 }
5104 while (*code == OP_ALT);
5105 return c;
5106 }
5107
5108
5109
5110 /*************************************************
5111 * Compile a Regular Expression *
5112 *************************************************/
5113
5114 /* This function takes a string and returns a pointer to a block of store
5115 holding a compiled version of the expression. The original API for this
5116 function had no error code return variable; it is retained for backwards
5117 compatibility. The new function is given a new name.
5118
5119 Arguments:
5120 pattern the regular expression
5121 options various option bits
5122 errorcodeptr pointer to error code variable (pcre_compile2() only)
5123 can be NULL if you don't want a code value
5124 errorptr pointer to pointer to error text
5125 erroroffset ptr offset in pattern where error was detected
5126 tables pointer to character tables or NULL
5127
5128 Returns: pointer to compiled data block, or NULL on error,
5129 with errorptr and erroroffset set
5130 */
5131
5132 PCRE_EXP_DEFN pcre *
5133 pcre_compile(const char *pattern, int options, const char **errorptr,
5134 int *erroroffset, const unsigned char *tables)
5135 {
5136 return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
5137 }
5138
5139
5140 PCRE_EXP_DEFN pcre *
5141 pcre_compile2(const char *pattern, int options, int *errorcodeptr,
5142 const char **errorptr, int *erroroffset, const unsigned char *tables)
5143 {
5144 real_pcre *re;
5145 int length = 1; /* For final END opcode */
5146 int firstbyte, reqbyte, newline;
5147 int errorcode = 0;
5148 #ifdef SUPPORT_UTF8
5149 BOOL utf8;
5150 #endif
5151 size_t size;
5152 uschar *code;
5153 const uschar *codestart;
5154 const uschar *ptr;
5155 compile_data compile_block;
5156 compile_data *cd = &compile_block;
5157
5158 /* This space is used for "compiling" into during the first phase, when we are
5159 computing the amount of memory that is needed. Compiled items are thrown away
5160 as soon as possible, so that a fairly large buffer should be sufficient for
5161 this purpose. The same space is used in the second phase for remembering where
5162 to fill in forward references to subpatterns. */
5163
5164 uschar cworkspace[COMPILE_WORK_SIZE];
5165
5166
5167 /* Set this early so that early errors get offset 0. */
5168
5169 ptr = (const uschar *)pattern;
5170
5171 /* We can't pass back an error message if errorptr is NULL; I guess the best we
5172 can do is just return NULL, but we can set a code value if there is a code
5173 pointer. */
5174
5175 if (errorptr == NULL)
5176 {
5177 if (errorcodeptr != NULL) *errorcodeptr = 99;
5178 return NULL;
5179 }
5180
5181 *errorptr = NULL;
5182 if (errorcodeptr != NULL) *errorcodeptr = ERR0;
5183
5184 /* However, we can give a message for this error */
5185
5186 if (erroroffset == NULL)
5187 {
5188 errorcode = ERR16;
5189 goto PCRE_EARLY_ERROR_RETURN2;
5190 }
5191
5192 *erroroffset = 0;
5193
5194 /* Can't support UTF8 unless PCRE has been compiled to include the code. */
5195
5196 #ifdef SUPPORT_UTF8
5197 utf8 = (options & PCRE_UTF8) != 0;
5198 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
5199 (*erroroffset = _pcre_valid_utf8((uschar *)pattern, -1)) >= 0)
5200 {
5201 errorcode = ERR44;
5202 goto PCRE_EARLY_ERROR_RETURN2;
5203 }
5204 #else
5205 if ((options & PCRE_UTF8) != 0)
5206 {
5207 errorcode = ERR32;
5208 goto PCRE_EARLY_ERROR_RETURN;
5209 }
5210 #endif
5211
5212 if ((options & ~PUBLIC_OPTIONS) != 0)
5213 {
5214 errorcode = ERR17;
5215 goto PCRE_EARLY_ERROR_RETURN;
5216 }
5217
5218 /* Set up pointers to the individual character tables */
5219
5220 if (tables == NULL) tables = _pcre_default_tables;
5221 cd->lcc = tables + lcc_offset;
5222 cd->fcc = tables + fcc_offset;
5223 cd->cbits = tables + cbits_offset;
5224 cd->ctypes = tables + ctypes_offset;
5225
5226 /* Handle different types of newline. The three bits give seven cases. The
5227 current code allows for fixed one- or two-byte sequences, plus "any" and
5228 "anycrlf". */
5229
5230 switch (options & (PCRE_NEWLINE_CRLF | PCRE_NEWLINE_ANY))
5231 {
5232 case 0: newline = NEWLINE; break; /* Compile-time default */
5233 case PCRE_NEWLINE_CR: newline = '\r'; break;
5234 case PCRE_NEWLINE_LF: newline = '\n'; break;
5235 case PCRE_NEWLINE_CR+
5236 PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
5237 case PCRE_NEWLINE_ANY: newline = -1; break;
5238 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
5239 default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
5240 }
5241
5242 if (newline == -2)
5243 {
5244 cd->nltype = NLTYPE_ANYCRLF;
5245 }
5246 else if (newline < 0)
5247 {
5248 cd->nltype = NLTYPE_ANY;
5249 }
5250 else
5251 {
5252 cd->nltype = NLTYPE_FIXED;
5253 if (newline > 255)
5254 {
5255 cd->nllen = 2;
5256 cd->nl[0] = (newline >> 8) & 255;
5257 cd->nl[1] = newline & 255;
5258 }
5259 else
5260 {
5261 cd->nllen = 1;
5262 cd->nl[0] = newline;
5263 }
5264 }
5265
5266 /* Maximum back reference and backref bitmap. The bitmap records up to 31 back
5267 references to help in deciding whether (.*) can be treated as anchored or not.
5268 */
5269
5270 cd->top_backref = 0;
5271 cd->backref_map = 0;
5272
5273 /* Reflect pattern for debugging output */
5274
5275 DPRINTF(("------------------------------------------------------------------\n"));
5276 DPRINTF(("%s\n", pattern));
5277
5278 /* Pretend to compile the pattern while actually just accumulating the length
5279 of memory required. This behaviour is triggered by passing a non-NULL final
5280 argument to compile_regex(). We pass a block of workspace (cworkspace) for it
5281 to compile parts of the pattern into; the compiled code is discarded when it is
5282 no longer needed, so hopefully this workspace will never overflow, though there
5283 is a test for its doing so. */
5284
5285 cd->bracount = 0;
5286 cd->names_found = 0;
5287 cd->name_entry_size = 0;
5288 cd->name_table = NULL;
5289 cd->start_workspace = cworkspace;
5290 cd->start_code = cworkspace;
5291 cd->hwm = cworkspace;
5292 cd->start_pattern = (const uschar *)pattern;
5293 cd->end_pattern = (const uschar *)(pattern + strlen(pattern));
5294 cd->req_varyopt = 0;
5295 cd->nopartial = FALSE;
5296 cd->external_options = options;
5297
5298 /* Now do the pre-compile. On error, errorcode will be set non-zero, so we
5299 don't need to look at the result of the function here. The initial options have
5300 been put into the cd block so that they can be changed if an option setting is
5301 found within the regex right at the beginning. Bringing initial option settings
5302 outside can help speed up starting point checks. */
5303
5304 code = cworkspace;
5305 *code = OP_BRA;
5306 (void)compile_regex(cd->external_options, cd->external_options & PCRE_IMS,
5307 &code, &ptr, &errorcode, FALSE, 0, &firstbyte, &reqbyte, NULL, cd, &length);
5308 if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN;
5309
5310 DPRINTF(("end pre-compile: length=%d workspace=%d\n", length,
5311 cd->hwm - cworkspace));
5312
5313 if (length > MAX_PATTERN_SIZE)
5314 {
5315 errorcode = ERR20;
5316 goto PCRE_EARLY_ERROR_RETURN;
5317 }
5318
5319 /* Compute the size of data block needed and get it, either from malloc or
5320 externally provided function. Integer overflow should no longer be possible
5321 because nowadays we limit the maximum value of cd->names_found and
5322 cd->name_entry_size. */
5323
5324 size = length + sizeof(real_pcre) + cd->names_found * (cd->name_entry_size + 3);
5325 re = (real_pcre *)(pcre_malloc)(size);
5326
5327 if (re == NULL)
5328 {
5329 errorcode = ERR21;
5330 goto PCRE_EARLY_ERROR_RETURN;
5331 }
5332
5333 /* Put in the magic number, and save the sizes, initial options, and character
5334 table pointer. NULL is used for the default character tables. The nullpad field
5335 is at the end; it's there to help in the case when a regex compiled on a system
5336 with 4-byte pointers is run on another with 8-byte pointers. */
5337
5338 re->magic_number = MAGIC_NUMBER;
5339 re->size = size;
5340 re->options = cd->external_options;
5341 re->dummy1 = 0;
5342 re->first_byte = 0;
5343 re->req_byte = 0;
5344 re->name_table_offset = sizeof(real_pcre);
5345 re->name_entry_size = cd->name_entry_size;
5346 re->name_count = cd->names_found;
5347 re->ref_count = 0;
5348 re->tables = (tables == _pcre_default_tables)? NULL : tables;
5349 re->nullpad = NULL;
5350
5351 /* The starting points of the name/number translation table and of the code are
5352 passed around in the compile data block. The start/end pattern and initial
5353 options are already set from the pre-compile phase, as is the name_entry_size
5354 field. Reset the bracket count and the names_found field. Also reset the hwm
5355 field; this time it's used for remembering forward references to subpatterns.
5356 */
5357
5358 cd->bracount = 0;
5359 cd->names_found = 0;
5360 cd->name_table = (uschar *)re + re->name_table_offset;
5361 codestart = cd->name_table + re->name_entry_size * re->name_count;
5362 cd->start_code = codestart;
5363 cd->hwm = cworkspace;
5364 cd->req_varyopt = 0;
5365 cd->nopartial = FALSE;
5366
5367 /* Set up a starting, non-extracting bracket, then compile the expression. On
5368 error, errorcode will be set non-zero, so we don't need to look at the result
5369 of the function here. */
5370
5371 ptr = (const uschar *)pattern;
5372 code = (uschar *)codestart;
5373 *code = OP_BRA;
5374 (void)compile_regex(re->options, re->options & PCRE_IMS, &code, &ptr,
5375 &errorcode, FALSE, 0, &firstbyte, &reqbyte, NULL, cd, NULL);
5376 re->top_bracket = cd->bracount;
5377 re->top_backref = cd->top_backref;
5378
5379 if (cd->nopartial) re->options |= PCRE_NOPARTIAL;
5380
5381 /* If not reached end of pattern on success, there's an excess bracket. */
5382
5383 if (errorcode == 0 && *ptr != 0) errorcode = ERR22;
5384
5385 /* Fill in the terminating state and check for disastrous overflow, but
5386 if debugging, leave the test till after things are printed out. */
5387
5388 *code++ = OP_END;
5389
5390 #ifndef DEBUG
5391 if (code - codestart > length) errorcode = ERR23;
5392 #endif
5393
5394 /* Fill in any forward references that are required. */
5395
5396 while (errorcode == 0 && cd->hwm > cworkspace)
5397 {
5398 int offset, recno;
5399 const uschar *groupptr;
5400 cd->hwm -= LINK_SIZE;
5401 offset = GET(cd->hwm, 0);
5402 recno = GET(codestart, offset);
5403 groupptr = find_bracket(codestart, (re->options & PCRE_UTF8) != 0, recno);
5404 if (groupptr == NULL) errorcode = ERR53;
5405 else PUT(((uschar *)codestart), offset, groupptr - codestart);
5406 }
5407
5408 /* Give an error if there's back reference to a non-existent capturing
5409 subpattern. */
5410
5411 if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15;
5412
5413 /* Failed to compile, or error while post-processing */
5414
5415 if (errorcode != 0)
5416 {
5417 (pcre_free)(re);
5418 PCRE_EARLY_ERROR_RETURN:
5419 *erroroffset = ptr - (const uschar *)pattern;
5420 PCRE_EARLY_ERROR_RETURN2:
5421 *errorptr = error_texts[errorcode];
5422 if (errorcodeptr != NULL) *errorcodeptr = errorcode;
5423 return NULL;
5424 }
5425
5426 /* If the anchored option was not passed, set the flag if we can determine that
5427 the pattern is anchored by virtue of ^ characters or \A or anything else (such
5428 as starting with .* when DOTALL is set).
5429
5430 Otherwise, if we know what the first byte has to be, save it, because that
5431 speeds up unanchored matches no end. If not, see if we can set the
5432 PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
5433 start with ^. and also when all branches start with .* for non-DOTALL matches.
5434 */
5435
5436 if ((re->options & PCRE_ANCHORED) == 0)
5437 {
5438 int temp_options = re->options; /* May get changed during these scans */
5439 if (is_anchored(codestart, &temp_options, 0, cd->backref_map))
5440 re->options |= PCRE_ANCHORED;
5441 else
5442 {
5443 if (firstbyte < 0)
5444 firstbyte = find_firstassertedchar(codestart, &temp_options, FALSE);
5445 if (firstbyte >= 0) /* Remove caseless flag for non-caseable chars */
5446 {
5447 int ch = firstbyte & 255;
5448 re->first_byte = ((firstbyte & REQ_CASELESS) != 0 &&
5449 cd->fcc[ch] == ch)? ch : firstbyte;
5450 re->options |= PCRE_FIRSTSET;
5451 }
5452 else if (is_startline(codestart, 0, cd->backref_map))
5453 re->options |= PCRE_STARTLINE;
5454 }
5455 }
5456
5457 /* For an anchored pattern, we use the "required byte" only if it follows a
5458 variable length item in the regex. Remove the caseless flag for non-caseable
5459 bytes. */
5460
5461 if (reqbyte >= 0 &&
5462 ((re->options & PCRE_ANCHORED) == 0 || (reqbyte & REQ_VARY) != 0))
5463 {
5464 int ch = reqbyte & 255;
5465 re->req_byte = ((reqbyte & REQ_CASELESS) != 0 &&
5466 cd->fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte;
5467 re->options |= PCRE_REQCHSET;
5468 }
5469
5470 /* Print out the compiled data if debugging is enabled. This is never the
5471 case when building a production library. */
5472
5473 #ifdef DEBUG
5474
5475 printf("Length = %d top_bracket = %d top_backref = %d\n",
5476 length, re->top_bracket, re->top_backref);
5477
5478 if (re->options != 0)
5479 {
5480 printf("%s%s%s%s%s%s%s%s%s\n",
5481 ((re->options & PCRE_NOPARTIAL) != 0)? "nopartial " : "",
5482 ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
5483 ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
5484 ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
5485 ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
5486 ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
5487 ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
5488 ((re->options & PCRE_EXTRA) != 0)? "extra " : "",
5489 ((re->options & PCRE_UNGREEDY) != 0)? "ungreedy " : "");
5490 }
5491
5492 if ((re->options & PCRE_FIRSTSET) != 0)
5493 {
5494 int ch = re->first_byte & 255;
5495 const char *caseless = ((re->first_byte & REQ_CASELESS) == 0)?
5496 "" : " (caseless)";
5497 if (isprint(ch)) printf("First char = %c%s\n", ch, caseless);
5498 else printf("First char = \\x%02x%s\n", ch, caseless);
5499 }
5500
5501 if ((re->options & PCRE_REQCHSET) != 0)
5502 {
5503 int ch = re->req_byte & 255;
5504 const char *caseless = ((re->req_byte & REQ_CASELESS) == 0)?
5505 "" : " (caseless)";
5506 if (isprint(ch)) printf("Req char = %c%s\n", ch, caseless);
5507 else printf("Req char = \\x%02x%s\n", ch, caseless);
5508 }
5509
5510 pcre_printint(re, stdout, TRUE);
5511
5512 /* This check is done here in the debugging case so that the code that
5513 was compiled can be seen. */
5514
5515 if (code - codestart > length)
5516 {
5517 (pcre_free)(re);
5518 *errorptr = error_texts[ERR23];
5519 *erroroffset = ptr - (uschar *)pattern;
5520 if (errorcodeptr != NULL) *errorcodeptr = ERR23;
5521 return NULL;
5522 }
5523 #endif /* DEBUG */
5524
5525 return (pcre *)re;
5526 }
5527
5528 /* End of pcre_compile.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12