/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Contents of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 172 - (show annotations) (download)
Tue Jun 5 10:40:13 2007 UTC (7 years, 4 months ago) by ph10
File MIME type: text/plain
File size: 175665 byte(s)
Drastically reduce workspace used for alternatives in groups; also some 
trailing space removals for a test release.

1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2007 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains the external function pcre_compile(), along with
42 supporting internal functions that are not used by other modules. */
43
44
45 #define NLBLOCK cd /* Block containing newline information */
46 #define PSSTART start_pattern /* Field containing processed string start */
47 #define PSEND end_pattern /* Field containing processed string end */
48
49
50 #include "pcre_internal.h"
51
52
53 /* When DEBUG is defined, we need the pcre_printint() function, which is also
54 used by pcretest. DEBUG is not defined when building a production library. */
55
56 #ifdef DEBUG
57 #include "pcre_printint.src"
58 #endif
59
60
61 /*************************************************
62 * Code parameters and static tables *
63 *************************************************/
64
65 /* This value specifies the size of stack workspace that is used during the
66 first pre-compile phase that determines how much memory is required. The regex
67 is partly compiled into this space, but the compiled parts are discarded as
68 soon as they can be, so that hopefully there will never be an overrun. The code
69 does, however, check for an overrun. The largest amount I've seen used is 218,
70 so this number is very generous.
71
72 The same workspace is used during the second, actual compile phase for
73 remembering forward references to groups so that they can be filled in at the
74 end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
75 is 4 there is plenty of room. */
76
77 #define COMPILE_WORK_SIZE (4096)
78
79
80 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
81 are simple data values; negative values are for special things like \d and so
82 on. Zero means further processing is needed (for things like \x), or the escape
83 is invalid. */
84
85 #ifndef EBCDIC /* This is the "normal" table for ASCII systems */
86 static const short int escapes[] = {
87 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
88 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
89 '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */
90 0, 0, 0, -ESC_K, 0, 0, 0, 0, /* H - O */
91 -ESC_P, -ESC_Q, -ESC_R, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
92 -ESC_X, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
93 '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */
94 0, 0, 0, -ESC_k, 0, 0, ESC_n, 0, /* h - o */
95 -ESC_p, 0, ESC_r, -ESC_s, ESC_tee, 0, 0, -ESC_w, /* p - w */
96 0, 0, -ESC_z /* x - z */
97 };
98
99 #else /* This is the "abnormal" table for EBCDIC systems */
100 static const short int escapes[] = {
101 /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
102 /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
103 /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
104 /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
105 /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
106 /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
107 /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
108 /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
109 /* 88 */ 0, 0, 0, '{', 0, 0, 0, 0,
110 /* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
111 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
112 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0, 0, -ESC_w, 0,
113 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
114 /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
115 /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
116 /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
117 /* C8 */ 0, 0, 0, 0, 0, 0, 0, 0,
118 /* D0 */ '}', 0, 0, 0, 0, 0, 0, -ESC_P,
119 /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
120 /* E0 */ '\\', 0, -ESC_S, 0, 0, 0, -ESC_W, -ESC_X,
121 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
122 /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
123 /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
124 };
125 #endif
126
127
128 /* Tables of names of POSIX character classes and their lengths. The list is
129 terminated by a zero length entry. The first three must be alpha, lower, upper,
130 as this is assumed for handling case independence. */
131
132 static const char *const posix_names[] = {
133 "alpha", "lower", "upper",
134 "alnum", "ascii", "blank", "cntrl", "digit", "graph",
135 "print", "punct", "space", "word", "xdigit" };
136
137 static const uschar posix_name_lengths[] = {
138 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
139
140 /* Table of class bit maps for each POSIX class. Each class is formed from a
141 base map, with an optional addition or removal of another map. Then, for some
142 classes, there is some additional tweaking: for [:blank:] the vertical space
143 characters are removed, and for [:alpha:] and [:alnum:] the underscore
144 character is removed. The triples in the table consist of the base map offset,
145 second map offset or -1 if no second map, and a non-negative value for map
146 addition or a negative value for map subtraction (if there are two maps). The
147 absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
148 remove vertical space characters, 2 => remove underscore. */
149
150 static const int posix_class_maps[] = {
151 cbit_word, cbit_digit, -2, /* alpha */
152 cbit_lower, -1, 0, /* lower */
153 cbit_upper, -1, 0, /* upper */
154 cbit_word, -1, 2, /* alnum - word without underscore */
155 cbit_print, cbit_cntrl, 0, /* ascii */
156 cbit_space, -1, 1, /* blank - a GNU extension */
157 cbit_cntrl, -1, 0, /* cntrl */
158 cbit_digit, -1, 0, /* digit */
159 cbit_graph, -1, 0, /* graph */
160 cbit_print, -1, 0, /* print */
161 cbit_punct, -1, 0, /* punct */
162 cbit_space, -1, 0, /* space */
163 cbit_word, -1, 0, /* word - a Perl extension */
164 cbit_xdigit,-1, 0 /* xdigit */
165 };
166
167
168 #define STRING(a) # a
169 #define XSTRING(s) STRING(s)
170
171 /* The texts of compile-time error messages. These are "char *" because they
172 are passed to the outside world. Do not ever re-use any error number, because
173 they are documented. Always add a new error instead. Messages marked DEAD below
174 are no longer used. */
175
176 static const char *error_texts[] = {
177 "no error",
178 "\\ at end of pattern",
179 "\\c at end of pattern",
180 "unrecognized character follows \\",
181 "numbers out of order in {} quantifier",
182 /* 5 */
183 "number too big in {} quantifier",
184 "missing terminating ] for character class",
185 "invalid escape sequence in character class",
186 "range out of order in character class",
187 "nothing to repeat",
188 /* 10 */
189 "operand of unlimited repeat could match the empty string", /** DEAD **/
190 "internal error: unexpected repeat",
191 "unrecognized character after (?",
192 "POSIX named classes are supported only within a class",
193 "missing )",
194 /* 15 */
195 "reference to non-existent subpattern",
196 "erroffset passed as NULL",
197 "unknown option bit(s) set",
198 "missing ) after comment",
199 "parentheses nested too deeply", /** DEAD **/
200 /* 20 */
201 "regular expression too large",
202 "failed to get memory",
203 "unmatched parentheses",
204 "internal error: code overflow",
205 "unrecognized character after (?<",
206 /* 25 */
207 "lookbehind assertion is not fixed length",
208 "malformed number or name after (?(",
209 "conditional group contains more than two branches",
210 "assertion expected after (?(",
211 "(?R or (?[+-]digits must be followed by )",
212 /* 30 */
213 "unknown POSIX class name",
214 "POSIX collating elements are not supported",
215 "this version of PCRE is not compiled with PCRE_UTF8 support",
216 "spare error", /** DEAD **/
217 "character value in \\x{...} sequence is too large",
218 /* 35 */
219 "invalid condition (?(0)",
220 "\\C not allowed in lookbehind assertion",
221 "PCRE does not support \\L, \\l, \\N, \\U, or \\u",
222 "number after (?C is > 255",
223 "closing ) for (?C expected",
224 /* 40 */
225 "recursive call could loop indefinitely",
226 "unrecognized character after (?P",
227 "syntax error in subpattern name (missing terminator)",
228 "two named subpatterns have the same name",
229 "invalid UTF-8 string",
230 /* 45 */
231 "support for \\P, \\p, and \\X has not been compiled",
232 "malformed \\P or \\p sequence",
233 "unknown property name after \\P or \\p",
234 "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)",
235 "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")",
236 /* 50 */
237 "repeated subpattern is too long",
238 "octal value is greater than \\377 (not in UTF-8 mode)",
239 "internal error: overran compiling workspace",
240 "internal error: previously-checked referenced subpattern not found",
241 "DEFINE group contains more than one branch",
242 /* 55 */
243 "repeating a DEFINE group is not allowed",
244 "inconsistent NEWLINE options",
245 "\\g is not followed by a braced name or an optionally braced non-zero number",
246 "(?+ or (?- or (?(+ or (?(- must be followed by a non-zero number"
247 };
248
249
250 /* Table to identify digits and hex digits. This is used when compiling
251 patterns. Note that the tables in chartables are dependent on the locale, and
252 may mark arbitrary characters as digits - but the PCRE compiling code expects
253 to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
254 a private table here. It costs 256 bytes, but it is a lot faster than doing
255 character value tests (at least in some simple cases I timed), and in some
256 applications one wants PCRE to compile efficiently as well as match
257 efficiently.
258
259 For convenience, we use the same bit definitions as in chartables:
260
261 0x04 decimal digit
262 0x08 hexadecimal digit
263
264 Then we can use ctype_digit and ctype_xdigit in the code. */
265
266 #ifndef EBCDIC /* This is the "normal" case, for ASCII systems */
267 static const unsigned char digitab[] =
268 {
269 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
270 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
271 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
272 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
273 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
274 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
275 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
276 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
277 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
278 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
279 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
280 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
281 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
282 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
283 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
284 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
285 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
286 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
287 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
288 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
289 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
290 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
291 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
292 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
293 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
294 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
295 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
296 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
297 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
298 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
299 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
300 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
301
302 #else /* This is the "abnormal" case, for EBCDIC systems */
303 static const unsigned char digitab[] =
304 {
305 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
306 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
307 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
308 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
309 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
310 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
311 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
312 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
313 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
314 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
315 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
316 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */
317 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
318 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
319 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
320 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
321 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
322 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
323 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
324 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
325 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
326 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
327 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
328 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
329 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
330 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
331 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
332 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
333 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
334 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
335 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
336 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
337
338 static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
339 0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
340 0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
341 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
342 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
343 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
344 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
345 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
346 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
347 0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
348 0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
349 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
350 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */
351 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
352 0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
353 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
354 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
355 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
356 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
357 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
358 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
359 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
360 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
361 0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
362 0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
363 0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
364 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
365 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
366 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
367 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
368 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
369 0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
370 0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
371 #endif
372
373
374 /* Definition to allow mutual recursion */
375
376 static BOOL
377 compile_regex(int, int, uschar **, const uschar **, int *, BOOL, int, int *,
378 int *, branch_chain *, compile_data *, int *);
379
380
381
382 /*************************************************
383 * Handle escapes *
384 *************************************************/
385
386 /* This function is called when a \ has been encountered. It either returns a
387 positive value for a simple escape such as \n, or a negative value which
388 encodes one of the more complicated things such as \d. A backreference to group
389 n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
390 UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
391 ptr is pointing at the \. On exit, it is on the final character of the escape
392 sequence.
393
394 Arguments:
395 ptrptr points to the pattern position pointer
396 errorcodeptr points to the errorcode variable
397 bracount number of previous extracting brackets
398 options the options bits
399 isclass TRUE if inside a character class
400
401 Returns: zero or positive => a data character
402 negative => a special escape sequence
403 on error, errorptr is set
404 */
405
406 static int
407 check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
408 int options, BOOL isclass)
409 {
410 BOOL utf8 = (options & PCRE_UTF8) != 0;
411 const uschar *ptr = *ptrptr + 1;
412 int c, i;
413
414 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
415 ptr--; /* Set pointer back to the last byte */
416
417 /* If backslash is at the end of the pattern, it's an error. */
418
419 if (c == 0) *errorcodeptr = ERR1;
420
421 /* Non-alphamerics are literals. For digits or letters, do an initial lookup in
422 a table. A non-zero result is something that can be returned immediately.
423 Otherwise further processing may be required. */
424
425 #ifndef EBCDIC /* ASCII coding */
426 else if (c < '0' || c > 'z') {} /* Not alphameric */
427 else if ((i = escapes[c - '0']) != 0) c = i;
428
429 #else /* EBCDIC coding */
430 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphameric */
431 else if ((i = escapes[c - 0x48]) != 0) c = i;
432 #endif
433
434 /* Escapes that need further processing, or are illegal. */
435
436 else
437 {
438 const uschar *oldptr;
439 BOOL braced, negated;
440
441 switch (c)
442 {
443 /* A number of Perl escapes are not handled by PCRE. We give an explicit
444 error. */
445
446 case 'l':
447 case 'L':
448 case 'N':
449 case 'u':
450 case 'U':
451 *errorcodeptr = ERR37;
452 break;
453
454 /* \g must be followed by a number, either plain or braced. If positive, it
455 is an absolute backreference. If negative, it is a relative backreference.
456 This is a Perl 5.10 feature. Perl 5.10 also supports \g{name} as a
457 reference to a named group. This is part of Perl's movement towards a
458 unified syntax for back references. As this is synonymous with \k{name}, we
459 fudge it up by pretending it really was \k. */
460
461 case 'g':
462 if (ptr[1] == '{')
463 {
464 const uschar *p;
465 for (p = ptr+2; *p != 0 && *p != '}'; p++)
466 if (*p != '-' && (digitab[*p] & ctype_digit) == 0) break;
467 if (*p != 0 && *p != '}')
468 {
469 c = -ESC_k;
470 break;
471 }
472 braced = TRUE;
473 ptr++;
474 }
475 else braced = FALSE;
476
477 if (ptr[1] == '-')
478 {
479 negated = TRUE;
480 ptr++;
481 }
482 else negated = FALSE;
483
484 c = 0;
485 while ((digitab[ptr[1]] & ctype_digit) != 0)
486 c = c * 10 + *(++ptr) - '0';
487
488 if (c == 0 || (braced && *(++ptr) != '}'))
489 {
490 *errorcodeptr = ERR57;
491 return 0;
492 }
493
494 if (negated)
495 {
496 if (c > bracount)
497 {
498 *errorcodeptr = ERR15;
499 return 0;
500 }
501 c = bracount - (c - 1);
502 }
503
504 c = -(ESC_REF + c);
505 break;
506
507 /* The handling of escape sequences consisting of a string of digits
508 starting with one that is not zero is not straightforward. By experiment,
509 the way Perl works seems to be as follows:
510
511 Outside a character class, the digits are read as a decimal number. If the
512 number is less than 10, or if there are that many previous extracting
513 left brackets, then it is a back reference. Otherwise, up to three octal
514 digits are read to form an escaped byte. Thus \123 is likely to be octal
515 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
516 value is greater than 377, the least significant 8 bits are taken. Inside a
517 character class, \ followed by a digit is always an octal number. */
518
519 case '1': case '2': case '3': case '4': case '5':
520 case '6': case '7': case '8': case '9':
521
522 if (!isclass)
523 {
524 oldptr = ptr;
525 c -= '0';
526 while ((digitab[ptr[1]] & ctype_digit) != 0)
527 c = c * 10 + *(++ptr) - '0';
528 if (c < 10 || c <= bracount)
529 {
530 c = -(ESC_REF + c);
531 break;
532 }
533 ptr = oldptr; /* Put the pointer back and fall through */
534 }
535
536 /* Handle an octal number following \. If the first digit is 8 or 9, Perl
537 generates a binary zero byte and treats the digit as a following literal.
538 Thus we have to pull back the pointer by one. */
539
540 if ((c = *ptr) >= '8')
541 {
542 ptr--;
543 c = 0;
544 break;
545 }
546
547 /* \0 always starts an octal number, but we may drop through to here with a
548 larger first octal digit. The original code used just to take the least
549 significant 8 bits of octal numbers (I think this is what early Perls used
550 to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
551 than 3 octal digits. */
552
553 case '0':
554 c -= '0';
555 while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
556 c = c * 8 + *(++ptr) - '0';
557 if (!utf8 && c > 255) *errorcodeptr = ERR51;
558 break;
559
560 /* \x is complicated. \x{ddd} is a character number which can be greater
561 than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
562 treated as a data character. */
563
564 case 'x':
565 if (ptr[1] == '{')
566 {
567 const uschar *pt = ptr + 2;
568 int count = 0;
569
570 c = 0;
571 while ((digitab[*pt] & ctype_xdigit) != 0)
572 {
573 register int cc = *pt++;
574 if (c == 0 && cc == '0') continue; /* Leading zeroes */
575 count++;
576
577 #ifndef EBCDIC /* ASCII coding */
578 if (cc >= 'a') cc -= 32; /* Convert to upper case */
579 c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));
580 #else /* EBCDIC coding */
581 if (cc >= 'a' && cc <= 'z') cc += 64; /* Convert to upper case */
582 c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));
583 #endif
584 }
585
586 if (*pt == '}')
587 {
588 if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
589 ptr = pt;
590 break;
591 }
592
593 /* If the sequence of hex digits does not end with '}', then we don't
594 recognize this construct; fall through to the normal \x handling. */
595 }
596
597 /* Read just a single-byte hex-defined char */
598
599 c = 0;
600 while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
601 {
602 int cc; /* Some compilers don't like ++ */
603 cc = *(++ptr); /* in initializers */
604 #ifndef EBCDIC /* ASCII coding */
605 if (cc >= 'a') cc -= 32; /* Convert to upper case */
606 c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
607 #else /* EBCDIC coding */
608 if (cc <= 'z') cc += 64; /* Convert to upper case */
609 c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
610 #endif
611 }
612 break;
613
614 /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
615 This coding is ASCII-specific, but then the whole concept of \cx is
616 ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
617
618 case 'c':
619 c = *(++ptr);
620 if (c == 0)
621 {
622 *errorcodeptr = ERR2;
623 return 0;
624 }
625
626 #ifndef EBCDIC /* ASCII coding */
627 if (c >= 'a' && c <= 'z') c -= 32;
628 c ^= 0x40;
629 #else /* EBCDIC coding */
630 if (c >= 'a' && c <= 'z') c += 64;
631 c ^= 0xC0;
632 #endif
633 break;
634
635 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
636 other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
637 for Perl compatibility, it is a literal. This code looks a bit odd, but
638 there used to be some cases other than the default, and there may be again
639 in future, so I haven't "optimized" it. */
640
641 default:
642 if ((options & PCRE_EXTRA) != 0) switch(c)
643 {
644 default:
645 *errorcodeptr = ERR3;
646 break;
647 }
648 break;
649 }
650 }
651
652 *ptrptr = ptr;
653 return c;
654 }
655
656
657
658 #ifdef SUPPORT_UCP
659 /*************************************************
660 * Handle \P and \p *
661 *************************************************/
662
663 /* This function is called after \P or \p has been encountered, provided that
664 PCRE is compiled with support for Unicode properties. On entry, ptrptr is
665 pointing at the P or p. On exit, it is pointing at the final character of the
666 escape sequence.
667
668 Argument:
669 ptrptr points to the pattern position pointer
670 negptr points to a boolean that is set TRUE for negation else FALSE
671 dptr points to an int that is set to the detailed property value
672 errorcodeptr points to the error code variable
673
674 Returns: type value from ucp_type_table, or -1 for an invalid type
675 */
676
677 static int
678 get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
679 {
680 int c, i, bot, top;
681 const uschar *ptr = *ptrptr;
682 char name[32];
683
684 c = *(++ptr);
685 if (c == 0) goto ERROR_RETURN;
686
687 *negptr = FALSE;
688
689 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
690 negation. */
691
692 if (c == '{')
693 {
694 if (ptr[1] == '^')
695 {
696 *negptr = TRUE;
697 ptr++;
698 }
699 for (i = 0; i < sizeof(name) - 1; i++)
700 {
701 c = *(++ptr);
702 if (c == 0) goto ERROR_RETURN;
703 if (c == '}') break;
704 name[i] = c;
705 }
706 if (c !='}') goto ERROR_RETURN;
707 name[i] = 0;
708 }
709
710 /* Otherwise there is just one following character */
711
712 else
713 {
714 name[0] = c;
715 name[1] = 0;
716 }
717
718 *ptrptr = ptr;
719
720 /* Search for a recognized property name using binary chop */
721
722 bot = 0;
723 top = _pcre_utt_size;
724
725 while (bot < top)
726 {
727 i = (bot + top) >> 1;
728 c = strcmp(name, _pcre_utt[i].name);
729 if (c == 0)
730 {
731 *dptr = _pcre_utt[i].value;
732 return _pcre_utt[i].type;
733 }
734 if (c > 0) bot = i + 1; else top = i;
735 }
736
737 *errorcodeptr = ERR47;
738 *ptrptr = ptr;
739 return -1;
740
741 ERROR_RETURN:
742 *errorcodeptr = ERR46;
743 *ptrptr = ptr;
744 return -1;
745 }
746 #endif
747
748
749
750
751 /*************************************************
752 * Check for counted repeat *
753 *************************************************/
754
755 /* This function is called when a '{' is encountered in a place where it might
756 start a quantifier. It looks ahead to see if it really is a quantifier or not.
757 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
758 where the ddds are digits.
759
760 Arguments:
761 p pointer to the first char after '{'
762
763 Returns: TRUE or FALSE
764 */
765
766 static BOOL
767 is_counted_repeat(const uschar *p)
768 {
769 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
770 while ((digitab[*p] & ctype_digit) != 0) p++;
771 if (*p == '}') return TRUE;
772
773 if (*p++ != ',') return FALSE;
774 if (*p == '}') return TRUE;
775
776 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
777 while ((digitab[*p] & ctype_digit) != 0) p++;
778
779 return (*p == '}');
780 }
781
782
783
784 /*************************************************
785 * Read repeat counts *
786 *************************************************/
787
788 /* Read an item of the form {n,m} and return the values. This is called only
789 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
790 so the syntax is guaranteed to be correct, but we need to check the values.
791
792 Arguments:
793 p pointer to first char after '{'
794 minp pointer to int for min
795 maxp pointer to int for max
796 returned as -1 if no max
797 errorcodeptr points to error code variable
798
799 Returns: pointer to '}' on success;
800 current ptr on error, with errorcodeptr set non-zero
801 */
802
803 static const uschar *
804 read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
805 {
806 int min = 0;
807 int max = -1;
808
809 /* Read the minimum value and do a paranoid check: a negative value indicates
810 an integer overflow. */
811
812 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
813 if (min < 0 || min > 65535)
814 {
815 *errorcodeptr = ERR5;
816 return p;
817 }
818
819 /* Read the maximum value if there is one, and again do a paranoid on its size.
820 Also, max must not be less than min. */
821
822 if (*p == '}') max = min; else
823 {
824 if (*(++p) != '}')
825 {
826 max = 0;
827 while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
828 if (max < 0 || max > 65535)
829 {
830 *errorcodeptr = ERR5;
831 return p;
832 }
833 if (max < min)
834 {
835 *errorcodeptr = ERR4;
836 return p;
837 }
838 }
839 }
840
841 /* Fill in the required variables, and pass back the pointer to the terminating
842 '}'. */
843
844 *minp = min;
845 *maxp = max;
846 return p;
847 }
848
849
850
851 /*************************************************
852 * Find forward referenced subpattern *
853 *************************************************/
854
855 /* This function scans along a pattern's text looking for capturing
856 subpatterns, and counting them. If it finds a named pattern that matches the
857 name it is given, it returns its number. Alternatively, if the name is NULL, it
858 returns when it reaches a given numbered subpattern. This is used for forward
859 references to subpatterns. We know that if (?P< is encountered, the name will
860 be terminated by '>' because that is checked in the first pass.
861
862 Arguments:
863 ptr current position in the pattern
864 count current count of capturing parens so far encountered
865 name name to seek, or NULL if seeking a numbered subpattern
866 lorn name length, or subpattern number if name is NULL
867 xmode TRUE if we are in /x mode
868
869 Returns: the number of the named subpattern, or -1 if not found
870 */
871
872 static int
873 find_parens(const uschar *ptr, int count, const uschar *name, int lorn,
874 BOOL xmode)
875 {
876 const uschar *thisname;
877
878 for (; *ptr != 0; ptr++)
879 {
880 int term;
881
882 /* Skip over backslashed characters and also entire \Q...\E */
883
884 if (*ptr == '\\')
885 {
886 if (*(++ptr) == 0) return -1;
887 if (*ptr == 'Q') for (;;)
888 {
889 while (*(++ptr) != 0 && *ptr != '\\');
890 if (*ptr == 0) return -1;
891 if (*(++ptr) == 'E') break;
892 }
893 continue;
894 }
895
896 /* Skip over character classes */
897
898 if (*ptr == '[')
899 {
900 while (*(++ptr) != ']')
901 {
902 if (*ptr == '\\')
903 {
904 if (*(++ptr) == 0) return -1;
905 if (*ptr == 'Q') for (;;)
906 {
907 while (*(++ptr) != 0 && *ptr != '\\');
908 if (*ptr == 0) return -1;
909 if (*(++ptr) == 'E') break;
910 }
911 continue;
912 }
913 }
914 continue;
915 }
916
917 /* Skip comments in /x mode */
918
919 if (xmode && *ptr == '#')
920 {
921 while (*(++ptr) != 0 && *ptr != '\n');
922 if (*ptr == 0) return -1;
923 continue;
924 }
925
926 /* An opening parens must now be a real metacharacter */
927
928 if (*ptr != '(') continue;
929 if (ptr[1] != '?')
930 {
931 count++;
932 if (name == NULL && count == lorn) return count;
933 continue;
934 }
935
936 ptr += 2;
937 if (*ptr == 'P') ptr++; /* Allow optional P */
938
939 /* We have to disambiguate (?<! and (?<= from (?<name> */
940
941 if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') &&
942 *ptr != '\'')
943 continue;
944
945 count++;
946
947 if (name == NULL && count == lorn) return count;
948 term = *ptr++;
949 if (term == '<') term = '>';
950 thisname = ptr;
951 while (*ptr != term) ptr++;
952 if (name != NULL && lorn == ptr - thisname &&
953 strncmp((const char *)name, (const char *)thisname, lorn) == 0)
954 return count;
955 }
956
957 return -1;
958 }
959
960
961
962 /*************************************************
963 * Find first significant op code *
964 *************************************************/
965
966 /* This is called by several functions that scan a compiled expression looking
967 for a fixed first character, or an anchoring op code etc. It skips over things
968 that do not influence this. For some calls, a change of option is important.
969 For some calls, it makes sense to skip negative forward and all backward
970 assertions, and also the \b assertion; for others it does not.
971
972 Arguments:
973 code pointer to the start of the group
974 options pointer to external options
975 optbit the option bit whose changing is significant, or
976 zero if none are
977 skipassert TRUE if certain assertions are to be skipped
978
979 Returns: pointer to the first significant opcode
980 */
981
982 static const uschar*
983 first_significant_code(const uschar *code, int *options, int optbit,
984 BOOL skipassert)
985 {
986 for (;;)
987 {
988 switch ((int)*code)
989 {
990 case OP_OPT:
991 if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
992 *options = (int)code[1];
993 code += 2;
994 break;
995
996 case OP_ASSERT_NOT:
997 case OP_ASSERTBACK:
998 case OP_ASSERTBACK_NOT:
999 if (!skipassert) return code;
1000 do code += GET(code, 1); while (*code == OP_ALT);
1001 code += _pcre_OP_lengths[*code];
1002 break;
1003
1004 case OP_WORD_BOUNDARY:
1005 case OP_NOT_WORD_BOUNDARY:
1006 if (!skipassert) return code;
1007 /* Fall through */
1008
1009 case OP_CALLOUT:
1010 case OP_CREF:
1011 case OP_RREF:
1012 case OP_DEF:
1013 code += _pcre_OP_lengths[*code];
1014 break;
1015
1016 default:
1017 return code;
1018 }
1019 }
1020 /* Control never reaches here */
1021 }
1022
1023
1024
1025
1026 /*************************************************
1027 * Find the fixed length of a pattern *
1028 *************************************************/
1029
1030 /* Scan a pattern and compute the fixed length of subject that will match it,
1031 if the length is fixed. This is needed for dealing with backward assertions.
1032 In UTF8 mode, the result is in characters rather than bytes.
1033
1034 Arguments:
1035 code points to the start of the pattern (the bracket)
1036 options the compiling options
1037
1038 Returns: the fixed length, or -1 if there is no fixed length,
1039 or -2 if \C was encountered
1040 */
1041
1042 static int
1043 find_fixedlength(uschar *code, int options)
1044 {
1045 int length = -1;
1046
1047 register int branchlength = 0;
1048 register uschar *cc = code + 1 + LINK_SIZE;
1049
1050 /* Scan along the opcodes for this branch. If we get to the end of the
1051 branch, check the length against that of the other branches. */
1052
1053 for (;;)
1054 {
1055 int d;
1056 register int op = *cc;
1057
1058 switch (op)
1059 {
1060 case OP_CBRA:
1061 case OP_BRA:
1062 case OP_ONCE:
1063 case OP_COND:
1064 d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);
1065 if (d < 0) return d;
1066 branchlength += d;
1067 do cc += GET(cc, 1); while (*cc == OP_ALT);
1068 cc += 1 + LINK_SIZE;
1069 break;
1070
1071 /* Reached end of a branch; if it's a ket it is the end of a nested
1072 call. If it's ALT it is an alternation in a nested call. If it is
1073 END it's the end of the outer call. All can be handled by the same code. */
1074
1075 case OP_ALT:
1076 case OP_KET:
1077 case OP_KETRMAX:
1078 case OP_KETRMIN:
1079 case OP_END:
1080 if (length < 0) length = branchlength;
1081 else if (length != branchlength) return -1;
1082 if (*cc != OP_ALT) return length;
1083 cc += 1 + LINK_SIZE;
1084 branchlength = 0;
1085 break;
1086
1087 /* Skip over assertive subpatterns */
1088
1089 case OP_ASSERT:
1090 case OP_ASSERT_NOT:
1091 case OP_ASSERTBACK:
1092 case OP_ASSERTBACK_NOT:
1093 do cc += GET(cc, 1); while (*cc == OP_ALT);
1094 /* Fall through */
1095
1096 /* Skip over things that don't match chars */
1097
1098 case OP_REVERSE:
1099 case OP_CREF:
1100 case OP_RREF:
1101 case OP_DEF:
1102 case OP_OPT:
1103 case OP_CALLOUT:
1104 case OP_SOD:
1105 case OP_SOM:
1106 case OP_EOD:
1107 case OP_EODN:
1108 case OP_CIRC:
1109 case OP_DOLL:
1110 case OP_NOT_WORD_BOUNDARY:
1111 case OP_WORD_BOUNDARY:
1112 cc += _pcre_OP_lengths[*cc];
1113 break;
1114
1115 /* Handle literal characters */
1116
1117 case OP_CHAR:
1118 case OP_CHARNC:
1119 case OP_NOT:
1120 branchlength++;
1121 cc += 2;
1122 #ifdef SUPPORT_UTF8
1123 if ((options & PCRE_UTF8) != 0)
1124 {
1125 while ((*cc & 0xc0) == 0x80) cc++;
1126 }
1127 #endif
1128 break;
1129
1130 /* Handle exact repetitions. The count is already in characters, but we
1131 need to skip over a multibyte character in UTF8 mode. */
1132
1133 case OP_EXACT:
1134 branchlength += GET2(cc,1);
1135 cc += 4;
1136 #ifdef SUPPORT_UTF8
1137 if ((options & PCRE_UTF8) != 0)
1138 {
1139 while((*cc & 0x80) == 0x80) cc++;
1140 }
1141 #endif
1142 break;
1143
1144 case OP_TYPEEXACT:
1145 branchlength += GET2(cc,1);
1146 cc += 4;
1147 break;
1148
1149 /* Handle single-char matchers */
1150
1151 case OP_PROP:
1152 case OP_NOTPROP:
1153 cc += 2;
1154 /* Fall through */
1155
1156 case OP_NOT_DIGIT:
1157 case OP_DIGIT:
1158 case OP_NOT_WHITESPACE:
1159 case OP_WHITESPACE:
1160 case OP_NOT_WORDCHAR:
1161 case OP_WORDCHAR:
1162 case OP_ANY:
1163 branchlength++;
1164 cc++;
1165 break;
1166
1167 /* The single-byte matcher isn't allowed */
1168
1169 case OP_ANYBYTE:
1170 return -2;
1171
1172 /* Check a class for variable quantification */
1173
1174 #ifdef SUPPORT_UTF8
1175 case OP_XCLASS:
1176 cc += GET(cc, 1) - 33;
1177 /* Fall through */
1178 #endif
1179
1180 case OP_CLASS:
1181 case OP_NCLASS:
1182 cc += 33;
1183
1184 switch (*cc)
1185 {
1186 case OP_CRSTAR:
1187 case OP_CRMINSTAR:
1188 case OP_CRQUERY:
1189 case OP_CRMINQUERY:
1190 return -1;
1191
1192 case OP_CRRANGE:
1193 case OP_CRMINRANGE:
1194 if (GET2(cc,1) != GET2(cc,3)) return -1;
1195 branchlength += GET2(cc,1);
1196 cc += 5;
1197 break;
1198
1199 default:
1200 branchlength++;
1201 }
1202 break;
1203
1204 /* Anything else is variable length */
1205
1206 default:
1207 return -1;
1208 }
1209 }
1210 /* Control never gets here */
1211 }
1212
1213
1214
1215
1216 /*************************************************
1217 * Scan compiled regex for numbered bracket *
1218 *************************************************/
1219
1220 /* This little function scans through a compiled pattern until it finds a
1221 capturing bracket with the given number.
1222
1223 Arguments:
1224 code points to start of expression
1225 utf8 TRUE in UTF-8 mode
1226 number the required bracket number
1227
1228 Returns: pointer to the opcode for the bracket, or NULL if not found
1229 */
1230
1231 static const uschar *
1232 find_bracket(const uschar *code, BOOL utf8, int number)
1233 {
1234 for (;;)
1235 {
1236 register int c = *code;
1237 if (c == OP_END) return NULL;
1238
1239 /* XCLASS is used for classes that cannot be represented just by a bit
1240 map. This includes negated single high-valued characters. The length in
1241 the table is zero; the actual length is stored in the compiled code. */
1242
1243 if (c == OP_XCLASS) code += GET(code, 1);
1244
1245 /* Handle capturing bracket */
1246
1247 else if (c == OP_CBRA)
1248 {
1249 int n = GET2(code, 1+LINK_SIZE);
1250 if (n == number) return (uschar *)code;
1251 code += _pcre_OP_lengths[c];
1252 }
1253
1254 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1255 a multi-byte character. The length in the table is a minimum, so we have to
1256 arrange to skip the extra bytes. */
1257
1258 else
1259 {
1260 code += _pcre_OP_lengths[c];
1261 #ifdef SUPPORT_UTF8
1262 if (utf8) switch(c)
1263 {
1264 case OP_CHAR:
1265 case OP_CHARNC:
1266 case OP_EXACT:
1267 case OP_UPTO:
1268 case OP_MINUPTO:
1269 case OP_POSUPTO:
1270 case OP_STAR:
1271 case OP_MINSTAR:
1272 case OP_POSSTAR:
1273 case OP_PLUS:
1274 case OP_MINPLUS:
1275 case OP_POSPLUS:
1276 case OP_QUERY:
1277 case OP_MINQUERY:
1278 case OP_POSQUERY:
1279 if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1280 break;
1281 }
1282 #endif
1283 }
1284 }
1285 }
1286
1287
1288
1289 /*************************************************
1290 * Scan compiled regex for recursion reference *
1291 *************************************************/
1292
1293 /* This little function scans through a compiled pattern until it finds an
1294 instance of OP_RECURSE.
1295
1296 Arguments:
1297 code points to start of expression
1298 utf8 TRUE in UTF-8 mode
1299
1300 Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1301 */
1302
1303 static const uschar *
1304 find_recurse(const uschar *code, BOOL utf8)
1305 {
1306 for (;;)
1307 {
1308 register int c = *code;
1309 if (c == OP_END) return NULL;
1310 if (c == OP_RECURSE) return code;
1311
1312 /* XCLASS is used for classes that cannot be represented just by a bit
1313 map. This includes negated single high-valued characters. The length in
1314 the table is zero; the actual length is stored in the compiled code. */
1315
1316 if (c == OP_XCLASS) code += GET(code, 1);
1317
1318 /* Otherwise, we get the item's length from the table. In UTF-8 mode, opcodes
1319 that are followed by a character may be followed by a multi-byte character.
1320 The length in the table is a minimum, so we have to arrange to skip the extra
1321 bytes. */
1322
1323 else
1324 {
1325 code += _pcre_OP_lengths[c];
1326 #ifdef SUPPORT_UTF8
1327 if (utf8) switch(c)
1328 {
1329 case OP_CHAR:
1330 case OP_CHARNC:
1331 case OP_EXACT:
1332 case OP_UPTO:
1333 case OP_MINUPTO:
1334 case OP_POSUPTO:
1335 case OP_STAR:
1336 case OP_MINSTAR:
1337 case OP_POSSTAR:
1338 case OP_PLUS:
1339 case OP_MINPLUS:
1340 case OP_POSPLUS:
1341 case OP_QUERY:
1342 case OP_MINQUERY:
1343 case OP_POSQUERY:
1344 if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1345 break;
1346 }
1347 #endif
1348 }
1349 }
1350 }
1351
1352
1353
1354 /*************************************************
1355 * Scan compiled branch for non-emptiness *
1356 *************************************************/
1357
1358 /* This function scans through a branch of a compiled pattern to see whether it
1359 can match the empty string or not. It is called from could_be_empty()
1360 below and from compile_branch() when checking for an unlimited repeat of a
1361 group that can match nothing. Note that first_significant_code() skips over
1362 assertions. If we hit an unclosed bracket, we return "empty" - this means we've
1363 struck an inner bracket whose current branch will already have been scanned.
1364
1365 Arguments:
1366 code points to start of search
1367 endcode points to where to stop
1368 utf8 TRUE if in UTF8 mode
1369
1370 Returns: TRUE if what is matched could be empty
1371 */
1372
1373 static BOOL
1374 could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1375 {
1376 register int c;
1377 for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1378 code < endcode;
1379 code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1380 {
1381 const uschar *ccode;
1382
1383 c = *code;
1384
1385 /* Groups with zero repeats can of course be empty; skip them. */
1386
1387 if (c == OP_BRAZERO || c == OP_BRAMINZERO)
1388 {
1389 code += _pcre_OP_lengths[c];
1390 do code += GET(code, 1); while (*code == OP_ALT);
1391 c = *code;
1392 continue;
1393 }
1394
1395 /* For other groups, scan the branches. */
1396
1397 if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE)
1398 {
1399 BOOL empty_branch;
1400 if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1401
1402 /* Scan a closed bracket */
1403
1404 empty_branch = FALSE;
1405 do
1406 {
1407 if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1408 empty_branch = TRUE;
1409 code += GET(code, 1);
1410 }
1411 while (*code == OP_ALT);
1412 if (!empty_branch) return FALSE; /* All branches are non-empty */
1413 c = *code;
1414 continue;
1415 }
1416
1417 /* Handle the other opcodes */
1418
1419 switch (c)
1420 {
1421 /* Check for quantifiers after a class */
1422
1423 #ifdef SUPPORT_UTF8
1424 case OP_XCLASS:
1425 ccode = code + GET(code, 1);
1426 goto CHECK_CLASS_REPEAT;
1427 #endif
1428
1429 case OP_CLASS:
1430 case OP_NCLASS:
1431 ccode = code + 33;
1432
1433 #ifdef SUPPORT_UTF8
1434 CHECK_CLASS_REPEAT:
1435 #endif
1436
1437 switch (*ccode)
1438 {
1439 case OP_CRSTAR: /* These could be empty; continue */
1440 case OP_CRMINSTAR:
1441 case OP_CRQUERY:
1442 case OP_CRMINQUERY:
1443 break;
1444
1445 default: /* Non-repeat => class must match */
1446 case OP_CRPLUS: /* These repeats aren't empty */
1447 case OP_CRMINPLUS:
1448 return FALSE;
1449
1450 case OP_CRRANGE:
1451 case OP_CRMINRANGE:
1452 if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
1453 break;
1454 }
1455 break;
1456
1457 /* Opcodes that must match a character */
1458
1459 case OP_PROP:
1460 case OP_NOTPROP:
1461 case OP_EXTUNI:
1462 case OP_NOT_DIGIT:
1463 case OP_DIGIT:
1464 case OP_NOT_WHITESPACE:
1465 case OP_WHITESPACE:
1466 case OP_NOT_WORDCHAR:
1467 case OP_WORDCHAR:
1468 case OP_ANY:
1469 case OP_ANYBYTE:
1470 case OP_CHAR:
1471 case OP_CHARNC:
1472 case OP_NOT:
1473 case OP_PLUS:
1474 case OP_MINPLUS:
1475 case OP_POSPLUS:
1476 case OP_EXACT:
1477 case OP_NOTPLUS:
1478 case OP_NOTMINPLUS:
1479 case OP_NOTPOSPLUS:
1480 case OP_NOTEXACT:
1481 case OP_TYPEPLUS:
1482 case OP_TYPEMINPLUS:
1483 case OP_TYPEPOSPLUS:
1484 case OP_TYPEEXACT:
1485 return FALSE;
1486
1487 /* End of branch */
1488
1489 case OP_KET:
1490 case OP_KETRMAX:
1491 case OP_KETRMIN:
1492 case OP_ALT:
1493 return TRUE;
1494
1495 /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1496 MINUPTO, and POSUPTO may be followed by a multibyte character */
1497
1498 #ifdef SUPPORT_UTF8
1499 case OP_STAR:
1500 case OP_MINSTAR:
1501 case OP_POSSTAR:
1502 case OP_QUERY:
1503 case OP_MINQUERY:
1504 case OP_POSQUERY:
1505 case OP_UPTO:
1506 case OP_MINUPTO:
1507 case OP_POSUPTO:
1508 if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1509 break;
1510 #endif
1511 }
1512 }
1513
1514 return TRUE;
1515 }
1516
1517
1518
1519 /*************************************************
1520 * Scan compiled regex for non-emptiness *
1521 *************************************************/
1522
1523 /* This function is called to check for left recursive calls. We want to check
1524 the current branch of the current pattern to see if it could match the empty
1525 string. If it could, we must look outwards for branches at other levels,
1526 stopping when we pass beyond the bracket which is the subject of the recursion.
1527
1528 Arguments:
1529 code points to start of the recursion
1530 endcode points to where to stop (current RECURSE item)
1531 bcptr points to the chain of current (unclosed) branch starts
1532 utf8 TRUE if in UTF-8 mode
1533
1534 Returns: TRUE if what is matched could be empty
1535 */
1536
1537 static BOOL
1538 could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1539 BOOL utf8)
1540 {
1541 while (bcptr != NULL && bcptr->current >= code)
1542 {
1543 if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1544 bcptr = bcptr->outer;
1545 }
1546 return TRUE;
1547 }
1548
1549
1550
1551 /*************************************************
1552 * Check for POSIX class syntax *
1553 *************************************************/
1554
1555 /* This function is called when the sequence "[:" or "[." or "[=" is
1556 encountered in a character class. It checks whether this is followed by an
1557 optional ^ and then a sequence of letters, terminated by a matching ":]" or
1558 ".]" or "=]".
1559
1560 Argument:
1561 ptr pointer to the initial [
1562 endptr where to return the end pointer
1563 cd pointer to compile data
1564
1565 Returns: TRUE or FALSE
1566 */
1567
1568 static BOOL
1569 check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
1570 {
1571 int terminator; /* Don't combine these lines; the Solaris cc */
1572 terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
1573 if (*(++ptr) == '^') ptr++;
1574 while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
1575 if (*ptr == terminator && ptr[1] == ']')
1576 {
1577 *endptr = ptr;
1578 return TRUE;
1579 }
1580 return FALSE;
1581 }
1582
1583
1584
1585
1586 /*************************************************
1587 * Check POSIX class name *
1588 *************************************************/
1589
1590 /* This function is called to check the name given in a POSIX-style class entry
1591 such as [:alnum:].
1592
1593 Arguments:
1594 ptr points to the first letter
1595 len the length of the name
1596
1597 Returns: a value representing the name, or -1 if unknown
1598 */
1599
1600 static int
1601 check_posix_name(const uschar *ptr, int len)
1602 {
1603 register int yield = 0;
1604 while (posix_name_lengths[yield] != 0)
1605 {
1606 if (len == posix_name_lengths[yield] &&
1607 strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
1608 yield++;
1609 }
1610 return -1;
1611 }
1612
1613
1614 /*************************************************
1615 * Adjust OP_RECURSE items in repeated group *
1616 *************************************************/
1617
1618 /* OP_RECURSE items contain an offset from the start of the regex to the group
1619 that is referenced. This means that groups can be replicated for fixed
1620 repetition simply by copying (because the recursion is allowed to refer to
1621 earlier groups that are outside the current group). However, when a group is
1622 optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
1623 it, after it has been compiled. This means that any OP_RECURSE items within it
1624 that refer to the group itself or any contained groups have to have their
1625 offsets adjusted. That one of the jobs of this function. Before it is called,
1626 the partially compiled regex must be temporarily terminated with OP_END.
1627
1628 This function has been extended with the possibility of forward references for
1629 recursions and subroutine calls. It must also check the list of such references
1630 for the group we are dealing with. If it finds that one of the recursions in
1631 the current group is on this list, it adjusts the offset in the list, not the
1632 value in the reference (which is a group number).
1633
1634 Arguments:
1635 group points to the start of the group
1636 adjust the amount by which the group is to be moved
1637 utf8 TRUE in UTF-8 mode
1638 cd contains pointers to tables etc.
1639 save_hwm the hwm forward reference pointer at the start of the group
1640
1641 Returns: nothing
1642 */
1643
1644 static void
1645 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
1646 uschar *save_hwm)
1647 {
1648 uschar *ptr = group;
1649 while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1650 {
1651 int offset;
1652 uschar *hc;
1653
1654 /* See if this recursion is on the forward reference list. If so, adjust the
1655 reference. */
1656
1657 for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
1658 {
1659 offset = GET(hc, 0);
1660 if (cd->start_code + offset == ptr + 1)
1661 {
1662 PUT(hc, 0, offset + adjust);
1663 break;
1664 }
1665 }
1666
1667 /* Otherwise, adjust the recursion offset if it's after the start of this
1668 group. */
1669
1670 if (hc >= cd->hwm)
1671 {
1672 offset = GET(ptr, 1);
1673 if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1674 }
1675
1676 ptr += 1 + LINK_SIZE;
1677 }
1678 }
1679
1680
1681
1682 /*************************************************
1683 * Insert an automatic callout point *
1684 *************************************************/
1685
1686 /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
1687 callout points before each pattern item.
1688
1689 Arguments:
1690 code current code pointer
1691 ptr current pattern pointer
1692 cd pointers to tables etc
1693
1694 Returns: new code pointer
1695 */
1696
1697 static uschar *
1698 auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
1699 {
1700 *code++ = OP_CALLOUT;
1701 *code++ = 255;
1702 PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */
1703 PUT(code, LINK_SIZE, 0); /* Default length */
1704 return code + 2*LINK_SIZE;
1705 }
1706
1707
1708
1709 /*************************************************
1710 * Complete a callout item *
1711 *************************************************/
1712
1713 /* A callout item contains the length of the next item in the pattern, which
1714 we can't fill in till after we have reached the relevant point. This is used
1715 for both automatic and manual callouts.
1716
1717 Arguments:
1718 previous_callout points to previous callout item
1719 ptr current pattern pointer
1720 cd pointers to tables etc
1721
1722 Returns: nothing
1723 */
1724
1725 static void
1726 complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
1727 {
1728 int length = ptr - cd->start_pattern - GET(previous_callout, 2);
1729 PUT(previous_callout, 2 + LINK_SIZE, length);
1730 }
1731
1732
1733
1734 #ifdef SUPPORT_UCP
1735 /*************************************************
1736 * Get othercase range *
1737 *************************************************/
1738
1739 /* This function is passed the start and end of a class range, in UTF-8 mode
1740 with UCP support. It searches up the characters, looking for internal ranges of
1741 characters in the "other" case. Each call returns the next one, updating the
1742 start address.
1743
1744 Arguments:
1745 cptr points to starting character value; updated
1746 d end value
1747 ocptr where to put start of othercase range
1748 odptr where to put end of othercase range
1749
1750 Yield: TRUE when range returned; FALSE when no more
1751 */
1752
1753 static BOOL
1754 get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
1755 unsigned int *odptr)
1756 {
1757 unsigned int c, othercase, next;
1758
1759 for (c = *cptr; c <= d; c++)
1760 { if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR) break; }
1761
1762 if (c > d) return FALSE;
1763
1764 *ocptr = othercase;
1765 next = othercase + 1;
1766
1767 for (++c; c <= d; c++)
1768 {
1769 if (_pcre_ucp_othercase(c) != next) break;
1770 next++;
1771 }
1772
1773 *odptr = next - 1;
1774 *cptr = c;
1775
1776 return TRUE;
1777 }
1778 #endif /* SUPPORT_UCP */
1779
1780
1781
1782 /*************************************************
1783 * Check if auto-possessifying is possible *
1784 *************************************************/
1785
1786 /* This function is called for unlimited repeats of certain items, to see
1787 whether the next thing could possibly match the repeated item. If not, it makes
1788 sense to automatically possessify the repeated item.
1789
1790 Arguments:
1791 op_code the repeated op code
1792 this data for this item, depends on the opcode
1793 utf8 TRUE in UTF-8 mode
1794 utf8_char used for utf8 character bytes, NULL if not relevant
1795 ptr next character in pattern
1796 options options bits
1797 cd contains pointers to tables etc.
1798
1799 Returns: TRUE if possessifying is wanted
1800 */
1801
1802 static BOOL
1803 check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
1804 const uschar *ptr, int options, compile_data *cd)
1805 {
1806 int next;
1807
1808 /* Skip whitespace and comments in extended mode */
1809
1810 if ((options & PCRE_EXTENDED) != 0)
1811 {
1812 for (;;)
1813 {
1814 while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1815 if (*ptr == '#')
1816 {
1817 while (*(++ptr) != 0)
1818 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1819 }
1820 else break;
1821 }
1822 }
1823
1824 /* If the next item is one that we can handle, get its value. A non-negative
1825 value is a character, a negative value is an escape value. */
1826
1827 if (*ptr == '\\')
1828 {
1829 int temperrorcode = 0;
1830 next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
1831 if (temperrorcode != 0) return FALSE;
1832 ptr++; /* Point after the escape sequence */
1833 }
1834
1835 else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
1836 {
1837 #ifdef SUPPORT_UTF8
1838 if (utf8) { GETCHARINC(next, ptr); } else
1839 #endif
1840 next = *ptr++;
1841 }
1842
1843 else return FALSE;
1844
1845 /* Skip whitespace and comments in extended mode */
1846
1847 if ((options & PCRE_EXTENDED) != 0)
1848 {
1849 for (;;)
1850 {
1851 while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1852 if (*ptr == '#')
1853 {
1854 while (*(++ptr) != 0)
1855 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1856 }
1857 else break;
1858 }
1859 }
1860
1861 /* If the next thing is itself optional, we have to give up. */
1862
1863 if (*ptr == '*' || *ptr == '?' || strncmp((char *)ptr, "{0,", 3) == 0)
1864 return FALSE;
1865
1866 /* Now compare the next item with the previous opcode. If the previous is a
1867 positive single character match, "item" either contains the character or, if
1868 "item" is greater than 127 in utf8 mode, the character's bytes are in
1869 utf8_char. */
1870
1871
1872 /* Handle cases when the next item is a character. */
1873
1874 if (next >= 0) switch(op_code)
1875 {
1876 case OP_CHAR:
1877 #ifdef SUPPORT_UTF8
1878 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1879 #endif
1880 return item != next;
1881
1882 /* For CHARNC (caseless character) we must check the other case. If we have
1883 Unicode property support, we can use it to test the other case of
1884 high-valued characters. */
1885
1886 case OP_CHARNC:
1887 #ifdef SUPPORT_UTF8
1888 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1889 #endif
1890 if (item == next) return FALSE;
1891 #ifdef SUPPORT_UTF8
1892 if (utf8)
1893 {
1894 unsigned int othercase;
1895 if (next < 128) othercase = cd->fcc[next]; else
1896 #ifdef SUPPORT_UCP
1897 othercase = _pcre_ucp_othercase((unsigned int)next);
1898 #else
1899 othercase = NOTACHAR;
1900 #endif
1901 return (unsigned int)item != othercase;
1902 }
1903 else
1904 #endif /* SUPPORT_UTF8 */
1905 return (item != cd->fcc[next]); /* Non-UTF-8 mode */
1906
1907 /* For OP_NOT, "item" must be a single-byte character. */
1908
1909 case OP_NOT:
1910 if (next < 0) return FALSE; /* Not a character */
1911 if (item == next) return TRUE;
1912 if ((options & PCRE_CASELESS) == 0) return FALSE;
1913 #ifdef SUPPORT_UTF8
1914 if (utf8)
1915 {
1916 unsigned int othercase;
1917 if (next < 128) othercase = cd->fcc[next]; else
1918 #ifdef SUPPORT_UCP
1919 othercase = _pcre_ucp_othercase(next);
1920 #else
1921 othercase = NOTACHAR;
1922 #endif
1923 return (unsigned int)item == othercase;
1924 }
1925 else
1926 #endif /* SUPPORT_UTF8 */
1927 return (item == cd->fcc[next]); /* Non-UTF-8 mode */
1928
1929 case OP_DIGIT:
1930 return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
1931
1932 case OP_NOT_DIGIT:
1933 return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
1934
1935 case OP_WHITESPACE:
1936 return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
1937
1938 case OP_NOT_WHITESPACE:
1939 return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
1940
1941 case OP_WORDCHAR:
1942 return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
1943
1944 case OP_NOT_WORDCHAR:
1945 return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
1946
1947 default:
1948 return FALSE;
1949 }
1950
1951
1952 /* Handle the case when the next item is \d, \s, etc. */
1953
1954 switch(op_code)
1955 {
1956 case OP_CHAR:
1957 case OP_CHARNC:
1958 #ifdef SUPPORT_UTF8
1959 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1960 #endif
1961 switch(-next)
1962 {
1963 case ESC_d:
1964 return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
1965
1966 case ESC_D:
1967 return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
1968
1969 case ESC_s:
1970 return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
1971
1972 case ESC_S:
1973 return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
1974
1975 case ESC_w:
1976 return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
1977
1978 case ESC_W:
1979 return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
1980
1981 default:
1982 return FALSE;
1983 }
1984
1985 case OP_DIGIT:
1986 return next == -ESC_D || next == -ESC_s || next == -ESC_W;
1987
1988 case OP_NOT_DIGIT:
1989 return next == -ESC_d;
1990
1991 case OP_WHITESPACE:
1992 return next == -ESC_S || next == -ESC_d || next == -ESC_w;
1993
1994 case OP_NOT_WHITESPACE:
1995 return next == -ESC_s;
1996
1997 case OP_WORDCHAR:
1998 return next == -ESC_W || next == -ESC_s;
1999
2000 case OP_NOT_WORDCHAR:
2001 return next == -ESC_w || next == -ESC_d;
2002
2003 default:
2004 return FALSE;
2005 }
2006
2007 /* Control does not reach here */
2008 }
2009
2010
2011
2012 /*************************************************
2013 * Compile one branch *
2014 *************************************************/
2015
2016 /* Scan the pattern, compiling it into the a vector. If the options are
2017 changed during the branch, the pointer is used to change the external options
2018 bits. This function is used during the pre-compile phase when we are trying
2019 to find out the amount of memory needed, as well as during the real compile
2020 phase. The value of lengthptr distinguishes the two phases.
2021
2022 Arguments:
2023 optionsptr pointer to the option bits
2024 codeptr points to the pointer to the current code point
2025 ptrptr points to the current pattern pointer
2026 errorcodeptr points to error code variable
2027 firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2028 reqbyteptr set to the last literal character required, else < 0
2029 bcptr points to current branch chain
2030 cd contains pointers to tables etc.
2031 lengthptr NULL during the real compile phase
2032 points to length accumulator during pre-compile phase
2033
2034 Returns: TRUE on success
2035 FALSE, with *errorcodeptr set non-zero on error
2036 */
2037
2038 static BOOL
2039 compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2040 int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2041 compile_data *cd, int *lengthptr)
2042 {
2043 int repeat_type, op_type;
2044 int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
2045 int bravalue = 0;
2046 int greedy_default, greedy_non_default;
2047 int firstbyte, reqbyte;
2048 int zeroreqbyte, zerofirstbyte;
2049 int req_caseopt, reqvary, tempreqvary;
2050 int options = *optionsptr;
2051 int after_manual_callout = 0;
2052 int length_prevgroup = 0;
2053 register int c;
2054 register uschar *code = *codeptr;
2055 uschar *last_code = code;
2056 uschar *orig_code = code;
2057 uschar *tempcode;
2058 BOOL inescq = FALSE;
2059 BOOL groupsetfirstbyte = FALSE;
2060 const uschar *ptr = *ptrptr;
2061 const uschar *tempptr;
2062 uschar *previous = NULL;
2063 uschar *previous_callout = NULL;
2064 uschar *save_hwm = NULL;
2065 uschar classbits[32];
2066
2067 #ifdef SUPPORT_UTF8
2068 BOOL class_utf8;
2069 BOOL utf8 = (options & PCRE_UTF8) != 0;
2070 uschar *class_utf8data;
2071 uschar utf8_char[6];
2072 #else
2073 BOOL utf8 = FALSE;
2074 uschar *utf8_char = NULL;
2075 #endif
2076
2077 #ifdef DEBUG
2078 if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2079 #endif
2080
2081 /* Set up the default and non-default settings for greediness */
2082
2083 greedy_default = ((options & PCRE_UNGREEDY) != 0);
2084 greedy_non_default = greedy_default ^ 1;
2085
2086 /* Initialize no first byte, no required byte. REQ_UNSET means "no char
2087 matching encountered yet". It gets changed to REQ_NONE if we hit something that
2088 matches a non-fixed char first char; reqbyte just remains unset if we never
2089 find one.
2090
2091 When we hit a repeat whose minimum is zero, we may have to adjust these values
2092 to take the zero repeat into account. This is implemented by setting them to
2093 zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2094 item types that can be repeated set these backoff variables appropriately. */
2095
2096 firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2097
2098 /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2099 according to the current setting of the caseless flag. REQ_CASELESS is a bit
2100 value > 255. It is added into the firstbyte or reqbyte variables to record the
2101 case status of the value. This is used only for ASCII characters. */
2102
2103 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2104
2105 /* Switch on next character until the end of the branch */
2106
2107 for (;; ptr++)
2108 {
2109 BOOL negate_class;
2110 BOOL possessive_quantifier;
2111 BOOL is_quantifier;
2112 BOOL is_recurse;
2113 int class_charcount;
2114 int class_lastchar;
2115 int newoptions;
2116 int recno;
2117 int refsign;
2118 int skipbytes;
2119 int subreqbyte;
2120 int subfirstbyte;
2121 int terminator;
2122 int mclength;
2123 uschar mcbuffer[8];
2124
2125 /* Get next byte in the pattern */
2126
2127 c = *ptr;
2128
2129 /* If we are in the pre-compile phase, accumulate the length used for the
2130 previous cycle of this loop. */
2131
2132 if (lengthptr != NULL)
2133 {
2134 #ifdef DEBUG
2135 if (code > cd->hwm) cd->hwm = code; /* High water info */
2136 #endif
2137 if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
2138 {
2139 *errorcodeptr = ERR52;
2140 goto FAILED;
2141 }
2142
2143 /* There is at least one situation where code goes backwards: this is the
2144 case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2145 the class is simply eliminated. However, it is created first, so we have to
2146 allow memory for it. Therefore, don't ever reduce the length at this point.
2147 */
2148
2149 if (code < last_code) code = last_code;
2150 *lengthptr += code - last_code;
2151 DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2152
2153 /* If "previous" is set and it is not at the start of the work space, move
2154 it back to there, in order to avoid filling up the work space. Otherwise,
2155 if "previous" is NULL, reset the current code pointer to the start. */
2156
2157 if (previous != NULL)
2158 {
2159 if (previous > orig_code)
2160 {
2161 memmove(orig_code, previous, code - previous);
2162 code -= previous - orig_code;
2163 previous = orig_code;
2164 }
2165 }
2166 else code = orig_code;
2167
2168 /* Remember where this code item starts so we can pick up the length
2169 next time round. */
2170
2171 last_code = code;
2172 }
2173
2174 /* In the real compile phase, just check the workspace used by the forward
2175 reference list. */
2176
2177 else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
2178 {
2179 *errorcodeptr = ERR52;
2180 goto FAILED;
2181 }
2182
2183 /* If in \Q...\E, check for the end; if not, we have a literal */
2184
2185 if (inescq && c != 0)
2186 {
2187 if (c == '\\' && ptr[1] == 'E')
2188 {
2189 inescq = FALSE;
2190 ptr++;
2191 continue;
2192 }
2193 else
2194 {
2195 if (previous_callout != NULL)
2196 {
2197 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2198 complete_callout(previous_callout, ptr, cd);
2199 previous_callout = NULL;
2200 }
2201 if ((options & PCRE_AUTO_CALLOUT) != 0)
2202 {
2203 previous_callout = code;
2204 code = auto_callout(code, ptr, cd);
2205 }
2206 goto NORMAL_CHAR;
2207 }
2208 }
2209
2210 /* Fill in length of a previous callout, except when the next thing is
2211 a quantifier. */
2212
2213 is_quantifier = c == '*' || c == '+' || c == '?' ||
2214 (c == '{' && is_counted_repeat(ptr+1));
2215
2216 if (!is_quantifier && previous_callout != NULL &&
2217 after_manual_callout-- <= 0)
2218 {
2219 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2220 complete_callout(previous_callout, ptr, cd);
2221 previous_callout = NULL;
2222 }
2223
2224 /* In extended mode, skip white space and comments */
2225
2226 if ((options & PCRE_EXTENDED) != 0)
2227 {
2228 if ((cd->ctypes[c] & ctype_space) != 0) continue;
2229 if (c == '#')
2230 {
2231 while (*(++ptr) != 0)
2232 {
2233 if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
2234 }
2235 if (*ptr != 0) continue;
2236
2237 /* Else fall through to handle end of string */
2238 c = 0;
2239 }
2240 }
2241
2242 /* No auto callout for quantifiers. */
2243
2244 if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
2245 {
2246 previous_callout = code;
2247 code = auto_callout(code, ptr, cd);
2248 }
2249
2250 switch(c)
2251 {
2252 /* ===================================================================*/
2253 case 0: /* The branch terminates at string end */
2254 case '|': /* or | or ) */
2255 case ')':
2256 *firstbyteptr = firstbyte;
2257 *reqbyteptr = reqbyte;
2258 *codeptr = code;
2259 *ptrptr = ptr;
2260 if (lengthptr != NULL)
2261 {
2262 *lengthptr += code - last_code; /* To include callout length */
2263 DPRINTF((">> end branch\n"));
2264 }
2265 return TRUE;
2266
2267
2268 /* ===================================================================*/
2269 /* Handle single-character metacharacters. In multiline mode, ^ disables
2270 the setting of any following char as a first character. */
2271
2272 case '^':
2273 if ((options & PCRE_MULTILINE) != 0)
2274 {
2275 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2276 }
2277 previous = NULL;
2278 *code++ = OP_CIRC;
2279 break;
2280
2281 case '$':
2282 previous = NULL;
2283 *code++ = OP_DOLL;
2284 break;
2285
2286 /* There can never be a first char if '.' is first, whatever happens about
2287 repeats. The value of reqbyte doesn't change either. */
2288
2289 case '.':
2290 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2291 zerofirstbyte = firstbyte;
2292 zeroreqbyte = reqbyte;
2293 previous = code;
2294 *code++ = OP_ANY;
2295 break;
2296
2297
2298 /* ===================================================================*/
2299 /* Character classes. If the included characters are all < 256, we build a
2300 32-byte bitmap of the permitted characters, except in the special case
2301 where there is only one such character. For negated classes, we build the
2302 map as usual, then invert it at the end. However, we use a different opcode
2303 so that data characters > 255 can be handled correctly.
2304
2305 If the class contains characters outside the 0-255 range, a different
2306 opcode is compiled. It may optionally have a bit map for characters < 256,
2307 but those above are are explicitly listed afterwards. A flag byte tells
2308 whether the bitmap is present, and whether this is a negated class or not.
2309 */
2310
2311 case '[':
2312 previous = code;
2313
2314 /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
2315 they are encountered at the top level, so we'll do that too. */
2316
2317 if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2318 check_posix_syntax(ptr, &tempptr, cd))
2319 {
2320 *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;
2321 goto FAILED;
2322 }
2323
2324 /* If the first character is '^', set the negation flag and skip it. */
2325
2326 if ((c = *(++ptr)) == '^')
2327 {
2328 negate_class = TRUE;
2329 c = *(++ptr);
2330 }
2331 else
2332 {
2333 negate_class = FALSE;
2334 }
2335
2336 /* Keep a count of chars with values < 256 so that we can optimize the case
2337 of just a single character (as long as it's < 256). However, For higher
2338 valued UTF-8 characters, we don't yet do any optimization. */
2339
2340 class_charcount = 0;
2341 class_lastchar = -1;
2342
2343 /* Initialize the 32-char bit map to all zeros. We build the map in a
2344 temporary bit of memory, in case the class contains only 1 character (less
2345 than 256), because in that case the compiled code doesn't use the bit map.
2346 */
2347
2348 memset(classbits, 0, 32 * sizeof(uschar));
2349
2350 #ifdef SUPPORT_UTF8
2351 class_utf8 = FALSE; /* No chars >= 256 */
2352 class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */
2353 #endif
2354
2355 /* Process characters until ] is reached. By writing this as a "do" it
2356 means that an initial ] is taken as a data character. At the start of the
2357 loop, c contains the first byte of the character. */
2358
2359 if (c != 0) do
2360 {
2361 const uschar *oldptr;
2362
2363 #ifdef SUPPORT_UTF8
2364 if (utf8 && c > 127)
2365 { /* Braces are required because the */
2366 GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
2367 }
2368 #endif
2369
2370 /* Inside \Q...\E everything is literal except \E */
2371
2372 if (inescq)
2373 {
2374 if (c == '\\' && ptr[1] == 'E') /* If we are at \E */
2375 {
2376 inescq = FALSE; /* Reset literal state */
2377 ptr++; /* Skip the 'E' */
2378 continue; /* Carry on with next */
2379 }
2380 goto CHECK_RANGE; /* Could be range if \E follows */
2381 }
2382
2383 /* Handle POSIX class names. Perl allows a negation extension of the
2384 form [:^name:]. A square bracket that doesn't match the syntax is
2385 treated as a literal. We also recognize the POSIX constructions
2386 [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
2387 5.6 and 5.8 do. */
2388
2389 if (c == '[' &&
2390 (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2391 check_posix_syntax(ptr, &tempptr, cd))
2392 {
2393 BOOL local_negate = FALSE;
2394 int posix_class, taboffset, tabopt;
2395 register const uschar *cbits = cd->cbits;
2396 uschar pbits[32];
2397
2398 if (ptr[1] != ':')
2399 {
2400 *errorcodeptr = ERR31;
2401 goto FAILED;
2402 }
2403
2404 ptr += 2;
2405 if (*ptr == '^')
2406 {
2407 local_negate = TRUE;
2408 ptr++;
2409 }
2410
2411 posix_class = check_posix_name(ptr, tempptr - ptr);
2412 if (posix_class < 0)
2413 {
2414 *errorcodeptr = ERR30;
2415 goto FAILED;
2416 }
2417
2418 /* If matching is caseless, upper and lower are converted to
2419 alpha. This relies on the fact that the class table starts with
2420 alpha, lower, upper as the first 3 entries. */
2421
2422 if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
2423 posix_class = 0;
2424
2425 /* We build the bit map for the POSIX class in a chunk of local store
2426 because we may be adding and subtracting from it, and we don't want to
2427 subtract bits that may be in the main map already. At the end we or the
2428 result into the bit map that is being built. */
2429
2430 posix_class *= 3;
2431
2432 /* Copy in the first table (always present) */
2433
2434 memcpy(pbits, cbits + posix_class_maps[posix_class],
2435 32 * sizeof(uschar));
2436
2437 /* If there is a second table, add or remove it as required. */
2438
2439 taboffset = posix_class_maps[posix_class + 1];
2440 tabopt = posix_class_maps[posix_class + 2];
2441
2442 if (taboffset >= 0)
2443 {
2444 if (tabopt >= 0)
2445 for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
2446 else
2447 for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
2448 }
2449
2450 /* Not see if we need to remove any special characters. An option
2451 value of 1 removes vertical space and 2 removes underscore. */
2452
2453 if (tabopt < 0) tabopt = -tabopt;
2454 if (tabopt == 1) pbits[1] &= ~0x3c;
2455 else if (tabopt == 2) pbits[11] &= 0x7f;
2456
2457 /* Add the POSIX table or its complement into the main table that is
2458 being built and we are done. */
2459
2460 if (local_negate)
2461 for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
2462 else
2463 for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
2464
2465 ptr = tempptr + 1;
2466 class_charcount = 10; /* Set > 1; assumes more than 1 per class */
2467 continue; /* End of POSIX syntax handling */
2468 }
2469
2470 /* Backslash may introduce a single character, or it may introduce one
2471 of the specials, which just set a flag. The sequence \b is a special
2472 case. Inside a class (and only there) it is treated as backspace.
2473 Elsewhere it marks a word boundary. Other escapes have preset maps ready
2474 to or into the one we are building. We assume they have more than one
2475 character in them, so set class_charcount bigger than one. */
2476
2477 if (c == '\\')
2478 {
2479 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2480 if (*errorcodeptr != 0) goto FAILED;
2481
2482 if (-c == ESC_b) c = '\b'; /* \b is backslash in a class */
2483 else if (-c == ESC_X) c = 'X'; /* \X is literal X in a class */
2484 else if (-c == ESC_R) c = 'R'; /* \R is literal R in a class */
2485 else if (-c == ESC_Q) /* Handle start of quoted string */
2486 {
2487 if (ptr[1] == '\\' && ptr[2] == 'E')
2488 {
2489 ptr += 2; /* avoid empty string */
2490 }
2491 else inescq = TRUE;
2492 continue;
2493 }
2494
2495 if (c < 0)
2496 {
2497 register const uschar *cbits = cd->cbits;
2498 class_charcount += 2; /* Greater than 1 is what matters */
2499
2500 /* Save time by not doing this in the pre-compile phase. */
2501
2502 if (lengthptr == NULL) switch (-c)
2503 {
2504 case ESC_d:
2505 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
2506 continue;
2507
2508 case ESC_D:
2509 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
2510 continue;
2511
2512 case ESC_w:
2513 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
2514 continue;
2515
2516 case ESC_W:
2517 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
2518 continue;
2519
2520 case ESC_s:
2521 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
2522 classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
2523 continue;
2524
2525 case ESC_S:
2526 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
2527 classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
2528 continue;
2529
2530 case ESC_E: /* Perl ignores an orphan \E */
2531 continue;
2532
2533 default: /* Not recognized; fall through */
2534 break; /* Need "default" setting to stop compiler warning. */
2535 }
2536
2537 /* In the pre-compile phase, just do the recognition. */
2538
2539 else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
2540 c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
2541
2542 /* We need to deal with \P and \p in both phases. */
2543
2544 #ifdef SUPPORT_UCP
2545 if (-c == ESC_p || -c == ESC_P)
2546 {
2547 BOOL negated;
2548 int pdata;
2549 int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
2550 if (ptype < 0) goto FAILED;
2551 class_utf8 = TRUE;
2552 *class_utf8data++ = ((-c == ESC_p) != negated)?
2553 XCL_PROP : XCL_NOTPROP;
2554 *class_utf8data++ = ptype;
2555 *class_utf8data++ = pdata;
2556 class_charcount -= 2; /* Not a < 256 character */
2557 continue;
2558 }
2559 #endif
2560 /* Unrecognized escapes are faulted if PCRE is running in its
2561 strict mode. By default, for compatibility with Perl, they are
2562 treated as literals. */
2563
2564 if ((options & PCRE_EXTRA) != 0)
2565 {
2566 *errorcodeptr = ERR7;
2567 goto FAILED;
2568 }
2569
2570 class_charcount -= 2; /* Undo the default count from above */
2571 c = *ptr; /* Get the final character and fall through */
2572 }
2573
2574 /* Fall through if we have a single character (c >= 0). This may be
2575 greater than 256 in UTF-8 mode. */
2576
2577 } /* End of backslash handling */
2578
2579 /* A single character may be followed by '-' to form a range. However,
2580 Perl does not permit ']' to be the end of the range. A '-' character
2581 at the end is treated as a literal. Perl ignores orphaned \E sequences
2582 entirely. The code for handling \Q and \E is messy. */
2583
2584 CHECK_RANGE:
2585 while (ptr[1] == '\\' && ptr[2] == 'E')
2586 {
2587 inescq = FALSE;
2588 ptr += 2;
2589 }
2590
2591 oldptr = ptr;
2592
2593 if (!inescq && ptr[1] == '-')
2594 {
2595 int d;
2596 ptr += 2;
2597 while (*ptr == '\\' && ptr[1] == 'E') ptr += 2;
2598
2599 /* If we hit \Q (not followed by \E) at this point, go into escaped
2600 mode. */
2601
2602 while (*ptr == '\\' && ptr[1] == 'Q')
2603 {
2604 ptr += 2;
2605 if (*ptr == '\\' && ptr[1] == 'E') { ptr += 2; continue; }
2606 inescq = TRUE;
2607 break;
2608 }
2609
2610 if (*ptr == 0 || (!inescq && *ptr == ']'))
2611 {
2612 ptr = oldptr;
2613 goto LONE_SINGLE_CHARACTER;
2614 }
2615
2616 #ifdef SUPPORT_UTF8
2617 if (utf8)
2618 { /* Braces are required because the */
2619 GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
2620 }
2621 else
2622 #endif
2623 d = *ptr; /* Not UTF-8 mode */
2624
2625 /* The second part of a range can be a single-character escape, but
2626 not any of the other escapes. Perl 5.6 treats a hyphen as a literal
2627 in such circumstances. */
2628
2629 if (!inescq && d == '\\')
2630 {
2631 d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2632 if (*errorcodeptr != 0) goto FAILED;
2633
2634 /* \b is backslash; \X is literal X; \R is literal R; any other
2635 special means the '-' was literal */
2636
2637 if (d < 0)
2638 {
2639 if (d == -ESC_b) d = '\b';
2640 else if (d == -ESC_X) d = 'X';
2641 else if (d == -ESC_R) d = 'R'; else
2642 {
2643 ptr = oldptr;
2644 goto LONE_SINGLE_CHARACTER; /* A few lines below */
2645 }
2646 }
2647 }
2648
2649 /* Check that the two values are in the correct order. Optimize
2650 one-character ranges */
2651
2652 if (d < c)
2653 {
2654 *errorcodeptr = ERR8;
2655 goto FAILED;
2656 }
2657
2658 if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
2659
2660 /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
2661 matching, we have to use an XCLASS with extra data items. Caseless
2662 matching for characters > 127 is available only if UCP support is
2663 available. */
2664
2665 #ifdef SUPPORT_UTF8
2666 if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
2667 {
2668 class_utf8 = TRUE;
2669
2670 /* With UCP support, we can find the other case equivalents of
2671 the relevant characters. There may be several ranges. Optimize how
2672 they fit with the basic range. */
2673
2674 #ifdef SUPPORT_UCP
2675 if ((options & PCRE_CASELESS) != 0)
2676 {
2677 unsigned int occ, ocd;
2678 unsigned int cc = c;
2679 unsigned int origd = d;
2680 while (get_othercase_range(&cc, origd, &occ, &ocd))
2681 {
2682 if (occ >= c && ocd <= d) continue; /* Skip embedded ranges */
2683
2684 if (occ < c && ocd >= c - 1) /* Extend the basic range */
2685 { /* if there is overlap, */
2686 c = occ; /* noting that if occ < c */
2687 continue; /* we can't have ocd > d */
2688 } /* because a subrange is */
2689 if (ocd > d && occ <= d + 1) /* always shorter than */
2690 { /* the basic range. */
2691 d = ocd;
2692 continue;
2693 }
2694
2695 if (occ == ocd)
2696 {
2697 *class_utf8data++ = XCL_SINGLE;
2698 }
2699 else
2700 {
2701 *class_utf8data++ = XCL_RANGE;
2702 class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
2703 }
2704 class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
2705 }
2706 }
2707 #endif /* SUPPORT_UCP */
2708
2709 /* Now record the original range, possibly modified for UCP caseless
2710 overlapping ranges. */
2711
2712 *class_utf8data++ = XCL_RANGE;
2713 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
2714 class_utf8data += _pcre_ord2utf8(d, class_utf8data);
2715
2716 /* With UCP support, we are done. Without UCP support, there is no
2717 caseless matching for UTF-8 characters > 127; we can use the bit map
2718 for the smaller ones. */
2719
2720 #ifdef SUPPORT_UCP
2721 continue; /* With next character in the class */
2722 #else
2723 if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
2724
2725 /* Adjust upper limit and fall through to set up the map */
2726
2727 d = 127;
2728
2729 #endif /* SUPPORT_UCP */
2730 }
2731 #endif /* SUPPORT_UTF8 */
2732
2733 /* We use the bit map for all cases when not in UTF-8 mode; else
2734 ranges that lie entirely within 0-127 when there is UCP support; else
2735 for partial ranges without UCP support. */
2736
2737 class_charcount += d - c + 1;
2738 class_lastchar = d;
2739
2740 /* We can save a bit of time by skipping this in the pre-compile. */
2741
2742 if (lengthptr == NULL) for (; c <= d; c++)
2743 {
2744 classbits[c/8] |= (1 << (c&7));
2745 if ((options & PCRE_CASELESS) != 0)
2746 {
2747 int uc = cd->fcc[c]; /* flip case */
2748 classbits[uc/8] |= (1 << (uc&7));
2749 }
2750 }
2751
2752 continue; /* Go get the next char in the class */
2753 }
2754
2755 /* Handle a lone single character - we can get here for a normal
2756 non-escape char, or after \ that introduces a single character or for an
2757 apparent range that isn't. */
2758
2759 LONE_SINGLE_CHARACTER:
2760
2761 /* Handle a character that cannot go in the bit map */
2762
2763 #ifdef SUPPORT_UTF8
2764 if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
2765 {
2766 class_utf8 = TRUE;
2767 *class_utf8data++ = XCL_SINGLE;
2768 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
2769
2770 #ifdef SUPPORT_UCP
2771 if ((options & PCRE_CASELESS) != 0)
2772 {
2773 unsigned int othercase;
2774 if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR)
2775 {
2776 *class_utf8data++ = XCL_SINGLE;
2777 class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
2778 }
2779 }
2780 #endif /* SUPPORT_UCP */
2781
2782 }
2783 else
2784 #endif /* SUPPORT_UTF8 */
2785
2786 /* Handle a single-byte character */
2787 {
2788 classbits[c/8] |= (1 << (c&7));
2789 if ((options & PCRE_CASELESS) != 0)
2790 {
2791 c = cd->fcc[c]; /* flip case */
2792 classbits[c/8] |= (1 << (c&7));
2793 }
2794 class_charcount++;
2795 class_lastchar = c;
2796 }
2797 }
2798
2799 /* Loop until ']' reached. This "while" is the end of the "do" above. */
2800
2801 while ((c = *(++ptr)) != 0 && (c != ']' || inescq));
2802
2803 if (c == 0) /* Missing terminating ']' */
2804 {
2805 *errorcodeptr = ERR6;
2806 goto FAILED;
2807 }
2808
2809 /* If class_charcount is 1, we saw precisely one character whose value is
2810 less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we
2811 can optimize the negative case only if there were no characters >= 128
2812 because OP_NOT and the related opcodes like OP_NOTSTAR operate on
2813 single-bytes only. This is an historical hangover. Maybe one day we can
2814 tidy these opcodes to handle multi-byte characters.
2815
2816 The optimization throws away the bit map. We turn the item into a
2817 1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
2818 that OP_NOT does not support multibyte characters. In the positive case, it
2819 can cause firstbyte to be set. Otherwise, there can be no first char if
2820 this item is first, whatever repeat count may follow. In the case of
2821 reqbyte, save the previous value for reinstating. */
2822
2823 #ifdef SUPPORT_UTF8
2824 if (class_charcount == 1 &&
2825 (!utf8 ||
2826 (!class_utf8 && (!negate_class || class_lastchar < 128))))
2827
2828 #else
2829 if (class_charcount == 1)
2830 #endif
2831 {
2832 zeroreqbyte = reqbyte;
2833
2834 /* The OP_NOT opcode works on one-byte characters only. */
2835
2836 if (negate_class)
2837 {
2838 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2839 zerofirstbyte = firstbyte;
2840 *code++ = OP_NOT;
2841 *code++ = class_lastchar;
2842 break;
2843 }
2844
2845 /* For a single, positive character, get the value into mcbuffer, and
2846 then we can handle this with the normal one-character code. */
2847
2848 #ifdef SUPPORT_UTF8
2849 if (utf8 && class_lastchar > 127)
2850 mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
2851 else
2852 #endif
2853 {
2854 mcbuffer[0] = class_lastchar;
2855 mclength = 1;
2856 }
2857 goto ONE_CHAR;
2858 } /* End of 1-char optimization */
2859
2860 /* The general case - not the one-char optimization. If this is the first
2861 thing in the branch, there can be no first char setting, whatever the
2862 repeat count. Any reqbyte setting must remain unchanged after any kind of
2863 repeat. */
2864
2865 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2866 zerofirstbyte = firstbyte;
2867 zeroreqbyte = reqbyte;
2868
2869 /* If there are characters with values > 255, we have to compile an
2870 extended class, with its own opcode. If there are no characters < 256,
2871 we can omit the bitmap in the actual compiled code. */
2872
2873 #ifdef SUPPORT_UTF8
2874 if (class_utf8)
2875 {
2876 *class_utf8data++ = XCL_END; /* Marks the end of extra data */
2877 *code++ = OP_XCLASS;
2878 code += LINK_SIZE;
2879 *code = negate_class? XCL_NOT : 0;
2880
2881 /* If the map is required, move up the extra data to make room for it;
2882 otherwise just move the code pointer to the end of the extra data. */
2883
2884 if (class_charcount > 0)
2885 {
2886 *code++ |= XCL_MAP;
2887 memmove(code + 32, code, class_utf8data - code);
2888 memcpy(code, classbits, 32);
2889 code = class_utf8data + 32;
2890 }
2891 else code = class_utf8data;
2892
2893 /* Now fill in the complete length of the item */
2894
2895 PUT(previous, 1, code - previous);
2896 break; /* End of class handling */
2897 }
2898 #endif
2899
2900 /* If there are no characters > 255, negate the 32-byte map if necessary,
2901 and copy it into the code vector. If this is the first thing in the branch,
2902 there can be no first char setting, whatever the repeat count. Any reqbyte
2903 setting must remain unchanged after any kind of repeat. */
2904
2905 if (negate_class)
2906 {
2907 *code++ = OP_NCLASS;
2908 if (lengthptr == NULL) /* Save time in the pre-compile phase */
2909 for (c = 0; c < 32; c++) code[c] = ~classbits[c];
2910 }
2911 else
2912 {
2913 *code++ = OP_CLASS;
2914 memcpy(code, classbits, 32);
2915 }
2916 code += 32;
2917 break;
2918
2919
2920 /* ===================================================================*/
2921 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
2922 has been tested above. */
2923
2924 case '{':
2925 if (!is_quantifier) goto NORMAL_CHAR;
2926 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
2927 if (*errorcodeptr != 0) goto FAILED;
2928 goto REPEAT;
2929
2930 case '*':
2931 repeat_min = 0;
2932 repeat_max = -1;
2933 goto REPEAT;
2934
2935 case '+':
2936 repeat_min = 1;
2937 repeat_max = -1;
2938 goto REPEAT;
2939
2940 case '?':
2941 repeat_min = 0;
2942 repeat_max = 1;
2943
2944 REPEAT:
2945 if (previous == NULL)
2946 {
2947 *errorcodeptr = ERR9;
2948 goto FAILED;
2949 }
2950
2951 if (repeat_min == 0)
2952 {
2953 firstbyte = zerofirstbyte; /* Adjust for zero repeat */
2954 reqbyte = zeroreqbyte; /* Ditto */
2955 }
2956
2957 /* Remember whether this is a variable length repeat */
2958
2959 reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
2960
2961 op_type = 0; /* Default single-char op codes */
2962 possessive_quantifier = FALSE; /* Default not possessive quantifier */
2963
2964 /* Save start of previous item, in case we have to move it up to make space
2965 for an inserted OP_ONCE for the additional '+' extension. */
2966
2967 tempcode = previous;
2968
2969 /* If the next character is '+', we have a possessive quantifier. This
2970 implies greediness, whatever the setting of the PCRE_UNGREEDY option.
2971 If the next character is '?' this is a minimizing repeat, by default,
2972 but if PCRE_UNGREEDY is set, it works the other way round. We change the
2973 repeat type to the non-default. */
2974
2975 if (ptr[1] == '+')
2976 {
2977 repeat_type = 0; /* Force greedy */
2978 possessive_quantifier = TRUE;
2979 ptr++;
2980 }
2981 else if (ptr[1] == '?')
2982 {
2983 repeat_type = greedy_non_default;
2984 ptr++;
2985 }
2986 else repeat_type = greedy_default;
2987
2988 /* If previous was a character match, abolish the item and generate a
2989 repeat item instead. If a char item has a minumum of more than one, ensure
2990 that it is set in reqbyte - it might not be if a sequence such as x{3} is
2991 the first thing in a branch because the x will have gone into firstbyte
2992 instead. */
2993
2994 if (*previous == OP_CHAR || *previous == OP_CHARNC)
2995 {
2996 /* Deal with UTF-8 characters that take up more than one byte. It's
2997 easier to write this out separately than try to macrify it. Use c to
2998 hold the length of the character in bytes, plus 0x80 to flag that it's a
2999 length rather than a small character. */
3000
3001 #ifdef SUPPORT_UTF8
3002 if (utf8 && (code[-1] & 0x80) != 0)
3003 {
3004 uschar *lastchar = code - 1;
3005 while((*lastchar & 0xc0) == 0x80) lastchar--;
3006 c = code - lastchar; /* Length of UTF-8 character */
3007 memcpy(utf8_char, lastchar, c); /* Save the char */
3008 c |= 0x80; /* Flag c as a length */
3009 }
3010 else
3011 #endif
3012
3013 /* Handle the case of a single byte - either with no UTF8 support, or
3014 with UTF-8 disabled, or for a UTF-8 character < 128. */
3015
3016 {
3017 c = code[-1];
3018 if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
3019 }
3020
3021 /* If the repetition is unlimited, it pays to see if the next thing on
3022 the line is something that cannot possibly match this character. If so,
3023 automatically possessifying this item gains some performance in the case
3024 where the match fails. */
3025
3026 if (!possessive_quantifier &&
3027 repeat_max < 0 &&
3028 check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3029 options, cd))
3030 {
3031 repeat_type = 0; /* Force greedy */
3032 possessive_quantifier = TRUE;
3033 }
3034
3035 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
3036 }
3037
3038 /* If previous was a single negated character ([^a] or similar), we use
3039 one of the special opcodes, replacing it. The code is shared with single-
3040 character repeats by setting opt_type to add a suitable offset into
3041 repeat_type. We can also test for auto-possessification. OP_NOT is
3042 currently used only for single-byte chars. */
3043
3044 else if (*previous == OP_NOT)
3045 {
3046 op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
3047 c = previous[1];
3048 if (!possessive_quantifier &&
3049 repeat_max < 0 &&
3050 check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3051 {
3052 repeat_type = 0; /* Force greedy */
3053 possessive_quantifier = TRUE;
3054 }
3055 goto OUTPUT_SINGLE_REPEAT;
3056 }
3057
3058 /* If previous was a character type match (\d or similar), abolish it and
3059 create a suitable repeat item. The code is shared with single-character
3060 repeats by setting op_type to add a suitable offset into repeat_type. Note
3061 the the Unicode property types will be present only when SUPPORT_UCP is
3062 defined, but we don't wrap the little bits of code here because it just
3063 makes it horribly messy. */
3064
3065 else if (*previous < OP_EODN)
3066 {
3067 uschar *oldcode;
3068 int prop_type, prop_value;
3069 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
3070 c = *previous;
3071
3072 if (!possessive_quantifier &&
3073 repeat_max < 0 &&
3074 check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3075 {
3076 repeat_type = 0; /* Force greedy */
3077 possessive_quantifier = TRUE;
3078 }
3079
3080 OUTPUT_SINGLE_REPEAT:
3081 if (*previous == OP_PROP || *previous == OP_NOTPROP)
3082 {
3083 prop_type = previous[1];
3084 prop_value = previous[2];
3085 }
3086 else prop_type = prop_value = -1;
3087
3088 oldcode = code;
3089 code = previous; /* Usually overwrite previous item */
3090
3091 /* If the maximum is zero then the minimum must also be zero; Perl allows
3092 this case, so we do too - by simply omitting the item altogether. */
3093
3094 if (repeat_max == 0) goto END_REPEAT;
3095
3096 /* All real repeats make it impossible to handle partial matching (maybe
3097 one day we will be able to remove this restriction). */
3098
3099 if (repeat_max != 1) cd->nopartial = TRUE;
3100
3101 /* Combine the op_type with the repeat_type */
3102
3103 repeat_type += op_type;
3104
3105 /* A minimum of zero is handled either as the special case * or ?, or as
3106 an UPTO, with the maximum given. */
3107
3108 if (repeat_min == 0)
3109 {
3110 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
3111 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
3112 else
3113 {
3114 *code++ = OP_UPTO + repeat_type;
3115 PUT2INC(code, 0, repeat_max);
3116 }
3117 }
3118
3119 /* A repeat minimum of 1 is optimized into some special cases. If the
3120 maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
3121 left in place and, if the maximum is greater than 1, we use OP_UPTO with
3122 one less than the maximum. */
3123
3124 else if (repeat_min == 1)
3125 {
3126 if (repeat_max == -1)
3127 *code++ = OP_PLUS + repeat_type;
3128 else
3129 {
3130 code = oldcode; /* leave previous item in place */
3131 if (repeat_max == 1) goto END_REPEAT;
3132 *code++ = OP_UPTO + repeat_type;
3133 PUT2INC(code, 0, repeat_max - 1);
3134 }
3135 }
3136
3137 /* The case {n,n} is just an EXACT, while the general case {n,m} is
3138 handled as an EXACT followed by an UPTO. */
3139
3140 else
3141 {
3142 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
3143 PUT2INC(code, 0, repeat_min);
3144
3145 /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
3146 we have to insert the character for the previous code. For a repeated
3147 Unicode property match, there are two extra bytes that define the
3148 required property. In UTF-8 mode, long characters have their length in
3149 c, with the 0x80 bit as a flag. */
3150
3151 if (repeat_max < 0)
3152 {
3153 #ifdef SUPPORT_UTF8
3154 if (utf8 && c >= 128)
3155 {
3156 memcpy(code, utf8_char, c & 7);
3157 code += c & 7;
3158 }
3159 else
3160 #endif
3161 {
3162 *code++ = c;
3163 if (prop_type >= 0)
3164 {
3165 *code++ = prop_type;
3166 *code++ = prop_value;
3167 }
3168 }
3169 *code++ = OP_STAR + repeat_type;
3170 }
3171
3172 /* Else insert an UPTO if the max is greater than the min, again
3173 preceded by the character, for the previously inserted code. If the
3174 UPTO is just for 1 instance, we can use QUERY instead. */
3175
3176 else if (repeat_max != repeat_min)
3177 {
3178 #ifdef SUPPORT_UTF8
3179 if (utf8 && c >= 128)
3180 {
3181 memcpy(code, utf8_char, c & 7);
3182 code += c & 7;
3183 }
3184 else
3185 #endif
3186 *code++ = c;
3187 if (prop_type >= 0)
3188 {
3189 *code++ = prop_type;
3190 *code++ = prop_value;
3191 }
3192 repeat_max -= repeat_min;
3193
3194 if (repeat_max == 1)
3195 {
3196 *code++ = OP_QUERY + repeat_type;
3197 }
3198 else
3199 {
3200 *code++ = OP_UPTO + repeat_type;
3201 PUT2INC(code, 0, repeat_max);
3202 }
3203 }
3204 }
3205
3206 /* The character or character type itself comes last in all cases. */
3207
3208 #ifdef SUPPORT_UTF8
3209 if (utf8 && c >= 128)
3210 {
3211 memcpy(code, utf8_char, c & 7);
3212 code += c & 7;
3213 }
3214 else
3215 #endif
3216 *code++ = c;
3217
3218 /* For a repeated Unicode property match, there are two extra bytes that
3219 define the required property. */
3220
3221 #ifdef SUPPORT_UCP
3222 if (prop_type >= 0)
3223 {
3224 *code++ = prop_type;
3225 *code++ = prop_value;
3226 }
3227 #endif
3228 }
3229
3230 /* If previous was a character class or a back reference, we put the repeat
3231 stuff after it, but just skip the item if the repeat was {0,0}. */
3232
3233 else if (*previous == OP_CLASS ||
3234 *previous == OP_NCLASS ||
3235 #ifdef SUPPORT_UTF8
3236 *previous == OP_XCLASS ||
3237 #endif
3238 *previous == OP_REF)
3239 {
3240 if (repeat_max == 0)
3241 {
3242 code = previous;
3243 goto END_REPEAT;
3244 }
3245
3246 /* All real repeats make it impossible to handle partial matching (maybe
3247 one day we will be able to remove this restriction). */
3248
3249 if (repeat_max != 1) cd->nopartial = TRUE;
3250
3251 if (repeat_min == 0 && repeat_max == -1)
3252 *code++ = OP_CRSTAR + repeat_type;
3253 else if (repeat_min == 1 && repeat_max == -1)
3254 *code++ = OP_CRPLUS + repeat_type;
3255 else if (repeat_min == 0 && repeat_max == 1)
3256 *code++ = OP_CRQUERY + repeat_type;
3257 else
3258 {
3259 *code++ = OP_CRRANGE + repeat_type;
3260 PUT2INC(code, 0, repeat_min);
3261 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
3262 PUT2INC(code, 0, repeat_max);
3263 }
3264 }
3265
3266 /* If previous was a bracket group, we may have to replicate it in certain
3267 cases. */
3268
3269 else if (*previous == OP_BRA || *previous == OP_CBRA ||
3270 *previous == OP_ONCE || *previous == OP_COND)
3271 {
3272 register int i;
3273 int ketoffset = 0;
3274 int len = code - previous;
3275 uschar *bralink = NULL;
3276
3277 /* Repeating a DEFINE group is pointless */
3278
3279 if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
3280 {
3281 *errorcodeptr = ERR55;
3282 goto FAILED;
3283 }
3284
3285 /* This is a paranoid check to stop integer overflow later on */
3286
3287 if (len > MAX_DUPLENGTH)
3288 {
3289 *errorcodeptr = ERR50;
3290 goto FAILED;
3291 }
3292
3293 /* If the maximum repeat count is unlimited, find the end of the bracket
3294 by scanning through from the start, and compute the offset back to it
3295 from the current code pointer. There may be an OP_OPT setting following
3296 the final KET, so we can't find the end just by going back from the code
3297 pointer. */
3298
3299 if (repeat_max == -1)
3300 {
3301 register uschar *ket = previous;
3302 do ket += GET(ket, 1); while (*ket != OP_KET);
3303 ketoffset = code - ket;
3304 }
3305
3306 /* The case of a zero minimum is special because of the need to stick
3307 OP_BRAZERO in front of it, and because the group appears once in the
3308 data, whereas in other cases it appears the minimum number of times. For
3309 this reason, it is simplest to treat this case separately, as otherwise
3310 the code gets far too messy. There are several special subcases when the
3311 minimum is zero. */
3312
3313 if (repeat_min == 0)
3314 {
3315 /* If the maximum is also zero, we just omit the group from the output
3316 altogether. */
3317
3318 if (repeat_max == 0)
3319 {
3320 code = previous;
3321 goto END_REPEAT;
3322 }
3323
3324 /* If the maximum is 1 or unlimited, we just have to stick in the
3325 BRAZERO and do no more at this point. However, we do need to adjust
3326 any OP_RECURSE calls inside the group that refer to the group itself or
3327 any internal or forward referenced group, because the offset is from
3328 the start of the whole regex. Temporarily terminate the pattern while
3329 doing this. */
3330
3331 if (repeat_max <= 1)
3332 {
3333 *code = OP_END;
3334 adjust_recurse(previous, 1, utf8, cd, save_hwm);
3335 memmove(previous+1, previous, len);
3336 code++;
3337 *previous++ = OP_BRAZERO + repeat_type;
3338 }
3339
3340 /* If the maximum is greater than 1 and limited, we have to replicate
3341 in a nested fashion, sticking OP_BRAZERO before each set of brackets.
3342 The first one has to be handled carefully because it's the original
3343 copy, which has to be moved up. The remainder can be handled by code
3344 that is common with the non-zero minimum case below. We have to
3345 adjust the value or repeat_max, since one less copy is required. Once
3346 again, we may have to adjust any OP_RECURSE calls inside the group. */
3347
3348 else
3349 {
3350 int offset;
3351 *code = OP_END;
3352 adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
3353 memmove(previous + 2 + LINK_SIZE, previous, len);
3354 code += 2 + LINK_SIZE;
3355 *previous++ = OP_BRAZERO + repeat_type;
3356 *previous++ = OP_BRA;
3357
3358 /* We chain together the bracket offset fields that have to be
3359 filled in later when the ends of the brackets are reached. */
3360
3361 offset = (bralink == NULL)? 0 : previous - bralink;
3362 bralink = previous;
3363 PUTINC(previous, 0, offset);
3364 }
3365
3366 repeat_max--;
3367 }
3368
3369 /* If the minimum is greater than zero, replicate the group as many
3370 times as necessary, and adjust the maximum to the number of subsequent
3371 copies that we need. If we set a first char from the group, and didn't
3372 set a required char, copy the latter from the former. If there are any
3373 forward reference subroutine calls in the group, there will be entries on
3374 the workspace list; replicate these with an appropriate increment. */
3375
3376 else
3377 {
3378 if (repeat_min > 1)
3379 {
3380 /* In the pre-compile phase, we don't actually do the replication. We
3381 just adjust the length as if we had. */
3382
3383 if (lengthptr != NULL)
3384 *lengthptr += (repeat_min - 1)*length_prevgroup;
3385
3386 /* This is compiling for real */
3387
3388 else
3389 {
3390 if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
3391 for (i = 1; i < repeat_min; i++)
3392 {
3393 uschar *hc;
3394 uschar *this_hwm = cd->hwm;
3395 memcpy(code, previous, len);
3396 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3397 {
3398 PUT(cd->hwm, 0, GET(hc, 0) + len);
3399 cd->hwm += LINK_SIZE;
3400 }
3401 save_hwm = this_hwm;
3402 code += len;
3403 }
3404 }
3405 }
3406
3407 if (repeat_max > 0) repeat_max -= repeat_min;
3408 }
3409
3410 /* This code is common to both the zero and non-zero minimum cases. If
3411 the maximum is limited, it replicates the group in a nested fashion,
3412 remembering the bracket starts on a stack. In the case of a zero minimum,
3413 the first one was set up above. In all cases the repeat_max now specifies
3414 the number of additional copies needed. Again, we must remember to
3415 replicate entries on the forward reference list. */
3416
3417 if (repeat_max >= 0)
3418 {
3419 /* In the pre-compile phase, we don't actually do the replication. We
3420 just adjust the length as if we had. For each repetition we must add 1
3421 to the length for BRAZERO and for all but the last repetition we must
3422 add 2 + 2*LINKSIZE to allow for the nesting that occurs. */
3423
3424 if (lengthptr != NULL && repeat_max > 0)
3425 *lengthptr += repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
3426 2 - 2*LINK_SIZE; /* Last one doesn't nest */
3427
3428 /* This is compiling for real */
3429
3430 else for (i = repeat_max - 1; i >= 0; i--)
3431 {
3432 uschar *hc;
3433 uschar *this_hwm = cd->hwm;
3434
3435 *code++ = OP_BRAZERO + repeat_type;
3436
3437 /* All but the final copy start a new nesting, maintaining the
3438 chain of brackets outstanding. */
3439
3440 if (i != 0)
3441 {
3442 int offset;
3443 *code++ = OP_BRA;
3444 offset = (bralink == NULL)? 0 : code - bralink;
3445 bralink = code;
3446 PUTINC(code, 0, offset);
3447 }
3448
3449 memcpy(code, previous, len);
3450 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3451 {
3452 PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
3453 cd->hwm += LINK_SIZE;
3454 }
3455 save_hwm = this_hwm;
3456 code += len;
3457 }
3458
3459 /* Now chain through the pending brackets, and fill in their length
3460 fields (which are holding the chain links pro tem). */
3461
3462 while (bralink != NULL)
3463 {
3464 int oldlinkoffset;
3465 int offset = code - bralink + 1;
3466 uschar *bra = code - offset;
3467 oldlinkoffset = GET(bra, 1);
3468 bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
3469 *code++ = OP_KET;
3470 PUTINC(code, 0, offset);
3471 PUT(bra, 1, offset);
3472 }
3473 }
3474
3475 /* If the maximum is unlimited, set a repeater in the final copy. We
3476 can't just offset backwards from the current code point, because we
3477 don't know if there's been an options resetting after the ket. The
3478 correct offset was computed above.
3479
3480 Then, when we are doing the actual compile phase, check to see whether
3481 this group is a non-atomic one that could match an empty string. If so,
3482 convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
3483 that runtime checking can be done. [This check is also applied to
3484 atomic groups at runtime, but in a different way.] */
3485
3486 else
3487 {
3488 uschar *ketcode = code - ketoffset;
3489 uschar *bracode = ketcode - GET(ketcode, 1);
3490 *ketcode = OP_KETRMAX + repeat_type;
3491 if (lengthptr == NULL && *bracode != OP_ONCE)
3492 {
3493 uschar *scode = bracode;
3494 do
3495 {
3496 if (could_be_empty_branch(scode, ketcode, utf8))
3497 {
3498 *bracode += OP_SBRA - OP_BRA;
3499 break;
3500 }
3501 scode += GET(scode, 1);
3502 }
3503 while (*scode == OP_ALT);
3504 }
3505 }
3506 }
3507
3508 /* Else there's some kind of shambles */
3509
3510 else
3511 {
3512 *errorcodeptr = ERR11;
3513 goto FAILED;
3514 }
3515
3516 /* If the character following a repeat is '+', or if certain optimization
3517 tests above succeeded, possessive_quantifier is TRUE. For some of the
3518 simpler opcodes, there is an special alternative opcode for this. For
3519 anything else, we wrap the entire repeated item inside OP_ONCE brackets.
3520 The '+' notation is just syntactic sugar, taken from Sun's Java package,
3521 but the special opcodes can optimize it a bit. The repeated item starts at
3522 tempcode, not at previous, which might be the first part of a string whose
3523 (former) last char we repeated.
3524
3525 Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
3526 an 'upto' may follow. We skip over an 'exact' item, and then test the
3527 length of what remains before proceeding. */
3528
3529 if (possessive_quantifier)
3530 {
3531 int len;
3532 if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||
3533 *tempcode == OP_NOTEXACT)
3534 tempcode += _pcre_OP_lengths[*tempcode];
3535 len = code - tempcode;
3536 if (len > 0) switch (*tempcode)
3537 {
3538 case OP_STAR: *tempcode = OP_POSSTAR; break;
3539 case OP_PLUS: *tempcode = OP_POSPLUS; break;
3540 case OP_QUERY: *tempcode = OP_POSQUERY; break;
3541 case OP_UPTO: *tempcode = OP_POSUPTO; break;
3542
3543 case OP_TYPESTAR: *tempcode = OP_TYPEPOSSTAR; break;
3544 case OP_TYPEPLUS: *tempcode = OP_TYPEPOSPLUS; break;
3545 case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
3546 case OP_TYPEUPTO: *tempcode = OP_TYPEPOSUPTO; break;
3547
3548 case OP_NOTSTAR: *tempcode = OP_NOTPOSSTAR; break;
3549 case OP_NOTPLUS: *tempcode = OP_NOTPOSPLUS; break;
3550 case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
3551 case OP_NOTUPTO: *tempcode = OP_NOTPOSUPTO; break;
3552
3553 default:
3554 memmove(tempcode + 1+LINK_SIZE, tempcode, len);
3555 code += 1 + LINK_SIZE;
3556 len += 1 + LINK_SIZE;
3557 tempcode[0] = OP_ONCE;
3558 *code++ = OP_KET;
3559 PUTINC(code, 0, len);
3560 PUT(tempcode, 1, len);
3561 break;
3562 }
3563 }
3564
3565 /* In all case we no longer have a previous item. We also set the
3566 "follows varying string" flag for subsequently encountered reqbytes if
3567 it isn't already set and we have just passed a varying length item. */
3568
3569 END_REPEAT:
3570 previous = NULL;
3571 cd->req_varyopt |= reqvary;
3572 break;
3573
3574
3575 /* ===================================================================*/
3576 /* Start of nested parenthesized sub-expression, or comment or lookahead or
3577 lookbehind or option setting or condition or all the other extended
3578 parenthesis forms. First deal with the specials; all are introduced by ?,
3579 and the appearance of any of them means that this is not a capturing
3580 group. */
3581
3582 case '(':
3583 newoptions = options;
3584 skipbytes = 0;
3585 bravalue = OP_CBRA;
3586 save_hwm = cd->hwm;
3587
3588 if (*(++ptr) == '?')
3589 {
3590 int i, set, unset, namelen;
3591 int *optset;
3592 const uschar *name;
3593 uschar *slot;
3594
3595 switch (*(++ptr))
3596 {
3597 case '#': /* Comment; skip to ket */
3598 ptr++;
3599 while (*ptr != 0 && *ptr != ')') ptr++;
3600 if (*ptr == 0)
3601 {
3602 *errorcodeptr = ERR18;
3603 goto FAILED;
3604 }
3605 continue;
3606
3607
3608 /* ------------------------------------------------------------ */
3609 case ':': /* Non-capturing bracket */
3610 bravalue = OP_BRA;
3611 ptr++;
3612 break;
3613
3614
3615 /* ------------------------------------------------------------ */
3616 case '(':
3617 bravalue = OP_COND; /* Conditional group */
3618
3619 /* A condition can be an assertion, a number (referring to a numbered
3620 group), a name (referring to a named group), or 'R', referring to
3621 recursion. R<digits> and R&name are also permitted for recursion tests.
3622
3623 There are several syntaxes for testing a named group: (?(name)) is used
3624 by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
3625
3626 There are two unfortunate ambiguities, caused by history. (a) 'R' can
3627 be the recursive thing or the name 'R' (and similarly for 'R' followed
3628 by digits), and (b) a number could be a name that consists of digits.
3629 In both cases, we look for a name first; if not found, we try the other
3630 cases. */
3631
3632 /* For conditions that are assertions, check the syntax, and then exit
3633 the switch. This will take control down to where bracketed groups,
3634 including assertions, are processed. */
3635
3636 if (ptr[1] == '?' && (ptr[2] == '=' || ptr[2] == '!' || ptr[2] == '<'))
3637 break;
3638
3639 /* Most other conditions use OP_CREF (a couple change to OP_RREF
3640 below), and all need to skip 3 bytes at the start of the group. */
3641
3642 code[1+LINK_SIZE] = OP_CREF;
3643 skipbytes = 3;
3644 refsign = -1;
3645
3646 /* Check for a test for recursion in a named group. */
3647
3648 if (ptr[1] == 'R' && ptr[2] == '&')
3649 {
3650 terminator = -1;
3651 ptr += 2;
3652 code[1+LINK_SIZE] = OP_RREF; /* Change the type of test */
3653 }
3654
3655 /* Check for a test for a named group's having been set, using the Perl
3656 syntax (?(<name>) or (?('name') */
3657
3658 else if (ptr[1] == '<')
3659 {
3660 terminator = '>';
3661 ptr++;
3662 }
3663 else if (ptr[1] == '\'')
3664 {
3665 terminator = '\'';
3666 ptr++;
3667 }
3668 else
3669 {
3670 terminator = 0;
3671 if (ptr[1] == '-' || ptr[1] == '+') refsign = *(++ptr);
3672 }
3673
3674 /* We now expect to read a name; any thing else is an error */
3675
3676 if ((cd->ctypes[ptr[1]] & ctype_word) == 0)
3677 {
3678 ptr += 1; /* To get the right offset */
3679 *errorcodeptr = ERR28;
3680 goto FAILED;
3681 }
3682
3683 /* Read the name, but also get it as a number if it's all digits */
3684
3685 recno = 0;
3686 name = ++ptr;
3687 while ((cd->ctypes[*ptr] & ctype_word) != 0)
3688 {
3689 if (recno >= 0)
3690 recno = ((digitab[*ptr] & ctype_digit) != 0)?
3691 recno * 10 + *ptr - '0' : -1;
3692 ptr++;
3693 }
3694 namelen = ptr - name;
3695
3696 if ((terminator > 0 && *ptr++ != terminator) || *ptr++ != ')')
3697 {
3698 ptr--; /* Error offset */
3699 *errorcodeptr = ERR26;
3700 goto FAILED;
3701 }
3702
3703 /* Do no further checking in the pre-compile phase. */
3704
3705 if (lengthptr != NULL) break;
3706
3707 /* In the real compile we do the work of looking for the actual
3708 reference. If the string started with "+" or "-" we require the rest to
3709 be digits, in which case recno will be set. */
3710
3711 if (refsign > 0)
3712 {
3713 if (recno <= 0)
3714 {
3715 *errorcodeptr = ERR58;
3716 goto FAILED;
3717 }
3718 if (refsign == '-')
3719 {
3720 recno = cd->bracount - recno + 1;
3721 if (recno <= 0)
3722 {
3723 *errorcodeptr = ERR15;
3724 goto FAILED;
3725 }
3726 }
3727 else recno += cd->bracount;
3728 PUT2(code, 2+LINK_SIZE, recno);
3729 break;
3730 }
3731
3732 /* Otherwise (did not start with "+" or "-"), start by looking for the
3733 name. */
3734
3735 slot = cd->name_table;
3736 for (i = 0; i < cd->names_found; i++)
3737 {
3738 if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
3739 slot += cd->name_entry_size;
3740 }
3741
3742 /* Found a previous named subpattern */
3743
3744 if (i < cd->names_found)
3745 {
3746 recno = GET2(slot, 0);
3747 PUT2(code, 2+LINK_SIZE, recno);
3748 }
3749
3750 /* Search the pattern for a forward reference */
3751
3752 else if ((i = find_parens(ptr, cd->bracount, name, namelen,
3753 (options & PCRE_EXTENDED) != 0)) > 0)
3754 {
3755 PUT2(code, 2+LINK_SIZE, i);
3756 }
3757
3758 /* If terminator == 0 it means that the name followed directly after
3759 the opening parenthesis [e.g. (?(abc)...] and in this case there are
3760 some further alternatives to try. For the cases where terminator != 0
3761 [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
3762 now checked all the possibilities, so give an error. */
3763
3764 else if (terminator != 0)
3765 {
3766 *errorcodeptr = ERR15;
3767 goto FAILED;
3768 }
3769
3770 /* Check for (?(R) for recursion. Allow digits after R to specify a
3771 specific group number. */
3772
3773 else if (*name == 'R')
3774 {
3775 recno = 0;
3776 for (i = 1; i < namelen; i++)
3777 {
3778 if ((digitab[name[i]] & ctype_digit) == 0)
3779 {
3780 *errorcodeptr = ERR15;
3781 goto FAILED;
3782 }
3783 recno = recno * 10 + name[i] - '0';
3784 }
3785 if (recno == 0) recno = RREF_ANY;
3786 code[1+LINK_SIZE] = OP_RREF; /* Change test type */
3787 PUT2(code, 2+LINK_SIZE, recno);
3788 }
3789
3790 /* Similarly, check for the (?(DEFINE) "condition", which is always
3791 false. */
3792
3793 else if (namelen == 6 && strncmp((char *)name, "DEFINE", 6) == 0)
3794 {
3795 code[1+LINK_SIZE] = OP_DEF;
3796 skipbytes = 1;
3797 }
3798
3799 /* Check for the "name" actually being a subpattern number. */
3800
3801 else if (recno > 0)
3802 {
3803 PUT2(code, 2+LINK_SIZE, recno);
3804 }
3805
3806 /* Either an unidentified subpattern, or a reference to (?(0) */
3807
3808 else
3809 {
3810 *errorcodeptr = (recno == 0)? ERR35: ERR15;
3811 goto FAILED;
3812 }
3813 break;
3814
3815
3816 /* ------------------------------------------------------------ */
3817 case '=': /* Positive lookahead */
3818 bravalue = OP_ASSERT;
3819 ptr++;
3820 break;
3821
3822
3823 /* ------------------------------------------------------------ */
3824 case '!': /* Negative lookahead */
3825 bravalue = OP_ASSERT_NOT;
3826 ptr++;
3827 break;
3828
3829
3830 /* ------------------------------------------------------------ */
3831 case '<': /* Lookbehind or named define */
3832 switch (ptr[1])
3833 {
3834 case '=': /* Positive lookbehind */
3835 bravalue = OP_ASSERTBACK;
3836 ptr += 2;
3837 break;
3838
3839 case '!': /* Negative lookbehind */
3840 bravalue = OP_ASSERTBACK_NOT;
3841 ptr += 2;
3842 break;
3843
3844 default: /* Could be name define, else bad */
3845 if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME;
3846 ptr++; /* Correct offset for error */
3847 *errorcodeptr = ERR24;
3848 goto FAILED;
3849 }
3850 break;
3851
3852
3853 /* ------------------------------------------------------------ */
3854 case '>': /* One-time brackets */
3855 bravalue = OP_ONCE;
3856 ptr++;
3857 break;
3858
3859
3860 /* ------------------------------------------------------------ */
3861 case 'C': /* Callout - may be followed by digits; */
3862 previous_callout = code; /* Save for later completion */
3863 after_manual_callout = 1; /* Skip one item before completing */
3864 *code++ = OP_CALLOUT;
3865 {
3866 int n = 0;
3867 while ((digitab[*(++ptr)] & ctype_digit) != 0)
3868 n = n * 10 + *ptr - '0';
3869 if (*ptr != ')')
3870 {
3871 *errorcodeptr = ERR39;
3872 goto FAILED;
3873 }
3874 if (n > 255)
3875 {
3876 *errorcodeptr = ERR38;
3877 goto FAILED;
3878 }
3879 *code++ = n;
3880 PUT(code, 0, ptr - cd->start_pattern + 1); /* Pattern offset */
3881 PUT(code, LINK_SIZE, 0); /* Default length */
3882 code += 2 * LINK_SIZE;
3883 }
3884 previous = NULL;
3885 continue;
3886
3887
3888 /* ------------------------------------------------------------ */
3889 case 'P': /* Python-style named subpattern handling */
3890 if (*(++ptr) == '=' || *ptr == '>') /* Reference or recursion */
3891 {
3892 is_recurse = *ptr == '>';
3893 terminator = ')';
3894 goto NAMED_REF_OR_RECURSE;
3895 }
3896 else if (*ptr != '<') /* Test for Python-style definition */
3897 {
3898 *errorcodeptr = ERR41;
3899 goto FAILED;
3900 }
3901 /* Fall through to handle (?P< as (?< is handled */
3902
3903
3904 /* ------------------------------------------------------------ */
3905 DEFINE_NAME: /* Come here from (?< handling */
3906 case '\'':
3907 {
3908 terminator = (*ptr == '<')? '>' : '\'';
3909 name = ++ptr;
3910
3911 while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
3912 namelen = ptr - name;
3913
3914 /* In the pre-compile phase, just do a syntax check. */
3915
3916 if (lengthptr != NULL)
3917 {
3918 if (*ptr != terminator)
3919 {
3920 *errorcodeptr = ERR42;
3921 goto FAILED;
3922 }
3923 if (cd->names_found >= MAX_NAME_COUNT)
3924 {
3925 *errorcodeptr = ERR49;
3926 goto FAILED;
3927 }
3928 if (namelen + 3 > cd->name_entry_size)
3929 {
3930 cd->name_entry_size = namelen + 3;
3931 if (namelen > MAX_NAME_SIZE)
3932 {
3933 *errorcodeptr = ERR48;
3934 goto FAILED;
3935 }
3936 }
3937 }
3938
3939 /* In the real compile, create the entry in the table */
3940
3941 else
3942 {
3943 slot = cd->name_table;
3944 for (i = 0; i < cd->names_found; i++)
3945 {
3946 int crc = memcmp(name, slot+2, namelen);
3947 if (crc == 0)
3948 {
3949 if (slot[2+namelen] == 0)
3950 {
3951 if ((options & PCRE_DUPNAMES) == 0)
3952 {
3953 *errorcodeptr = ERR43;
3954 goto FAILED;
3955 }
3956 }
3957 else crc = -1; /* Current name is substring */
3958 }
3959 if (crc < 0)
3960 {
3961 memmove(slot + cd->name_entry_size, slot,
3962 (cd->names_found - i) * cd->name_entry_size);
3963 break;
3964 }
3965 slot += cd->name_entry_size;
3966 }
3967
3968 PUT2(slot, 0, cd->bracount + 1);
3969 memcpy(slot + 2, name, namelen);
3970 slot[2+namelen] = 0;
3971 }
3972 }
3973
3974 /* In both cases, count the number of names we've encountered. */
3975
3976 ptr++; /* Move past > or ' */
3977 cd->names_found++;
3978 goto NUMBERED_GROUP;
3979
3980
3981 /* ------------------------------------------------------------ */
3982 case '&': /* Perl recursion/subroutine syntax */
3983 terminator = ')';
3984 is_recurse = TRUE;
3985 /* Fall through */
3986
3987 /* We come here from the Python syntax above that handles both
3988 references (?P=name) and recursion (?P>name), as well as falling
3989 through from the Perl recursion syntax (?&name). */
3990
3991 NAMED_REF_OR_RECURSE:
3992 name = ++ptr;
3993 while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
3994 namelen = ptr - name;
3995
3996 /* In the pre-compile phase, do a syntax check and set a dummy
3997 reference number. */
3998
3999 if (lengthptr != NULL)
4000 {
4001 if (*ptr != terminator)
4002 {
4003 *errorcodeptr = ERR42;
4004 goto FAILED;
4005 }
4006 if (namelen > MAX_NAME_SIZE)
4007 {
4008 *errorcodeptr = ERR48;
4009 goto FAILED;
4010 }
4011 recno = 0;
4012 }
4013
4014 /* In the real compile, seek the name in the table */
4015
4016 else
4017 {
4018 slot = cd->name_table;
4019 for (i = 0; i < cd->names_found; i++)
4020 {
4021 if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
4022 slot += cd->name_entry_size;
4023 }
4024
4025 if (i < cd->names_found) /* Back reference */
4026 {
4027 recno = GET2(slot, 0);
4028 }
4029 else if ((recno = /* Forward back reference */
4030 find_parens(ptr, cd->bracount, name, namelen,
4031 (options & PCRE_EXTENDED) != 0)) <= 0)
4032 {
4033 *errorcodeptr = ERR15;
4034 goto FAILED;
4035 }
4036 }
4037
4038 /* In both phases, we can now go to the code than handles numerical
4039 recursion or backreferences. */
4040
4041 if (is_recurse) goto HANDLE_RECURSION;
4042 else goto HANDLE_REFERENCE;
4043
4044
4045 /* ------------------------------------------------------------ */
4046 case 'R': /* Recursion */
4047 ptr++; /* Same as (?0) */
4048 /* Fall through */
4049
4050
4051 /* ------------------------------------------------------------ */
4052 case '-': case '+':
4053 case '0': case '1': case '2': case '3': case '4': /* Recursion or */
4054 case '5': case '6': case '7': case '8': case '9': /* subroutine */
4055 {
4056 const uschar *called;
4057
4058 if ((refsign = *ptr) == '+') ptr++;
4059 else if (refsign == '-')
4060 {
4061 if ((digitab[ptr[1]] & ctype_digit) == 0)
4062 goto OTHER_CHAR_AFTER_QUERY;
4063 ptr++;
4064 }
4065
4066 recno = 0;
4067 while((digitab[*ptr] & ctype_digit) != 0)
4068 recno = recno * 10 + *ptr++ - '0';
4069
4070 if (*ptr != ')')
4071 {
4072 *errorcodeptr = ERR29;
4073 goto FAILED;
4074 }
4075
4076 if (refsign == '-')
4077 {
4078 if (recno == 0)
4079 {
4080 *errorcodeptr = ERR58;
4081 goto FAILED;
4082 }
4083 recno = cd->bracount - recno + 1;
4084 if (recno <= 0)
4085 {
4086 *errorcodeptr = ERR15;
4087 goto FAILED;
4088 }
4089 }
4090 else if (refsign == '+')
4091 {
4092 if (recno == 0)
4093 {
4094 *errorcodeptr = ERR58;
4095 goto FAILED;
4096 }
4097 recno += cd->bracount;
4098 }
4099
4100 /* Come here from code above that handles a named recursion */
4101
4102 HANDLE_RECURSION:
4103
4104 previous = code;
4105 called = cd->start_code;
4106
4107 /* When we are actually compiling, find the bracket that is being
4108 referenced. Temporarily end the regex in case it doesn't exist before
4109 this point. If we end up with a forward reference, first check that
4110 the bracket does occur later so we can give the error (and position)
4111 now. Then remember this forward reference in the workspace so it can
4112 be filled in at the end. */
4113
4114 if (lengthptr == NULL)
4115 {
4116 *code = OP_END;
4117 if (recno != 0) called = find_bracket(cd->start_code, utf8, recno);
4118
4119 /* Forward reference */
4120
4121 if (called == NULL)
4122 {
4123 if (find_parens(ptr, cd->bracount, NULL, recno,
4124 (options & PCRE_EXTENDED) != 0) < 0)
4125 {
4126 *errorcodeptr = ERR15;
4127 goto FAILED;
4128 }
4129 called = cd->start_code + recno;
4130 PUTINC(cd->hwm, 0, code + 2 + LINK_SIZE - cd->start_code);
4131 }
4132
4133 /* If not a forward reference, and the subpattern is still open,
4134 this is a recursive call. We check to see if this is a left
4135 recursion that could loop for ever, and diagnose that case. */
4136
4137 else if (GET(called, 1) == 0 &&
4138 could_be_empty(called, code, bcptr, utf8))
4139 {
4140 *errorcodeptr = ERR40;
4141 goto FAILED;
4142 }
4143 }
4144
4145 /* Insert the recursion/subroutine item, automatically wrapped inside
4146 "once" brackets. Set up a "previous group" length so that a
4147 subsequent quantifier will work. */
4148
4149 *code = OP_ONCE;
4150 PUT(code, 1, 2 + 2*LINK_SIZE);
4151 code += 1 + LINK_SIZE;
4152
4153 *code = OP_RECURSE;
4154 PUT(code, 1, called - cd->start_code);
4155 code += 1 + LINK_SIZE;
4156
4157 *code = OP_KET;
4158 PUT(code, 1, 2 + 2*LINK_SIZE);
4159 code += 1 + LINK_SIZE;
4160
4161 length_prevgroup = 3 + 3*LINK_SIZE;
4162 }
4163
4164 /* Can't determine a first byte now */
4165
4166 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
4167 continue;
4168
4169
4170 /* ------------------------------------------------------------ */
4171 default: /* Other characters: check option setting */
4172 OTHER_CHAR_AFTER_QUERY:
4173 set = unset = 0;
4174 optset = &set;
4175
4176 while (*ptr != ')' && *ptr != ':')
4177 {
4178 switch (*ptr++)
4179 {
4180 case '-': optset = &unset; break;
4181
4182 case 'J': /* Record that it changed in the external options */
4183 *optset |= PCRE_DUPNAMES;
4184 cd->external_options |= PCRE_JCHANGED;
4185 break;
4186
4187 case 'i': *optset |= PCRE_CASELESS; break;
4188 case 'm': *optset |= PCRE_MULTILINE; break;
4189 case 's': *optset |= PCRE_DOTALL; break;
4190 case 'x': *optset |= PCRE_EXTENDED; break;
4191 case 'U': *optset |= PCRE_UNGREEDY; break;
4192 case 'X': *optset |= PCRE_EXTRA; break;
4193
4194 default: *errorcodeptr = ERR12;
4195 ptr--; /* Correct the offset */
4196 goto FAILED;
4197 }
4198 }
4199
4200 /* Set up the changed option bits, but don't change anything yet. */
4201
4202 newoptions = (options | set) & (~unset);
4203
4204 /* If the options ended with ')' this is not the start of a nested
4205 group with option changes, so the options change at this level. If this
4206 item is right at the start of the pattern, the options can be
4207 abstracted and made external in the pre-compile phase, and ignored in
4208 the compile phase. This can be helpful when matching -- for instance in
4209 caseless checking of required bytes.
4210
4211 If the code pointer is not (cd->start_code + 1 + LINK_SIZE), we are
4212 definitely *not* at the start of the pattern because something has been
4213 compiled. In the pre-compile phase, however, the code pointer can have
4214 that value after the start, because it gets reset as code is discarded
4215 during the pre-compile. However, this can happen only at top level - if
4216 we are within parentheses, the starting BRA will still be present. At
4217 any parenthesis level, the length value can be used to test if anything
4218 has been compiled at that level. Thus, a test for both these conditions
4219 is necessary to ensure we correctly detect the start of the pattern in
4220 both phases.
4221
4222 If we are not at the pattern start, compile code to change the ims
4223 options if this setting actually changes any of them. We also pass the
4224 new setting back so that it can be put at the start of any following
4225 branches, and when this group ends (if we are in a group), a resetting
4226 item can be compiled. */
4227
4228 if (*ptr == ')')
4229 {
4230 if (code == cd->start_code + 1 + LINK_SIZE &&
4231 (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE))
4232 {
4233 cd->external_options = newoptions;
4234 options = newoptions;
4235 }
4236 else
4237 {
4238 if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
4239 {
4240 *code++ = OP_OPT;
4241 *code++ = newoptions & PCRE_IMS;
4242 }
4243
4244 /* Change options at this level, and pass them back for use
4245 in subsequent branches. Reset the greedy defaults and the case
4246 value for firstbyte and reqbyte. */
4247
4248 *optionsptr = options = newoptions;
4249 greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
4250 greedy_non_default = greedy_default ^ 1;
4251 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
4252 }
4253
4254 previous = NULL; /* This item can't be repeated */
4255 continue; /* It is complete */
4256 }
4257
4258 /* If the options ended with ':' we are heading into a nested group
4259 with possible change of options. Such groups are non-capturing and are
4260 not assertions of any kind. All we need to do is skip over the ':';
4261 the newoptions value is handled below. */
4262
4263 bravalue = OP_BRA;
4264 ptr++;
4265 } /* End of switch for character following (? */
4266 } /* End of (? handling */
4267
4268 /* Opening parenthesis not followed by '?'. If PCRE_NO_AUTO_CAPTURE is set,
4269 all unadorned brackets become non-capturing and behave like (?:...)
4270 brackets. */
4271
4272 else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
4273 {
4274 bravalue = OP_BRA;
4275 }
4276
4277 /* Else we have a capturing group. */
4278
4279 else
4280 {
4281 NUMBERED_GROUP:
4282 cd->bracount += 1;
4283 PUT2(code, 1+LINK_SIZE, cd->bracount);
4284 skipbytes = 2;
4285 }
4286
4287 /* Process nested bracketed regex. Assertions may not be repeated, but
4288 other kinds can be. All their opcodes are >= OP_ONCE. We copy code into a
4289 non-register variable in order to be able to pass its address because some
4290 compilers complain otherwise. Pass in a new setting for the ims options if
4291 they have changed. */
4292
4293 previous = (bravalue >= OP_ONCE)? code : NULL;
4294 *code = bravalue;
4295 tempcode = code;
4296 tempreqvary = cd->req_varyopt; /* Save value before bracket */
4297 length_prevgroup = 0; /* Initialize for pre-compile phase */
4298
4299 if (!compile_regex(
4300 newoptions, /* The complete new option state */
4301 options & PCRE_IMS, /* The previous ims option state */
4302 &tempcode, /* Where to put code (updated) */
4303 &ptr, /* Input pointer (updated) */
4304 errorcodeptr, /* Where to put an error message */
4305 (bravalue == OP_ASSERTBACK ||
4306 bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
4307 skipbytes, /* Skip over bracket number */
4308 &subfirstbyte, /* For possible first char */
4309 &subreqbyte, /* For possible last char */
4310 bcptr, /* Current branch chain */
4311 cd, /* Tables block */
4312 (lengthptr == NULL)? NULL : /* Actual compile phase */
4313 &length_prevgroup /* Pre-compile phase */
4314 ))
4315 goto FAILED;
4316
4317 /* At the end of compiling, code is still pointing to the start of the
4318 group, while tempcode has been updated to point past the end of the group
4319 and any option resetting that may follow it. The pattern pointer (ptr)
4320 is on the bracket. */
4321
4322 /* If this is a conditional bracket, check that there are no more than
4323 two branches in the group, or just one if it's a DEFINE group. We do this
4324 in the real compile phase, not in the pre-pass, where the whole group may
4325 not be available. */
4326
4327 if (bravalue == OP_COND && lengthptr == NULL)
4328 {
4329 uschar *tc = code;
4330 int condcount = 0;
4331
4332 do {
4333 condcount++;
4334 tc += GET(tc,1);
4335 }
4336 while (*tc != OP_KET);
4337
4338 /* A DEFINE group is never obeyed inline (the "condition" is always
4339 false). It must have only one branch. */
4340
4341 if (code[LINK_SIZE+1] == OP_DEF)
4342 {
4343 if (condcount > 1)
4344 {
4345 *errorcodeptr = ERR54;
4346 goto FAILED;
4347 }
4348 bravalue = OP_DEF; /* Just a flag to suppress char handling below */
4349 }
4350
4351 /* A "normal" conditional group. If there is just one branch, we must not
4352 make use of its firstbyte or reqbyte, because this is equivalent to an
4353 empty second branch. */
4354
4355 else
4356 {
4357 if (condcount > 2)
4358 {
4359 *errorcodeptr = ERR27;
4360 goto FAILED;
4361 }
4362 if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
4363 }
4364 }
4365
4366 /* Error if hit end of pattern */
4367
4368 if (*ptr != ')')
4369 {
4370 *errorcodeptr = ERR14;
4371 goto FAILED;
4372 }
4373
4374 /* In the pre-compile phase, update the length by the length of the nested
4375 group, less the brackets at either end. Then reduce the compiled code to
4376 just the brackets so that it doesn't use much memory if it is duplicated by
4377 a quantifier. */
4378
4379 if (lengthptr != NULL)
4380 {
4381 *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
4382 code++;
4383 PUTINC(code, 0, 1 + LINK_SIZE);
4384 *code++ = OP_KET;
4385 PUTINC(code, 0, 1 + LINK_SIZE);
4386 }
4387
4388 /* Otherwise update the main code pointer to the end of the group. */
4389
4390 else code = tempcode;
4391
4392 /* For a DEFINE group, required and first character settings are not
4393 relevant. */
4394
4395 if (bravalue == OP_DEF) break;
4396
4397 /* Handle updating of the required and first characters for other types of
4398 group. Update for normal brackets of all kinds, and conditions with two
4399 branches (see code above). If the bracket is followed by a quantifier with
4400 zero repeat, we have to back off. Hence the definition of zeroreqbyte and
4401 zerofirstbyte outside the main loop so that they can be accessed for the
4402 back off. */
4403
4404 zeroreqbyte = reqbyte;
4405 zerofirstbyte = firstbyte;
4406 groupsetfirstbyte = FALSE;
4407
4408 if (bravalue >= OP_ONCE)
4409 {
4410 /* If we have not yet set a firstbyte in this branch, take it from the
4411 subpattern, remembering that it was set here so that a repeat of more
4412 than one can replicate it as reqbyte if necessary. If the subpattern has
4413 no firstbyte, set "none" for the whole branch. In both cases, a zero
4414 repeat forces firstbyte to "none". */
4415
4416 if (firstbyte == REQ_UNSET)
4417 {
4418 if (subfirstbyte >= 0)
4419 {
4420 firstbyte = subfirstbyte;
4421 groupsetfirstbyte = TRUE;
4422 }
4423 else firstbyte = REQ_NONE;
4424 zerofirstbyte = REQ_NONE;
4425 }
4426
4427 /* If firstbyte was previously set, convert the subpattern's firstbyte
4428 into reqbyte if there wasn't one, using the vary flag that was in
4429 existence beforehand. */
4430
4431 else if (subfirstbyte >= 0 && subreqbyte < 0)
4432 subreqbyte = subfirstbyte | tempreqvary;
4433
4434 /* If the subpattern set a required byte (or set a first byte that isn't
4435 really the first byte - see above), set it. */
4436
4437 if (subreqbyte >= 0) reqbyte = subreqbyte;
4438 }
4439
4440 /* For a forward assertion, we take the reqbyte, if set. This can be
4441 helpful if the pattern that follows the assertion doesn't set a different
4442 char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
4443 for an assertion, however because it leads to incorrect effect for patterns
4444 such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
4445 of a firstbyte. This is overcome by a scan at the end if there's no
4446 firstbyte, looking for an asserted first char. */
4447
4448 else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
4449 break; /* End of processing '(' */
4450
4451
4452 /* ===================================================================*/
4453 /* Handle metasequences introduced by \. For ones like \d, the ESC_ values
4454 are arranged to be the negation of the corresponding OP_values. For the
4455 back references, the values are ESC_REF plus the reference number. Only
4456 back references and those types that consume a character may be repeated.
4457 We can test for values between ESC_b and ESC_Z for the latter; this may
4458 have to change if any new ones are ever created. */
4459
4460 case '\\':
4461 tempptr = ptr;
4462 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, FALSE);
4463 if (*errorcodeptr != 0) goto FAILED;
4464
4465 if (c < 0)
4466 {
4467 if (-c == ESC_Q) /* Handle start of quoted string */
4468 {
4469 if (ptr[1] == '\\' && ptr[2] == 'E') ptr += 2; /* avoid empty string */
4470 else inescq = TRUE;
4471 continue;
4472 }
4473
4474 if (-c == ESC_E) continue; /* Perl ignores an orphan \E */
4475
4476 /* For metasequences that actually match a character, we disable the
4477 setting of a first character if it hasn't already been set. */
4478
4479 if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
4480 firstbyte = REQ_NONE;
4481
4482 /* Set values to reset to if this is followed by a zero repeat. */
4483
4484 zerofirstbyte = firstbyte;
4485 zeroreqbyte = reqbyte;
4486
4487 /* \k<name> or \k'name' is a back reference by name (Perl syntax).
4488 We also support \k{name} (.NET syntax) */
4489
4490 if (-c == ESC_k && (ptr[1] == '<' || ptr[1] == '\'' || ptr[1] == '{'))
4491 {
4492 is_recurse = FALSE;
4493 terminator = (*(++ptr) == '<')? '>' : (*ptr == '\'')? '\'' : '}';
4494 goto NAMED_REF_OR_RECURSE;
4495 }
4496
4497 /* Back references are handled specially; must disable firstbyte if
4498 not set to cope with cases like (?=(\w+))\1: which would otherwise set
4499 ':' later. */
4500
4501 if (-c >= ESC_REF)
4502 {
4503 recno = -c - ESC_REF;
4504
4505 HANDLE_REFERENCE: /* Come here from named backref handling */
4506 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
4507 previous = code;
4508 *code++ = OP_REF;
4509 PUT2INC(code, 0, recno);
4510 cd->backref_map |= (recno < 32)? (1 << recno) : 1;
4511 if (recno > cd->top_backref) cd->top_backref = recno;
4512 }
4513
4514 /* So are Unicode property matches, if supported. */
4515
4516 #ifdef SUPPORT_UCP
4517 else if (-c == ESC_P || -c == ESC_p)
4518 {
4519 BOOL negated;
4520 int pdata;
4521 int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
4522 if (ptype < 0) goto FAILED;
4523 previous = code;
4524 *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
4525 *code++ = ptype;
4526 *code++ = pdata;
4527 }
4528 #else
4529
4530 /* If Unicode properties are not supported, \X, \P, and \p are not
4531 allowed. */
4532
4533 else if (-c == ESC_X || -c == ESC_P || -c == ESC_p)
4534 {
4535 *errorcodeptr = ERR45;
4536 goto FAILED;
4537 }
4538 #endif
4539
4540 /* For the rest (including \X when Unicode properties are supported), we
4541 can obtain the OP value by negating the escape value. */
4542
4543 else
4544 {
4545 previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
4546 *code++ = -c;
4547 }
4548 continue;
4549 }
4550
4551 /* We have a data character whose value is in c. In UTF-8 mode it may have
4552 a value > 127. We set its representation in the length/buffer, and then
4553 handle it as a data character. */
4554
4555 #ifdef SUPPORT_UTF8
4556 if (utf8 && c > 127)
4557 mclength = _pcre_ord2utf8(c, mcbuffer);
4558 else
4559 #endif
4560
4561 {
4562 mcbuffer[0] = c;
4563 mclength = 1;
4564 }
4565 goto ONE_CHAR;
4566
4567
4568 /* ===================================================================*/
4569 /* Handle a literal character. It is guaranteed not to be whitespace or #
4570 when the extended flag is set. If we are in UTF-8 mode, it may be a
4571 multi-byte literal character. */
4572
4573 default:
4574 NORMAL_CHAR:
4575 mclength = 1;
4576 mcbuffer[0] = c;
4577
4578 #ifdef SUPPORT_UTF8
4579 if (utf8 && c >= 0xc0)
4580 {
4581 while ((ptr[1] & 0xc0) == 0x80)
4582 mcbuffer[mclength++] = *(++ptr);
4583 }
4584 #endif
4585
4586 /* At this point we have the character's bytes in mcbuffer, and the length
4587 in mclength. When not in UTF-8 mode, the length is always 1. */
4588
4589 ONE_CHAR:
4590 previous = code;
4591 *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARNC : OP_CHAR;
4592 for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
4593
4594 /* Set the first and required bytes appropriately. If no previous first
4595 byte, set it from this character, but revert to none on a zero repeat.
4596 Otherwise, leave the firstbyte value alone, and don't change it on a zero
4597 repeat. */
4598
4599 if (firstbyte == REQ_UNSET)
4600 {
4601 zerofirstbyte = REQ_NONE;
4602 zeroreqbyte = reqbyte;
4603
4604 /* If the character is more than one byte long, we can set firstbyte
4605 only if it is not to be matched caselessly. */
4606
4607 if (mclength == 1 || req_caseopt == 0)
4608 {
4609 firstbyte = mcbuffer[0] | req_caseopt;
4610 if (mclength != 1) reqbyte = code[-1] | cd->req_varyopt;
4611 }
4612 else firstbyte = reqbyte = REQ_NONE;
4613 }
4614
4615 /* firstbyte was previously set; we can set reqbyte only the length is
4616 1 or the matching is caseful. */
4617
4618 else
4619 {
4620 zerofirstbyte = firstbyte;
4621 zeroreqbyte = reqbyte;
4622 if (mclength == 1 || req_caseopt == 0)
4623 reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
4624 }
4625
4626 break; /* End of literal character handling */
4627 }
4628 } /* end of big loop */
4629
4630
4631 /* Control never reaches here by falling through, only by a goto for all the
4632 error states. Pass back the position in the pattern so that it can be displayed
4633 to the user for diagnosing the error. */
4634
4635 FAILED:
4636 *ptrptr = ptr;
4637 return FALSE;
4638 }
4639
4640
4641
4642
4643 /*************************************************
4644 * Compile sequence of alternatives *
4645 *************************************************/
4646
4647 /* On entry, ptr is pointing past the bracket character, but on return it
4648 points to the closing bracket, or vertical bar, or end of string. The code
4649 variable is pointing at the byte into which the BRA operator has been stored.
4650 If the ims options are changed at the start (for a (?ims: group) or during any
4651 branch, we need to insert an OP_OPT item at the start of every following branch
4652 to ensure they get set correctly at run time, and also pass the new options
4653 into every subsequent branch compile.
4654
4655 This function is used during the pre-compile phase when we are trying to find
4656 out the amount of memory needed, as well as during the real compile phase. The
4657 value of lengthptr distinguishes the two phases.
4658
4659 Arguments:
4660 options option bits, including any changes for this subpattern
4661 oldims previous settings of ims option bits
4662 codeptr -> the address of the current code pointer
4663 ptrptr -> the address of the current pattern pointer
4664 errorcodeptr -> pointer to error code variable
4665 lookbehind TRUE if this is a lookbehind assertion
4666 skipbytes skip this many bytes at start (for brackets and OP_COND)
4667 firstbyteptr place to put the first required character, or a negative number
4668 reqbyteptr place to put the last required character, or a negative number
4669 bcptr pointer to the chain of currently open branches
4670 cd points to the data block with tables pointers etc.
4671 lengthptr NULL during the real compile phase
4672 points to length accumulator during pre-compile phase
4673
4674 Returns: TRUE on success
4675 */
4676
4677 static BOOL
4678 compile_regex(int options, int oldims, uschar **codeptr, const uschar **ptrptr,
4679 int *errorcodeptr, BOOL lookbehind, int skipbytes, int *firstbyteptr,
4680 int *reqbyteptr, branch_chain *bcptr, compile_data *cd, int *lengthptr)
4681 {
4682 const uschar *ptr = *ptrptr;
4683 uschar *code = *codeptr;
4684 uschar *last_branch = code;
4685 uschar *start_bracket = code;
4686 uschar *reverse_count = NULL;
4687 int firstbyte, reqbyte;
4688 int branchfirstbyte, branchreqbyte;
4689 int length;
4690 branch_chain bc;
4691
4692 bc.outer = bcptr;
4693 bc.current = code;
4694
4695 firstbyte = reqbyte = REQ_UNSET;
4696
4697 /* Accumulate the length for use in the pre-compile phase. Start with the
4698 length of the BRA and KET and any extra bytes that are required at the
4699 beginning. We accumulate in a local variable to save frequent testing of
4700 lenthptr for NULL. We cannot do this by looking at the value of code at the
4701 start and end of each alternative, because compiled items are discarded during
4702 the pre-compile phase so that the work space is not exceeded. */
4703
4704 length = 2 + 2*LINK_SIZE + skipbytes;
4705
4706 /* WARNING: If the above line is changed for any reason, you must also change
4707 the code that abstracts option settings at the start of the pattern and makes
4708 them global. It tests the value of length for (2 + 2*LINK_SIZE) in the
4709 pre-compile phase to find out whether anything has yet been compiled or not. */
4710
4711 /* Offset is set zero to mark that this bracket is still open */
4712
4713 PUT(code, 1, 0);
4714 code += 1 + LINK_SIZE + skipbytes;
4715
4716 /* Loop for each alternative branch */
4717
4718 for (;;)
4719 {
4720 /* Handle a change of ims options at the start of the branch */
4721
4722 if ((options & PCRE_IMS) != oldims)
4723 {
4724 *code++ = OP_OPT;
4725 *code++ = options & PCRE_IMS;
4726 length += 2;
4727 }
4728
4729 /* Set up dummy OP_REVERSE if lookbehind assertion */
4730
4731 if (lookbehind)
4732 {
4733 *code++ = OP_REVERSE;
4734 reverse_count = code;
4735 PUTINC(code, 0, 0);
4736 length += 1 + LINK_SIZE;
4737 }
4738
4739 /* Now compile the branch; in the pre-compile phase its length gets added
4740 into the length. */
4741
4742 if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstbyte,
4743 &branchreqbyte, &bc, cd, (lengthptr == NULL)? NULL : &length))
4744 {
4745 *ptrptr = ptr;
4746 return FALSE;
4747 }
4748
4749 /* In the real compile phase, there is some post-processing to be done. */
4750
4751 if (lengthptr == NULL)
4752 {
4753 /* If this is the first branch, the firstbyte and reqbyte values for the
4754 branch become the values for the regex. */
4755
4756 if (*last_branch != OP_ALT)
4757 {
4758 firstbyte = branchfirstbyte;
4759 reqbyte = branchreqbyte;
4760 }
4761
4762 /* If this is not the first branch, the first char and reqbyte have to
4763 match the values from all the previous branches, except that if the
4764 previous value for reqbyte didn't have REQ_VARY set, it can still match,
4765 and we set REQ_VARY for the regex. */
4766
4767 else
4768 {
4769 /* If we previously had a firstbyte, but it doesn't match the new branch,
4770 we have to abandon the firstbyte for the regex, but if there was
4771 previously no reqbyte, it takes on the value of the old firstbyte. */
4772
4773 if (firstbyte >= 0 && firstbyte != branchfirstbyte)
4774 {
4775 if (reqbyte < 0) reqbyte = firstbyte;
4776 firstbyte = REQ_NONE;
4777 }
4778
4779 /* If we (now or from before) have no firstbyte, a firstbyte from the
4780 branch becomes a reqbyte if there isn't a branch reqbyte. */
4781
4782 if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
4783 branchreqbyte = branchfirstbyte;
4784
4785 /* Now ensure that the reqbytes match */
4786
4787 if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
4788 reqbyte = REQ_NONE;
4789 else reqbyte |= branchreqbyte; /* To "or" REQ_VARY */
4790 }
4791
4792 /* If lookbehind, check that this branch matches a fixed-length string, and
4793 put the length into the OP_REVERSE item. Temporarily mark the end of the
4794 branch with OP_END. */
4795
4796 if (lookbehind)
4797 {
4798 int fixed_length;
4799 *code = OP_END;
4800 fixed_length = find_fixedlength(last_branch, options);
4801 DPRINTF(("fixed length = %d\n", fixed_length));
4802 if (fixed_length < 0)
4803 {
4804 *errorcodeptr = (fixed_length == -2)? ERR36 : ERR25;
4805 *ptrptr = ptr;
4806 return FALSE;
4807 }
4808 PUT(reverse_count, 0, fixed_length);
4809 }
4810 }
4811
4812 /* Reached end of expression, either ')' or end of pattern. In the real
4813 compile phase, go back through the alternative branches and reverse the chain
4814 of offsets, with the field in the BRA item now becoming an offset to the
4815 first alternative. If there are no alternatives, it points to the end of the
4816 group. The length in the terminating ket is always the length of the whole
4817 bracketed item. If any of the ims options were changed inside the group,
4818 compile a resetting op-code following, except at the very end of the pattern.
4819 Return leaving the pointer at the terminating char. */
4820
4821 if (*ptr != '|')
4822 {
4823 if (lengthptr == NULL)
4824 {
4825 int branch_length = code - last_branch;
4826 do
4827 {
4828 int prev_length = GET(last_branch, 1);
4829 PUT(last_branch, 1, branch_length);
4830 branch_length = prev_length;
4831 last_branch -= branch_length;
4832 }
4833 while (branch_length > 0);
4834 }
4835
4836 /* Fill in the ket */
4837
4838 *code = OP_KET;
4839 PUT(code, 1, code - start_bracket);
4840 code += 1 + LINK_SIZE;
4841
4842 /* Resetting option if needed */
4843
4844 if ((options & PCRE_IMS) != oldims && *ptr == ')')
4845 {
4846 *code++ = OP_OPT;
4847 *code++ = oldims;
4848 length += 2;
4849 }
4850
4851 /* Set values to pass back */
4852
4853 *codeptr = code;
4854 *ptrptr = ptr;
4855 *firstbyteptr = firstbyte;
4856 *reqbyteptr = reqbyte;
4857 if (lengthptr != NULL) *lengthptr += length;
4858 return TRUE;
4859 }
4860
4861 /* Another branch follows. In the pre-compile phase, we can move the code
4862 pointer back to where it was for the start of the first branch. (That is,
4863 pretend that each branch is the only one.)
4864
4865 In the real compile phase, insert an ALT node. Its length field points back
4866 to the previous branch while the bracket remains open. At the end the chain
4867 is reversed. It's done like this so that the start of the bracket has a
4868 zero offset until it is closed, making it possible to detect recursion. */
4869
4870 if (lengthptr != NULL)
4871 {
4872 code = *codeptr + 1 + LINK_SIZE + skipbytes;
4873 length += 1 + LINK_SIZE;
4874 }
4875 else
4876 {
4877 *code = OP_ALT;
4878 PUT(code, 1, code - last_branch);
4879 bc.current = last_branch = code;
4880 code += 1 + LINK_SIZE;
4881 }
4882
4883 ptr++;
4884 }
4885 /* Control never reaches here */
4886 }
4887
4888
4889
4890
4891 /*************************************************
4892 * Check for anchored expression *
4893 *************************************************/
4894
4895 /* Try to find out if this is an anchored regular expression. Consider each
4896 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
4897 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
4898 it's anchored. However, if this is a multiline pattern, then only OP_SOD
4899 counts, since OP_CIRC can match in the middle.
4900
4901 We can also consider a regex to be anchored if OP_SOM starts all its branches.
4902 This is the code for \G, which means "match at start of match position, taking
4903 into account the match offset".
4904
4905 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
4906 because that will try the rest of the pattern at all possible matching points,
4907 so there is no point trying again.... er ....
4908
4909 .... except when the .* appears inside capturing parentheses, and there is a
4910 subsequent back reference to those parentheses. We haven't enough information
4911 to catch that case precisely.
4912
4913 At first, the best we could do was to detect when .* was in capturing brackets
4914 and the highest back reference was greater than or equal to that level.
4915 However, by keeping a bitmap of the first 31 back references, we can catch some
4916 of the more common cases more precisely.
4917
4918 Arguments:
4919 code points to start of expression (the bracket)
4920 options points to the options setting
4921 bracket_map a bitmap of which brackets we are inside while testing; this
4922 handles up to substring 31; after that we just have to take
4923 the less precise approach
4924 backref_map the back reference bitmap
4925
4926 Returns: TRUE or FALSE
4927 */
4928
4929 static BOOL
4930 is_anchored(register const uschar *code, int *options, unsigned int bracket_map,
4931 unsigned int backref_map)
4932 {
4933 do {
4934 const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
4935 options, PCRE_MULTILINE, FALSE);
4936 register int op = *scode;
4937
4938 /* Non-capturing brackets */
4939
4940 if (op == OP_BRA)
4941 {
4942 if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
4943 }
4944
4945 /* Capturing brackets */
4946
4947 else if (op == OP_CBRA)
4948 {
4949 int n = GET2(scode, 1+LINK_SIZE);
4950 int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
4951 if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;
4952 }
4953
4954 /* Other brackets */
4955
4956 else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
4957 {
4958 if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
4959 }
4960
4961 /* .* is not anchored unless DOTALL is set and it isn't in brackets that
4962 are or may be referenced. */
4963
4964 else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
4965 op == OP_TYPEPOSSTAR) &&
4966 (*options & PCRE_DOTALL) != 0)
4967 {
4968 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
4969 }
4970
4971 /* Check for explicit anchoring */
4972
4973 else if (op != OP_SOD && op != OP_SOM &&
4974 ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
4975 return FALSE;
4976 code += GET(code, 1);
4977 }
4978 while (*code == OP_ALT); /* Loop for each alternative */
4979 return TRUE;
4980 }
4981
4982
4983
4984 /*************************************************
4985 * Check for starting with ^ or .* *
4986 *************************************************/
4987
4988 /* This is called to find out if every branch starts with ^ or .* so that
4989 "first char" processing can be done to speed things up in multiline
4990 matching and for non-DOTALL patterns that start with .* (which must start at
4991 the beginning or after \n). As in the case of is_anchored() (see above), we
4992 have to take account of back references to capturing brackets that contain .*
4993 because in that case we can't make the assumption.
4994
4995 Arguments:
4996 code points to start of expression (the bracket)
4997 bracket_map a bitmap of which brackets we are inside while testing; this
4998 handles up to substring 31; after that we just have to take
4999 the less precise approach
5000 backref_map the back reference bitmap
5001
5002 Returns: TRUE or FALSE
5003 */
5004
5005 static BOOL
5006 is_startline(const uschar *code, unsigned int bracket_map,
5007 unsigned int backref_map)
5008 {
5009 do {
5010 const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
5011 NULL, 0, FALSE);
5012 register int op = *scode;
5013
5014 /* Non-capturing brackets */
5015
5016 if (op == OP_BRA)
5017 {
5018 if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
5019 }
5020
5021 /* Capturing brackets */
5022
5023 else if (op == OP_CBRA)
5024 {
5025 int n = GET2(scode, 1+LINK_SIZE);
5026 int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
5027 if (!is_startline(scode, new_map, backref_map)) return FALSE;
5028 }
5029
5030 /* Other brackets */
5031
5032 else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
5033 { if (!is_startline(scode, bracket_map, backref_map)) return FALSE; }
5034
5035 /* .* means "start at start or after \n" if it isn't in brackets that
5036 may be referenced. */
5037
5038 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
5039 {
5040 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
5041 }
5042
5043 /* Check for explicit circumflex */
5044
5045 else if (op != OP_CIRC) return FALSE;
5046
5047 /* Move on to the next alternative */
5048
5049 code += GET(code, 1);
5050 }
5051 while (*code == OP_ALT); /* Loop for each alternative */
5052 return TRUE;
5053 }
5054
5055
5056
5057 /*************************************************
5058 * Check for asserted fixed first char *
5059 *************************************************/
5060
5061 /* During compilation, the "first char" settings from forward assertions are
5062 discarded, because they can cause conflicts with actual literals that follow.
5063 However, if we end up without a first char setting for an unanchored pattern,
5064 it is worth scanning the regex to see if there is an initial asserted first
5065 char. If all branches start with the same asserted char, or with a bracket all
5066 of whose alternatives start with the same asserted char (recurse ad lib), then
5067 we return that char, otherwise -1.
5068
5069 Arguments:
5070 code points to start of expression (the bracket)
5071 options pointer to the options (used to check casing changes)
5072 inassert TRUE if in an assertion
5073
5074 Returns: -1 or the fixed first char
5075 */
5076
5077 static int
5078 find_firstassertedchar(const uschar *code, int *options, BOOL inassert)
5079 {
5080 register int c = -1;
5081 do {
5082 int d;
5083 const uschar *scode =
5084 first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE);
5085 register int op = *scode;
5086
5087 switch(op)
5088 {
5089 default:
5090 return -1;
5091
5092 case OP_BRA:
5093 case OP_CBRA:
5094 case OP_ASSERT:
5095 case OP_ONCE:
5096 case OP_COND:
5097 if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0)
5098 return -1;
5099 if (c < 0) c = d; else if (c != d) return -1;
5100 break;
5101
5102 case OP_EXACT: /* Fall through */
5103 scode += 2;
5104
5105 case OP_CHAR:
5106 case OP_CHARNC:
5107 case OP_PLUS:
5108 case OP_MINPLUS:
5109 case OP_POSPLUS:
5110 if (!inassert) return -1;
5111 if (c < 0)
5112 {
5113 c = scode[1];
5114 if ((*options & PCRE_CASELESS) != 0) c |= REQ_CASELESS;
5115 }
5116 else if (c != scode[1]) return -1;
5117 break;
5118 }
5119
5120 code += GET(code, 1);
5121 }
5122 while (*code == OP_ALT);
5123 return c;
5124 }
5125
5126
5127
5128 /*************************************************
5129 * Compile a Regular Expression *
5130 *************************************************/
5131
5132 /* This function takes a string and returns a pointer to a block of store
5133 holding a compiled version of the expression. The original API for this
5134 function had no error code return variable; it is retained for backwards
5135 compatibility. The new function is given a new name.
5136
5137 Arguments:
5138 pattern the regular expression
5139 options various option bits
5140 errorcodeptr pointer to error code variable (pcre_compile2() only)
5141 can be NULL if you don't want a code value
5142 errorptr pointer to pointer to error text
5143 erroroffset ptr offset in pattern where error was detected
5144 tables pointer to character tables or NULL
5145
5146 Returns: pointer to compiled data block, or NULL on error,
5147 with errorptr and erroroffset set
5148 */
5149
5150 PCRE_EXP_DEFN pcre *
5151 pcre_compile(const char *pattern, int options, const char **errorptr,
5152 int *erroroffset, const unsigned char *tables)
5153 {
5154 return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
5155 }
5156
5157
5158 PCRE_EXP_DEFN pcre *
5159 pcre_compile2(const char *pattern, int options, int *errorcodeptr,
5160 const char **errorptr, int *erroroffset, const unsigned char *tables)
5161 {
5162 real_pcre *re;
5163 int length = 1; /* For final END opcode */
5164 int firstbyte, reqbyte, newline;
5165 int errorcode = 0;
5166 #ifdef SUPPORT_UTF8
5167 BOOL utf8;
5168 #endif
5169 size_t size;
5170 uschar *code;
5171 const uschar *codestart;
5172 const uschar *ptr;
5173 compile_data compile_block;
5174 compile_data *cd = &compile_block;
5175
5176 /* This space is used for "compiling" into during the first phase, when we are
5177 computing the amount of memory that is needed. Compiled items are thrown away
5178 as soon as possible, so that a fairly large buffer should be sufficient for
5179 this purpose. The same space is used in the second phase for remembering where
5180 to fill in forward references to subpatterns. */
5181
5182 uschar cworkspace[COMPILE_WORK_SIZE];
5183
5184
5185 /* Set this early so that early errors get offset 0. */
5186
5187 ptr = (const uschar *)pattern;
5188
5189 /* We can't pass back an error message if errorptr is NULL; I guess the best we
5190 can do is just return NULL, but we can set a code value if there is a code
5191 pointer. */
5192
5193 if (errorptr == NULL)
5194 {
5195 if (errorcodeptr != NULL) *errorcodeptr = 99;
5196 return NULL;
5197 }
5198
5199 *errorptr = NULL;
5200 if (errorcodeptr != NULL) *errorcodeptr = ERR0;
5201
5202 /* However, we can give a message for this error */
5203
5204 if (erroroffset == NULL)
5205 {
5206 errorcode = ERR16;
5207 goto PCRE_EARLY_ERROR_RETURN2;
5208 }
5209
5210 *erroroffset = 0;
5211
5212 /* Can't support UTF8 unless PCRE has been compiled to include the code. */
5213
5214 #ifdef SUPPORT_UTF8
5215 utf8 = (options & PCRE_UTF8) != 0;
5216 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
5217 (*erroroffset = _pcre_valid_utf8((uschar *)pattern, -1)) >= 0)
5218 {
5219 errorcode = ERR44;
5220 goto PCRE_EARLY_ERROR_RETURN2;
5221 }
5222 #else
5223 if ((options & PCRE_UTF8) != 0)
5224 {
5225 errorcode = ERR32;
5226 goto PCRE_EARLY_ERROR_RETURN;
5227 }
5228 #endif
5229
5230 if ((options & ~PUBLIC_OPTIONS) != 0)
5231 {
5232 errorcode = ERR17;
5233 goto PCRE_EARLY_ERROR_RETURN;
5234 }
5235
5236 /* Set up pointers to the individual character tables */
5237
5238 if (tables == NULL) tables = _pcre_default_tables;
5239 cd->lcc = tables + lcc_offset;
5240 cd->fcc = tables + fcc_offset;
5241 cd->cbits = tables + cbits_offset;
5242 cd->ctypes = tables + ctypes_offset;
5243
5244 /* Handle different types of newline. The three bits give seven cases. The
5245 current code allows for fixed one- or two-byte sequences, plus "any" and
5246 "anycrlf". */
5247
5248 switch (options & (PCRE_NEWLINE_CRLF | PCRE_NEWLINE_ANY))
5249 {
5250 case 0: newline = NEWLINE; break; /* Compile-time default */
5251 case PCRE_NEWLINE_CR: newline = '\r'; break;
5252 case PCRE_NEWLINE_LF: newline = '\n'; break;
5253 case PCRE_NEWLINE_CR+
5254 PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
5255 case PCRE_NEWLINE_ANY: newline = -1; break;
5256 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
5257 default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
5258 }
5259
5260 if (newline == -2)
5261 {
5262 cd->nltype = NLTYPE_ANYCRLF;
5263 }
5264 else if (newline < 0)
5265 {
5266 cd->nltype = NLTYPE_ANY;
5267 }
5268 else
5269 {
5270 cd->nltype = NLTYPE_FIXED;
5271 if (newline > 255)
5272 {
5273 cd->nllen = 2;
5274 cd->nl[0] = (newline >> 8) & 255;
5275 cd->nl[1] = newline & 255;
5276 }
5277 else
5278 {
5279 cd->nllen = 1;
5280 cd->nl[0] = newline;
5281 }
5282 }
5283
5284 /* Maximum back reference and backref bitmap. The bitmap records up to 31 back
5285 references to help in deciding whether (.*) can be treated as anchored or not.
5286 */
5287
5288 cd->top_backref = 0;
5289 cd->backref_map = 0;
5290
5291 /* Reflect pattern for debugging output */
5292
5293 DPRINTF(("------------------------------------------------------------------\n"));
5294 DPRINTF(("%s\n", pattern));
5295
5296 /* Pretend to compile the pattern while actually just accumulating the length
5297 of memory required. This behaviour is triggered by passing a non-NULL final
5298 argument to compile_regex(). We pass a block of workspace (cworkspace) for it
5299 to compile parts of the pattern into; the compiled code is discarded when it is
5300 no longer needed, so hopefully this workspace will never overflow, though there
5301 is a test for its doing so. */
5302
5303 cd->bracount = 0;
5304 cd->names_found = 0;
5305 cd->name_entry_size = 0;
5306 cd->name_table = NULL;
5307 cd->start_workspace = cworkspace;
5308 cd->start_code = cworkspace;
5309 cd->hwm = cworkspace;
5310 cd->start_pattern = (const uschar *)pattern;
5311 cd->end_pattern = (const uschar *)(pattern + strlen(pattern));
5312 cd->req_varyopt = 0;
5313 cd->nopartial = FALSE;
5314 cd->external_options = options;
5315
5316 /* Now do the pre-compile. On error, errorcode will be set non-zero, so we
5317 don't need to look at the result of the function here. The initial options have
5318 been put into the cd block so that they can be changed if an option setting is
5319 found within the regex right at the beginning. Bringing initial option settings
5320 outside can help speed up starting point checks. */
5321
5322 code = cworkspace;
5323 *code = OP_BRA;
5324 (void)compile_regex(cd->external_options, cd->external_options & PCRE_IMS,
5325 &code, &ptr, &errorcode, FALSE, 0, &firstbyte, &reqbyte, NULL, cd, &length);
5326 if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN;
5327
5328 DPRINTF(("end pre-compile: length=%d workspace=%d\n", length,
5329 cd->hwm - cworkspace));
5330
5331 if (length > MAX_PATTERN_SIZE)
5332 {
5333 errorcode = ERR20;
5334 goto PCRE_EARLY_ERROR_RETURN;
5335 }
5336
5337 /* Compute the size of data block needed and get it, either from malloc or
5338 externally provided function. Integer overflow should no longer be possible
5339 because nowadays we limit the maximum value of cd->names_found and
5340 cd->name_entry_size. */
5341
5342 size = length + sizeof(real_pcre) + cd->names_found * (cd->name_entry_size + 3);
5343 re = (real_pcre *)(pcre_malloc)(size);
5344
5345 if (re == NULL)
5346 {
5347 errorcode = ERR21;
5348 goto PCRE_EARLY_ERROR_RETURN;
5349 }
5350
5351 /* Put in the magic number, and save the sizes, initial options, and character
5352 table pointer. NULL is used for the default character tables. The nullpad field
5353 is at the end; it's there to help in the case when a regex compiled on a system
5354 with 4-byte pointers is run on another with 8-byte pointers. */
5355
5356 re->magic_number = MAGIC_NUMBER;
5357 re->size = size;
5358 re->options = cd->external_options;
5359 re->dummy1 = 0;
5360 re->first_byte = 0;
5361 re->req_byte = 0;
5362 re->name_table_offset = sizeof(real_pcre);
5363 re->name_entry_size = cd->name_entry_size;
5364 re->name_count = cd->names_found;
5365 re->ref_count = 0;
5366 re->tables = (tables == _pcre_default_tables)? NULL : tables;
5367 re->nullpad = NULL;
5368
5369 /* The starting points of the name/number translation table and of the code are
5370 passed around in the compile data block. The start/end pattern and initial
5371 options are already set from the pre-compile phase, as is the name_entry_size
5372 field. Reset the bracket count and the names_found field. Also reset the hwm
5373 field; this time it's used for remembering forward references to subpatterns.
5374 */
5375
5376 cd->bracount = 0;
5377 cd->names_found = 0;
5378 cd->name_table = (uschar *)re + re->name_table_offset;
5379 codestart = cd->name_table + re->name_entry_size * re->name_count;
5380 cd->start_code = codestart;
5381 cd->hwm = cworkspace;
5382 cd->req_varyopt = 0;
5383 cd->nopartial = FALSE;
5384
5385 /* Set up a starting, non-extracting bracket, then compile the expression. On
5386 error, errorcode will be set non-zero, so we don't need to look at the result
5387 of the function here. */
5388
5389 ptr = (const uschar *)pattern;
5390 code = (uschar *)codestart;
5391 *code = OP_BRA;
5392 (void)compile_regex(re->options, re->options & PCRE_IMS, &code, &ptr,
5393 &errorcode, FALSE, 0, &firstbyte, &reqbyte, NULL, cd, NULL);
5394 re->top_bracket = cd->bracount;
5395 re->top_backref = cd->top_backref;
5396
5397 if (cd->nopartial) re->options |= PCRE_NOPARTIAL;
5398
5399 /* If not reached end of pattern on success, there's an excess bracket. */
5400
5401 if (errorcode == 0 && *ptr != 0) errorcode = ERR22;
5402
5403 /* Fill in the terminating state and check for disastrous overflow, but
5404 if debugging, leave the test till after things are printed out. */
5405
5406 *code++ = OP_END;
5407
5408 #ifndef DEBUG
5409 if (code - codestart > length) errorcode = ERR23;
5410 #endif
5411
5412 /* Fill in any forward references that are required. */
5413
5414 while (errorcode == 0 && cd->hwm > cworkspace)
5415 {
5416 int offset, recno;
5417 const uschar *groupptr;
5418 cd->hwm -= LINK_SIZE;
5419 offset = GET(cd->hwm, 0);
5420 recno = GET(codestart, offset);
5421 groupptr = find_bracket(codestart, (re->options & PCRE_UTF8) != 0, recno);
5422 if (groupptr == NULL) errorcode = ERR53;
5423 else PUT(((uschar *)codestart), offset, groupptr - codestart);
5424 }
5425
5426 /* Give an error if there's back reference to a non-existent capturing
5427 subpattern. */
5428
5429 if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15;
5430
5431 /* Failed to compile, or error while post-processing */
5432
5433 if (errorcode != 0)
5434 {
5435 (pcre_free)(re);
5436 PCRE_EARLY_ERROR_RETURN:
5437 *erroroffset = ptr - (const uschar *)pattern;
5438 PCRE_EARLY_ERROR_RETURN2:
5439 *errorptr = error_texts[errorcode];
5440 if (errorcodeptr != NULL) *errorcodeptr = errorcode;
5441 return NULL;
5442 }
5443
5444 /* If the anchored option was not passed, set the flag if we can determine that
5445 the pattern is anchored by virtue of ^ characters or \A or anything else (such
5446 as starting with .* when DOTALL is set).
5447
5448 Otherwise, if we know what the first byte has to be, save it, because that
5449 speeds up unanchored matches no end. If not, see if we can set the
5450 PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
5451 start with ^. and also when all branches start with .* for non-DOTALL matches.
5452 */
5453
5454 if ((re->options & PCRE_ANCHORED) == 0)
5455 {
5456 int temp_options = re->options; /* May get changed during these scans */
5457 if (is_anchored(codestart, &temp_options, 0, cd->backref_map))
5458 re->options |= PCRE_ANCHORED;
5459 else
5460 {
5461 if (firstbyte < 0)
5462 firstbyte = find_firstassertedchar(codestart, &temp_options, FALSE);
5463 if (firstbyte >= 0) /* Remove caseless flag for non-caseable chars */
5464 {
5465 int ch = firstbyte & 255;
5466 re->first_byte = ((firstbyte & REQ_CASELESS) != 0 &&
5467 cd->fcc[ch] == ch)? ch : firstbyte;
5468 re->options |= PCRE_FIRSTSET;
5469 }
5470 else if (is_startline(codestart, 0, cd->backref_map))
5471 re->options |= PCRE_STARTLINE;
5472 }
5473 }
5474
5475 /* For an anchored pattern, we use the "required byte" only if it follows a
5476 variable length item in the regex. Remove the caseless flag for non-caseable
5477 bytes. */
5478
5479 if (reqbyte >= 0 &&
5480 ((re->options & PCRE_ANCHORED) == 0 || (reqbyte & REQ_VARY) != 0))
5481 {
5482 int ch = reqbyte & 255;
5483 re->req_byte = ((reqbyte & REQ_CASELESS) != 0 &&
5484 cd->fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte;
5485 re->options |= PCRE_REQCHSET;
5486 }
5487
5488 /* Print out the compiled data if debugging is enabled. This is never the
5489 case when building a production library. */
5490
5491 #ifdef DEBUG
5492
5493 printf("Length = %d top_bracket = %d top_backref = %d\n",
5494 length, re->top_bracket, re->top_backref);
5495
5496 if (re->options != 0)
5497 {
5498 printf("%s%s%s%s%s%s%s%s%s\n",
5499 ((re->options & PCRE_NOPARTIAL) != 0)? "nopartial " : "",
5500 ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
5501 ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
5502 ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
5503 ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
5504 ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
5505 ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
5506 ((re->options & PCRE_EXTRA) != 0)? "extra " : "",
5507 ((re->options & PCRE_UNGREEDY) != 0)? "ungreedy " : "");
5508 }
5509
5510 if ((re->options & PCRE_FIRSTSET) != 0)
5511 {
5512 int ch = re->first_byte & 255;
5513 const char *caseless = ((re->first_byte & REQ_CASELESS) == 0)?
5514 "" : " (caseless)";
5515 if (isprint(ch)) printf("First char = %c%s\n", ch, caseless);
5516 else printf("First char = \\x%02x%s\n", ch, caseless);
5517 }
5518
5519 if ((re->options & PCRE_REQCHSET) != 0)
5520 {
5521 int ch = re->req_byte & 255;
5522 const char *caseless = ((re->req_byte & REQ_CASELESS) == 0)?
5523 "" : " (caseless)";
5524 if (isprint(ch)) printf("Req char = %c%s\n", ch, caseless);
5525 else printf("Req char = \\x%02x%s\n", ch, caseless);
5526 }
5527
5528 pcre_printint(re, stdout, TRUE);
5529
5530 /* This check is done here in the debugging case so that the code that
5531 was compiled can be seen. */
5532
5533 if (code - codestart > length)
5534 {
5535 (pcre_free)(re);
5536 *errorptr = error_texts[ERR23];
5537 *erroroffset = ptr - (uschar *)pattern;
5538 if (errorcodeptr != NULL) *errorcodeptr = ERR23;
5539 return NULL;
5540 }
5541 #endif /* DEBUG */
5542
5543 return (pcre *)re;
5544 }
5545
5546 /* End of pcre_compile.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12