/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Contents of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 180 - (show annotations) (download)
Wed Jun 13 10:59:18 2007 UTC (7 years, 4 months ago) by ph10
File MIME type: text/plain
File size: 183681 byte(s)
Add auto-possessification for \h, \H, \v, \V.

1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2007 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains the external function pcre_compile(), along with
42 supporting internal functions that are not used by other modules. */
43
44
45 #define NLBLOCK cd /* Block containing newline information */
46 #define PSSTART start_pattern /* Field containing processed string start */
47 #define PSEND end_pattern /* Field containing processed string end */
48
49
50 #include "pcre_internal.h"
51
52
53 /* When DEBUG is defined, we need the pcre_printint() function, which is also
54 used by pcretest. DEBUG is not defined when building a production library. */
55
56 #ifdef DEBUG
57 #include "pcre_printint.src"
58 #endif
59
60
61 /* Macro for setting individual bits in class bitmaps. */
62
63 #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
64
65
66 /*************************************************
67 * Code parameters and static tables *
68 *************************************************/
69
70 /* This value specifies the size of stack workspace that is used during the
71 first pre-compile phase that determines how much memory is required. The regex
72 is partly compiled into this space, but the compiled parts are discarded as
73 soon as they can be, so that hopefully there will never be an overrun. The code
74 does, however, check for an overrun. The largest amount I've seen used is 218,
75 so this number is very generous.
76
77 The same workspace is used during the second, actual compile phase for
78 remembering forward references to groups so that they can be filled in at the
79 end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
80 is 4 there is plenty of room. */
81
82 #define COMPILE_WORK_SIZE (4096)
83
84
85 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
86 are simple data values; negative values are for special things like \d and so
87 on. Zero means further processing is needed (for things like \x), or the escape
88 is invalid. */
89
90 #ifndef EBCDIC /* This is the "normal" table for ASCII systems */
91 static const short int escapes[] = {
92 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
93 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
94 '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */
95 -ESC_H, 0, 0, -ESC_K, 0, 0, 0, 0, /* H - O */
96 -ESC_P, -ESC_Q, -ESC_R, -ESC_S, 0, 0, -ESC_V, -ESC_W, /* P - W */
97 -ESC_X, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
98 '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */
99 -ESC_h, 0, 0, -ESC_k, 0, 0, ESC_n, 0, /* h - o */
100 -ESC_p, 0, ESC_r, -ESC_s, ESC_tee, 0, -ESC_v, -ESC_w, /* p - w */
101 0, 0, -ESC_z /* x - z */
102 };
103
104 #else /* This is the "abnormal" table for EBCDIC systems */
105 static const short int escapes[] = {
106 /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
107 /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
108 /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
109 /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
110 /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
111 /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
112 /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
113 /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
114 /* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0,
115 /* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
116 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
117 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0,
118 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
119 /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
120 /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
121 /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
122 /* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0,
123 /* D0 */ '}', 0, 0, 0, 0, 0, 0, -ESC_P,
124 /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
125 /* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X,
126 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
127 /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
128 /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
129 };
130 #endif
131
132
133 /* Tables of names of POSIX character classes and their lengths. The list is
134 terminated by a zero length entry. The first three must be alpha, lower, upper,
135 as this is assumed for handling case independence. */
136
137 static const char *const posix_names[] = {
138 "alpha", "lower", "upper",
139 "alnum", "ascii", "blank", "cntrl", "digit", "graph",
140 "print", "punct", "space", "word", "xdigit" };
141
142 static const uschar posix_name_lengths[] = {
143 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
144
145 /* Table of class bit maps for each POSIX class. Each class is formed from a
146 base map, with an optional addition or removal of another map. Then, for some
147 classes, there is some additional tweaking: for [:blank:] the vertical space
148 characters are removed, and for [:alpha:] and [:alnum:] the underscore
149 character is removed. The triples in the table consist of the base map offset,
150 second map offset or -1 if no second map, and a non-negative value for map
151 addition or a negative value for map subtraction (if there are two maps). The
152 absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
153 remove vertical space characters, 2 => remove underscore. */
154
155 static const int posix_class_maps[] = {
156 cbit_word, cbit_digit, -2, /* alpha */
157 cbit_lower, -1, 0, /* lower */
158 cbit_upper, -1, 0, /* upper */
159 cbit_word, -1, 2, /* alnum - word without underscore */
160 cbit_print, cbit_cntrl, 0, /* ascii */
161 cbit_space, -1, 1, /* blank - a GNU extension */
162 cbit_cntrl, -1, 0, /* cntrl */
163 cbit_digit, -1, 0, /* digit */
164 cbit_graph, -1, 0, /* graph */
165 cbit_print, -1, 0, /* print */
166 cbit_punct, -1, 0, /* punct */
167 cbit_space, -1, 0, /* space */
168 cbit_word, -1, 0, /* word - a Perl extension */
169 cbit_xdigit,-1, 0 /* xdigit */
170 };
171
172
173 #define STRING(a) # a
174 #define XSTRING(s) STRING(s)
175
176 /* The texts of compile-time error messages. These are "char *" because they
177 are passed to the outside world. Do not ever re-use any error number, because
178 they are documented. Always add a new error instead. Messages marked DEAD below
179 are no longer used. */
180
181 static const char *error_texts[] = {
182 "no error",
183 "\\ at end of pattern",
184 "\\c at end of pattern",
185 "unrecognized character follows \\",
186 "numbers out of order in {} quantifier",
187 /* 5 */
188 "number too big in {} quantifier",
189 "missing terminating ] for character class",
190 "invalid escape sequence in character class",
191 "range out of order in character class",
192 "nothing to repeat",
193 /* 10 */
194 "operand of unlimited repeat could match the empty string", /** DEAD **/
195 "internal error: unexpected repeat",
196 "unrecognized character after (?",
197 "POSIX named classes are supported only within a class",
198 "missing )",
199 /* 15 */
200 "reference to non-existent subpattern",
201 "erroffset passed as NULL",
202 "unknown option bit(s) set",
203 "missing ) after comment",
204 "parentheses nested too deeply", /** DEAD **/
205 /* 20 */
206 "regular expression too large",
207 "failed to get memory",
208 "unmatched parentheses",
209 "internal error: code overflow",
210 "unrecognized character after (?<",
211 /* 25 */
212 "lookbehind assertion is not fixed length",
213 "malformed number or name after (?(",
214 "conditional group contains more than two branches",
215 "assertion expected after (?(",
216 "(?R or (?[+-]digits must be followed by )",
217 /* 30 */
218 "unknown POSIX class name",
219 "POSIX collating elements are not supported",
220 "this version of PCRE is not compiled with PCRE_UTF8 support",
221 "spare error", /** DEAD **/
222 "character value in \\x{...} sequence is too large",
223 /* 35 */
224 "invalid condition (?(0)",
225 "\\C not allowed in lookbehind assertion",
226 "PCRE does not support \\L, \\l, \\N, \\U, or \\u",
227 "number after (?C is > 255",
228 "closing ) for (?C expected",
229 /* 40 */
230 "recursive call could loop indefinitely",
231 "unrecognized character after (?P",
232 "syntax error in subpattern name (missing terminator)",
233 "two named subpatterns have the same name",
234 "invalid UTF-8 string",
235 /* 45 */
236 "support for \\P, \\p, and \\X has not been compiled",
237 "malformed \\P or \\p sequence",
238 "unknown property name after \\P or \\p",
239 "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)",
240 "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")",
241 /* 50 */
242 "repeated subpattern is too long",
243 "octal value is greater than \\377 (not in UTF-8 mode)",
244 "internal error: overran compiling workspace",
245 "internal error: previously-checked referenced subpattern not found",
246 "DEFINE group contains more than one branch",
247 /* 55 */
248 "repeating a DEFINE group is not allowed",
249 "inconsistent NEWLINE options",
250 "\\g is not followed by a braced name or an optionally braced non-zero number",
251 "(?+ or (?- or (?(+ or (?(- must be followed by a non-zero number"
252 };
253
254
255 /* Table to identify digits and hex digits. This is used when compiling
256 patterns. Note that the tables in chartables are dependent on the locale, and
257 may mark arbitrary characters as digits - but the PCRE compiling code expects
258 to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
259 a private table here. It costs 256 bytes, but it is a lot faster than doing
260 character value tests (at least in some simple cases I timed), and in some
261 applications one wants PCRE to compile efficiently as well as match
262 efficiently.
263
264 For convenience, we use the same bit definitions as in chartables:
265
266 0x04 decimal digit
267 0x08 hexadecimal digit
268
269 Then we can use ctype_digit and ctype_xdigit in the code. */
270
271 #ifndef EBCDIC /* This is the "normal" case, for ASCII systems */
272 static const unsigned char digitab[] =
273 {
274 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
275 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
276 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
277 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
278 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
279 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
280 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
281 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
282 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
283 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
284 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
285 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
286 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
287 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
288 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
289 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
290 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
291 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
292 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
293 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
294 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
295 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
296 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
297 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
298 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
299 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
300 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
301 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
302 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
303 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
304 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
305 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
306
307 #else /* This is the "abnormal" case, for EBCDIC systems */
308 static const unsigned char digitab[] =
309 {
310 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
311 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
312 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
313 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
314 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
315 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
316 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
317 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
318 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
319 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
320 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
321 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */
322 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
323 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
324 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
325 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
326 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
327 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
328 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
329 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
330 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
331 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
332 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
333 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
334 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
335 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
336 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
337 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
338 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
339 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
340 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
341 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
342
343 static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
344 0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
345 0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
346 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
347 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
348 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
349 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
350 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
351 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
352 0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
353 0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
354 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
355 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */
356 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
357 0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
358 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
359 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
360 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
361 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
362 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
363 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
364 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
365 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
366 0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
367 0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
368 0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
369 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
370 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
371 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
372 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
373 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
374 0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
375 0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
376 #endif
377
378
379 /* Definition to allow mutual recursion */
380
381 static BOOL
382 compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
383 int *, int *, branch_chain *, compile_data *, int *);
384
385
386
387 /*************************************************
388 * Handle escapes *
389 *************************************************/
390
391 /* This function is called when a \ has been encountered. It either returns a
392 positive value for a simple escape such as \n, or a negative value which
393 encodes one of the more complicated things such as \d. A backreference to group
394 n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
395 UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
396 ptr is pointing at the \. On exit, it is on the final character of the escape
397 sequence.
398
399 Arguments:
400 ptrptr points to the pattern position pointer
401 errorcodeptr points to the errorcode variable
402 bracount number of previous extracting brackets
403 options the options bits
404 isclass TRUE if inside a character class
405
406 Returns: zero or positive => a data character
407 negative => a special escape sequence
408 on error, errorptr is set
409 */
410
411 static int
412 check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
413 int options, BOOL isclass)
414 {
415 BOOL utf8 = (options & PCRE_UTF8) != 0;
416 const uschar *ptr = *ptrptr + 1;
417 int c, i;
418
419 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
420 ptr--; /* Set pointer back to the last byte */
421
422 /* If backslash is at the end of the pattern, it's an error. */
423
424 if (c == 0) *errorcodeptr = ERR1;
425
426 /* Non-alphamerics are literals. For digits or letters, do an initial lookup in
427 a table. A non-zero result is something that can be returned immediately.
428 Otherwise further processing may be required. */
429
430 #ifndef EBCDIC /* ASCII coding */
431 else if (c < '0' || c > 'z') {} /* Not alphameric */
432 else if ((i = escapes[c - '0']) != 0) c = i;
433
434 #else /* EBCDIC coding */
435 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphameric */
436 else if ((i = escapes[c - 0x48]) != 0) c = i;
437 #endif
438
439 /* Escapes that need further processing, or are illegal. */
440
441 else
442 {
443 const uschar *oldptr;
444 BOOL braced, negated;
445
446 switch (c)
447 {
448 /* A number of Perl escapes are not handled by PCRE. We give an explicit
449 error. */
450
451 case 'l':
452 case 'L':
453 case 'N':
454 case 'u':
455 case 'U':
456 *errorcodeptr = ERR37;
457 break;
458
459 /* \g must be followed by a number, either plain or braced. If positive, it
460 is an absolute backreference. If negative, it is a relative backreference.
461 This is a Perl 5.10 feature. Perl 5.10 also supports \g{name} as a
462 reference to a named group. This is part of Perl's movement towards a
463 unified syntax for back references. As this is synonymous with \k{name}, we
464 fudge it up by pretending it really was \k. */
465
466 case 'g':
467 if (ptr[1] == '{')
468 {
469 const uschar *p;
470 for (p = ptr+2; *p != 0 && *p != '}'; p++)
471 if (*p != '-' && (digitab[*p] & ctype_digit) == 0) break;
472 if (*p != 0 && *p != '}')
473 {
474 c = -ESC_k;
475 break;
476 }
477 braced = TRUE;
478 ptr++;
479 }
480 else braced = FALSE;
481
482 if (ptr[1] == '-')
483 {
484 negated = TRUE;
485 ptr++;
486 }
487 else negated = FALSE;
488
489 c = 0;
490 while ((digitab[ptr[1]] & ctype_digit) != 0)
491 c = c * 10 + *(++ptr) - '0';
492
493 if (c == 0 || (braced && *(++ptr) != '}'))
494 {
495 *errorcodeptr = ERR57;
496 return 0;
497 }
498
499 if (negated)
500 {
501 if (c > bracount)
502 {
503 *errorcodeptr = ERR15;
504 return 0;
505 }
506 c = bracount - (c - 1);
507 }
508
509 c = -(ESC_REF + c);
510 break;
511
512 /* The handling of escape sequences consisting of a string of digits
513 starting with one that is not zero is not straightforward. By experiment,
514 the way Perl works seems to be as follows:
515
516 Outside a character class, the digits are read as a decimal number. If the
517 number is less than 10, or if there are that many previous extracting
518 left brackets, then it is a back reference. Otherwise, up to three octal
519 digits are read to form an escaped byte. Thus \123 is likely to be octal
520 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
521 value is greater than 377, the least significant 8 bits are taken. Inside a
522 character class, \ followed by a digit is always an octal number. */
523
524 case '1': case '2': case '3': case '4': case '5':
525 case '6': case '7': case '8': case '9':
526
527 if (!isclass)
528 {
529 oldptr = ptr;
530 c -= '0';
531 while ((digitab[ptr[1]] & ctype_digit) != 0)
532 c = c * 10 + *(++ptr) - '0';
533 if (c < 10 || c <= bracount)
534 {
535 c = -(ESC_REF + c);
536 break;
537 }
538 ptr = oldptr; /* Put the pointer back and fall through */
539 }
540
541 /* Handle an octal number following \. If the first digit is 8 or 9, Perl
542 generates a binary zero byte and treats the digit as a following literal.
543 Thus we have to pull back the pointer by one. */
544
545 if ((c = *ptr) >= '8')
546 {
547 ptr--;
548 c = 0;
549 break;
550 }
551
552 /* \0 always starts an octal number, but we may drop through to here with a
553 larger first octal digit. The original code used just to take the least
554 significant 8 bits of octal numbers (I think this is what early Perls used
555 to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
556 than 3 octal digits. */
557
558 case '0':
559 c -= '0';
560 while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
561 c = c * 8 + *(++ptr) - '0';
562 if (!utf8 && c > 255) *errorcodeptr = ERR51;
563 break;
564
565 /* \x is complicated. \x{ddd} is a character number which can be greater
566 than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
567 treated as a data character. */
568
569 case 'x':
570 if (ptr[1] == '{')
571 {
572 const uschar *pt = ptr + 2;
573 int count = 0;
574
575 c = 0;
576 while ((digitab[*pt] & ctype_xdigit) != 0)
577 {
578 register int cc = *pt++;
579 if (c == 0 && cc == '0') continue; /* Leading zeroes */
580 count++;
581
582 #ifndef EBCDIC /* ASCII coding */
583 if (cc >= 'a') cc -= 32; /* Convert to upper case */
584 c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));
585 #else /* EBCDIC coding */
586 if (cc >= 'a' && cc <= 'z') cc += 64; /* Convert to upper case */
587 c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));
588 #endif
589 }
590
591 if (*pt == '}')
592 {
593 if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
594 ptr = pt;
595 break;
596 }
597
598 /* If the sequence of hex digits does not end with '}', then we don't
599 recognize this construct; fall through to the normal \x handling. */
600 }
601
602 /* Read just a single-byte hex-defined char */
603
604 c = 0;
605 while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
606 {
607 int cc; /* Some compilers don't like ++ */
608 cc = *(++ptr); /* in initializers */
609 #ifndef EBCDIC /* ASCII coding */
610 if (cc >= 'a') cc -= 32; /* Convert to upper case */
611 c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
612 #else /* EBCDIC coding */
613 if (cc <= 'z') cc += 64; /* Convert to upper case */
614 c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
615 #endif
616 }
617 break;
618
619 /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
620 This coding is ASCII-specific, but then the whole concept of \cx is
621 ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
622
623 case 'c':
624 c = *(++ptr);
625 if (c == 0)
626 {
627 *errorcodeptr = ERR2;
628 return 0;
629 }
630
631 #ifndef EBCDIC /* ASCII coding */
632 if (c >= 'a' && c <= 'z') c -= 32;
633 c ^= 0x40;
634 #else /* EBCDIC coding */
635 if (c >= 'a' && c <= 'z') c += 64;
636 c ^= 0xC0;
637 #endif
638 break;
639
640 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
641 other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
642 for Perl compatibility, it is a literal. This code looks a bit odd, but
643 there used to be some cases other than the default, and there may be again
644 in future, so I haven't "optimized" it. */
645
646 default:
647 if ((options & PCRE_EXTRA) != 0) switch(c)
648 {
649 default:
650 *errorcodeptr = ERR3;
651 break;
652 }
653 break;
654 }
655 }
656
657 *ptrptr = ptr;
658 return c;
659 }
660
661
662
663 #ifdef SUPPORT_UCP
664 /*************************************************
665 * Handle \P and \p *
666 *************************************************/
667
668 /* This function is called after \P or \p has been encountered, provided that
669 PCRE is compiled with support for Unicode properties. On entry, ptrptr is
670 pointing at the P or p. On exit, it is pointing at the final character of the
671 escape sequence.
672
673 Argument:
674 ptrptr points to the pattern position pointer
675 negptr points to a boolean that is set TRUE for negation else FALSE
676 dptr points to an int that is set to the detailed property value
677 errorcodeptr points to the error code variable
678
679 Returns: type value from ucp_type_table, or -1 for an invalid type
680 */
681
682 static int
683 get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
684 {
685 int c, i, bot, top;
686 const uschar *ptr = *ptrptr;
687 char name[32];
688
689 c = *(++ptr);
690 if (c == 0) goto ERROR_RETURN;
691
692 *negptr = FALSE;
693
694 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
695 negation. */
696
697 if (c == '{')
698 {
699 if (ptr[1] == '^')
700 {
701 *negptr = TRUE;
702 ptr++;
703 }
704 for (i = 0; i < sizeof(name) - 1; i++)
705 {
706 c = *(++ptr);
707 if (c == 0) goto ERROR_RETURN;
708 if (c == '}') break;
709 name[i] = c;
710 }
711 if (c !='}') goto ERROR_RETURN;
712 name[i] = 0;
713 }
714
715 /* Otherwise there is just one following character */
716
717 else
718 {
719 name[0] = c;
720 name[1] = 0;
721 }
722
723 *ptrptr = ptr;
724
725 /* Search for a recognized property name using binary chop */
726
727 bot = 0;
728 top = _pcre_utt_size;
729
730 while (bot < top)
731 {
732 i = (bot + top) >> 1;
733 c = strcmp(name, _pcre_utt[i].name);
734 if (c == 0)
735 {
736 *dptr = _pcre_utt[i].value;
737 return _pcre_utt[i].type;
738 }
739 if (c > 0) bot = i + 1; else top = i;
740 }
741
742 *errorcodeptr = ERR47;
743 *ptrptr = ptr;
744 return -1;
745
746 ERROR_RETURN:
747 *errorcodeptr = ERR46;
748 *ptrptr = ptr;
749 return -1;
750 }
751 #endif
752
753
754
755
756 /*************************************************
757 * Check for counted repeat *
758 *************************************************/
759
760 /* This function is called when a '{' is encountered in a place where it might
761 start a quantifier. It looks ahead to see if it really is a quantifier or not.
762 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
763 where the ddds are digits.
764
765 Arguments:
766 p pointer to the first char after '{'
767
768 Returns: TRUE or FALSE
769 */
770
771 static BOOL
772 is_counted_repeat(const uschar *p)
773 {
774 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
775 while ((digitab[*p] & ctype_digit) != 0) p++;
776 if (*p == '}') return TRUE;
777
778 if (*p++ != ',') return FALSE;
779 if (*p == '}') return TRUE;
780
781 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
782 while ((digitab[*p] & ctype_digit) != 0) p++;
783
784 return (*p == '}');
785 }
786
787
788
789 /*************************************************
790 * Read repeat counts *
791 *************************************************/
792
793 /* Read an item of the form {n,m} and return the values. This is called only
794 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
795 so the syntax is guaranteed to be correct, but we need to check the values.
796
797 Arguments:
798 p pointer to first char after '{'
799 minp pointer to int for min
800 maxp pointer to int for max
801 returned as -1 if no max
802 errorcodeptr points to error code variable
803
804 Returns: pointer to '}' on success;
805 current ptr on error, with errorcodeptr set non-zero
806 */
807
808 static const uschar *
809 read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
810 {
811 int min = 0;
812 int max = -1;
813
814 /* Read the minimum value and do a paranoid check: a negative value indicates
815 an integer overflow. */
816
817 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
818 if (min < 0 || min > 65535)
819 {
820 *errorcodeptr = ERR5;
821 return p;
822 }
823
824 /* Read the maximum value if there is one, and again do a paranoid on its size.
825 Also, max must not be less than min. */
826
827 if (*p == '}') max = min; else
828 {
829 if (*(++p) != '}')
830 {
831 max = 0;
832 while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
833 if (max < 0 || max > 65535)
834 {
835 *errorcodeptr = ERR5;
836 return p;
837 }
838 if (max < min)
839 {
840 *errorcodeptr = ERR4;
841 return p;
842 }
843 }
844 }
845
846 /* Fill in the required variables, and pass back the pointer to the terminating
847 '}'. */
848
849 *minp = min;
850 *maxp = max;
851 return p;
852 }
853
854
855
856 /*************************************************
857 * Find forward referenced subpattern *
858 *************************************************/
859
860 /* This function scans along a pattern's text looking for capturing
861 subpatterns, and counting them. If it finds a named pattern that matches the
862 name it is given, it returns its number. Alternatively, if the name is NULL, it
863 returns when it reaches a given numbered subpattern. This is used for forward
864 references to subpatterns. We know that if (?P< is encountered, the name will
865 be terminated by '>' because that is checked in the first pass.
866
867 Arguments:
868 ptr current position in the pattern
869 count current count of capturing parens so far encountered
870 name name to seek, or NULL if seeking a numbered subpattern
871 lorn name length, or subpattern number if name is NULL
872 xmode TRUE if we are in /x mode
873
874 Returns: the number of the named subpattern, or -1 if not found
875 */
876
877 static int
878 find_parens(const uschar *ptr, int count, const uschar *name, int lorn,
879 BOOL xmode)
880 {
881 const uschar *thisname;
882
883 for (; *ptr != 0; ptr++)
884 {
885 int term;
886
887 /* Skip over backslashed characters and also entire \Q...\E */
888
889 if (*ptr == '\\')
890 {
891 if (*(++ptr) == 0) return -1;
892 if (*ptr == 'Q') for (;;)
893 {
894 while (*(++ptr) != 0 && *ptr != '\\');
895 if (*ptr == 0) return -1;
896 if (*(++ptr) == 'E') break;
897 }
898 continue;
899 }
900
901 /* Skip over character classes */
902
903 if (*ptr == '[')
904 {
905 while (*(++ptr) != ']')
906 {
907 if (*ptr == '\\')
908 {
909 if (*(++ptr) == 0) return -1;
910 if (*ptr == 'Q') for (;;)
911 {
912 while (*(++ptr) != 0 && *ptr != '\\');
913 if (*ptr == 0) return -1;
914 if (*(++ptr) == 'E') break;
915 }
916 continue;
917 }
918 }
919 continue;
920 }
921
922 /* Skip comments in /x mode */
923
924 if (xmode && *ptr == '#')
925 {
926 while (*(++ptr) != 0 && *ptr != '\n');
927 if (*ptr == 0) return -1;
928 continue;
929 }
930
931 /* An opening parens must now be a real metacharacter */
932
933 if (*ptr != '(') continue;
934 if (ptr[1] != '?')
935 {
936 count++;
937 if (name == NULL && count == lorn) return count;
938 continue;
939 }
940
941 ptr += 2;
942 if (*ptr == 'P') ptr++; /* Allow optional P */
943
944 /* We have to disambiguate (?<! and (?<= from (?<name> */
945
946 if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') &&
947 *ptr != '\'')
948 continue;
949
950 count++;
951
952 if (name == NULL && count == lorn) return count;
953 term = *ptr++;
954 if (term == '<') term = '>';
955 thisname = ptr;
956 while (*ptr != term) ptr++;
957 if (name != NULL && lorn == ptr - thisname &&
958 strncmp((const char *)name, (const char *)thisname, lorn) == 0)
959 return count;
960 }
961
962 return -1;
963 }
964
965
966
967 /*************************************************
968 * Find first significant op code *
969 *************************************************/
970
971 /* This is called by several functions that scan a compiled expression looking
972 for a fixed first character, or an anchoring op code etc. It skips over things
973 that do not influence this. For some calls, a change of option is important.
974 For some calls, it makes sense to skip negative forward and all backward
975 assertions, and also the \b assertion; for others it does not.
976
977 Arguments:
978 code pointer to the start of the group
979 options pointer to external options
980 optbit the option bit whose changing is significant, or
981 zero if none are
982 skipassert TRUE if certain assertions are to be skipped
983
984 Returns: pointer to the first significant opcode
985 */
986
987 static const uschar*
988 first_significant_code(const uschar *code, int *options, int optbit,
989 BOOL skipassert)
990 {
991 for (;;)
992 {
993 switch ((int)*code)
994 {
995 case OP_OPT:
996 if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
997 *options = (int)code[1];
998 code += 2;
999 break;
1000
1001 case OP_ASSERT_NOT:
1002 case OP_ASSERTBACK:
1003 case OP_ASSERTBACK_NOT:
1004 if (!skipassert) return code;
1005 do code += GET(code, 1); while (*code == OP_ALT);
1006 code += _pcre_OP_lengths[*code];
1007 break;
1008
1009 case OP_WORD_BOUNDARY:
1010 case OP_NOT_WORD_BOUNDARY:
1011 if (!skipassert) return code;
1012 /* Fall through */
1013
1014 case OP_CALLOUT:
1015 case OP_CREF:
1016 case OP_RREF:
1017 case OP_DEF:
1018 code += _pcre_OP_lengths[*code];
1019 break;
1020
1021 default:
1022 return code;
1023 }
1024 }
1025 /* Control never reaches here */
1026 }
1027
1028
1029
1030
1031 /*************************************************
1032 * Find the fixed length of a pattern *
1033 *************************************************/
1034
1035 /* Scan a pattern and compute the fixed length of subject that will match it,
1036 if the length is fixed. This is needed for dealing with backward assertions.
1037 In UTF8 mode, the result is in characters rather than bytes.
1038
1039 Arguments:
1040 code points to the start of the pattern (the bracket)
1041 options the compiling options
1042
1043 Returns: the fixed length, or -1 if there is no fixed length,
1044 or -2 if \C was encountered
1045 */
1046
1047 static int
1048 find_fixedlength(uschar *code, int options)
1049 {
1050 int length = -1;
1051
1052 register int branchlength = 0;
1053 register uschar *cc = code + 1 + LINK_SIZE;
1054
1055 /* Scan along the opcodes for this branch. If we get to the end of the
1056 branch, check the length against that of the other branches. */
1057
1058 for (;;)
1059 {
1060 int d;
1061 register int op = *cc;
1062
1063 switch (op)
1064 {
1065 case OP_CBRA:
1066 case OP_BRA:
1067 case OP_ONCE:
1068 case OP_COND:
1069 d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);
1070 if (d < 0) return d;
1071 branchlength += d;
1072 do cc += GET(cc, 1); while (*cc == OP_ALT);
1073 cc += 1 + LINK_SIZE;
1074 break;
1075
1076 /* Reached end of a branch; if it's a ket it is the end of a nested
1077 call. If it's ALT it is an alternation in a nested call. If it is
1078 END it's the end of the outer call. All can be handled by the same code. */
1079
1080 case OP_ALT:
1081 case OP_KET:
1082 case OP_KETRMAX:
1083 case OP_KETRMIN:
1084 case OP_END:
1085 if (length < 0) length = branchlength;
1086 else if (length != branchlength) return -1;
1087 if (*cc != OP_ALT) return length;
1088 cc += 1 + LINK_SIZE;
1089 branchlength = 0;
1090 break;
1091
1092 /* Skip over assertive subpatterns */
1093
1094 case OP_ASSERT:
1095 case OP_ASSERT_NOT:
1096 case OP_ASSERTBACK:
1097 case OP_ASSERTBACK_NOT:
1098 do cc += GET(cc, 1); while (*cc == OP_ALT);
1099 /* Fall through */
1100
1101 /* Skip over things that don't match chars */
1102
1103 case OP_REVERSE:
1104 case OP_CREF:
1105 case OP_RREF:
1106 case OP_DEF:
1107 case OP_OPT:
1108 case OP_CALLOUT:
1109 case OP_SOD:
1110 case OP_SOM:
1111 case OP_EOD:
1112 case OP_EODN:
1113 case OP_CIRC:
1114 case OP_DOLL:
1115 case OP_NOT_WORD_BOUNDARY:
1116 case OP_WORD_BOUNDARY:
1117 cc += _pcre_OP_lengths[*cc];
1118 break;
1119
1120 /* Handle literal characters */
1121
1122 case OP_CHAR:
1123 case OP_CHARNC:
1124 case OP_NOT:
1125 branchlength++;
1126 cc += 2;
1127 #ifdef SUPPORT_UTF8
1128 if ((options & PCRE_UTF8) != 0)
1129 {
1130 while ((*cc & 0xc0) == 0x80) cc++;
1131 }
1132 #endif
1133 break;
1134
1135 /* Handle exact repetitions. The count is already in characters, but we
1136 need to skip over a multibyte character in UTF8 mode. */
1137
1138 case OP_EXACT:
1139 branchlength += GET2(cc,1);
1140 cc += 4;
1141 #ifdef SUPPORT_UTF8
1142 if ((options & PCRE_UTF8) != 0)
1143 {
1144 while((*cc & 0x80) == 0x80) cc++;
1145 }
1146 #endif
1147 break;
1148
1149 case OP_TYPEEXACT:
1150 branchlength += GET2(cc,1);
1151 cc += 4;
1152 break;
1153
1154 /* Handle single-char matchers */
1155
1156 case OP_PROP:
1157 case OP_NOTPROP:
1158 cc += 2;
1159 /* Fall through */
1160
1161 case OP_NOT_DIGIT:
1162 case OP_DIGIT:
1163 case OP_NOT_WHITESPACE:
1164 case OP_WHITESPACE:
1165 case OP_NOT_WORDCHAR:
1166 case OP_WORDCHAR:
1167 case OP_ANY:
1168 branchlength++;
1169 cc++;
1170 break;
1171
1172 /* The single-byte matcher isn't allowed */
1173
1174 case OP_ANYBYTE:
1175 return -2;
1176
1177 /* Check a class for variable quantification */
1178
1179 #ifdef SUPPORT_UTF8
1180 case OP_XCLASS:
1181 cc += GET(cc, 1) - 33;
1182 /* Fall through */
1183 #endif
1184
1185 case OP_CLASS:
1186 case OP_NCLASS:
1187 cc += 33;
1188
1189 switch (*cc)
1190 {
1191 case OP_CRSTAR:
1192 case OP_CRMINSTAR:
1193 case OP_CRQUERY:
1194 case OP_CRMINQUERY:
1195 return -1;
1196
1197 case OP_CRRANGE:
1198 case OP_CRMINRANGE:
1199 if (GET2(cc,1) != GET2(cc,3)) return -1;
1200 branchlength += GET2(cc,1);
1201 cc += 5;
1202 break;
1203
1204 default:
1205 branchlength++;
1206 }
1207 break;
1208
1209 /* Anything else is variable length */
1210
1211 default:
1212 return -1;
1213 }
1214 }
1215 /* Control never gets here */
1216 }
1217
1218
1219
1220
1221 /*************************************************
1222 * Scan compiled regex for numbered bracket *
1223 *************************************************/
1224
1225 /* This little function scans through a compiled pattern until it finds a
1226 capturing bracket with the given number.
1227
1228 Arguments:
1229 code points to start of expression
1230 utf8 TRUE in UTF-8 mode
1231 number the required bracket number
1232
1233 Returns: pointer to the opcode for the bracket, or NULL if not found
1234 */
1235
1236 static const uschar *
1237 find_bracket(const uschar *code, BOOL utf8, int number)
1238 {
1239 for (;;)
1240 {
1241 register int c = *code;
1242 if (c == OP_END) return NULL;
1243
1244 /* XCLASS is used for classes that cannot be represented just by a bit
1245 map. This includes negated single high-valued characters. The length in
1246 the table is zero; the actual length is stored in the compiled code. */
1247
1248 if (c == OP_XCLASS) code += GET(code, 1);
1249
1250 /* Handle capturing bracket */
1251
1252 else if (c == OP_CBRA)
1253 {
1254 int n = GET2(code, 1+LINK_SIZE);
1255 if (n == number) return (uschar *)code;
1256 code += _pcre_OP_lengths[c];
1257 }
1258
1259 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1260 a multi-byte character. The length in the table is a minimum, so we have to
1261 arrange to skip the extra bytes. */
1262
1263 else
1264 {
1265 code += _pcre_OP_lengths[c];
1266 #ifdef SUPPORT_UTF8
1267 if (utf8) switch(c)
1268 {
1269 case OP_CHAR:
1270 case OP_CHARNC:
1271 case OP_EXACT:
1272 case OP_UPTO:
1273 case OP_MINUPTO:
1274 case OP_POSUPTO:
1275 case OP_STAR:
1276 case OP_MINSTAR:
1277 case OP_POSSTAR:
1278 case OP_PLUS:
1279 case OP_MINPLUS:
1280 case OP_POSPLUS:
1281 case OP_QUERY:
1282 case OP_MINQUERY:
1283 case OP_POSQUERY:
1284 if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1285 break;
1286 }
1287 #endif
1288 }
1289 }
1290 }
1291
1292
1293
1294 /*************************************************
1295 * Scan compiled regex for recursion reference *
1296 *************************************************/
1297
1298 /* This little function scans through a compiled pattern until it finds an
1299 instance of OP_RECURSE.
1300
1301 Arguments:
1302 code points to start of expression
1303 utf8 TRUE in UTF-8 mode
1304
1305 Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1306 */
1307
1308 static const uschar *
1309 find_recurse(const uschar *code, BOOL utf8)
1310 {
1311 for (;;)
1312 {
1313 register int c = *code;
1314 if (c == OP_END) return NULL;
1315 if (c == OP_RECURSE) return code;
1316
1317 /* XCLASS is used for classes that cannot be represented just by a bit
1318 map. This includes negated single high-valued characters. The length in
1319 the table is zero; the actual length is stored in the compiled code. */
1320
1321 if (c == OP_XCLASS) code += GET(code, 1);
1322
1323 /* Otherwise, we get the item's length from the table. In UTF-8 mode, opcodes
1324 that are followed by a character may be followed by a multi-byte character.
1325 The length in the table is a minimum, so we have to arrange to skip the extra
1326 bytes. */
1327
1328 else
1329 {
1330 code += _pcre_OP_lengths[c];
1331 #ifdef SUPPORT_UTF8
1332 if (utf8) switch(c)
1333 {
1334 case OP_CHAR:
1335 case OP_CHARNC:
1336 case OP_EXACT:
1337 case OP_UPTO:
1338 case OP_MINUPTO:
1339 case OP_POSUPTO:
1340 case OP_STAR:
1341 case OP_MINSTAR:
1342 case OP_POSSTAR:
1343 case OP_PLUS:
1344 case OP_MINPLUS:
1345 case OP_POSPLUS:
1346 case OP_QUERY:
1347 case OP_MINQUERY:
1348 case OP_POSQUERY:
1349 if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1350 break;
1351 }
1352 #endif
1353 }
1354 }
1355 }
1356
1357
1358
1359 /*************************************************
1360 * Scan compiled branch for non-emptiness *
1361 *************************************************/
1362
1363 /* This function scans through a branch of a compiled pattern to see whether it
1364 can match the empty string or not. It is called from could_be_empty()
1365 below and from compile_branch() when checking for an unlimited repeat of a
1366 group that can match nothing. Note that first_significant_code() skips over
1367 assertions. If we hit an unclosed bracket, we return "empty" - this means we've
1368 struck an inner bracket whose current branch will already have been scanned.
1369
1370 Arguments:
1371 code points to start of search
1372 endcode points to where to stop
1373 utf8 TRUE if in UTF8 mode
1374
1375 Returns: TRUE if what is matched could be empty
1376 */
1377
1378 static BOOL
1379 could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1380 {
1381 register int c;
1382 for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1383 code < endcode;
1384 code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1385 {
1386 const uschar *ccode;
1387
1388 c = *code;
1389
1390 /* Groups with zero repeats can of course be empty; skip them. */
1391
1392 if (c == OP_BRAZERO || c == OP_BRAMINZERO)
1393 {
1394 code += _pcre_OP_lengths[c];
1395 do code += GET(code, 1); while (*code == OP_ALT);
1396 c = *code;
1397 continue;
1398 }
1399
1400 /* For other groups, scan the branches. */
1401
1402 if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE)
1403 {
1404 BOOL empty_branch;
1405 if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1406
1407 /* Scan a closed bracket */
1408
1409 empty_branch = FALSE;
1410 do
1411 {
1412 if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1413 empty_branch = TRUE;
1414 code += GET(code, 1);
1415 }
1416 while (*code == OP_ALT);
1417 if (!empty_branch) return FALSE; /* All branches are non-empty */
1418 c = *code;
1419 continue;
1420 }
1421
1422 /* Handle the other opcodes */
1423
1424 switch (c)
1425 {
1426 /* Check for quantifiers after a class */
1427
1428 #ifdef SUPPORT_UTF8
1429 case OP_XCLASS:
1430 ccode = code + GET(code, 1);
1431 goto CHECK_CLASS_REPEAT;
1432 #endif
1433
1434 case OP_CLASS:
1435 case OP_NCLASS:
1436 ccode = code + 33;
1437
1438 #ifdef SUPPORT_UTF8
1439 CHECK_CLASS_REPEAT:
1440 #endif
1441
1442 switch (*ccode)
1443 {
1444 case OP_CRSTAR: /* These could be empty; continue */
1445 case OP_CRMINSTAR:
1446 case OP_CRQUERY:
1447 case OP_CRMINQUERY:
1448 break;
1449
1450 default: /* Non-repeat => class must match */
1451 case OP_CRPLUS: /* These repeats aren't empty */
1452 case OP_CRMINPLUS:
1453 return FALSE;
1454
1455 case OP_CRRANGE:
1456 case OP_CRMINRANGE:
1457 if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
1458 break;
1459 }
1460 break;
1461
1462 /* Opcodes that must match a character */
1463
1464 case OP_PROP:
1465 case OP_NOTPROP:
1466 case OP_EXTUNI:
1467 case OP_NOT_DIGIT:
1468 case OP_DIGIT:
1469 case OP_NOT_WHITESPACE:
1470 case OP_WHITESPACE:
1471 case OP_NOT_WORDCHAR:
1472 case OP_WORDCHAR:
1473 case OP_ANY:
1474 case OP_ANYBYTE:
1475 case OP_CHAR:
1476 case OP_CHARNC:
1477 case OP_NOT:
1478 case OP_PLUS:
1479 case OP_MINPLUS:
1480 case OP_POSPLUS:
1481 case OP_EXACT:
1482 case OP_NOTPLUS:
1483 case OP_NOTMINPLUS:
1484 case OP_NOTPOSPLUS:
1485 case OP_NOTEXACT:
1486 case OP_TYPEPLUS:
1487 case OP_TYPEMINPLUS:
1488 case OP_TYPEPOSPLUS:
1489 case OP_TYPEEXACT:
1490 return FALSE;
1491
1492 /* End of branch */
1493
1494 case OP_KET:
1495 case OP_KETRMAX:
1496 case OP_KETRMIN:
1497 case OP_ALT:
1498 return TRUE;
1499
1500 /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1501 MINUPTO, and POSUPTO may be followed by a multibyte character */
1502
1503 #ifdef SUPPORT_UTF8
1504 case OP_STAR:
1505 case OP_MINSTAR:
1506 case OP_POSSTAR:
1507 case OP_QUERY:
1508 case OP_MINQUERY:
1509 case OP_POSQUERY:
1510 case OP_UPTO:
1511 case OP_MINUPTO:
1512 case OP_POSUPTO:
1513 if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1514 break;
1515 #endif
1516 }
1517 }
1518
1519 return TRUE;
1520 }
1521
1522
1523
1524 /*************************************************
1525 * Scan compiled regex for non-emptiness *
1526 *************************************************/
1527
1528 /* This function is called to check for left recursive calls. We want to check
1529 the current branch of the current pattern to see if it could match the empty
1530 string. If it could, we must look outwards for branches at other levels,
1531 stopping when we pass beyond the bracket which is the subject of the recursion.
1532
1533 Arguments:
1534 code points to start of the recursion
1535 endcode points to where to stop (current RECURSE item)
1536 bcptr points to the chain of current (unclosed) branch starts
1537 utf8 TRUE if in UTF-8 mode
1538
1539 Returns: TRUE if what is matched could be empty
1540 */
1541
1542 static BOOL
1543 could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1544 BOOL utf8)
1545 {
1546 while (bcptr != NULL && bcptr->current >= code)
1547 {
1548 if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1549 bcptr = bcptr->outer;
1550 }
1551 return TRUE;
1552 }
1553
1554
1555
1556 /*************************************************
1557 * Check for POSIX class syntax *
1558 *************************************************/
1559
1560 /* This function is called when the sequence "[:" or "[." or "[=" is
1561 encountered in a character class. It checks whether this is followed by an
1562 optional ^ and then a sequence of letters, terminated by a matching ":]" or
1563 ".]" or "=]".
1564
1565 Argument:
1566 ptr pointer to the initial [
1567 endptr where to return the end pointer
1568 cd pointer to compile data
1569
1570 Returns: TRUE or FALSE
1571 */
1572
1573 static BOOL
1574 check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
1575 {
1576 int terminator; /* Don't combine these lines; the Solaris cc */
1577 terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
1578 if (*(++ptr) == '^') ptr++;
1579 while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
1580 if (*ptr == terminator && ptr[1] == ']')
1581 {
1582 *endptr = ptr;
1583 return TRUE;
1584 }
1585 return FALSE;
1586 }
1587
1588
1589
1590
1591 /*************************************************
1592 * Check POSIX class name *
1593 *************************************************/
1594
1595 /* This function is called to check the name given in a POSIX-style class entry
1596 such as [:alnum:].
1597
1598 Arguments:
1599 ptr points to the first letter
1600 len the length of the name
1601
1602 Returns: a value representing the name, or -1 if unknown
1603 */
1604
1605 static int
1606 check_posix_name(const uschar *ptr, int len)
1607 {
1608 register int yield = 0;
1609 while (posix_name_lengths[yield] != 0)
1610 {
1611 if (len == posix_name_lengths[yield] &&
1612 strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
1613 yield++;
1614 }
1615 return -1;
1616 }
1617
1618
1619 /*************************************************
1620 * Adjust OP_RECURSE items in repeated group *
1621 *************************************************/
1622
1623 /* OP_RECURSE items contain an offset from the start of the regex to the group
1624 that is referenced. This means that groups can be replicated for fixed
1625 repetition simply by copying (because the recursion is allowed to refer to
1626 earlier groups that are outside the current group). However, when a group is
1627 optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
1628 it, after it has been compiled. This means that any OP_RECURSE items within it
1629 that refer to the group itself or any contained groups have to have their
1630 offsets adjusted. That one of the jobs of this function. Before it is called,
1631 the partially compiled regex must be temporarily terminated with OP_END.
1632
1633 This function has been extended with the possibility of forward references for
1634 recursions and subroutine calls. It must also check the list of such references
1635 for the group we are dealing with. If it finds that one of the recursions in
1636 the current group is on this list, it adjusts the offset in the list, not the
1637 value in the reference (which is a group number).
1638
1639 Arguments:
1640 group points to the start of the group
1641 adjust the amount by which the group is to be moved
1642 utf8 TRUE in UTF-8 mode
1643 cd contains pointers to tables etc.
1644 save_hwm the hwm forward reference pointer at the start of the group
1645
1646 Returns: nothing
1647 */
1648
1649 static void
1650 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
1651 uschar *save_hwm)
1652 {
1653 uschar *ptr = group;
1654 while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1655 {
1656 int offset;
1657 uschar *hc;
1658
1659 /* See if this recursion is on the forward reference list. If so, adjust the
1660 reference. */
1661
1662 for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
1663 {
1664 offset = GET(hc, 0);
1665 if (cd->start_code + offset == ptr + 1)
1666 {
1667 PUT(hc, 0, offset + adjust);
1668 break;
1669 }
1670 }
1671
1672 /* Otherwise, adjust the recursion offset if it's after the start of this
1673 group. */
1674
1675 if (hc >= cd->hwm)
1676 {
1677 offset = GET(ptr, 1);
1678 if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1679 }
1680
1681 ptr += 1 + LINK_SIZE;
1682 }
1683 }
1684
1685
1686
1687 /*************************************************
1688 * Insert an automatic callout point *
1689 *************************************************/
1690
1691 /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
1692 callout points before each pattern item.
1693
1694 Arguments:
1695 code current code pointer
1696 ptr current pattern pointer
1697 cd pointers to tables etc
1698
1699 Returns: new code pointer
1700 */
1701
1702 static uschar *
1703 auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
1704 {
1705 *code++ = OP_CALLOUT;
1706 *code++ = 255;
1707 PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */
1708 PUT(code, LINK_SIZE, 0); /* Default length */
1709 return code + 2*LINK_SIZE;
1710 }
1711
1712
1713
1714 /*************************************************
1715 * Complete a callout item *
1716 *************************************************/
1717
1718 /* A callout item contains the length of the next item in the pattern, which
1719 we can't fill in till after we have reached the relevant point. This is used
1720 for both automatic and manual callouts.
1721
1722 Arguments:
1723 previous_callout points to previous callout item
1724 ptr current pattern pointer
1725 cd pointers to tables etc
1726
1727 Returns: nothing
1728 */
1729
1730 static void
1731 complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
1732 {
1733 int length = ptr - cd->start_pattern - GET(previous_callout, 2);
1734 PUT(previous_callout, 2 + LINK_SIZE, length);
1735 }
1736
1737
1738
1739 #ifdef SUPPORT_UCP
1740 /*************************************************
1741 * Get othercase range *
1742 *************************************************/
1743
1744 /* This function is passed the start and end of a class range, in UTF-8 mode
1745 with UCP support. It searches up the characters, looking for internal ranges of
1746 characters in the "other" case. Each call returns the next one, updating the
1747 start address.
1748
1749 Arguments:
1750 cptr points to starting character value; updated
1751 d end value
1752 ocptr where to put start of othercase range
1753 odptr where to put end of othercase range
1754
1755 Yield: TRUE when range returned; FALSE when no more
1756 */
1757
1758 static BOOL
1759 get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
1760 unsigned int *odptr)
1761 {
1762 unsigned int c, othercase, next;
1763
1764 for (c = *cptr; c <= d; c++)
1765 { if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR) break; }
1766
1767 if (c > d) return FALSE;
1768
1769 *ocptr = othercase;
1770 next = othercase + 1;
1771
1772 for (++c; c <= d; c++)
1773 {
1774 if (_pcre_ucp_othercase(c) != next) break;
1775 next++;
1776 }
1777
1778 *odptr = next - 1;
1779 *cptr = c;
1780
1781 return TRUE;
1782 }
1783 #endif /* SUPPORT_UCP */
1784
1785
1786
1787 /*************************************************
1788 * Check if auto-possessifying is possible *
1789 *************************************************/
1790
1791 /* This function is called for unlimited repeats of certain items, to see
1792 whether the next thing could possibly match the repeated item. If not, it makes
1793 sense to automatically possessify the repeated item.
1794
1795 Arguments:
1796 op_code the repeated op code
1797 this data for this item, depends on the opcode
1798 utf8 TRUE in UTF-8 mode
1799 utf8_char used for utf8 character bytes, NULL if not relevant
1800 ptr next character in pattern
1801 options options bits
1802 cd contains pointers to tables etc.
1803
1804 Returns: TRUE if possessifying is wanted
1805 */
1806
1807 static BOOL
1808 check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
1809 const uschar *ptr, int options, compile_data *cd)
1810 {
1811 int next;
1812
1813 /* Skip whitespace and comments in extended mode */
1814
1815 if ((options & PCRE_EXTENDED) != 0)
1816 {
1817 for (;;)
1818 {
1819 while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1820 if (*ptr == '#')
1821 {
1822 while (*(++ptr) != 0)
1823 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1824 }
1825 else break;
1826 }
1827 }
1828
1829 /* If the next item is one that we can handle, get its value. A non-negative
1830 value is a character, a negative value is an escape value. */
1831
1832 if (*ptr == '\\')
1833 {
1834 int temperrorcode = 0;
1835 next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
1836 if (temperrorcode != 0) return FALSE;
1837 ptr++; /* Point after the escape sequence */
1838 }
1839
1840 else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
1841 {
1842 #ifdef SUPPORT_UTF8
1843 if (utf8) { GETCHARINC(next, ptr); } else
1844 #endif
1845 next = *ptr++;
1846 }
1847
1848 else return FALSE;
1849
1850 /* Skip whitespace and comments in extended mode */
1851
1852 if ((options & PCRE_EXTENDED) != 0)
1853 {
1854 for (;;)
1855 {
1856 while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1857 if (*ptr == '#')
1858 {
1859 while (*(++ptr) != 0)
1860 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1861 }
1862 else break;
1863 }
1864 }
1865
1866 /* If the next thing is itself optional, we have to give up. */
1867
1868 if (*ptr == '*' || *ptr == '?' || strncmp((char *)ptr, "{0,", 3) == 0)
1869 return FALSE;
1870
1871 /* Now compare the next item with the previous opcode. If the previous is a
1872 positive single character match, "item" either contains the character or, if
1873 "item" is greater than 127 in utf8 mode, the character's bytes are in
1874 utf8_char. */
1875
1876
1877 /* Handle cases when the next item is a character. */
1878
1879 if (next >= 0) switch(op_code)
1880 {
1881 case OP_CHAR:
1882 #ifdef SUPPORT_UTF8
1883 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1884 #endif
1885 return item != next;
1886
1887 /* For CHARNC (caseless character) we must check the other case. If we have
1888 Unicode property support, we can use it to test the other case of
1889 high-valued characters. */
1890
1891 case OP_CHARNC:
1892 #ifdef SUPPORT_UTF8
1893 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1894 #endif
1895 if (item == next) return FALSE;
1896 #ifdef SUPPORT_UTF8
1897 if (utf8)
1898 {
1899 unsigned int othercase;
1900 if (next < 128) othercase = cd->fcc[next]; else
1901 #ifdef SUPPORT_UCP
1902 othercase = _pcre_ucp_othercase((unsigned int)next);
1903 #else
1904 othercase = NOTACHAR;
1905 #endif
1906 return (unsigned int)item != othercase;
1907 }
1908 else
1909 #endif /* SUPPORT_UTF8 */
1910 return (item != cd->fcc[next]); /* Non-UTF-8 mode */
1911
1912 /* For OP_NOT, "item" must be a single-byte character. */
1913
1914 case OP_NOT:
1915 if (next < 0) return FALSE; /* Not a character */
1916 if (item == next) return TRUE;
1917 if ((options & PCRE_CASELESS) == 0) return FALSE;
1918 #ifdef SUPPORT_UTF8
1919 if (utf8)
1920 {
1921 unsigned int othercase;
1922 if (next < 128) othercase = cd->fcc[next]; else
1923 #ifdef SUPPORT_UCP
1924 othercase = _pcre_ucp_othercase(next);
1925 #else
1926 othercase = NOTACHAR;
1927 #endif
1928 return (unsigned int)item == othercase;
1929 }
1930 else
1931 #endif /* SUPPORT_UTF8 */
1932 return (item == cd->fcc[next]); /* Non-UTF-8 mode */
1933
1934 case OP_DIGIT:
1935 return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
1936
1937 case OP_NOT_DIGIT:
1938 return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
1939
1940 case OP_WHITESPACE:
1941 return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
1942
1943 case OP_NOT_WHITESPACE:
1944 return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
1945
1946 case OP_WORDCHAR:
1947 return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
1948
1949 case OP_NOT_WORDCHAR:
1950 return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
1951
1952 case OP_HSPACE:
1953 case OP_NOT_HSPACE:
1954 switch(next)
1955 {
1956 case 0x09:
1957 case 0x20:
1958 case 0xa0:
1959 case 0x1680:
1960 case 0x180e:
1961 case 0x2000:
1962 case 0x2001:
1963 case 0x2002:
1964 case 0x2003:
1965 case 0x2004:
1966 case 0x2005:
1967 case 0x2006:
1968 case 0x2007:
1969 case 0x2008:
1970 case 0x2009:
1971 case 0x200A:
1972 case 0x202f:
1973 case 0x205f:
1974 case 0x3000:
1975 return op_code != OP_HSPACE;
1976 default:
1977 return op_code == OP_HSPACE;
1978 }
1979
1980 case OP_VSPACE:
1981 case OP_NOT_VSPACE:
1982 switch(next)
1983 {
1984 case 0x0a:
1985 case 0x0b:
1986 case 0x0c:
1987 case 0x0d:
1988 case 0x85:
1989 case 0x2028:
1990 case 0x2029:
1991 return op_code != OP_VSPACE;
1992 default:
1993 return op_code == OP_VSPACE;
1994 }
1995
1996 default:
1997 return FALSE;
1998 }
1999
2000
2001 /* Handle the case when the next item is \d, \s, etc. */
2002
2003 switch(op_code)
2004 {
2005 case OP_CHAR:
2006 case OP_CHARNC:
2007 #ifdef SUPPORT_UTF8
2008 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2009 #endif
2010 switch(-next)
2011 {
2012 case ESC_d:
2013 return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
2014
2015 case ESC_D:
2016 return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
2017
2018 case ESC_s:
2019 return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
2020
2021 case ESC_S:
2022 return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
2023
2024 case ESC_w:
2025 return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
2026
2027 case ESC_W:
2028 return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
2029
2030 case ESC_h:
2031 case ESC_H:
2032 switch(item)
2033 {
2034 case 0x09:
2035 case 0x20:
2036 case 0xa0:
2037 case 0x1680:
2038 case 0x180e:
2039 case 0x2000:
2040 case 0x2001:
2041 case 0x2002:
2042 case 0x2003:
2043 case 0x2004:
2044 case 0x2005:
2045 case 0x2006:
2046 case 0x2007:
2047 case 0x2008:
2048 case 0x2009:
2049 case 0x200A:
2050 case 0x202f:
2051 case 0x205f:
2052 case 0x3000:
2053 return -next != ESC_h;
2054 default:
2055 return -next == ESC_h;
2056 }
2057
2058 case ESC_v:
2059 case ESC_V:
2060 switch(item)
2061 {
2062 case 0x0a:
2063 case 0x0b:
2064 case 0x0c:
2065 case 0x0d:
2066 case 0x85:
2067 case 0x2028:
2068 case 0x2029:
2069 return -next != ESC_v;
2070 default:
2071 return -next == ESC_v;
2072 }
2073
2074 default:
2075 return FALSE;
2076 }
2077
2078 case OP_DIGIT:
2079 return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
2080 next == -ESC_h || next == -ESC_v;
2081
2082 case OP_NOT_DIGIT:
2083 return next == -ESC_d;
2084
2085 case OP_WHITESPACE:
2086 return next == -ESC_S || next == -ESC_d || next == -ESC_w;
2087
2088 case OP_NOT_WHITESPACE:
2089 return next == -ESC_s || next == -ESC_h || next == -ESC_v;
2090
2091 case OP_HSPACE:
2092 return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w;
2093
2094 case OP_NOT_HSPACE:
2095 return next == -ESC_h;
2096
2097 /* Can't have \S in here because VT matches \S (Perl anomaly) */
2098 case OP_VSPACE:
2099 return next == -ESC_V || next == -ESC_d || next == -ESC_w;
2100
2101 case OP_NOT_VSPACE:
2102 return next == -ESC_v;
2103
2104 case OP_WORDCHAR:
2105 return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v;
2106
2107 case OP_NOT_WORDCHAR:
2108 return next == -ESC_w || next == -ESC_d;
2109
2110 default:
2111 return FALSE;
2112 }
2113
2114 /* Control does not reach here */
2115 }
2116
2117
2118
2119 /*************************************************
2120 * Compile one branch *
2121 *************************************************/
2122
2123 /* Scan the pattern, compiling it into the a vector. If the options are
2124 changed during the branch, the pointer is used to change the external options
2125 bits. This function is used during the pre-compile phase when we are trying
2126 to find out the amount of memory needed, as well as during the real compile
2127 phase. The value of lengthptr distinguishes the two phases.
2128
2129 Arguments:
2130 optionsptr pointer to the option bits
2131 codeptr points to the pointer to the current code point
2132 ptrptr points to the current pattern pointer
2133 errorcodeptr points to error code variable
2134 firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2135 reqbyteptr set to the last literal character required, else < 0
2136 bcptr points to current branch chain
2137 cd contains pointers to tables etc.
2138 lengthptr NULL during the real compile phase
2139 points to length accumulator during pre-compile phase
2140
2141 Returns: TRUE on success
2142 FALSE, with *errorcodeptr set non-zero on error
2143 */
2144
2145 static BOOL
2146 compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2147 int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2148 compile_data *cd, int *lengthptr)
2149 {
2150 int repeat_type, op_type;
2151 int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
2152 int bravalue = 0;
2153 int greedy_default, greedy_non_default;
2154 int firstbyte, reqbyte;
2155 int zeroreqbyte, zerofirstbyte;
2156 int req_caseopt, reqvary, tempreqvary;
2157 int options = *optionsptr;
2158 int after_manual_callout = 0;
2159 int length_prevgroup = 0;
2160 register int c;
2161 register uschar *code = *codeptr;
2162 uschar *last_code = code;
2163 uschar *orig_code = code;
2164 uschar *tempcode;
2165 BOOL inescq = FALSE;
2166 BOOL groupsetfirstbyte = FALSE;
2167 const uschar *ptr = *ptrptr;
2168 const uschar *tempptr;
2169 uschar *previous = NULL;
2170 uschar *previous_callout = NULL;
2171 uschar *save_hwm = NULL;
2172 uschar classbits[32];
2173
2174 #ifdef SUPPORT_UTF8
2175 BOOL class_utf8;
2176 BOOL utf8 = (options & PCRE_UTF8) != 0;
2177 uschar *class_utf8data;
2178 uschar utf8_char[6];
2179 #else
2180 BOOL utf8 = FALSE;
2181 uschar *utf8_char = NULL;
2182 #endif
2183
2184 #ifdef DEBUG
2185 if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2186 #endif
2187
2188 /* Set up the default and non-default settings for greediness */
2189
2190 greedy_default = ((options & PCRE_UNGREEDY) != 0);
2191 greedy_non_default = greedy_default ^ 1;
2192
2193 /* Initialize no first byte, no required byte. REQ_UNSET means "no char
2194 matching encountered yet". It gets changed to REQ_NONE if we hit something that
2195 matches a non-fixed char first char; reqbyte just remains unset if we never
2196 find one.
2197
2198 When we hit a repeat whose minimum is zero, we may have to adjust these values
2199 to take the zero repeat into account. This is implemented by setting them to
2200 zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2201 item types that can be repeated set these backoff variables appropriately. */
2202
2203 firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2204
2205 /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2206 according to the current setting of the caseless flag. REQ_CASELESS is a bit
2207 value > 255. It is added into the firstbyte or reqbyte variables to record the
2208 case status of the value. This is used only for ASCII characters. */
2209
2210 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2211
2212 /* Switch on next character until the end of the branch */
2213
2214 for (;; ptr++)
2215 {
2216 BOOL negate_class;
2217 BOOL possessive_quantifier;
2218 BOOL is_quantifier;
2219 BOOL is_recurse;
2220 BOOL reset_bracount;
2221 int class_charcount;
2222 int class_lastchar;
2223 int newoptions;
2224 int recno;
2225 int refsign;
2226 int skipbytes;
2227 int subreqbyte;
2228 int subfirstbyte;
2229 int terminator;
2230 int mclength;
2231 uschar mcbuffer[8];
2232
2233 /* Get next byte in the pattern */
2234
2235 c = *ptr;
2236
2237 /* If we are in the pre-compile phase, accumulate the length used for the
2238 previous cycle of this loop. */
2239
2240 if (lengthptr != NULL)
2241 {
2242 #ifdef DEBUG
2243 if (code > cd->hwm) cd->hwm = code; /* High water info */
2244 #endif
2245 if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
2246 {
2247 *errorcodeptr = ERR52;
2248 goto FAILED;
2249 }
2250
2251 /* There is at least one situation where code goes backwards: this is the
2252 case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2253 the class is simply eliminated. However, it is created first, so we have to
2254 allow memory for it. Therefore, don't ever reduce the length at this point.
2255 */
2256
2257 if (code < last_code) code = last_code;
2258 *lengthptr += code - last_code;
2259 DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2260
2261 /* If "previous" is set and it is not at the start of the work space, move
2262 it back to there, in order to avoid filling up the work space. Otherwise,
2263 if "previous" is NULL, reset the current code pointer to the start. */
2264
2265 if (previous != NULL)
2266 {
2267 if (previous > orig_code)
2268 {
2269 memmove(orig_code, previous, code - previous);
2270 code -= previous - orig_code;
2271 previous = orig_code;
2272 }
2273 }
2274 else code = orig_code;
2275
2276 /* Remember where this code item starts so we can pick up the length
2277 next time round. */
2278
2279 last_code = code;
2280 }
2281
2282 /* In the real compile phase, just check the workspace used by the forward
2283 reference list. */
2284
2285 else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
2286 {
2287 *errorcodeptr = ERR52;
2288 goto FAILED;
2289 }
2290
2291 /* If in \Q...\E, check for the end; if not, we have a literal */
2292
2293 if (inescq && c != 0)
2294 {
2295 if (c == '\\' && ptr[1] == 'E')
2296 {
2297 inescq = FALSE;
2298 ptr++;
2299 continue;
2300 }
2301 else
2302 {
2303 if (previous_callout != NULL)
2304 {
2305 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2306 complete_callout(previous_callout, ptr, cd);
2307 previous_callout = NULL;
2308 }
2309 if ((options & PCRE_AUTO_CALLOUT) != 0)
2310 {
2311 previous_callout = code;
2312 code = auto_callout(code, ptr, cd);
2313 }
2314 goto NORMAL_CHAR;
2315 }
2316 }
2317
2318 /* Fill in length of a previous callout, except when the next thing is
2319 a quantifier. */
2320
2321 is_quantifier = c == '*' || c == '+' || c == '?' ||
2322 (c == '{' && is_counted_repeat(ptr+1));
2323
2324 if (!is_quantifier && previous_callout != NULL &&
2325 after_manual_callout-- <= 0)
2326 {
2327 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2328 complete_callout(previous_callout, ptr, cd);
2329 previous_callout = NULL;
2330 }
2331
2332 /* In extended mode, skip white space and comments */
2333
2334 if ((options & PCRE_EXTENDED) != 0)
2335 {
2336 if ((cd->ctypes[c] & ctype_space) != 0) continue;
2337 if (c == '#')
2338 {
2339 while (*(++ptr) != 0)
2340 {
2341 if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
2342 }
2343 if (*ptr != 0) continue;
2344
2345 /* Else fall through to handle end of string */
2346 c = 0;
2347 }
2348 }
2349
2350 /* No auto callout for quantifiers. */
2351
2352 if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
2353 {
2354 previous_callout = code;
2355 code = auto_callout(code, ptr, cd);
2356 }
2357
2358 switch(c)
2359 {
2360 /* ===================================================================*/
2361 case 0: /* The branch terminates at string end */
2362 case '|': /* or | or ) */
2363 case ')':
2364 *firstbyteptr = firstbyte;
2365 *reqbyteptr = reqbyte;
2366 *codeptr = code;
2367 *ptrptr = ptr;
2368 if (lengthptr != NULL)
2369 {
2370 *lengthptr += code - last_code; /* To include callout length */
2371 DPRINTF((">> end branch\n"));
2372 }
2373 return TRUE;
2374
2375
2376 /* ===================================================================*/
2377 /* Handle single-character metacharacters. In multiline mode, ^ disables
2378 the setting of any following char as a first character. */
2379
2380 case '^':
2381 if ((options & PCRE_MULTILINE) != 0)
2382 {
2383 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2384 }
2385 previous = NULL;
2386 *code++ = OP_CIRC;
2387 break;
2388
2389 case '$':
2390 previous = NULL;
2391 *code++ = OP_DOLL;
2392 break;
2393
2394 /* There can never be a first char if '.' is first, whatever happens about
2395 repeats. The value of reqbyte doesn't change either. */
2396
2397 case '.':
2398 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2399 zerofirstbyte = firstbyte;
2400 zeroreqbyte = reqbyte;
2401 previous = code;
2402 *code++ = OP_ANY;
2403 break;
2404
2405
2406 /* ===================================================================*/
2407 /* Character classes. If the included characters are all < 256, we build a
2408 32-byte bitmap of the permitted characters, except in the special case
2409 where there is only one such character. For negated classes, we build the
2410 map as usual, then invert it at the end. However, we use a different opcode
2411 so that data characters > 255 can be handled correctly.
2412
2413 If the class contains characters outside the 0-255 range, a different
2414 opcode is compiled. It may optionally have a bit map for characters < 256,
2415 but those above are are explicitly listed afterwards. A flag byte tells
2416 whether the bitmap is present, and whether this is a negated class or not.
2417 */
2418
2419 case '[':
2420 previous = code;
2421
2422 /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
2423 they are encountered at the top level, so we'll do that too. */
2424
2425 if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2426 check_posix_syntax(ptr, &tempptr, cd))
2427 {
2428 *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;
2429 goto FAILED;
2430 }
2431
2432 /* If the first character is '^', set the negation flag and skip it. */
2433
2434 if ((c = *(++ptr)) == '^')
2435 {
2436 negate_class = TRUE;
2437 c = *(++ptr);
2438 }
2439 else
2440 {
2441 negate_class = FALSE;
2442 }
2443
2444 /* Keep a count of chars with values < 256 so that we can optimize the case
2445 of just a single character (as long as it's < 256). However, For higher
2446 valued UTF-8 characters, we don't yet do any optimization. */
2447
2448 class_charcount = 0;
2449 class_lastchar = -1;
2450
2451 /* Initialize the 32-char bit map to all zeros. We build the map in a
2452 temporary bit of memory, in case the class contains only 1 character (less
2453 than 256), because in that case the compiled code doesn't use the bit map.
2454 */
2455
2456 memset(classbits, 0, 32 * sizeof(uschar));
2457
2458 #ifdef SUPPORT_UTF8
2459 class_utf8 = FALSE; /* No chars >= 256 */
2460 class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */
2461 #endif
2462
2463 /* Process characters until ] is reached. By writing this as a "do" it
2464 means that an initial ] is taken as a data character. At the start of the
2465 loop, c contains the first byte of the character. */
2466
2467 if (c != 0) do
2468 {
2469 const uschar *oldptr;
2470
2471 #ifdef SUPPORT_UTF8
2472 if (utf8 && c > 127)
2473 { /* Braces are required because the */
2474 GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
2475 }
2476 #endif
2477
2478 /* Inside \Q...\E everything is literal except \E */
2479
2480 if (inescq)
2481 {
2482 if (c == '\\' && ptr[1] == 'E') /* If we are at \E */
2483 {
2484 inescq = FALSE; /* Reset literal state */
2485 ptr++; /* Skip the 'E' */
2486 continue; /* Carry on with next */
2487 }
2488 goto CHECK_RANGE; /* Could be range if \E follows */
2489 }
2490
2491 /* Handle POSIX class names. Perl allows a negation extension of the
2492 form [:^name:]. A square bracket that doesn't match the syntax is
2493 treated as a literal. We also recognize the POSIX constructions
2494 [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
2495 5.6 and 5.8 do. */
2496
2497 if (c == '[' &&
2498 (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2499 check_posix_syntax(ptr, &tempptr, cd))
2500 {
2501 BOOL local_negate = FALSE;
2502 int posix_class, taboffset, tabopt;
2503 register const uschar *cbits = cd->cbits;
2504 uschar pbits[32];
2505
2506 if (ptr[1] != ':')
2507 {
2508 *errorcodeptr = ERR31;
2509 goto FAILED;
2510 }
2511
2512 ptr += 2;
2513 if (*ptr == '^')
2514 {
2515 local_negate = TRUE;
2516 ptr++;
2517 }
2518
2519 posix_class = check_posix_name(ptr, tempptr - ptr);
2520 if (posix_class < 0)
2521 {
2522 *errorcodeptr = ERR30;
2523 goto FAILED;
2524 }
2525
2526 /* If matching is caseless, upper and lower are converted to
2527 alpha. This relies on the fact that the class table starts with
2528 alpha, lower, upper as the first 3 entries. */
2529
2530 if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
2531 posix_class = 0;
2532
2533 /* We build the bit map for the POSIX class in a chunk of local store
2534 because we may be adding and subtracting from it, and we don't want to
2535 subtract bits that may be in the main map already. At the end we or the
2536 result into the bit map that is being built. */
2537
2538 posix_class *= 3;
2539
2540 /* Copy in the first table (always present) */
2541
2542 memcpy(pbits, cbits + posix_class_maps[posix_class],
2543 32 * sizeof(uschar));
2544
2545 /* If there is a second table, add or remove it as required. */
2546
2547 taboffset = posix_class_maps[posix_class + 1];
2548 tabopt = posix_class_maps[posix_class + 2];
2549
2550 if (taboffset >= 0)
2551 {
2552 if (tabopt >= 0)
2553 for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
2554 else
2555 for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
2556 }
2557
2558 /* Not see if we need to remove any special characters. An option
2559 value of 1 removes vertical space and 2 removes underscore. */
2560
2561 if (tabopt < 0) tabopt = -tabopt;
2562 if (tabopt == 1) pbits[1] &= ~0x3c;
2563 else if (tabopt == 2) pbits[11] &= 0x7f;
2564
2565 /* Add the POSIX table or its complement into the main table that is
2566 being built and we are done. */
2567
2568 if (local_negate)
2569 for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
2570 else
2571 for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
2572
2573 ptr = tempptr + 1;
2574 class_charcount = 10; /* Set > 1; assumes more than 1 per class */
2575 continue; /* End of POSIX syntax handling */
2576 }
2577
2578 /* Backslash may introduce a single character, or it may introduce one
2579 of the specials, which just set a flag. The sequence \b is a special
2580 case. Inside a class (and only there) it is treated as backspace.
2581 Elsewhere it marks a word boundary. Other escapes have preset maps ready
2582 to or into the one we are building. We assume they have more than one
2583 character in them, so set class_charcount bigger than one. */
2584
2585 if (c == '\\')
2586 {
2587 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2588 if (*errorcodeptr != 0) goto FAILED;
2589
2590 if (-c == ESC_b) c = '\b'; /* \b is backslash in a class */
2591 else if (-c == ESC_X) c = 'X'; /* \X is literal X in a class */
2592 else if (-c == ESC_R) c = 'R'; /* \R is literal R in a class */
2593 else if (-c == ESC_Q) /* Handle start of quoted string */
2594 {
2595 if (ptr[1] == '\\' && ptr[2] == 'E')
2596 {
2597 ptr += 2; /* avoid empty string */
2598 }
2599 else inescq = TRUE;
2600 continue;
2601 }
2602
2603 if (c < 0)
2604 {
2605 register const uschar *cbits = cd->cbits;
2606 class_charcount += 2; /* Greater than 1 is what matters */
2607
2608 /* Save time by not doing this in the pre-compile phase. */
2609
2610 if (lengthptr == NULL) switch (-c)
2611 {
2612 case ESC_d:
2613 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
2614 continue;
2615
2616 case ESC_D:
2617 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
2618 continue;
2619
2620 case ESC_w:
2621 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
2622 continue;
2623
2624 case ESC_W:
2625 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
2626 continue;
2627
2628 case ESC_s:
2629 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
2630 classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
2631 continue;
2632
2633 case ESC_S:
2634 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
2635 classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
2636 continue;
2637
2638 case ESC_E: /* Perl ignores an orphan \E */
2639 continue;
2640
2641 default: /* Not recognized; fall through */
2642 break; /* Need "default" setting to stop compiler warning. */
2643 }
2644
2645 /* In the pre-compile phase, just do the recognition. */
2646
2647 else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
2648 c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
2649
2650 /* We need to deal with \H, \h, \V, and \v in both phases because
2651 they use extra memory. */
2652
2653 if (-c == ESC_h)
2654 {
2655 SETBIT(classbits, 0x09); /* VT */
2656 SETBIT(classbits, 0x20); /* SPACE */
2657 SETBIT(classbits, 0xa0); /* NSBP */
2658 #ifdef SUPPORT_UTF8
2659 if (utf8)
2660 {
2661 class_utf8 = TRUE;
2662 *class_utf8data++ = XCL_SINGLE;
2663 class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
2664 *class_utf8data++ = XCL_SINGLE;
2665 class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
2666 *class_utf8data++ = XCL_RANGE;
2667 class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
2668 class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
2669 *class_utf8data++ = XCL_SINGLE;
2670 class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
2671 *class_utf8data++ = XCL_SINGLE;
2672 class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
2673 *class_utf8data++ = XCL_SINGLE;
2674 class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
2675 }
2676 #endif
2677 continue;
2678 }
2679
2680 if (-c == ESC_H)
2681 {
2682 for (c = 0; c < 32; c++)
2683 {
2684 int x = 0xff;
2685 switch (c)
2686 {
2687 case 0x09/8: x ^= 1 << (0x09%8); break;
2688 case 0x20/8: x ^= 1 << (0x20%8); break;
2689 case 0xa0/8: x ^= 1 << (0xa0%8); break;
2690 default: break;
2691 }
2692 classbits[c] |= x;
2693 }
2694
2695 #ifdef SUPPORT_UTF8
2696 if (utf8)
2697 {
2698 class_utf8 = TRUE;
2699 *class_utf8data++ = XCL_RANGE;
2700 class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2701 class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
2702 *class_utf8data++ = XCL_RANGE;
2703 class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
2704 class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
2705 *class_utf8data++ = XCL_RANGE;
2706 class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
2707 class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
2708 *class_utf8data++ = XCL_RANGE;
2709 class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
2710 class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
2711 *class_utf8data++ = XCL_RANGE;
2712 class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
2713 class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
2714 *class_utf8data++ = XCL_RANGE;
2715 class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
2716 class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
2717 *class_utf8data++ = XCL_RANGE;
2718 class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
2719 class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2720 }
2721 #endif
2722 continue;
2723 }
2724
2725 if (-c == ESC_v)
2726 {
2727 SETBIT(classbits, 0x0a); /* LF */
2728 SETBIT(classbits, 0x0b); /* VT */
2729 SETBIT(classbits, 0x0c); /* FF */
2730 SETBIT(classbits, 0x0d); /* CR */
2731 SETBIT(classbits, 0x85); /* NEL */
2732 #ifdef SUPPORT_UTF8
2733 if (utf8)
2734 {
2735 class_utf8 = TRUE;
2736 *class_utf8data++ = XCL_RANGE;
2737 class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
2738 class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
2739 }
2740 #endif
2741 continue;
2742 }
2743
2744 if (-c == ESC_V)
2745 {
2746 for (c = 0; c < 32; c++)
2747 {
2748 int x = 0xff;
2749 switch (c)
2750 {
2751 case 0x0a/8: x ^= 1 << (0x0a%8);
2752 x ^= 1 << (0x0b%8);
2753 x ^= 1 << (0x0c%8);
2754 x ^= 1 << (0x0d%8);
2755 break;
2756 case 0x85/8: x ^= 1 << (0x85%8); break;
2757 default: break;
2758 }
2759 classbits[c] |= x;
2760 }
2761
2762 #ifdef SUPPORT_UTF8
2763 if (utf8)
2764 {
2765 class_utf8 = TRUE;
2766 *class_utf8data++ = XCL_RANGE;
2767 class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2768 class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
2769 *class_utf8data++ = XCL_RANGE;
2770 class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
2771 class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2772 }
2773 #endif
2774 continue;
2775 }
2776
2777 /* We need to deal with \P and \p in both phases. */
2778
2779 #ifdef SUPPORT_UCP
2780 if (-c == ESC_p || -c == ESC_P)
2781 {
2782 BOOL negated;
2783 int pdata;
2784 int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
2785 if (ptype < 0) goto FAILED;
2786 class_utf8 = TRUE;
2787 *class_utf8data++ = ((-c == ESC_p) != negated)?
2788 XCL_PROP : XCL_NOTPROP;
2789 *class_utf8data++ = ptype;
2790 *class_utf8data++ = pdata;
2791 class_charcount -= 2; /* Not a < 256 character */
2792 continue;
2793 }
2794 #endif
2795 /* Unrecognized escapes are faulted if PCRE is running in its
2796 strict mode. By default, for compatibility with Perl, they are
2797 treated as literals. */
2798
2799 if ((options & PCRE_EXTRA) != 0)
2800 {
2801 *errorcodeptr = ERR7;
2802 goto FAILED;
2803 }
2804
2805 class_charcount -= 2; /* Undo the default count from above */
2806 c = *ptr; /* Get the final character and fall through */
2807 }
2808
2809 /* Fall through if we have a single character (c >= 0). This may be
2810 greater than 256 in UTF-8 mode. */
2811
2812 } /* End of backslash handling */
2813
2814 /* A single character may be followed by '-' to form a range. However,
2815 Perl does not permit ']' to be the end of the range. A '-' character
2816 at the end is treated as a literal. Perl ignores orphaned \E sequences
2817 entirely. The code for handling \Q and \E is messy. */
2818
2819 CHECK_RANGE:
2820 while (ptr[1] == '\\' && ptr[2] == 'E')
2821 {
2822 inescq = FALSE;
2823 ptr += 2;
2824 }
2825
2826 oldptr = ptr;
2827
2828 if (!inescq && ptr[1] == '-')
2829 {
2830 int d;
2831 ptr += 2;
2832 while (*ptr == '\\' && ptr[1] == 'E') ptr += 2;
2833
2834 /* If we hit \Q (not followed by \E) at this point, go into escaped
2835 mode. */
2836
2837 while (*ptr == '\\' && ptr[1] == 'Q')
2838 {
2839 ptr += 2;
2840 if (*ptr == '\\' && ptr[1] == 'E') { ptr += 2; continue; }
2841 inescq = TRUE;
2842 break;
2843 }
2844
2845 if (*ptr == 0 || (!inescq && *ptr == ']'))
2846 {
2847 ptr = oldptr;
2848 goto LONE_SINGLE_CHARACTER;
2849 }
2850
2851 #ifdef SUPPORT_UTF8
2852 if (utf8)
2853 { /* Braces are required because the */
2854 GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
2855 }
2856 else
2857 #endif
2858 d = *ptr; /* Not UTF-8 mode */
2859
2860 /* The second part of a range can be a single-character escape, but
2861 not any of the other escapes. Perl 5.6 treats a hyphen as a literal
2862 in such circumstances. */
2863
2864 if (!inescq && d == '\\')
2865 {
2866 d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2867 if (*errorcodeptr != 0) goto FAILED;
2868
2869 /* \b is backslash; \X is literal X; \R is literal R; any other
2870 special means the '-' was literal */
2871
2872 if (d < 0)
2873 {
2874 if (d == -ESC_b) d = '\b';
2875 else if (d == -ESC_X) d = 'X';
2876 else if (d == -ESC_R) d = 'R'; else
2877 {
2878 ptr = oldptr;
2879 goto LONE_SINGLE_CHARACTER; /* A few lines below */
2880 }
2881 }
2882 }
2883
2884 /* Check that the two values are in the correct order. Optimize
2885 one-character ranges */
2886
2887 if (d < c)
2888 {
2889 *errorcodeptr = ERR8;
2890 goto FAILED;
2891 }
2892
2893 if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
2894
2895 /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
2896 matching, we have to use an XCLASS with extra data items. Caseless
2897 matching for characters > 127 is available only if UCP support is
2898 available. */
2899
2900 #ifdef SUPPORT_UTF8
2901 if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
2902 {
2903 class_utf8 = TRUE;
2904
2905 /* With UCP support, we can find the other case equivalents of
2906 the relevant characters. There may be several ranges. Optimize how
2907 they fit with the basic range. */
2908
2909 #ifdef SUPPORT_UCP
2910 if ((options & PCRE_CASELESS) != 0)
2911 {
2912 unsigned int occ, ocd;
2913 unsigned int cc = c;
2914 unsigned int origd = d;
2915 while (get_othercase_range(&cc, origd, &occ, &ocd))
2916 {
2917 if (occ >= (unsigned int)c &&
2918 ocd <= (unsigned int)d)
2919 continue; /* Skip embedded ranges */
2920
2921 if (occ < (unsigned int)c &&
2922 ocd >= (unsigned int)c - 1) /* Extend the basic range */
2923 { /* if there is overlap, */
2924 c = occ; /* noting that if occ < c */
2925 continue; /* we can't have ocd > d */
2926 } /* because a subrange is */
2927 if (ocd > (unsigned int)d &&
2928 occ <= (unsigned int)d + 1) /* always shorter than */
2929 { /* the basic range. */
2930 d = ocd;
2931 continue;
2932 }
2933
2934 if (occ == ocd)
2935 {
2936 *class_utf8data++ = XCL_SINGLE;
2937 }
2938 else
2939 {
2940 *class_utf8data++ = XCL_RANGE;
2941 class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
2942 }
2943 class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
2944 }
2945 }
2946 #endif /* SUPPORT_UCP */
2947
2948 /* Now record the original range, possibly modified for UCP caseless
2949 overlapping ranges. */
2950
2951 *class_utf8data++ = XCL_RANGE;
2952 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
2953 class_utf8data += _pcre_ord2utf8(d, class_utf8data);
2954
2955 /* With UCP support, we are done. Without UCP support, there is no
2956 caseless matching for UTF-8 characters > 127; we can use the bit map
2957 for the smaller ones. */
2958
2959 #ifdef SUPPORT_UCP
2960 continue; /* With next character in the class */
2961 #else
2962 if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
2963
2964 /* Adjust upper limit and fall through to set up the map */
2965
2966 d = 127;
2967
2968 #endif /* SUPPORT_UCP */
2969 }
2970 #endif /* SUPPORT_UTF8 */
2971
2972 /* We use the bit map for all cases when not in UTF-8 mode; else
2973 ranges that lie entirely within 0-127 when there is UCP support; else
2974 for partial ranges without UCP support. */
2975
2976 class_charcount += d - c + 1;
2977 class_lastchar = d;
2978
2979 /* We can save a bit of time by skipping this in the pre-compile. */
2980
2981 if (lengthptr == NULL) for (; c <= d; c++)
2982 {
2983 classbits[c/8] |= (1 << (c&7));
2984 if ((options & PCRE_CASELESS) != 0)
2985 {
2986 int uc = cd->fcc[c]; /* flip case */
2987 classbits[uc/8] |= (1 << (uc&7));
2988 }
2989 }
2990
2991 continue; /* Go get the next char in the class */
2992 }
2993
2994 /* Handle a lone single character - we can get here for a normal
2995 non-escape char, or after \ that introduces a single character or for an
2996 apparent range that isn't. */
2997
2998 LONE_SINGLE_CHARACTER:
2999
3000 /* Handle a character that cannot go in the bit map */
3001
3002 #ifdef SUPPORT_UTF8
3003 if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
3004 {
3005 class_utf8 = TRUE;
3006 *class_utf8data++ = XCL_SINGLE;
3007 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3008
3009 #ifdef SUPPORT_UCP
3010 if ((options & PCRE_CASELESS) != 0)
3011 {
3012 unsigned int othercase;
3013 if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR)
3014 {
3015 *class_utf8data++ = XCL_SINGLE;
3016 class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
3017 }
3018 }
3019 #endif /* SUPPORT_UCP */
3020
3021 }
3022 else
3023 #endif /* SUPPORT_UTF8 */
3024
3025 /* Handle a single-byte character */
3026 {
3027 classbits[c/8] |= (1 << (c&7));
3028 if ((options & PCRE_CASELESS) != 0)
3029 {
3030 c = cd->fcc[c]; /* flip case */
3031 classbits[c/8] |= (1 << (c&7));
3032 }
3033 class_charcount++;
3034 class_lastchar = c;
3035 }
3036 }
3037
3038 /* Loop until ']' reached. This "while" is the end of the "do" above. */
3039
3040 while ((c = *(++ptr)) != 0 && (c != ']' || inescq));
3041
3042 if (c == 0) /* Missing terminating ']' */
3043 {
3044 *errorcodeptr = ERR6;
3045 goto FAILED;
3046 }
3047
3048 /* If class_charcount is 1, we saw precisely one character whose value is
3049 less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we
3050 can optimize the negative case only if there were no characters >= 128
3051 because OP_NOT and the related opcodes like OP_NOTSTAR operate on
3052 single-bytes only. This is an historical hangover. Maybe one day we can
3053 tidy these opcodes to handle multi-byte characters.
3054
3055 The optimization throws away the bit map. We turn the item into a
3056 1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
3057 that OP_NOT does not support multibyte characters. In the positive case, it
3058 can cause firstbyte to be set. Otherwise, there can be no first char if
3059 this item is first, whatever repeat count may follow. In the case of
3060 reqbyte, save the previous value for reinstating. */
3061
3062 #ifdef SUPPORT_UTF8
3063 if (class_charcount == 1 &&
3064 (!utf8 ||
3065 (!class_utf8 && (!negate_class || class_lastchar < 128))))
3066
3067 #else
3068 if (class_charcount == 1)
3069 #endif
3070 {
3071 zeroreqbyte = reqbyte;
3072
3073 /* The OP_NOT opcode works on one-byte characters only. */
3074
3075 if (negate_class)
3076 {
3077 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3078 zerofirstbyte = firstbyte;
3079 *code++ = OP_NOT;
3080 *code++ = class_lastchar;
3081 break;
3082 }
3083
3084 /* For a single, positive character, get the value into mcbuffer, and
3085 then we can handle this with the normal one-character code. */
3086
3087 #ifdef SUPPORT_UTF8
3088 if (utf8 && class_lastchar > 127)
3089 mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
3090 else
3091 #endif
3092 {
3093 mcbuffer[0] = class_lastchar;
3094 mclength = 1;
3095 }
3096 goto ONE_CHAR;
3097 } /* End of 1-char optimization */
3098
3099 /* The general case - not the one-char optimization. If this is the first
3100 thing in the branch, there can be no first char setting, whatever the
3101 repeat count. Any reqbyte setting must remain unchanged after any kind of
3102 repeat. */
3103
3104 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3105 zerofirstbyte = firstbyte;
3106 zeroreqbyte = reqbyte;
3107
3108 /* If there are characters with values > 255, we have to compile an
3109 extended class, with its own opcode. If there are no characters < 256,
3110 we can omit the bitmap in the actual compiled code. */
3111
3112 #ifdef SUPPORT_UTF8
3113 if (class_utf8)
3114 {
3115 *class_utf8data++ = XCL_END; /* Marks the end of extra data */
3116 *code++ = OP_XCLASS;
3117 code += LINK_SIZE;
3118 *code = negate_class? XCL_NOT : 0;
3119
3120 /* If the map is required, move up the extra data to make room for it;
3121 otherwise just move the code pointer to the end of the extra data. */
3122
3123 if (class_charcount > 0)
3124 {
3125 *code++ |= XCL_MAP;
3126 memmove(code + 32, code, class_utf8data - code);
3127 memcpy(code, classbits, 32);
3128 code = class_utf8data + 32;
3129 }
3130 else code = class_utf8data;
3131
3132 /* Now fill in the complete length of the item */
3133
3134 PUT(previous, 1, code - previous);
3135 break; /* End of class handling */
3136 }
3137 #endif
3138
3139 /* If there are no characters > 255, negate the 32-byte map if necessary,
3140 and copy it into the code vector. If this is the first thing in the branch,
3141 there can be no first char setting, whatever the repeat count. Any reqbyte
3142 setting must remain unchanged after any kind of repeat. */
3143
3144 if (negate_class)
3145 {
3146 *code++ = OP_NCLASS;
3147 if (lengthptr == NULL) /* Save time in the pre-compile phase */
3148 for (c = 0; c < 32; c++) code[c] = ~classbits[c];
3149 }
3150 else
3151 {
3152 *code++ = OP_CLASS;
3153 memcpy(code, classbits, 32);
3154 }
3155 code += 32;
3156 break;
3157
3158
3159 /* ===================================================================*/
3160 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
3161 has been tested above. */
3162
3163 case '{':
3164 if (!is_quantifier) goto NORMAL_CHAR;
3165 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
3166 if (*errorcodeptr != 0) goto FAILED;
3167 goto REPEAT;
3168
3169 case '*':
3170 repeat_min = 0;
3171 repeat_max = -1;
3172 goto REPEAT;
3173
3174 case '+':
3175 repeat_min = 1;
3176 repeat_max = -1;
3177 goto REPEAT;
3178
3179 case '?':
3180 repeat_min = 0;
3181 repeat_max = 1;
3182
3183 REPEAT:
3184 if (previous == NULL)
3185 {
3186 *errorcodeptr = ERR9;
3187 goto FAILED;
3188 }
3189
3190 if (repeat_min == 0)
3191 {
3192 firstbyte = zerofirstbyte; /* Adjust for zero repeat */
3193 reqbyte = zeroreqbyte; /* Ditto */
3194 }
3195
3196 /* Remember whether this is a variable length repeat */
3197
3198 reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
3199
3200 op_type = 0; /* Default single-char op codes */
3201 possessive_quantifier = FALSE; /* Default not possessive quantifier */
3202
3203 /* Save start of previous item, in case we have to move it up to make space
3204 for an inserted OP_ONCE for the additional '+' extension. */
3205
3206 tempcode = previous;
3207
3208 /* If the next character is '+', we have a possessive quantifier. This
3209 implies greediness, whatever the setting of the PCRE_UNGREEDY option.
3210 If the next character is '?' this is a minimizing repeat, by default,
3211 but if PCRE_UNGREEDY is set, it works the other way round. We change the
3212 repeat type to the non-default. */
3213
3214 if (ptr[1] == '+')
3215 {
3216 repeat_type = 0; /* Force greedy */
3217 possessive_quantifier = TRUE;
3218 ptr++;
3219 }
3220 else if (ptr[1] == '?')
3221 {
3222 repeat_type = greedy_non_default;
3223 ptr++;
3224 }
3225 else repeat_type = greedy_default;
3226
3227 /* If previous was a character match, abolish the item and generate a
3228 repeat item instead. If a char item has a minumum of more than one, ensure
3229 that it is set in reqbyte - it might not be if a sequence such as x{3} is
3230 the first thing in a branch because the x will have gone into firstbyte
3231 instead. */
3232
3233 if (*previous == OP_CHAR || *previous == OP_CHARNC)
3234 {
3235 /* Deal with UTF-8 characters that take up more than one byte. It's
3236 easier to write this out separately than try to macrify it. Use c to
3237 hold the length of the character in bytes, plus 0x80 to flag that it's a
3238 length rather than a small character. */
3239
3240 #ifdef SUPPORT_UTF8
3241 if (utf8 && (code[-1] & 0x80) != 0)
3242 {
3243 uschar *lastchar = code - 1;
3244 while((*lastchar & 0xc0) == 0x80) lastchar--;
3245 c = code - lastchar; /* Length of UTF-8 character */
3246 memcpy(utf8_char, lastchar, c); /* Save the char */
3247 c |= 0x80; /* Flag c as a length */
3248 }
3249 else
3250 #endif
3251
3252 /* Handle the case of a single byte - either with no UTF8 support, or
3253 with UTF-8 disabled, or for a UTF-8 character < 128. */
3254
3255 {
3256 c = code[-1];
3257 if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
3258 }
3259
3260 /* If the repetition is unlimited, it pays to see if the next thing on
3261 the line is something that cannot possibly match this character. If so,
3262 automatically possessifying this item gains some performance in the case
3263 where the match fails. */
3264
3265 if (!possessive_quantifier &&
3266 repeat_max < 0 &&
3267 check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3268 options, cd))
3269 {
3270 repeat_type = 0; /* Force greedy */
3271 possessive_quantifier = TRUE;
3272 }
3273
3274 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
3275 }
3276
3277 /* If previous was a single negated character ([^a] or similar), we use
3278 one of the special opcodes, replacing it. The code is shared with single-
3279 character repeats by setting opt_type to add a suitable offset into
3280 repeat_type. We can also test for auto-possessification. OP_NOT is
3281 currently used only for single-byte chars. */
3282
3283 else if (*previous == OP_NOT)
3284 {
3285 op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
3286 c = previous[1];
3287 if (!possessive_quantifier &&
3288 repeat_max < 0 &&
3289 check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3290 {
3291 repeat_type = 0; /* Force greedy */
3292 possessive_quantifier = TRUE;
3293 }
3294 goto OUTPUT_SINGLE_REPEAT;
3295 }
3296
3297 /* If previous was a character type match (\d or similar), abolish it and
3298 create a suitable repeat item. The code is shared with single-character
3299 repeats by setting op_type to add a suitable offset into repeat_type. Note
3300 the the Unicode property types will be present only when SUPPORT_UCP is
3301 defined, but we don't wrap the little bits of code here because it just
3302 makes it horribly messy. */
3303
3304 else if (*previous < OP_EODN)
3305 {
3306 uschar *oldcode;
3307 int prop_type, prop_value;
3308 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
3309 c = *previous;
3310
3311 if (!possessive_quantifier &&
3312 repeat_max < 0 &&
3313 check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3314 {
3315 repeat_type = 0; /* Force greedy */
3316 possessive_quantifier = TRUE;
3317 }
3318
3319 OUTPUT_SINGLE_REPEAT:
3320 if (*previous == OP_PROP || *previous == OP_NOTPROP)
3321 {
3322 prop_type = previous[1];
3323 prop_value = previous[2];
3324 }
3325 else prop_type = prop_value = -1;
3326
3327 oldcode = code;
3328 code = previous; /* Usually overwrite previous item */
3329
3330 /* If the maximum is zero then the minimum must also be zero; Perl allows
3331 this case, so we do too - by simply omitting the item altogether. */
3332
3333 if (repeat_max == 0) goto END_REPEAT;
3334
3335 /* All real repeats make it impossible to handle partial matching (maybe
3336 one day we will be able to remove this restriction). */
3337
3338 if (repeat_max != 1) cd->nopartial = TRUE;
3339
3340 /* Combine the op_type with the repeat_type */
3341
3342 repeat_type += op_type;
3343
3344 /* A minimum of zero is handled either as the special case * or ?, or as
3345 an UPTO, with the maximum given. */
3346
3347 if (repeat_min == 0)
3348 {
3349 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
3350 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
3351 else
3352 {
3353 *code++ = OP_UPTO + repeat_type;
3354 PUT2INC(code, 0, repeat_max);
3355 }
3356 }
3357
3358 /* A repeat minimum of 1 is optimized into some special cases. If the
3359 maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
3360 left in place and, if the maximum is greater than 1, we use OP_UPTO with
3361 one less than the maximum. */
3362
3363 else if (repeat_min == 1)
3364 {
3365 if (repeat_max == -1)
3366 *code++ = OP_PLUS + repeat_type;
3367 else
3368 {
3369 code = oldcode; /* leave previous item in place */
3370 if (repeat_max == 1) goto END_REPEAT;
3371 *code++ = OP_UPTO + repeat_type;
3372 PUT2INC(code, 0, repeat_max - 1);
3373 }
3374 }
3375
3376 /* The case {n,n} is just an EXACT, while the general case {n,m} is
3377 handled as an EXACT followed by an UPTO. */
3378
3379 else
3380 {
3381 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
3382 PUT2INC(code, 0, repeat_min);
3383
3384 /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
3385 we have to insert the character for the previous code. For a repeated
3386 Unicode property match, there are two extra bytes that define the
3387 required property. In UTF-8 mode, long characters have their length in
3388 c, with the 0x80 bit as a flag. */
3389
3390 if (repeat_max < 0)
3391 {
3392 #ifdef SUPPORT_UTF8
3393 if (utf8 && c >= 128)
3394 {
3395 memcpy(code, utf8_char, c & 7);
3396 code += c & 7;
3397 }
3398 else
3399 #endif
3400 {
3401 *code++ = c;
3402 if (prop_type >= 0)
3403 {
3404 *code++ = prop_type;
3405 *code++ = prop_value;
3406 }
3407 }
3408 *code++ = OP_STAR + repeat_type;
3409 }
3410
3411 /* Else insert an UPTO if the max is greater than the min, again
3412 preceded by the character, for the previously inserted code. If the
3413 UPTO is just for 1 instance, we can use QUERY instead. */
3414
3415 else if (repeat_max != repeat_min)
3416 {
3417 #ifdef SUPPORT_UTF8
3418 if (utf8 && c >= 128)
3419 {
3420 memcpy(code, utf8_char, c & 7);
3421 code += c & 7;
3422 }
3423 else
3424 #endif
3425 *code++ = c;
3426 if (prop_type >= 0)
3427 {
3428 *code++ = prop_type;
3429 *code++ = prop_value;
3430 }
3431 repeat_max -= repeat_min;
3432
3433 if (repeat_max == 1)
3434 {
3435 *code++ = OP_QUERY + repeat_type;
3436 }
3437 else
3438 {
3439 *code++ = OP_UPTO + repeat_type;
3440 PUT2INC(code, 0, repeat_max);
3441 }
3442 }
3443 }
3444
3445 /* The character or character type itself comes last in all cases. */
3446
3447 #ifdef SUPPORT_UTF8
3448 if (utf8 && c >= 128)
3449 {
3450 memcpy(code, utf8_char, c & 7);
3451 code += c & 7;
3452 }
3453 else
3454 #endif
3455 *code++ = c;
3456
3457 /* For a repeated Unicode property match, there are two extra bytes that
3458 define the required property. */
3459
3460 #ifdef SUPPORT_UCP
3461 if (prop_type >= 0)
3462 {
3463 *code++ = prop_type;
3464 *code++ = prop_value;
3465 }
3466 #endif
3467 }
3468
3469 /* If previous was a character class or a back reference, we put the repeat
3470 stuff after it, but just skip the item if the repeat was {0,0}. */
3471
3472 else if (*previous == OP_CLASS ||
3473 *previous == OP_NCLASS ||
3474 #ifdef SUPPORT_UTF8
3475 *previous == OP_XCLASS ||
3476 #endif
3477 *previous == OP_REF)
3478 {
3479 if (repeat_max == 0)
3480 {
3481 code = previous;
3482 goto END_REPEAT;
3483 }
3484
3485 /* All real repeats make it impossible to handle partial matching (maybe
3486 one day we will be able to remove this restriction). */
3487
3488 if (repeat_max != 1) cd->nopartial = TRUE;
3489
3490 if (repeat_min == 0 && repeat_max == -1)
3491 *code++ = OP_CRSTAR + repeat_type;
3492 else if (repeat_min == 1 && repeat_max == -1)
3493 *code++ = OP_CRPLUS + repeat_type;
3494 else if (repeat_min == 0 && repeat_max == 1)
3495 *code++ = OP_CRQUERY + repeat_type;
3496 else
3497 {
3498 *code++ = OP_CRRANGE + repeat_type;
3499 PUT2INC(code, 0, repeat_min);
3500 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
3501 PUT2INC(code, 0, repeat_max);
3502 }
3503 }
3504
3505 /* If previous was a bracket group, we may have to replicate it in certain
3506 cases. */
3507
3508 else if (*previous == OP_BRA || *previous == OP_CBRA ||
3509 *previous == OP_ONCE || *previous == OP_COND)
3510 {
3511 register int i;
3512 int ketoffset = 0;
3513 int len = code - previous;
3514 uschar *bralink = NULL;
3515
3516 /* Repeating a DEFINE group is pointless */
3517
3518 if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
3519 {
3520 *errorcodeptr = ERR55;
3521 goto FAILED;
3522 }
3523
3524 /* This is a paranoid check to stop integer overflow later on */
3525
3526 if (len > MAX_DUPLENGTH)
3527 {
3528 *errorcodeptr = ERR50;
3529 goto FAILED;
3530 }
3531
3532 /* If the maximum repeat count is unlimited, find the end of the bracket
3533 by scanning through from the start, and compute the offset back to it
3534 from the current code pointer. There may be an OP_OPT setting following
3535 the final KET, so we can't find the end just by going back from the code
3536 pointer. */
3537
3538 if (repeat_max == -1)
3539 {
3540 register uschar *ket = previous;
3541 do ket += GET(ket, 1); while (*ket != OP_KET);
3542 ketoffset = code - ket;
3543 }
3544
3545 /* The case of a zero minimum is special because of the need to stick
3546 OP_BRAZERO in front of it, and because the group appears once in the
3547 data, whereas in other cases it appears the minimum number of times. For
3548 this reason, it is simplest to treat this case separately, as otherwise
3549 the code gets far too messy. There are several special subcases when the
3550 minimum is zero. */
3551
3552 if (repeat_min == 0)
3553 {
3554 /* If the maximum is also zero, we just omit the group from the output
3555 altogether. */
3556
3557 if (repeat_max == 0)
3558 {
3559 code = previous;
3560 goto END_REPEAT;
3561 }
3562
3563 /* If the maximum is 1 or unlimited, we just have to stick in the
3564 BRAZERO and do no more at this point. However, we do need to adjust
3565 any OP_RECURSE calls inside the group that refer to the group itself or
3566 any internal or forward referenced group, because the offset is from
3567 the start of the whole regex. Temporarily terminate the pattern while
3568 doing this. */
3569
3570 if (repeat_max <= 1)
3571 {
3572 *code = OP_END;
3573 adjust_recurse(previous, 1, utf8, cd, save_hwm);
3574 memmove(previous+1, previous, len);
3575 code++;
3576 *previous++ = OP_BRAZERO + repeat_type;
3577 }
3578
3579 /* If the maximum is greater than 1 and limited, we have to replicate
3580 in a nested fashion, sticking OP_BRAZERO before each set of brackets.
3581 The first one has to be handled carefully because it's the original
3582 copy, which has to be moved up. The remainder can be handled by code
3583 that is common with the non-zero minimum case below. We have to
3584 adjust the value or repeat_max, since one less copy is required. Once
3585 again, we may have to adjust any OP_RECURSE calls inside the group. */
3586
3587 else
3588 {
3589 int offset;
3590 *code = OP_END;
3591 adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
3592 memmove(previous + 2 + LINK_SIZE, previous, len);
3593 code += 2 + LINK_SIZE;
3594 *previous++ = OP_BRAZERO + repeat_type;
3595 *previous++ = OP_BRA;
3596
3597 /* We chain together the bracket offset fields that have to be
3598 filled in later when the ends of the brackets are reached. */
3599
3600 offset = (bralink == NULL)? 0 : previous - bralink;
3601 bralink = previous;
3602 PUTINC(previous, 0, offset);
3603 }
3604
3605 repeat_max--;
3606 }
3607
3608 /* If the minimum is greater than zero, replicate the group as many
3609 times as necessary, and adjust the maximum to the number of subsequent
3610 copies that we need. If we set a first char from the group, and didn't
3611 set a required char, copy the latter from the former. If there are any
3612 forward reference subroutine calls in the group, there will be entries on
3613 the workspace list; replicate these with an appropriate increment. */
3614
3615 else
3616 {
3617 if (repeat_min > 1)
3618 {
3619 /* In the pre-compile phase, we don't actually do the replication. We
3620 just adjust the length as if we had. */
3621
3622 if (lengthptr != NULL)
3623 *lengthptr += (repeat_min - 1)*length_prevgroup;
3624
3625 /* This is compiling for real */
3626
3627 else
3628 {
3629 if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
3630 for (i = 1; i < repeat_min; i++)
3631 {
3632 uschar *hc;
3633 uschar *this_hwm = cd->hwm;
3634 memcpy(code, previous, len);
3635 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3636 {
3637 PUT(cd->hwm, 0, GET(hc, 0) + len);
3638 cd->hwm += LINK_SIZE;
3639 }
3640 save_hwm = this_hwm;
3641 code += len;
3642 }
3643 }
3644 }
3645
3646 if (repeat_max > 0) repeat_max -= repeat_min;
3647 }
3648
3649 /* This code is common to both the zero and non-zero minimum cases. If
3650 the maximum is limited, it replicates the group in a nested fashion,
3651 remembering the bracket starts on a stack. In the case of a zero minimum,
3652 the first one was set up above. In all cases the repeat_max now specifies
3653 the number of additional copies needed. Again, we must remember to
3654 replicate entries on the forward reference list. */
3655
3656 if (repeat_max >= 0)
3657 {
3658 /* In the pre-compile phase, we don't actually do the replication. We
3659 just adjust the length as if we had. For each repetition we must add 1
3660 to the length for BRAZERO and for all but the last repetition we must
3661 add 2 + 2*LINKSIZE to allow for the nesting that occurs. */
3662
3663 if (lengthptr != NULL && repeat_max > 0)
3664 *lengthptr += repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
3665 2 - 2*LINK_SIZE; /* Last one doesn't nest */
3666
3667 /* This is compiling for real */
3668
3669 else for (i = repeat_max - 1; i >= 0; i--)
3670 {
3671 uschar *hc;
3672 uschar *this_hwm = cd->hwm;
3673
3674 *code++ = OP_BRAZERO + repeat_type;
3675
3676 /* All but the final copy start a new nesting, maintaining the
3677 chain of brackets outstanding. */
3678
3679 if (i != 0)
3680 {
3681 int offset;
3682 *code++ = OP_BRA;
3683 offset = (bralink == NULL)? 0 : code - bralink;
3684 bralink = code;
3685 PUTINC(code, 0, offset);
3686 }
3687
3688 memcpy(code, previous, len);
3689 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3690 {
3691 PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
3692 cd->hwm += LINK_SIZE;
3693 }
3694 save_hwm = this_hwm;
3695 code += len;
3696 }
3697
3698 /* Now chain through the pending brackets, and fill in their length
3699 fields (which are holding the chain links pro tem). */
3700
3701 while (bralink != NULL)
3702 {
3703 int oldlinkoffset;
3704 int offset = code - bralink + 1;
3705 uschar *bra = code - offset;
3706 oldlinkoffset = GET(bra, 1);
3707 bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
3708 *code++ = OP_KET;
3709 PUTINC(code, 0, offset);
3710 PUT(bra, 1, offset);
3711 }
3712 }
3713
3714 /* If the maximum is unlimited, set a repeater in the final copy. We
3715 can't just offset backwards from the current code point, because we
3716 don't know if there's been an options resetting after the ket. The
3717 correct offset was computed above.
3718
3719 Then, when we are doing the actual compile phase, check to see whether
3720 this group is a non-atomic one that could match an empty string. If so,
3721 convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
3722 that runtime checking can be done. [This check is also applied to
3723 atomic groups at runtime, but in a different way.] */
3724
3725 else
3726 {
3727 uschar *ketcode = code - ketoffset;
3728 uschar *bracode = ketcode - GET(ketcode, 1);
3729 *ketcode = OP_KETRMAX + repeat_type;
3730 if (lengthptr == NULL && *bracode != OP_ONCE)
3731 {
3732 uschar *scode = bracode;
3733 do
3734 {
3735 if (could_be_empty_branch(scode, ketcode, utf8))
3736 {
3737 *bracode += OP_SBRA - OP_BRA;
3738 break;
3739 }
3740 scode += GET(scode, 1);
3741 }
3742 while (*scode == OP_ALT);
3743 }
3744 }
3745 }
3746
3747 /* Else there's some kind of shambles */
3748
3749 else
3750 {
3751 *errorcodeptr = ERR11;
3752 goto FAILED;
3753 }
3754
3755 /* If the character following a repeat is '+', or if certain optimization
3756 tests above succeeded, possessive_quantifier is TRUE. For some of the
3757 simpler opcodes, there is an special alternative opcode for this. For
3758 anything else, we wrap the entire repeated item inside OP_ONCE brackets.
3759 The '+' notation is just syntactic sugar, taken from Sun's Java package,
3760 but the special opcodes can optimize it a bit. The repeated item starts at
3761 tempcode, not at previous, which might be the first part of a string whose
3762 (former) last char we repeated.
3763
3764 Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
3765 an 'upto' may follow. We skip over an 'exact' item, and then test the
3766 length of what remains before proceeding. */
3767
3768 if (possessive_quantifier)
3769 {
3770 int len;
3771 if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||
3772 *tempcode == OP_NOTEXACT)
3773 tempcode += _pcre_OP_lengths[*tempcode];
3774 len = code - tempcode;
3775 if (len > 0) switch (*tempcode)
3776 {
3777 case OP_STAR: *tempcode = OP_POSSTAR; break;
3778 case OP_PLUS: *tempcode = OP_POSPLUS; break;
3779 case OP_QUERY: *tempcode = OP_POSQUERY; break;
3780 case OP_UPTO: *tempcode = OP_POSUPTO; break;
3781
3782 case OP_TYPESTAR: *tempcode = OP_TYPEPOSSTAR; break;
3783 case OP_TYPEPLUS: *tempcode = OP_TYPEPOSPLUS; break;
3784 case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
3785 case OP_TYPEUPTO: *tempcode = OP_TYPEPOSUPTO; break;
3786
3787 case OP_NOTSTAR: *tempcode = OP_NOTPOSSTAR; break;
3788 case OP_NOTPLUS: *tempcode = OP_NOTPOSPLUS; break;
3789 case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
3790 case OP_NOTUPTO: *tempcode = OP_NOTPOSUPTO; break;
3791
3792 default:
3793 memmove(tempcode + 1+LINK_SIZE, tempcode, len);
3794 code += 1 + LINK_SIZE;
3795 len += 1 + LINK_SIZE;
3796 tempcode[0] = OP_ONCE;
3797 *code++ = OP_KET;
3798 PUTINC(code, 0, len);
3799 PUT(tempcode, 1, len);
3800 break;
3801 }
3802 }
3803
3804 /* In all case we no longer have a previous item. We also set the
3805 "follows varying string" flag for subsequently encountered reqbytes if
3806 it isn't already set and we have just passed a varying length item. */
3807
3808 END_REPEAT:
3809 previous = NULL;
3810 cd->req_varyopt |= reqvary;
3811 break;
3812
3813
3814 /* ===================================================================*/
3815 /* Start of nested parenthesized sub-expression, or comment or lookahead or
3816 lookbehind or option setting or condition or all the other extended
3817 parenthesis forms. First deal with the specials; all are introduced by ?,
3818 and the appearance of any of them means that this is not a capturing
3819 group. */
3820
3821 case '(':
3822 newoptions = options;
3823 skipbytes = 0;
3824 bravalue = OP_CBRA;
3825 save_hwm = cd->hwm;
3826 reset_bracount = FALSE;
3827
3828 if (*(++ptr) == '?')
3829 {
3830 int i, set, unset, namelen;
3831 int *optset;
3832 const uschar *name;
3833 uschar *slot;
3834
3835 switch (*(++ptr))
3836 {
3837 case '#': /* Comment; skip to ket */
3838 ptr++;
3839 while (*ptr != 0 && *ptr != ')') ptr++;
3840 if (*ptr == 0)
3841 {
3842 *errorcodeptr = ERR18;
3843 goto FAILED;
3844 }
3845 continue;
3846
3847
3848 /* ------------------------------------------------------------ */
3849 case '|': /* Reset capture count for each branch */
3850 reset_bracount = TRUE;
3851 /* Fall through */
3852
3853 /* ------------------------------------------------------------ */
3854 case ':': /* Non-capturing bracket */
3855 bravalue = OP_BRA;
3856 ptr++;
3857 break;
3858
3859
3860 /* ------------------------------------------------------------ */
3861 case '(':
3862 bravalue = OP_COND; /* Conditional group */
3863
3864 /* A condition can be an assertion, a number (referring to a numbered
3865 group), a name (referring to a named group), or 'R', referring to
3866 recursion. R<digits> and R&name are also permitted for recursion tests.
3867
3868 There are several syntaxes for testing a named group: (?(name)) is used
3869 by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
3870
3871 There are two unfortunate ambiguities, caused by history. (a) 'R' can
3872 be the recursive thing or the name 'R' (and similarly for 'R' followed
3873 by digits), and (b) a number could be a name that consists of digits.
3874 In both cases, we look for a name first; if not found, we try the other
3875 cases. */
3876
3877 /* For conditions that are assertions, check the syntax, and then exit
3878 the switch. This will take control down to where bracketed groups,
3879 including assertions, are processed. */
3880
3881 if (ptr[1] == '?' && (ptr[2] == '=' || ptr[2] == '!' || ptr[2] == '<'))
3882 break;
3883
3884 /* Most other conditions use OP_CREF (a couple change to OP_RREF
3885 below), and all need to skip 3 bytes at the start of the group. */
3886
3887 code[1+LINK_SIZE] = OP_CREF;
3888 skipbytes = 3;
3889 refsign = -1;
3890
3891 /* Check for a test for recursion in a named group. */
3892
3893 if (ptr[1] == 'R' && ptr[2] == '&')
3894 {
3895 terminator = -1;
3896 ptr += 2;
3897 code[1+LINK_SIZE] = OP_RREF; /* Change the type of test */
3898 }
3899
3900 /* Check for a test for a named group's having been set, using the Perl
3901 syntax (?(<name>) or (?('name') */
3902
3903 else if (ptr[1] == '<')
3904 {
3905 terminator = '>';
3906 ptr++;
3907 }
3908 else if (ptr[1] == '\'')
3909 {
3910 terminator = '\'';
3911 ptr++;
3912 }
3913 else
3914 {
3915 terminator = 0;
3916 if (ptr[1] == '-' || ptr[1] == '+') refsign = *(++ptr);
3917 }
3918
3919 /* We now expect to read a name; any thing else is an error */
3920
3921 if ((cd->ctypes[ptr[1]] & ctype_word) == 0)
3922 {
3923 ptr += 1; /* To get the right offset */
3924 *errorcodeptr = ERR28;
3925 goto FAILED;
3926 }
3927
3928 /* Read the name, but also get it as a number if it's all digits */
3929
3930 recno = 0;
3931 name = ++ptr;
3932 while ((cd->ctypes[*ptr] & ctype_word) != 0)
3933 {
3934 if (recno >= 0)
3935 recno = ((digitab[*ptr] & ctype_digit) != 0)?
3936 recno * 10 + *ptr - '0' : -1;
3937 ptr++;
3938 }
3939 namelen = ptr - name;
3940
3941 if ((terminator > 0 && *ptr++ != terminator) || *ptr++ != ')')
3942 {
3943 ptr--; /* Error offset */
3944 *errorcodeptr = ERR26;
3945 goto FAILED;
3946 }
3947
3948 /* Do no further checking in the pre-compile phase. */
3949
3950 if (lengthptr != NULL) break;
3951
3952 /* In the real compile we do the work of looking for the actual
3953 reference. If the string started with "+" or "-" we require the rest to
3954 be digits, in which case recno will be set. */
3955
3956 if (refsign > 0)
3957 {
3958 if (recno <= 0)
3959 {
3960 *errorcodeptr = ERR58;
3961 goto FAILED;
3962 }
3963 if (refsign == '-')
3964 {
3965 recno = cd->bracount - recno + 1;
3966 if (recno <= 0)
3967 {
3968 *errorcodeptr = ERR15;
3969 goto FAILED;
3970 }
3971 }
3972 else recno += cd->bracount;
3973 PUT2(code, 2+LINK_SIZE, recno);
3974 break;
3975 }
3976
3977 /* Otherwise (did not start with "+" or "-"), start by looking for the
3978 name. */
3979
3980 slot = cd->name_table;
3981 for (i = 0; i < cd->names_found; i++)
3982 {
3983 if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
3984 slot += cd->name_entry_size;
3985 }
3986
3987 /* Found a previous named subpattern */
3988
3989 if (i < cd->names_found)
3990 {
3991 recno = GET2(slot, 0);
3992 PUT2(code, 2+LINK_SIZE, recno);
3993 }
3994
3995 /* Search the pattern for a forward reference */
3996
3997 else if ((i = find_parens(ptr, cd->bracount, name, namelen,
3998 (options & PCRE_EXTENDED) != 0)) > 0)
3999 {
4000 PUT2(code, 2+LINK_SIZE, i);
4001 }
4002
4003 /* If terminator == 0 it means that the name followed directly after
4004 the opening parenthesis [e.g. (?(abc)...] and in this case there are
4005 some further alternatives to try. For the cases where terminator != 0
4006 [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
4007 now checked all the possibilities, so give an error. */
4008
4009 else if (terminator != 0)
4010 {
4011 *errorcodeptr = ERR15;
4012 goto FAILED;
4013 }
4014
4015 /* Check for (?(R) for recursion. Allow digits after R to specify a
4016 specific group number. */
4017
4018 else if (*name == 'R')
4019 {
4020 recno = 0;
4021 for (i = 1; i < namelen; i++)
4022 {
4023 if ((digitab[name[i]] & ctype_digit) == 0)
4024 {
4025 *errorcodeptr = ERR15;
4026 goto FAILED;
4027 }
4028 recno = recno * 10 + name[i] - '0';
4029 }
4030 if (recno == 0) recno = RREF_ANY;
4031 code[1+LINK_SIZE] = OP_RREF; /* Change test type */
4032 PUT2(code, 2+LINK_SIZE, recno);
4033 }
4034
4035 /* Similarly, check for the (?(DEFINE) "condition", which is always
4036 false. */
4037
4038 else if (namelen == 6 && strncmp((char *)name, "DEFINE", 6) == 0)
4039 {
4040 code[1+LINK_SIZE] = OP_DEF;
4041 skipbytes = 1;
4042 }
4043
4044 /* Check for the "name" actually being a subpattern number. */
4045
4046 else if (recno > 0)
4047 {
4048 PUT2(code, 2+LINK_SIZE, recno);
4049 }
4050
4051 /* Either an unidentified subpattern, or a reference to (?(0) */
4052
4053 else
4054 {
4055 *errorcodeptr = (recno == 0)? ERR35: ERR15;
4056 goto FAILED;
4057 }
4058 break;
4059
4060
4061 /* ------------------------------------------------------------ */
4062 case '=': /* Positive lookahead */
4063 bravalue = OP_ASSERT;
4064 ptr++;
4065 break;
4066
4067
4068 /* ------------------------------------------------------------ */
4069 case '!': /* Negative lookahead */
4070 bravalue = OP_ASSERT_NOT;
4071 ptr++;
4072 break;
4073
4074
4075 /* ------------------------------------------------------------ */
4076 case '<': /* Lookbehind or named define */
4077 switch (ptr[1])
4078 {
4079 case '=': /* Positive lookbehind */
4080 bravalue = OP_ASSERTBACK;
4081 ptr += 2;
4082 break;
4083
4084 case '!': /* Negative lookbehind */
4085 bravalue = OP_ASSERTBACK_NOT;
4086 ptr += 2;
4087 break;
4088
4089 default: /* Could be name define, else bad */
4090 if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME;
4091 ptr++; /* Correct offset for error */
4092 *errorcodeptr = ERR24;
4093 goto FAILED;
4094 }
4095 break;
4096
4097
4098 /* ------------------------------------------------------------ */
4099 case '>': /* One-time brackets */
4100 bravalue = OP_ONCE;
4101 ptr++;
4102 break;
4103
4104
4105 /* ------------------------------------------------------------ */
4106 case 'C': /* Callout - may be followed by digits; */
4107 previous_callout = code; /* Save for later completion */
4108 after_manual_callout = 1; /* Skip one item before completing */
4109 *code++ = OP_CALLOUT;
4110 {
4111 int n = 0;
4112 while ((digitab[*(++ptr)] & ctype_digit) != 0)
4113 n = n * 10 + *ptr - '0';
4114 if (*ptr != ')')
4115 {
4116 *errorcodeptr = ERR39;
4117 goto FAILED;
4118 }
4119 if (n > 255)
4120 {
4121 *errorcodeptr = ERR38;
4122 goto FAILED;
4123 }
4124 *code++ = n;
4125 PUT(code, 0, ptr - cd->start_pattern + 1); /* Pattern offset */
4126 PUT(code, LINK_SIZE, 0); /* Default length */
4127 code += 2 * LINK_SIZE;
4128 }
4129 previous = NULL;
4130 continue;
4131
4132
4133 /* ------------------------------------------------------------ */
4134 case 'P': /* Python-style named subpattern handling */
4135 if (*(++ptr) == '=' || *ptr == '>') /* Reference or recursion */
4136 {
4137 is_recurse = *ptr == '>';
4138 terminator = ')';
4139 goto NAMED_REF_OR_RECURSE;
4140 }
4141 else if (*ptr != '<') /* Test for Python-style definition */
4142 {
4143 *errorcodeptr = ERR41;
4144 goto FAILED;
4145 }
4146 /* Fall through to handle (?P< as (?< is handled */
4147
4148
4149 /* ------------------------------------------------------------ */
4150 DEFINE_NAME: /* Come here from (?< handling */
4151 case '\'':
4152 {
4153 terminator = (*ptr == '<')? '>' : '\'';
4154 name = ++ptr;
4155
4156 while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4157 namelen = ptr - name;
4158
4159 /* In the pre-compile phase, just do a syntax check. */
4160
4161 if (lengthptr != NULL)
4162 {
4163 if (*ptr != terminator)
4164 {
4165 *errorcodeptr = ERR42;
4166 goto FAILED;
4167 }
4168 if (cd->names_found >= MAX_NAME_COUNT)
4169 {
4170 *errorcodeptr = ERR49;
4171 goto FAILED;
4172 }
4173 if (namelen + 3 > cd->name_entry_size)
4174 {
4175 cd->name_entry_size = namelen + 3;
4176 if (namelen > MAX_NAME_SIZE)
4177 {
4178 *errorcodeptr = ERR48;
4179 goto FAILED;
4180 }
4181 }
4182 }
4183
4184 /* In the real compile, create the entry in the table */
4185
4186 else
4187 {
4188 slot = cd->name_table;
4189 for (i = 0; i < cd->names_found; i++)
4190 {
4191 int crc = memcmp(name, slot+2, namelen);
4192 if (crc == 0)
4193 {
4194 if (slot[2+namelen] == 0)
4195 {
4196 if ((options & PCRE_DUPNAMES) == 0)
4197 {
4198 *errorcodeptr = ERR43;
4199 goto FAILED;
4200 }
4201 }
4202 else crc = -1; /* Current name is substring */
4203 }
4204 if (crc < 0)
4205 {
4206 memmove(slot + cd->name_entry_size, slot,
4207 (cd->names_found - i) * cd->name_entry_size);
4208 break;
4209 }
4210 slot += cd->name_entry_size;
4211 }
4212
4213 PUT2(slot, 0, cd->bracount + 1);
4214 memcpy(slot + 2, name, namelen);
4215 slot[2+namelen] = 0;
4216 }
4217 }
4218
4219 /* In both cases, count the number of names we've encountered. */
4220
4221 ptr++; /* Move past > or ' */
4222 cd->names_found++;
4223 goto NUMBERED_GROUP;
4224
4225
4226 /* ------------------------------------------------------------ */
4227 case '&': /* Perl recursion/subroutine syntax */
4228 terminator = ')';
4229 is_recurse = TRUE;
4230 /* Fall through */
4231
4232 /* We come here from the Python syntax above that handles both
4233 references (?P=name) and recursion (?P>name), as well as falling
4234 through from the Perl recursion syntax (?&name). */
4235
4236 NAMED_REF_OR_RECURSE:
4237 name = ++ptr;
4238 while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4239 namelen = ptr - name;
4240
4241 /* In the pre-compile phase, do a syntax check and set a dummy
4242 reference number. */
4243
4244 if (lengthptr != NULL)
4245 {
4246 if (*ptr != terminator)
4247 {
4248 *errorcodeptr = ERR42;
4249 goto FAILED;
4250 }
4251 if (namelen > MAX_NAME_SIZE)
4252 {
4253 *errorcodeptr = ERR48;
4254 goto FAILED;
4255 }
4256 recno = 0;
4257 }
4258
4259 /* In the real compile, seek the name in the table */
4260
4261 else
4262 {
4263 slot = cd->name_table;
4264 for (i = 0; i < cd->names_found; i++)
4265 {
4266 if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
4267 slot += cd->name_entry_size;
4268 }
4269
4270 if (i < cd->names_found) /* Back reference */
4271 {
4272 recno = GET2(slot, 0);
4273 }
4274 else if ((recno = /* Forward back reference */
4275 find_parens(ptr, cd->bracount, name, namelen,
4276 (options & PCRE_EXTENDED) != 0)) <= 0)
4277 {
4278 *errorcodeptr = ERR15;
4279 goto FAILED;
4280 }
4281 }
4282
4283 /* In both phases, we can now go to the code than handles numerical
4284 recursion or backreferences. */
4285
4286 if (is_recurse) goto HANDLE_RECURSION;
4287 else goto HANDLE_REFERENCE;
4288
4289
4290 /* ------------------------------------------------------------ */
4291 case 'R': /* Recursion */
4292 ptr++; /* Same as (?0) */
4293 /* Fall through */
4294
4295
4296 /* ------------------------------------------------------------ */
4297 case '-': case '+':
4298 case '0': case '1': case '2': case '3': case '4': /* Recursion or */
4299 case '5': case '6': case '7': case '8': case '9': /* subroutine */
4300 {
4301 const uschar *called;
4302
4303 if ((refsign = *ptr) == '+') ptr++;
4304 else if (refsign == '-')
4305 {
4306 if ((digitab[ptr[1]] & ctype_digit) == 0)
4307 goto OTHER_CHAR_AFTER_QUERY;
4308 ptr++;
4309 }
4310
4311 recno = 0;
4312 while((digitab[*ptr] & ctype_digit) != 0)
4313 recno = recno * 10 + *ptr++ - '0';
4314
4315 if (*ptr != ')')
4316 {
4317 *errorcodeptr = ERR29;
4318 goto FAILED;
4319 }
4320
4321 if (refsign == '-')
4322 {
4323 if (recno == 0)
4324 {
4325 *errorcodeptr = ERR58;
4326 goto FAILED;
4327 }
4328 recno = cd->bracount - recno + 1;
4329 if (recno <= 0)
4330 {
4331 *errorcodeptr = ERR15;
4332 goto FAILED;
4333 }
4334 }
4335 else if (refsign == '+')
4336 {
4337 if (recno == 0)
4338 {
4339 *errorcodeptr = ERR58;
4340 goto FAILED;
4341 }
4342 recno += cd->bracount;
4343 }
4344
4345 /* Come here from code above that handles a named recursion */
4346
4347 HANDLE_RECURSION:
4348
4349 previous = code;
4350 called = cd->start_code;
4351
4352 /* When we are actually compiling, find the bracket that is being
4353 referenced. Temporarily end the regex in case it doesn't exist before
4354 this point. If we end up with a forward reference, first check that
4355 the bracket does occur later so we can give the error (and position)
4356 now. Then remember this forward reference in the workspace so it can
4357 be filled in at the end. */
4358
4359 if (lengthptr == NULL)
4360 {
4361 *code = OP_END;
4362 if (recno != 0) called = find_bracket(cd->start_code, utf8, recno);
4363
4364 /* Forward reference */
4365
4366 if (called == NULL)
4367 {
4368 if (find_parens(ptr, cd->bracount, NULL, recno,
4369 (options & PCRE_EXTENDED) != 0) < 0)
4370 {
4371 *errorcodeptr = ERR15;
4372 goto FAILED;
4373 }
4374 called = cd->start_code + recno;
4375 PUTINC(cd->hwm, 0, code + 2 + LINK_SIZE - cd->start_code);
4376 }
4377
4378 /* If not a forward reference, and the subpattern is still open,
4379 this is a recursive call. We check to see if this is a left
4380 recursion that could loop for ever, and diagnose that case. */
4381
4382 else if (GET(called, 1) == 0 &&
4383 could_be_empty(called, code, bcptr, utf8))
4384 {
4385 *errorcodeptr = ERR40;
4386 goto FAILED;
4387 }
4388 }
4389
4390 /* Insert the recursion/subroutine item, automatically wrapped inside
4391 "once" brackets. Set up a "previous group" length so that a
4392 subsequent quantifier will work. */
4393
4394 *code = OP_ONCE;
4395 PUT(code, 1, 2 + 2*LINK_SIZE);
4396 code += 1 + LINK_SIZE;
4397
4398 *code = OP_RECURSE;
4399 PUT(code, 1, called - cd->start_code);
4400 code += 1 + LINK_SIZE;
4401
4402 *code = OP_KET;
4403 PUT(code, 1, 2 + 2*LINK_SIZE);
4404 code += 1 + LINK_SIZE;
4405
4406 length_prevgroup = 3 + 3*LINK_SIZE;
4407 }
4408
4409 /* Can't determine a first byte now */
4410
4411 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
4412 continue;
4413
4414
4415 /* ------------------------------------------------------------ */
4416 default: /* Other characters: check option setting */
4417 OTHER_CHAR_AFTER_QUERY:
4418 set = unset = 0;
4419 optset = &set;
4420
4421 while (*ptr != ')' && *ptr != ':')
4422 {
4423 switch (*ptr++)
4424 {
4425 case '-': optset = &unset; break;
4426
4427 case 'J': /* Record that it changed in the external options */
4428 *optset |= PCRE_DUPNAMES;
4429 cd->external_options |= PCRE_JCHANGED;
4430 break;
4431
4432 case 'i': *optset |= PCRE_CASELESS; break;
4433 case 'm': *optset |= PCRE_MULTILINE; break;
4434 case 's': *optset |= PCRE_DOTALL; break;
4435 case 'x': *optset |= PCRE_EXTENDED; break;
4436 case 'U': *optset |= PCRE_UNGREEDY; break;
4437 case 'X': *optset |= PCRE_EXTRA; break;
4438
4439 default: *errorcodeptr = ERR12;
4440 ptr--; /* Correct the offset */
4441 goto FAILED;
4442 }
4443 }
4444
4445 /* Set up the changed option bits, but don't change anything yet. */
4446
4447 newoptions = (options | set) & (~unset);
4448
4449 /* If the options ended with ')' this is not the start of a nested
4450 group with option changes, so the options change at this level. If this
4451 item is right at the start of the pattern, the options can be
4452 abstracted and made external in the pre-compile phase, and ignored in
4453 the compile phase. This can be helpful when matching -- for instance in
4454 caseless checking of required bytes.
4455
4456 If the code pointer is not (cd->start_code + 1 + LINK_SIZE), we are
4457 definitely *not* at the start of the pattern because something has been
4458 compiled. In the pre-compile phase, however, the code pointer can have
4459 that value after the start, because it gets reset as code is discarded
4460 during the pre-compile. However, this can happen only at top level - if
4461 we are within parentheses, the starting BRA will still be present. At
4462 any parenthesis level, the length value can be used to test if anything
4463 has been compiled at that level. Thus, a test for both these conditions
4464 is necessary to ensure we correctly detect the start of the pattern in
4465 both phases.
4466
4467 If we are not at the pattern start, compile code to change the ims
4468 options if this setting actually changes any of them. We also pass the
4469 new setting back so that it can be put at the start of any following
4470 branches, and when this group ends (if we are in a group), a resetting
4471 item can be compiled. */
4472
4473 if (*ptr == ')')
4474 {
4475 if (code == cd->start_code + 1 + LINK_SIZE &&
4476 (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE))
4477 {
4478 cd->external_options = newoptions;
4479 options = newoptions;
4480 }
4481 else
4482 {
4483 if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
4484 {
4485 *code++ = OP_OPT;
4486 *code++ = newoptions & PCRE_IMS;
4487 }
4488
4489 /* Change options at this level, and pass them back for use
4490 in subsequent branches. Reset the greedy defaults and the case
4491 value for firstbyte and reqbyte. */
4492
4493 *optionsptr = options = newoptions;
4494 greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
4495 greedy_non_default = greedy_default ^ 1;
4496 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
4497 }
4498
4499 previous = NULL; /* This item can't be repeated */
4500 continue; /* It is complete */
4501 }
4502
4503 /* If the options ended with ':' we are heading into a nested group
4504 with possible change of options. Such groups are non-capturing and are
4505 not assertions of any kind. All we need to do is skip over the ':';
4506 the newoptions value is handled below. */
4507
4508 bravalue = OP_BRA;
4509 ptr++;
4510 } /* End of switch for character following (? */
4511 } /* End of (? handling */
4512
4513 /* Opening parenthesis not followed by '?'. If PCRE_NO_AUTO_CAPTURE is set,
4514 all unadorned brackets become non-capturing and behave like (?:...)
4515 brackets. */
4516
4517 else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
4518 {
4519 bravalue = OP_BRA;
4520 }
4521
4522 /* Else we have a capturing group. */
4523
4524 else
4525 {
4526 NUMBERED_GROUP:
4527 cd->bracount += 1;
4528 PUT2(code, 1+LINK_SIZE, cd->bracount);
4529 skipbytes = 2;
4530 }
4531
4532 /* Process nested bracketed regex. Assertions may not be repeated, but
4533 other kinds can be. All their opcodes are >= OP_ONCE. We copy code into a
4534 non-register variable in order to be able to pass its address because some
4535 compilers complain otherwise. Pass in a new setting for the ims options if
4536 they have changed. */
4537
4538 previous = (bravalue >= OP_ONCE)? code : NULL;
4539 *code = bravalue;
4540 tempcode = code;
4541 tempreqvary = cd->req_varyopt; /* Save value before bracket */
4542 length_prevgroup = 0; /* Initialize for pre-compile phase */
4543
4544 if (!compile_regex(
4545 newoptions, /* The complete new option state */
4546 options & PCRE_IMS, /* The previous ims option state */
4547 &tempcode, /* Where to put code (updated) */
4548 &ptr, /* Input pointer (updated) */
4549 errorcodeptr, /* Where to put an error message */
4550 (bravalue == OP_ASSERTBACK ||
4551 bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
4552 reset_bracount, /* True if (?| group */
4553 skipbytes, /* Skip over bracket number */
4554 &subfirstbyte, /* For possible first char */
4555 &subreqbyte, /* For possible last char */
4556 bcptr, /* Current branch chain */
4557 cd, /* Tables block */
4558 (lengthptr == NULL)? NULL : /* Actual compile phase */
4559 &length_prevgroup /* Pre-compile phase */
4560 ))
4561 goto FAILED;
4562
4563 /* At the end of compiling, code is still pointing to the start of the
4564 group, while tempcode has been updated to point past the end of the group
4565 and any option resetting that may follow it. The pattern pointer (ptr)
4566 is on the bracket. */
4567
4568 /* If this is a conditional bracket, check that there are no more than
4569 two branches in the group, or just one if it's a DEFINE group. We do this
4570 in the real compile phase, not in the pre-pass, where the whole group may
4571 not be available. */
4572
4573 if (bravalue == OP_COND && lengthptr == NULL)
4574 {
4575 uschar *tc = code;
4576 int condcount = 0;
4577
4578 do {
4579 condcount++;
4580 tc += GET(tc,1);
4581 }
4582 while (*tc != OP_KET);
4583
4584 /* A DEFINE group is never obeyed inline (the "condition" is always
4585 false). It must have only one branch. */
4586
4587 if (code[LINK_SIZE+1] == OP_DEF)
4588 {
4589 if (condcount > 1)
4590 {
4591 *errorcodeptr = ERR54;
4592 goto FAILED;
4593 }
4594 bravalue = OP_DEF; /* Just a flag to suppress char handling below */
4595 }
4596
4597 /* A "normal" conditional group. If there is just one branch, we must not
4598 make use of its firstbyte or reqbyte, because this is equivalent to an
4599 empty second branch. */
4600
4601 else
4602 {
4603 if (condcount > 2)
4604 {
4605 *errorcodeptr = ERR27;
4606 goto FAILED;
4607 }
4608 if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
4609 }
4610 }
4611
4612 /* Error if hit end of pattern */
4613
4614 if (*ptr != ')')
4615 {
4616 *errorcodeptr = ERR14;
4617 goto FAILED;
4618 }
4619
4620 /* In the pre-compile phase, update the length by the length of the nested
4621 group, less the brackets at either end. Then reduce the compiled code to
4622 just the brackets so that it doesn't use much memory if it is duplicated by
4623 a quantifier. */
4624
4625 if (lengthptr != NULL)
4626 {
4627 *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
4628 code++;
4629 PUTINC(code, 0, 1 + LINK_SIZE);
4630 *code++ = OP_KET;
4631 PUTINC(code, 0, 1 + LINK_SIZE);
4632 }
4633
4634 /* Otherwise update the main code pointer to the end of the group. */
4635
4636 else code = tempcode;
4637
4638 /* For a DEFINE group, required and first character settings are not
4639 relevant. */
4640
4641 if (bravalue == OP_DEF) break;
4642
4643 /* Handle updating of the required and first characters for other types of
4644 group. Update for normal brackets of all kinds, and conditions with two
4645 branches (see code above). If the bracket is followed by a quantifier with
4646 zero repeat, we have to back off. Hence the definition of zeroreqbyte and
4647 zerofirstbyte outside the main loop so that they can be accessed for the
4648 back off. */
4649
4650 zeroreqbyte = reqbyte;
4651 zerofirstbyte = firstbyte;
4652 groupsetfirstbyte = FALSE;
4653
4654 if (bravalue >= OP_ONCE)
4655 {
4656 /* If we have not yet set a firstbyte in this branch, take it from the
4657 subpattern, remembering that it was set here so that a repeat of more
4658 than one can replicate it as reqbyte if necessary. If the subpattern has
4659 no firstbyte, set "none" for the whole branch. In both cases, a zero
4660 repeat forces firstbyte to "none". */
4661
4662 if (firstbyte == REQ_UNSET)
4663 {
4664 if (subfirstbyte >= 0)
4665 {
4666 firstbyte = subfirstbyte;
4667 groupsetfirstbyte = TRUE;
4668 }
4669 else firstbyte = REQ_NONE;
4670 zerofirstbyte = REQ_NONE;
4671 }
4672
4673 /* If firstbyte was previously set, convert the subpattern's firstbyte
4674 into reqbyte if there wasn't one, using the vary flag that was in
4675 existence beforehand. */
4676
4677 else if (subfirstbyte >= 0 && subreqbyte < 0)
4678 subreqbyte = subfirstbyte | tempreqvary;
4679
4680 /* If the subpattern set a required byte (or set a first byte that isn't
4681 really the first byte - see above), set it. */
4682
4683 if (subreqbyte >= 0) reqbyte = subreqbyte;
4684 }
4685
4686 /* For a forward assertion, we take the reqbyte, if set. This can be
4687 helpful if the pattern that follows the assertion doesn't set a different
4688 char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
4689 for an assertion, however because it leads to incorrect effect for patterns
4690 such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
4691 of a firstbyte. This is overcome by a scan at the end if there's no
4692 firstbyte, looking for an asserted first char. */
4693
4694 else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
4695 break; /* End of processing '(' */
4696
4697
4698 /* ===================================================================*/
4699 /* Handle metasequences introduced by \. For ones like \d, the ESC_ values
4700 are arranged to be the negation of the corresponding OP_values. For the
4701 back references, the values are ESC_REF plus the reference number. Only
4702 back references and those types that consume a character may be repeated.
4703 We can test for values between ESC_b and ESC_Z for the latter; this may
4704 have to change if any new ones are ever created. */
4705
4706 case '\\':
4707 tempptr = ptr;
4708 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, FALSE);
4709 if (*errorcodeptr != 0) goto FAILED;
4710
4711 if (c < 0)
4712 {
4713 if (-c == ESC_Q) /* Handle start of quoted string */
4714 {
4715 if (ptr[1] == '\\' && ptr[2] == 'E') ptr += 2; /* avoid empty string */
4716 else inescq = TRUE;
4717 continue;
4718 }
4719
4720 if (-c == ESC_E) continue; /* Perl ignores an orphan \E */
4721
4722 /* For metasequences that actually match a character, we disable the
4723 setting of a first character if it hasn't already been set. */
4724
4725 if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
4726 firstbyte = REQ_NONE;
4727
4728 /* Set values to reset to if this is followed by a zero repeat. */
4729
4730 zerofirstbyte = firstbyte;
4731 zeroreqbyte = reqbyte;
4732
4733 /* \k<name> or \k'name' is a back reference by name (Perl syntax).
4734 We also support \k{name} (.NET syntax) */
4735
4736 if (-c == ESC_k && (ptr[1] == '<' || ptr[1] == '\'' || ptr[1] == '{'))
4737 {
4738 is_recurse = FALSE;
4739 terminator = (*(++ptr) == '<')? '>' : (*ptr == '\'')? '\'' : '}';
4740 goto NAMED_REF_OR_RECURSE;
4741 }
4742
4743 /* Back references are handled specially; must disable firstbyte if
4744 not set to cope with cases like (?=(\w+))\1: which would otherwise set
4745 ':' later. */
4746
4747 if (-c >= ESC_REF)
4748 {
4749 recno = -c - ESC_REF;
4750
4751 HANDLE_REFERENCE: /* Come here from named backref handling */
4752 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
4753 previous = code;
4754 *code++ = OP_REF;
4755 PUT2INC(code, 0, recno);
4756 cd->backref_map |= (recno < 32)? (1 << recno) : 1;
4757 if (recno > cd->top_backref) cd->top_backref = recno;
4758 }
4759
4760 /* So are Unicode property matches, if supported. */
4761
4762 #ifdef SUPPORT_UCP
4763 else if (-c == ESC_P || -c == ESC_p)
4764 {
4765 BOOL negated;
4766 int pdata;
4767 int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
4768 if (ptype < 0) goto FAILED;
4769 previous = code;
4770 *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
4771 *code++ = ptype;
4772 *code++ = pdata;
4773 }
4774 #else
4775
4776 /* If Unicode properties are not supported, \X, \P, and \p are not
4777 allowed. */
4778
4779 else if (-c == ESC_X || -c == ESC_P || -c == ESC_p)
4780 {
4781 *errorcodeptr = ERR45;
4782 goto FAILED;
4783 }
4784 #endif
4785
4786 /* For the rest (including \X when Unicode properties are supported), we
4787 can obtain the OP value by negating the escape value. */
4788
4789 else
4790 {
4791 previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
4792 *code++ = -c;
4793 }
4794 continue;
4795 }
4796
4797 /* We have a data character whose value is in c. In UTF-8 mode it may have
4798 a value > 127. We set its representation in the length/buffer, and then
4799 handle it as a data character. */
4800
4801 #ifdef SUPPORT_UTF8
4802 if (utf8 && c > 127)
4803 mclength = _pcre_ord2utf8(c, mcbuffer);
4804 else
4805 #endif
4806
4807 {
4808 mcbuffer[0] = c;
4809 mclength = 1;
4810 }
4811 goto ONE_CHAR;
4812
4813
4814 /* ===================================================================*/
4815 /* Handle a literal character. It is guaranteed not to be whitespace or #
4816 when the extended flag is set. If we are in UTF-8 mode, it may be a
4817 multi-byte literal character. */
4818
4819 default:
4820 NORMAL_CHAR:
4821 mclength = 1;
4822 mcbuffer[0] = c;
4823
4824 #ifdef SUPPORT_UTF8
4825 if (utf8 && c >= 0xc0)
4826 {
4827 while ((ptr[1] & 0xc0) == 0x80)
4828 mcbuffer[mclength++] = *(++ptr);
4829 }
4830 #endif
4831
4832 /* At this point we have the character's bytes in mcbuffer, and the length
4833 in mclength. When not in UTF-8 mode, the length is always 1. */
4834
4835 ONE_CHAR:
4836 previous = code;
4837 *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARNC : OP_CHAR;
4838 for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
4839
4840 /* Set the first and required bytes appropriately. If no previous first
4841 byte, set it from this character, but revert to none on a zero repeat.
4842 Otherwise, leave the firstbyte value alone, and don't change it on a zero
4843 repeat. */
4844
4845 if (firstbyte == REQ_UNSET)
4846 {
4847 zerofirstbyte = REQ_NONE;
4848 zeroreqbyte = reqbyte;
4849
4850 /* If the character is more than one byte long, we can set firstbyte
4851 only if it is not to be matched caselessly. */
4852
4853 if (mclength == 1 || req_caseopt == 0)
4854 {
4855 firstbyte = mcbuffer[0] | req_caseopt;
4856 if (mclength != 1) reqbyte = code[-1] | cd->req_varyopt;
4857 }
4858 else firstbyte = reqbyte = REQ_NONE;
4859 }
4860
4861 /* firstbyte was previously set; we can set reqbyte only the length is
4862 1 or the matching is caseful. */
4863
4864 else
4865 {
4866 zerofirstbyte = firstbyte;
4867 zeroreqbyte = reqbyte;
4868 if (mclength == 1 || req_caseopt == 0)
4869 reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
4870 }
4871
4872 break; /* End of literal character handling */
4873 }
4874 } /* end of big loop */
4875
4876
4877 /* Control never reaches here by falling through, only by a goto for all the
4878 error states. Pass back the position in the pattern so that it can be displayed
4879 to the user for diagnosing the error. */
4880
4881 FAILED:
4882 *ptrptr = ptr;
4883 return FALSE;
4884 }
4885
4886
4887
4888
4889 /*************************************************
4890 * Compile sequence of alternatives *
4891 *************************************************/
4892
4893 /* On entry, ptr is pointing past the bracket character, but on return it
4894 points to the closing bracket, or vertical bar, or end of string. The code
4895 variable is pointing at the byte into which the BRA operator has been stored.
4896 If the ims options are changed at the start (for a (?ims: group) or during any
4897 branch, we need to insert an OP_OPT item at the start of every following branch
4898 to ensure they get set correctly at run time, and also pass the new options
4899 into every subsequent branch compile.
4900
4901 This function is used during the pre-compile phase when we are trying to find
4902 out the amount of memory needed, as well as during the real compile phase. The
4903 value of lengthptr distinguishes the two phases.
4904
4905 Arguments:
4906 options option bits, including any changes for this subpattern
4907 oldims previous settings of ims option bits
4908 codeptr -> the address of the current code pointer
4909 ptrptr -> the address of the current pattern pointer
4910 errorcodeptr -> pointer to error code variable
4911 lookbehind TRUE if this is a lookbehind assertion
4912 reset_bracount TRUE to reset the count for each branch
4913 skipbytes skip this many bytes at start (for brackets and OP_COND)
4914 firstbyteptr place to put the first required character, or a negative number
4915 reqbyteptr place to put the last required character, or a negative number
4916 bcptr pointer to the chain of currently open branches
4917 cd points to the data block with tables pointers etc.
4918 lengthptr NULL during the real compile phase
4919 points to length accumulator during pre-compile phase
4920
4921 Returns: TRUE on success
4922 */
4923
4924 static BOOL
4925 compile_regex(int options, int oldims, uschar **codeptr, const uschar **ptrptr,
4926 int *errorcodeptr, BOOL lookbehind, BOOL reset_bracount, int skipbytes,
4927 int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd,
4928 int *lengthptr)
4929 {
4930 const uschar *ptr = *ptrptr;
4931 uschar *code = *codeptr;
4932 uschar *last_branch = code;
4933 uschar *start_bracket = code;
4934 uschar *reverse_count = NULL;
4935 int firstbyte, reqbyte;
4936 int branchfirstbyte, branchreqbyte;
4937 int length;
4938 int orig_bracount;
4939 int max_bracount;
4940 branch_chain bc;
4941
4942 bc.outer = bcptr;
4943 bc.current = code;
4944
4945 firstbyte = reqbyte = REQ_UNSET;
4946
4947 /* Accumulate the length for use in the pre-compile phase. Start with the
4948 length of the BRA and KET and any extra bytes that are required at the
4949 beginning. We accumulate in a local variable to save frequent testing of
4950 lenthptr for NULL. We cannot do this by looking at the value of code at the
4951 start and end of each alternative, because compiled items are discarded during
4952 the pre-compile phase so that the work space is not exceeded. */
4953
4954 length = 2 + 2*LINK_SIZE + skipbytes;
4955
4956 /* WARNING: If the above line is changed for any reason, you must also change
4957 the code that abstracts option settings at the start of the pattern and makes
4958 them global. It tests the value of length for (2 + 2*LINK_SIZE) in the
4959 pre-compile phase to find out whether anything has yet been compiled or not. */
4960
4961 /* Offset is set zero to mark that this bracket is still open */
4962
4963 PUT(code, 1, 0);
4964 code += 1 + LINK_SIZE + skipbytes;
4965
4966 /* Loop for each alternative branch */
4967
4968 orig_bracount = max_bracount = cd->bracount;
4969 for (;;)
4970 {
4971 /* For a (?| group, reset the capturing bracket count so that each branch
4972 uses the same numbers. */
4973
4974 if (reset_bracount) cd->bracount = orig_bracount;
4975
4976 /* Handle a change of ims options at the start of the branch */
4977
4978 if ((options & PCRE_IMS) != oldims)
4979 {
4980 *code++ = OP_OPT;
4981 *code++ = options & PCRE_IMS;
4982 length += 2;
4983 }
4984
4985 /* Set up dummy OP_REVERSE if lookbehind assertion */
4986
4987 if (lookbehind)
4988 {
4989 *code++ = OP_REVERSE;
4990 reverse_count = code;
4991 PUTINC(code, 0, 0);
4992 length += 1 + LINK_SIZE;
4993 }
4994
4995 /* Now compile the branch; in the pre-compile phase its length gets added
4996 into the length. */
4997
4998 if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstbyte,
4999 &branchreqbyte, &bc, cd, (lengthptr == NULL)? NULL : &length))
5000 {
5001 *ptrptr = ptr;
5002 return FALSE;
5003 }
5004
5005 /* Keep the highest bracket count in case (?| was used and some branch
5006 has fewer than the rest. */
5007
5008 if (cd->bracount > max_bracount) max_bracount = cd->bracount;
5009
5010 /* In the real compile phase, there is some post-processing to be done. */
5011
5012 if (lengthptr == NULL)
5013 {
5014 /* If this is the first branch, the firstbyte and reqbyte values for the
5015 branch become the values for the regex. */
5016
5017 if (*last_branch != OP_ALT)
5018 {
5019 firstbyte = branchfirstbyte;
5020 reqbyte = branchreqbyte;
5021 }
5022
5023 /* If this is not the first branch, the first char and reqbyte have to
5024 match the values from all the previous branches, except that if the
5025 previous value for reqbyte didn't have REQ_VARY set, it can still match,
5026 and we set REQ_VARY for the regex. */
5027
5028 else
5029 {
5030 /* If we previously had a firstbyte, but it doesn't match the new branch,
5031 we have to abandon the firstbyte for the regex, but if there was
5032 previously no reqbyte, it takes on the value of the old firstbyte. */
5033
5034 if (firstbyte >= 0 && firstbyte != branchfirstbyte)
5035 {
5036 if (reqbyte < 0) reqbyte = firstbyte;
5037 firstbyte = REQ_NONE;
5038 }
5039
5040 /* If we (now or from before) have no firstbyte, a firstbyte from the
5041 branch becomes a reqbyte if there isn't a branch reqbyte. */
5042
5043 if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
5044 branchreqbyte = branchfirstbyte;
5045
5046 /* Now ensure that the reqbytes match */
5047
5048 if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
5049 reqbyte = REQ_NONE;
5050 else reqbyte |= branchreqbyte; /* To "or" REQ_VARY */
5051 }
5052
5053 /* If lookbehind, check that this branch matches a fixed-length string, and
5054 put the length into the OP_REVERSE item. Temporarily mark the end of the
5055 branch with OP_END. */
5056
5057 if (lookbehind)
5058 {
5059 int fixed_length;
5060 *code = OP_END;
5061 fixed_length = find_fixedlength(last_branch, options);
5062 DPRINTF(("fixed length = %d\n", fixed_length));
5063 if (fixed_length < 0)
5064 {
5065 *errorcodeptr = (fixed_length == -2)? ERR36 : ERR25;
5066 *ptrptr = ptr;
5067 return FALSE;
5068 }
5069 PUT(reverse_count, 0, fixed_length);
5070 }
5071 }
5072
5073 /* Reached end of expression, either ')' or end of pattern. In the real
5074 compile phase, go back through the alternative branches and reverse the chain
5075 of offsets, with the field in the BRA item now becoming an offset to the
5076 first alternative. If there are no alternatives, it points to the end of the
5077 group. The length in the terminating ket is always the length of the whole
5078 bracketed item. If any of the ims options were changed inside the group,
5079 compile a resetting op-code following, except at the very end of the pattern.
5080 Return leaving the pointer at the terminating char. */
5081
5082 if (*ptr != '|')
5083 {
5084 if (lengthptr == NULL)
5085 {
5086 int branch_length = code - last_branch;
5087 do
5088 {
5089 int prev_length = GET(last_branch, 1);
5090 PUT(last_branch, 1, branch_length);
5091 branch_length = prev_length;
5092 last_branch -= branch_length;
5093 }
5094 while (branch_length > 0);
5095 }
5096
5097 /* Fill in the ket */
5098
5099 *code = OP_KET;
5100 PUT(code, 1, code - start_bracket);
5101 code += 1 + LINK_SIZE;
5102
5103 /* Resetting option if needed */
5104
5105 if ((options & PCRE_IMS) != oldims && *ptr == ')')
5106 {
5107 *code++ = OP_OPT;
5108 *code++ = oldims;
5109 length += 2;
5110 }
5111
5112 /* Retain the highest bracket number, in case resetting was used. */
5113
5114 cd->bracount = max_bracount;
5115
5116 /* Set values to pass back */
5117
5118 *codeptr = code;
5119 *ptrptr = ptr;
5120 *firstbyteptr = firstbyte;
5121 *reqbyteptr = reqbyte;
5122 if (lengthptr != NULL) *lengthptr += length;
5123 return TRUE;
5124 }
5125
5126 /* Another branch follows. In the pre-compile phase, we can move the code
5127 pointer back to where it was for the start of the first branch. (That is,
5128 pretend that each branch is the only one.)
5129
5130 In the real compile phase, insert an ALT node. Its length field points back
5131 to the previous branch while the bracket remains open. At the end the chain
5132 is reversed. It's done like this so that the start of the bracket has a
5133 zero offset until it is closed, making it possible to detect recursion. */
5134
5135 if (lengthptr != NULL)
5136 {
5137 code = *codeptr + 1 + LINK_SIZE + skipbytes;
5138 length += 1 + LINK_SIZE;
5139 }
5140 else
5141 {
5142 *code = OP_ALT;
5143 PUT(code, 1, code - last_branch);
5144 bc.current = last_branch = code;
5145 code += 1 + LINK_SIZE;
5146 }
5147
5148 ptr++;
5149 }
5150 /* Control never reaches here */
5151 }
5152
5153
5154
5155
5156 /*************************************************
5157 * Check for anchored expression *
5158 *************************************************/
5159
5160 /* Try to find out if this is an anchored regular expression. Consider each
5161 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
5162 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
5163 it's anchored. However, if this is a multiline pattern, then only OP_SOD
5164 counts, since OP_CIRC can match in the middle.
5165
5166 We can also consider a regex to be anchored if OP_SOM starts all its branches.
5167 This is the code for \G, which means "match at start of match position, taking
5168 into account the match offset".
5169
5170 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
5171 because that will try the rest of the pattern at all possible matching points,
5172 so there is no point trying again.... er ....
5173
5174 .... except when the .* appears inside capturing parentheses, and there is a
5175 subsequent back reference to those parentheses. We haven't enough information
5176 to catch that case precisely.
5177
5178 At first, the best we could do was to detect when .* was in capturing brackets
5179 and the highest back reference was greater than or equal to that level.
5180 However, by keeping a bitmap of the first 31 back references, we can catch some
5181 of the more common cases more precisely.
5182
5183 Arguments:
5184 code points to start of expression (the bracket)
5185 options points to the options setting
5186 bracket_map a bitmap of which brackets we are inside while testing; this
5187 handles up to substring 31; after that we just have to take
5188 the less precise approach
5189 backref_map the back reference bitmap
5190
5191 Returns: TRUE or FALSE
5192 */
5193
5194 static BOOL
5195 is_anchored(register const uschar *code, int *options, unsigned int bracket_map,
5196 unsigned int backref_map)
5197 {
5198 do {
5199 const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
5200 options, PCRE_MULTILINE, FALSE);
5201 register int op = *scode;
5202
5203 /* Non-capturing brackets */
5204
5205 if (op == OP_BRA)
5206 {
5207 if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
5208 }
5209
5210 /* Capturing brackets */
5211
5212 else if (op == OP_CBRA)
5213 {
5214 int n = GET2(scode, 1+LINK_SIZE);
5215 int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
5216 if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;
5217 }
5218
5219 /* Other brackets */
5220
5221 else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
5222 {
5223 if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
5224 }
5225
5226 /* .* is not anchored unless DOTALL is set and it isn't in brackets that
5227 are or may be referenced. */
5228
5229 else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
5230 op == OP_TYPEPOSSTAR) &&
5231 (*options & PCRE_DOTALL) != 0)
5232 {
5233 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
5234 }
5235
5236 /* Check for explicit anchoring */
5237
5238 else if (op != OP_SOD && op != OP_SOM &&
5239 ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
5240 return FALSE;
5241 code += GET(code, 1);
5242 }
5243 while (*code == OP_ALT); /* Loop for each alternative */
5244 return TRUE;
5245 }
5246
5247
5248
5249 /*************************************************
5250 * Check for starting with ^ or .* *
5251 *************************************************/
5252
5253 /* This is called to find out if every branch starts with ^ or .* so that
5254 "first char" processing can be done to speed things up in multiline
5255 matching and for non-DOTALL patterns that start with .* (which must start at
5256 the beginning or after \n). As in the case of is_anchored() (see above), we
5257 have to take account of back references to capturing brackets that contain .*
5258 because in that case we can't make the assumption.
5259
5260 Arguments:
5261 code points to start of expression (the bracket)
5262 bracket_map a bitmap of which brackets we are inside while testing; this
5263 handles up to substring 31; after that we just have to take
5264 the less precise approach
5265 backref_map the back reference bitmap
5266
5267 Returns: TRUE or FALSE
5268 */
5269
5270 static BOOL
5271 is_startline(const uschar *code, unsigned int bracket_map,
5272 unsigned int backref_map)
5273 {
5274 do {
5275 const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
5276 NULL, 0, FALSE);
5277 register int op = *scode;
5278
5279 /* Non-capturing brackets */
5280
5281 if (op == OP_BRA)
5282 {
5283 if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
5284 }
5285
5286 /* Capturing brackets */
5287
5288 else if (op == OP_CBRA)
5289 {
5290 int n = GET2(scode, 1+LINK_SIZE);
5291 int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
5292 if (!is_startline(scode, new_map, backref_map)) return FALSE;
5293 }
5294
5295 /* Other brackets */
5296
5297 else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
5298 { if (!is_startline(scode, bracket_map, backref_map)) return FALSE; }
5299
5300 /* .* means "start at start or after \n" if it isn't in brackets that
5301 may be referenced. */
5302
5303 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
5304 {
5305 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
5306 }
5307
5308 /* Check for explicit circumflex */
5309
5310 else if (op != OP_CIRC) return FALSE;
5311
5312 /* Move on to the next alternative */
5313
5314 code += GET(code, 1);
5315 }
5316 while (*code == OP_ALT); /* Loop for each alternative */
5317 return TRUE;
5318 }
5319
5320
5321
5322 /*************************************************
5323 * Check for asserted fixed first char *
5324 *************************************************/
5325
5326 /* During compilation, the "first char" settings from forward assertions are
5327 discarded, because they can cause conflicts with actual literals that follow.
5328 However, if we end up without a first char setting for an unanchored pattern,
5329 it is worth scanning the regex to see if there is an initial asserted first
5330 char. If all branches start with the same asserted char, or with a bracket all
5331 of whose alternatives start with the same asserted char (recurse ad lib), then
5332 we return that char, otherwise -1.
5333
5334 Arguments:
5335 code points to start of expression (the bracket)
5336 options pointer to the options (used to check casing changes)
5337 inassert TRUE if in an assertion
5338
5339 Returns: -1 or the fixed first char
5340 */
5341
5342 static int
5343 find_firstassertedchar(const uschar *code, int *options, BOOL inassert)
5344 {
5345 register int c = -1;
5346 do {
5347 int d;
5348 const uschar *scode =
5349 first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE);
5350 register int op = *scode;
5351
5352 switch(op)
5353 {
5354 default:
5355 return -1;
5356
5357 case OP_BRA:
5358 case OP_CBRA:
5359 case OP_ASSERT:
5360 case OP_ONCE:
5361 case OP_COND:
5362 if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0)
5363 return -1;
5364 if (c < 0) c = d; else if (c != d) return -1;
5365 break;
5366
5367 case OP_EXACT: /* Fall through */
5368 scode += 2;
5369
5370 case OP_CHAR:
5371 case OP_CHARNC:
5372 case OP_PLUS:
5373 case OP_MINPLUS:
5374 case OP_POSPLUS:
5375 if (!inassert) return -1;
5376 if (c < 0)
5377 {
5378 c = scode[1];
5379 if ((*options & PCRE_CASELESS) != 0) c |= REQ_CASELESS;
5380 }
5381 else if (c != scode[1]) return -1;
5382 break;
5383 }
5384
5385 code += GET(code, 1);
5386 }
5387 while (*code == OP_ALT);
5388 return c;
5389 }
5390
5391
5392
5393 /*************************************************
5394 * Compile a Regular Expression *
5395 *************************************************/
5396
5397 /* This function takes a string and returns a pointer to a block of store
5398 holding a compiled version of the expression. The original API for this
5399 function had no error code return variable; it is retained for backwards
5400 compatibility. The new function is given a new name.
5401
5402 Arguments:
5403 pattern the regular expression
5404 options various option bits
5405 errorcodeptr pointer to error code variable (pcre_compile2() only)
5406 can be NULL if you don't want a code value
5407 errorptr pointer to pointer to error text
5408 erroroffset ptr offset in pattern where error was detected
5409 tables pointer to character tables or NULL
5410
5411 Returns: pointer to compiled data block, or NULL on error,
5412 with errorptr and erroroffset set
5413 */
5414
5415 PCRE_EXP_DEFN pcre *
5416 pcre_compile(const char *pattern, int options, const char **errorptr,
5417 int *erroroffset, const unsigned char *tables)
5418 {
5419 return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
5420 }
5421
5422
5423 PCRE_EXP_DEFN pcre *
5424 pcre_compile2(const char *pattern, int options, int *errorcodeptr,
5425 const char **errorptr, int *erroroffset, const unsigned char *tables)
5426 {
5427 real_pcre *re;
5428 int length = 1; /* For final END opcode */
5429 int firstbyte, reqbyte, newline;
5430 int errorcode = 0;
5431 #ifdef SUPPORT_UTF8
5432 BOOL utf8;
5433 #endif
5434 size_t size;
5435 uschar *code;
5436 const uschar *codestart;
5437 const uschar *ptr;
5438 compile_data compile_block;
5439 compile_data *cd = &compile_block;
5440
5441 /* This space is used for "compiling" into during the first phase, when we are
5442 computing the amount of memory that is needed. Compiled items are thrown away
5443 as soon as possible, so that a fairly large buffer should be sufficient for
5444 this purpose. The same space is used in the second phase for remembering where
5445 to fill in forward references to subpatterns. */
5446
5447 uschar cworkspace[COMPILE_WORK_SIZE];
5448
5449
5450 /* Set this early so that early errors get offset 0. */
5451
5452 ptr = (const uschar *)pattern;
5453
5454 /* We can't pass back an error message if errorptr is NULL; I guess the best we
5455 can do is just return NULL, but we can set a code value if there is a code
5456 pointer. */
5457
5458 if (errorptr == NULL)
5459 {
5460 if (errorcodeptr != NULL) *errorcodeptr = 99;
5461 return NULL;
5462 }
5463
5464 *errorptr = NULL;
5465 if (errorcodeptr != NULL) *errorcodeptr = ERR0;
5466
5467 /* However, we can give a message for this error */
5468
5469 if (erroroffset == NULL)
5470 {
5471 errorcode = ERR16;
5472 goto PCRE_EARLY_ERROR_RETURN2;
5473 }
5474
5475 *erroroffset = 0;
5476
5477 /* Can't support UTF8 unless PCRE has been compiled to include the code. */
5478
5479 #ifdef SUPPORT_UTF8
5480 utf8 = (options & PCRE_UTF8) != 0;
5481 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
5482 (*erroroffset = _pcre_valid_utf8((uschar *)pattern, -1)) >= 0)
5483 {
5484 errorcode = ERR44;
5485 goto PCRE_EARLY_ERROR_RETURN2;
5486 }
5487 #else
5488 if ((options & PCRE_UTF8) != 0)
5489 {
5490 errorcode = ERR32;
5491 goto PCRE_EARLY_ERROR_RETURN;
5492 }
5493 #endif
5494
5495 if ((options & ~PUBLIC_OPTIONS) != 0)
5496 {
5497 errorcode = ERR17;
5498 goto PCRE_EARLY_ERROR_RETURN;
5499 }
5500
5501 /* Set up pointers to the individual character tables */
5502
5503 if (tables == NULL) tables = _pcre_default_tables;
5504 cd->lcc = tables + lcc_offset;
5505 cd->fcc = tables + fcc_offset;
5506 cd->cbits = tables + cbits_offset;
5507 cd->ctypes = tables + ctypes_offset;
5508
5509 /* Handle different types of newline. The three bits give seven cases. The
5510 current code allows for fixed one- or two-byte sequences, plus "any" and
5511 "anycrlf". */
5512
5513 switch (options & (PCRE_NEWLINE_CRLF | PCRE_NEWLINE_ANY))
5514 {
5515 case 0: newline = NEWLINE; break; /* Compile-time default */
5516 case PCRE_NEWLINE_CR: newline = '\r'; break;
5517 case PCRE_NEWLINE_LF: newline = '\n'; break;
5518 case PCRE_NEWLINE_CR+
5519 PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
5520 case PCRE_NEWLINE_ANY: newline = -1; break;
5521 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
5522 default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
5523 }
5524
5525 if (newline == -2)
5526 {
5527 cd->nltype = NLTYPE_ANYCRLF;
5528 }
5529 else if (newline < 0)
5530 {
5531 cd->nltype = NLTYPE_ANY;
5532 }
5533 else
5534 {
5535 cd->nltype = NLTYPE_FIXED;
5536 if (newline > 255)
5537 {
5538 cd->nllen = 2;
5539 cd->nl[0] = (newline >> 8) & 255;
5540 cd->nl[1] = newline & 255;
5541 }
5542 else
5543 {
5544 cd->nllen = 1;
5545 cd->nl[0] = newline;
5546 }
5547 }
5548
5549 /* Maximum back reference and backref bitmap. The bitmap records up to 31 back
5550 references to help in deciding whether (.*) can be treated as anchored or not.
5551 */
5552
5553 cd->top_backref = 0;
5554 cd->backref_map = 0;
5555
5556 /* Reflect pattern for debugging output */
5557
5558 DPRINTF(("------------------------------------------------------------------\n"));
5559 DPRINTF(("%s\n", pattern));
5560
5561 /* Pretend to compile the pattern while actually just accumulating the length
5562 of memory required. This behaviour is triggered by passing a non-NULL final
5563 argument to compile_regex(). We pass a block of workspace (cworkspace) for it
5564 to compile parts of the pattern into; the compiled code is discarded when it is
5565 no longer needed, so hopefully this workspace will never overflow, though there
5566 is a test for its doing so. */
5567
5568 cd->bracount = 0;
5569 cd->names_found = 0;
5570 cd->name_entry_size = 0;
5571 cd->name_table = NULL;
5572 cd->start_workspace = cworkspace;
5573 cd->start_code = cworkspace;
5574 cd->hwm = cworkspace;
5575 cd->start_pattern = (const uschar *)pattern;
5576 cd->end_pattern = (const uschar *)(pattern + strlen(pattern));
5577 cd->req_varyopt = 0;
5578 cd->nopartial = FALSE;
5579 cd->external_options = options;
5580
5581 /* Now do the pre-compile. On error, errorcode will be set non-zero, so we
5582 don't need to look at the result of the function here. The initial options have
5583 been put into the cd block so that they can be changed if an option setting is
5584 found within the regex right at the beginning. Bringing initial option settings
5585 outside can help speed up starting point checks. */
5586
5587 code = cworkspace;
5588 *code = OP_BRA;
5589 (void)compile_regex(cd->external_options, cd->external_options & PCRE_IMS,
5590 &code, &ptr, &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd,
5591 &length);
5592 if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN;
5593
5594 DPRINTF(("end pre-compile: length=%d workspace=%d\n", length,
5595 cd->hwm - cworkspace));
5596
5597 if (length > MAX_PATTERN_SIZE)
5598 {
5599 errorcode = ERR20;
5600 goto PCRE_EARLY_ERROR_RETURN;
5601 }
5602
5603 /* Compute the size of data block needed and get it, either from malloc or
5604 externally provided function. Integer overflow should no longer be possible
5605 because nowadays we limit the maximum value of cd->names_found and
5606 cd->name_entry_size. */
5607
5608 size = length + sizeof(real_pcre) + cd->names_found * (cd->name_entry_size + 3);
5609 re = (real_pcre *)(pcre_malloc)(size);
5610
5611 if (re == NULL)
5612 {
5613 errorcode = ERR21;
5614 goto PCRE_EARLY_ERROR_RETURN;
5615 }
5616
5617 /* Put in the magic number, and save the sizes, initial options, and character
5618 table pointer. NULL is used for the default character tables. The nullpad field
5619 is at the end; it's there to help in the case when a regex compiled on a system
5620 with 4-byte pointers is run on another with 8-byte pointers. */
5621
5622 re->magic_number = MAGIC_NUMBER;
5623 re->size = size;
5624 re->options = cd->external_options;
5625 re->dummy1 = 0;
5626 re->first_byte = 0;
5627 re->req_byte = 0;
5628 re->name_table_offset = sizeof(real_pcre);
5629 re->name_entry_size = cd->name_entry_size;
5630 re->name_count = cd->names_found;
5631 re->ref_count = 0;
5632 re->tables = (tables == _pcre_default_tables)? NULL : tables;
5633 re->nullpad = NULL;
5634
5635 /* The starting points of the name/number translation table and of the code are
5636 passed around in the compile data block. The start/end pattern and initial
5637 options are already set from the pre-compile phase, as is the name_entry_size
5638 field. Reset the bracket count and the names_found field. Also reset the hwm
5639 field; this time it's used for remembering forward references to subpatterns.
5640 */
5641
5642 cd->bracount = 0;
5643 cd->names_found = 0;
5644 cd->name_table = (uschar *)re + re->name_table_offset;
5645 codestart = cd->name_table + re->name_entry_size * re->name_count;
5646 cd->start_code = codestart;
5647 cd->hwm = cworkspace;
5648 cd->req_varyopt = 0;
5649 cd->nopartial = FALSE;
5650
5651 /* Set up a starting, non-extracting bracket, then compile the expression. On
5652 error, errorcode will be set non-zero, so we don't need to look at the result
5653 of the function here. */
5654
5655 ptr = (const uschar *)pattern;
5656 code = (uschar *)codestart;
5657 *code = OP_BRA;
5658 (void)compile_regex(re->options, re->options & PCRE_IMS, &code, &ptr,
5659 &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd, NULL);
5660 re->top_bracket = cd->bracount;
5661 re->top_backref = cd->top_backref;
5662
5663 if (cd->nopartial) re->options |= PCRE_NOPARTIAL;
5664
5665 /* If not reached end of pattern on success, there's an excess bracket. */
5666
5667 if (errorcode == 0 && *ptr != 0) errorcode = ERR22;
5668
5669 /* Fill in the terminating state and check for disastrous overflow, but
5670 if debugging, leave the test till after things are printed out. */
5671
5672 *code++ = OP_END;
5673
5674 #ifndef DEBUG
5675 if (code - codestart > length) errorcode = ERR23;
5676 #endif
5677
5678 /* Fill in any forward references that are required. */
5679
5680 while (errorcode == 0 && cd->hwm > cworkspace)
5681 {
5682 int offset, recno;
5683 const uschar *groupptr;
5684 cd->hwm -= LINK_SIZE;
5685 offset = GET(cd->hwm, 0);
5686 recno = GET(codestart, offset);
5687 groupptr = find_bracket(codestart, (re->options & PCRE_UTF8) != 0, recno);
5688 if (groupptr == NULL) errorcode = ERR53;
5689 else PUT(((uschar *)codestart), offset, groupptr - codestart);
5690 }
5691
5692 /* Give an error if there's back reference to a non-existent capturing
5693 subpattern. */
5694
5695 if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15;
5696
5697 /* Failed to compile, or error while post-processing */
5698
5699 if (errorcode != 0)
5700 {
5701 (pcre_free)(re);
5702 PCRE_EARLY_ERROR_RETURN:
5703 *erroroffset = ptr - (const uschar *)pattern;
5704 PCRE_EARLY_ERROR_RETURN2:
5705 *errorptr = error_texts[errorcode];
5706 if (errorcodeptr != NULL) *errorcodeptr = errorcode;
5707 return NULL;
5708 }
5709
5710 /* If the anchored option was not passed, set the flag if we can determine that
5711 the pattern is anchored by virtue of ^ characters or \A or anything else (such
5712 as starting with .* when DOTALL is set).
5713
5714 Otherwise, if we know what the first byte has to be, save it, because that
5715 speeds up unanchored matches no end. If not, see if we can set the
5716 PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
5717 start with ^. and also when all branches start with .* for non-DOTALL matches.
5718 */
5719
5720 if ((re->options & PCRE_ANCHORED) == 0)
5721 {
5722 int temp_options = re->options; /* May get changed during these scans */
5723 if (is_anchored(codestart, &temp_options, 0, cd->backref_map))
5724 re->options |= PCRE_ANCHORED;
5725 else
5726 {
5727 if (firstbyte < 0)
5728 firstbyte = find_firstassertedchar(codestart, &temp_options, FALSE);
5729 if (firstbyte >= 0) /* Remove caseless flag for non-caseable chars */
5730 {
5731 int ch = firstbyte & 255;
5732 re->first_byte = ((firstbyte & REQ_CASELESS) != 0 &&
5733 cd->fcc[ch] == ch)? ch : firstbyte;
5734 re->options |= PCRE_FIRSTSET;
5735 }
5736 else if (is_startline(codestart, 0, cd->backref_map))
5737 re->options |= PCRE_STARTLINE;
5738 }
5739 }
5740
5741 /* For an anchored pattern, we use the "required byte" only if it follows a
5742 variable length item in the regex. Remove the caseless flag for non-caseable
5743 bytes. */
5744
5745 if (reqbyte >= 0 &&
5746 ((re->options & PCRE_ANCHORED) == 0 || (reqbyte & REQ_VARY) != 0))
5747 {
5748 int ch = reqbyte & 255;
5749 re->req_byte = ((reqbyte & REQ_CASELESS) != 0 &&
5750 cd->fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte;
5751 re->options |= PCRE_REQCHSET;
5752 }
5753
5754 /* Print out the compiled data if debugging is enabled. This is never the
5755 case when building a production library. */
5756
5757 #ifdef DEBUG
5758
5759 printf("Length = %d top_bracket = %d top_backref = %d\n",
5760 length, re->top_bracket, re->top_backref);
5761
5762 if (re->options != 0)
5763 {
5764 printf("%s%s%s%s%s%s%s%s%s\n",
5765 ((re->options & PCRE_NOPARTIAL) != 0)? "nopartial " : "",
5766 ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
5767 ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
5768 ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
5769 ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
5770 ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
5771 ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
5772 ((re->options & PCRE_EXTRA) != 0)? "extra " : "",
5773 ((re->options & PCRE_UNGREEDY) != 0)? "ungreedy " : "");
5774 }
5775
5776 if ((re->options & PCRE_FIRSTSET) != 0)
5777 {
5778 int ch = re->first_byte & 255;
5779 const char *caseless = ((re->first_byte & REQ_CASELESS) == 0)?
5780 "" : " (caseless)";
5781 if (isprint(ch)) printf("First char = %c%s\n", ch, caseless);
5782 else printf("First char = \\x%02x%s\n", ch, caseless);
5783 }
5784
5785 if ((re->options & PCRE_REQCHSET) != 0)
5786 {
5787 int ch = re->req_byte & 255;
5788 const char *caseless = ((re->req_byte & REQ_CASELESS) == 0)?
5789 "" : " (caseless)";
5790 if (isprint(ch)) printf("Req char = %c%s\n", ch, caseless);
5791 else printf("Req char = \\x%02x%s\n", ch, caseless);
5792 }
5793
5794 pcre_printint(re, stdout, TRUE);
5795
5796 /* This check is done here in the debugging case so that the code that
5797 was compiled can be seen. */
5798
5799 if (code - codestart > length)
5800 {
5801 (pcre_free)(re);
5802 *errorptr = error_texts[ERR23];
5803 *erroroffset = ptr - (uschar *)pattern;
5804 if (errorcodeptr != NULL) *errorcodeptr = ERR23;
5805 return NULL;
5806 }
5807 #endif /* DEBUG */
5808
5809 return (pcre *)re;
5810 }
5811
5812 /* End of pcre_compile.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12