/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Contents of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 236 - (show annotations) (download)
Tue Sep 11 12:57:06 2007 UTC (7 years ago) by ph10
File MIME type: text/plain
File size: 191944 byte(s)
<config.h> => "config.h" and also some cases of <pcre.h>.

1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2007 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains the external function pcre_compile(), along with
42 supporting internal functions that are not used by other modules. */
43
44
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48
49 #define NLBLOCK cd /* Block containing newline information */
50 #define PSSTART start_pattern /* Field containing processed string start */
51 #define PSEND end_pattern /* Field containing processed string end */
52
53 #include "pcre_internal.h"
54
55
56 /* When DEBUG is defined, we need the pcre_printint() function, which is also
57 used by pcretest. DEBUG is not defined when building a production library. */
58
59 #ifdef DEBUG
60 #include "pcre_printint.src"
61 #endif
62
63
64 /* Macro for setting individual bits in class bitmaps. */
65
66 #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
67
68 /* Maximum length value to check against when making sure that the integer that
69 holds the compiled pattern length does not overflow. We make it a bit less than
70 INT_MAX to allow for adding in group terminating bytes, so that we don't have
71 to check them every time. */
72
73 #define OFLOW_MAX (INT_MAX - 20)
74
75
76 /*************************************************
77 * Code parameters and static tables *
78 *************************************************/
79
80 /* This value specifies the size of stack workspace that is used during the
81 first pre-compile phase that determines how much memory is required. The regex
82 is partly compiled into this space, but the compiled parts are discarded as
83 soon as they can be, so that hopefully there will never be an overrun. The code
84 does, however, check for an overrun. The largest amount I've seen used is 218,
85 so this number is very generous.
86
87 The same workspace is used during the second, actual compile phase for
88 remembering forward references to groups so that they can be filled in at the
89 end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
90 is 4 there is plenty of room. */
91
92 #define COMPILE_WORK_SIZE (4096)
93
94
95 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
96 are simple data values; negative values are for special things like \d and so
97 on. Zero means further processing is needed (for things like \x), or the escape
98 is invalid. */
99
100 #ifndef EBCDIC /* This is the "normal" table for ASCII systems */
101 static const short int escapes[] = {
102 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
103 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
104 '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */
105 -ESC_H, 0, 0, -ESC_K, 0, 0, 0, 0, /* H - O */
106 -ESC_P, -ESC_Q, -ESC_R, -ESC_S, 0, 0, -ESC_V, -ESC_W, /* P - W */
107 -ESC_X, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
108 '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */
109 -ESC_h, 0, 0, -ESC_k, 0, 0, ESC_n, 0, /* h - o */
110 -ESC_p, 0, ESC_r, -ESC_s, ESC_tee, 0, -ESC_v, -ESC_w, /* p - w */
111 0, 0, -ESC_z /* x - z */
112 };
113
114 #else /* This is the "abnormal" table for EBCDIC systems */
115 static const short int escapes[] = {
116 /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
117 /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
118 /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
119 /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
120 /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
121 /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
122 /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
123 /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
124 /* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0,
125 /* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
126 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
127 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0,
128 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
129 /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
130 /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
131 /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
132 /* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0,
133 /* D0 */ '}', 0, -ESC_K, 0, 0, 0, 0, -ESC_P,
134 /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
135 /* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X,
136 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
137 /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
138 /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
139 };
140 #endif
141
142
143 /* Table of special "verbs" like (*PRUNE) */
144
145 typedef struct verbitem {
146 const char *name;
147 int len;
148 int op;
149 } verbitem;
150
151 static verbitem verbs[] = {
152 { "ACCEPT", 6, OP_ACCEPT },
153 { "COMMIT", 6, OP_COMMIT },
154 { "F", 1, OP_FAIL },
155 { "FAIL", 4, OP_FAIL },
156 { "PRUNE", 5, OP_PRUNE },
157 { "SKIP", 4, OP_SKIP },
158 { "THEN", 4, OP_THEN }
159 };
160
161 static int verbcount = sizeof(verbs)/sizeof(verbitem);
162
163
164 /* Tables of names of POSIX character classes and their lengths. The list is
165 terminated by a zero length entry. The first three must be alpha, lower, upper,
166 as this is assumed for handling case independence. */
167
168 static const char *const posix_names[] = {
169 "alpha", "lower", "upper",
170 "alnum", "ascii", "blank", "cntrl", "digit", "graph",
171 "print", "punct", "space", "word", "xdigit" };
172
173 static const uschar posix_name_lengths[] = {
174 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
175
176 /* Table of class bit maps for each POSIX class. Each class is formed from a
177 base map, with an optional addition or removal of another map. Then, for some
178 classes, there is some additional tweaking: for [:blank:] the vertical space
179 characters are removed, and for [:alpha:] and [:alnum:] the underscore
180 character is removed. The triples in the table consist of the base map offset,
181 second map offset or -1 if no second map, and a non-negative value for map
182 addition or a negative value for map subtraction (if there are two maps). The
183 absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
184 remove vertical space characters, 2 => remove underscore. */
185
186 static const int posix_class_maps[] = {
187 cbit_word, cbit_digit, -2, /* alpha */
188 cbit_lower, -1, 0, /* lower */
189 cbit_upper, -1, 0, /* upper */
190 cbit_word, -1, 2, /* alnum - word without underscore */
191 cbit_print, cbit_cntrl, 0, /* ascii */
192 cbit_space, -1, 1, /* blank - a GNU extension */
193 cbit_cntrl, -1, 0, /* cntrl */
194 cbit_digit, -1, 0, /* digit */
195 cbit_graph, -1, 0, /* graph */
196 cbit_print, -1, 0, /* print */
197 cbit_punct, -1, 0, /* punct */
198 cbit_space, -1, 0, /* space */
199 cbit_word, -1, 0, /* word - a Perl extension */
200 cbit_xdigit,-1, 0 /* xdigit */
201 };
202
203
204 #define STRING(a) # a
205 #define XSTRING(s) STRING(s)
206
207 /* The texts of compile-time error messages. These are "char *" because they
208 are passed to the outside world. Do not ever re-use any error number, because
209 they are documented. Always add a new error instead. Messages marked DEAD below
210 are no longer used. */
211
212 static const char *error_texts[] = {
213 "no error",
214 "\\ at end of pattern",
215 "\\c at end of pattern",
216 "unrecognized character follows \\",
217 "numbers out of order in {} quantifier",
218 /* 5 */
219 "number too big in {} quantifier",
220 "missing terminating ] for character class",
221 "invalid escape sequence in character class",
222 "range out of order in character class",
223 "nothing to repeat",
224 /* 10 */
225 "operand of unlimited repeat could match the empty string", /** DEAD **/
226 "internal error: unexpected repeat",
227 "unrecognized character after (?",
228 "POSIX named classes are supported only within a class",
229 "missing )",
230 /* 15 */
231 "reference to non-existent subpattern",
232 "erroffset passed as NULL",
233 "unknown option bit(s) set",
234 "missing ) after comment",
235 "parentheses nested too deeply", /** DEAD **/
236 /* 20 */
237 "regular expression is too large",
238 "failed to get memory",
239 "unmatched parentheses",
240 "internal error: code overflow",
241 "unrecognized character after (?<",
242 /* 25 */
243 "lookbehind assertion is not fixed length",
244 "malformed number or name after (?(",
245 "conditional group contains more than two branches",
246 "assertion expected after (?(",
247 "(?R or (?[+-]digits must be followed by )",
248 /* 30 */
249 "unknown POSIX class name",
250 "POSIX collating elements are not supported",
251 "this version of PCRE is not compiled with PCRE_UTF8 support",
252 "spare error", /** DEAD **/
253 "character value in \\x{...} sequence is too large",
254 /* 35 */
255 "invalid condition (?(0)",
256 "\\C not allowed in lookbehind assertion",
257 "PCRE does not support \\L, \\l, \\N, \\U, or \\u",
258 "number after (?C is > 255",
259 "closing ) for (?C expected",
260 /* 40 */
261 "recursive call could loop indefinitely",
262 "unrecognized character after (?P",
263 "syntax error in subpattern name (missing terminator)",
264 "two named subpatterns have the same name",
265 "invalid UTF-8 string",
266 /* 45 */
267 "support for \\P, \\p, and \\X has not been compiled",
268 "malformed \\P or \\p sequence",
269 "unknown property name after \\P or \\p",
270 "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)",
271 "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")",
272 /* 50 */
273 "repeated subpattern is too long", /** DEAD **/
274 "octal value is greater than \\377 (not in UTF-8 mode)",
275 "internal error: overran compiling workspace",
276 "internal error: previously-checked referenced subpattern not found",
277 "DEFINE group contains more than one branch",
278 /* 55 */
279 "repeating a DEFINE group is not allowed",
280 "inconsistent NEWLINE options",
281 "\\g is not followed by a braced name or an optionally braced non-zero number",
282 "(?+ or (?- or (?(+ or (?(- must be followed by a non-zero number",
283 "(*VERB) with an argument is not supported",
284 /* 60 */
285 "(*VERB) not recognized",
286 "number is too big"
287 };
288
289
290 /* Table to identify digits and hex digits. This is used when compiling
291 patterns. Note that the tables in chartables are dependent on the locale, and
292 may mark arbitrary characters as digits - but the PCRE compiling code expects
293 to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
294 a private table here. It costs 256 bytes, but it is a lot faster than doing
295 character value tests (at least in some simple cases I timed), and in some
296 applications one wants PCRE to compile efficiently as well as match
297 efficiently.
298
299 For convenience, we use the same bit definitions as in chartables:
300
301 0x04 decimal digit
302 0x08 hexadecimal digit
303
304 Then we can use ctype_digit and ctype_xdigit in the code. */
305
306 #ifndef EBCDIC /* This is the "normal" case, for ASCII systems */
307 static const unsigned char digitab[] =
308 {
309 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
310 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
311 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
312 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
313 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
314 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
315 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
316 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
317 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
318 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
319 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
320 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
321 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
322 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
323 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
324 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
325 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
326 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
327 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
328 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
329 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
330 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
331 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
332 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
333 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
334 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
335 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
336 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
337 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
338 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
339 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
340 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
341
342 #else /* This is the "abnormal" case, for EBCDIC systems */
343 static const unsigned char digitab[] =
344 {
345 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
346 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
347 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
348 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
349 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
350 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
351 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
352 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
353 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
354 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
355 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
356 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */
357 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
358 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
359 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
360 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
361 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
362 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
363 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
364 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
365 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
366 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
367 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
368 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
369 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
370 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
371 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
372 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
373 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
374 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
375 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
376 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
377
378 static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
379 0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
380 0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
381 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
382 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
383 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
384 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
385 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
386 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
387 0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
388 0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
389 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
390 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */
391 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
392 0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
393 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
394 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
395 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
396 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
397 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
398 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
399 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
400 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
401 0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
402 0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
403 0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
404 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
405 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
406 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
407 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
408 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
409 0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
410 0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
411 #endif
412
413
414 /* Definition to allow mutual recursion */
415
416 static BOOL
417 compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
418 int *, int *, branch_chain *, compile_data *, int *);
419
420
421
422 /*************************************************
423 * Handle escapes *
424 *************************************************/
425
426 /* This function is called when a \ has been encountered. It either returns a
427 positive value for a simple escape such as \n, or a negative value which
428 encodes one of the more complicated things such as \d. A backreference to group
429 n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
430 UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
431 ptr is pointing at the \. On exit, it is on the final character of the escape
432 sequence.
433
434 Arguments:
435 ptrptr points to the pattern position pointer
436 errorcodeptr points to the errorcode variable
437 bracount number of previous extracting brackets
438 options the options bits
439 isclass TRUE if inside a character class
440
441 Returns: zero or positive => a data character
442 negative => a special escape sequence
443 on error, errorcodeptr is set
444 */
445
446 static int
447 check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
448 int options, BOOL isclass)
449 {
450 BOOL utf8 = (options & PCRE_UTF8) != 0;
451 const uschar *ptr = *ptrptr + 1;
452 int c, i;
453
454 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
455 ptr--; /* Set pointer back to the last byte */
456
457 /* If backslash is at the end of the pattern, it's an error. */
458
459 if (c == 0) *errorcodeptr = ERR1;
460
461 /* Non-alphamerics are literals. For digits or letters, do an initial lookup in
462 a table. A non-zero result is something that can be returned immediately.
463 Otherwise further processing may be required. */
464
465 #ifndef EBCDIC /* ASCII coding */
466 else if (c < '0' || c > 'z') {} /* Not alphameric */
467 else if ((i = escapes[c - '0']) != 0) c = i;
468
469 #else /* EBCDIC coding */
470 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphameric */
471 else if ((i = escapes[c - 0x48]) != 0) c = i;
472 #endif
473
474 /* Escapes that need further processing, or are illegal. */
475
476 else
477 {
478 const uschar *oldptr;
479 BOOL braced, negated;
480
481 switch (c)
482 {
483 /* A number of Perl escapes are not handled by PCRE. We give an explicit
484 error. */
485
486 case 'l':
487 case 'L':
488 case 'N':
489 case 'u':
490 case 'U':
491 *errorcodeptr = ERR37;
492 break;
493
494 /* \g must be followed by a number, either plain or braced. If positive, it
495 is an absolute backreference. If negative, it is a relative backreference.
496 This is a Perl 5.10 feature. Perl 5.10 also supports \g{name} as a
497 reference to a named group. This is part of Perl's movement towards a
498 unified syntax for back references. As this is synonymous with \k{name}, we
499 fudge it up by pretending it really was \k. */
500
501 case 'g':
502 if (ptr[1] == '{')
503 {
504 const uschar *p;
505 for (p = ptr+2; *p != 0 && *p != '}'; p++)
506 if (*p != '-' && (digitab[*p] & ctype_digit) == 0) break;
507 if (*p != 0 && *p != '}')
508 {
509 c = -ESC_k;
510 break;
511 }
512 braced = TRUE;
513 ptr++;
514 }
515 else braced = FALSE;
516
517 if (ptr[1] == '-')
518 {
519 negated = TRUE;
520 ptr++;
521 }
522 else negated = FALSE;
523
524 c = 0;
525 while ((digitab[ptr[1]] & ctype_digit) != 0)
526 c = c * 10 + *(++ptr) - '0';
527
528 if (c < 0)
529 {
530 *errorcodeptr = ERR61;
531 break;
532 }
533
534 if (c == 0 || (braced && *(++ptr) != '}'))
535 {
536 *errorcodeptr = ERR57;
537 break;
538 }
539
540 if (negated)
541 {
542 if (c > bracount)
543 {
544 *errorcodeptr = ERR15;
545 break;
546 }
547 c = bracount - (c - 1);
548 }
549
550 c = -(ESC_REF + c);
551 break;
552
553 /* The handling of escape sequences consisting of a string of digits
554 starting with one that is not zero is not straightforward. By experiment,
555 the way Perl works seems to be as follows:
556
557 Outside a character class, the digits are read as a decimal number. If the
558 number is less than 10, or if there are that many previous extracting
559 left brackets, then it is a back reference. Otherwise, up to three octal
560 digits are read to form an escaped byte. Thus \123 is likely to be octal
561 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
562 value is greater than 377, the least significant 8 bits are taken. Inside a
563 character class, \ followed by a digit is always an octal number. */
564
565 case '1': case '2': case '3': case '4': case '5':
566 case '6': case '7': case '8': case '9':
567
568 if (!isclass)
569 {
570 oldptr = ptr;
571 c -= '0';
572 while ((digitab[ptr[1]] & ctype_digit) != 0)
573 c = c * 10 + *(++ptr) - '0';
574 if (c < 0)
575 {
576 *errorcodeptr = ERR61;
577 break;
578 }
579 if (c < 10 || c <= bracount)
580 {
581 c = -(ESC_REF + c);
582 break;
583 }
584 ptr = oldptr; /* Put the pointer back and fall through */
585 }
586
587 /* Handle an octal number following \. If the first digit is 8 or 9, Perl
588 generates a binary zero byte and treats the digit as a following literal.
589 Thus we have to pull back the pointer by one. */
590
591 if ((c = *ptr) >= '8')
592 {
593 ptr--;
594 c = 0;
595 break;
596 }
597
598 /* \0 always starts an octal number, but we may drop through to here with a
599 larger first octal digit. The original code used just to take the least
600 significant 8 bits of octal numbers (I think this is what early Perls used
601 to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
602 than 3 octal digits. */
603
604 case '0':
605 c -= '0';
606 while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
607 c = c * 8 + *(++ptr) - '0';
608 if (!utf8 && c > 255) *errorcodeptr = ERR51;
609 break;
610
611 /* \x is complicated. \x{ddd} is a character number which can be greater
612 than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
613 treated as a data character. */
614
615 case 'x':
616 if (ptr[1] == '{')
617 {
618 const uschar *pt = ptr + 2;
619 int count = 0;
620
621 c = 0;
622 while ((digitab[*pt] & ctype_xdigit) != 0)
623 {
624 register int cc = *pt++;
625 if (c == 0 && cc == '0') continue; /* Leading zeroes */
626 count++;
627
628 #ifndef EBCDIC /* ASCII coding */
629 if (cc >= 'a') cc -= 32; /* Convert to upper case */
630 c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));
631 #else /* EBCDIC coding */
632 if (cc >= 'a' && cc <= 'z') cc += 64; /* Convert to upper case */
633 c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));
634 #endif
635 }
636
637 if (*pt == '}')
638 {
639 if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
640 ptr = pt;
641 break;
642 }
643
644 /* If the sequence of hex digits does not end with '}', then we don't
645 recognize this construct; fall through to the normal \x handling. */
646 }
647
648 /* Read just a single-byte hex-defined char */
649
650 c = 0;
651 while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
652 {
653 int cc; /* Some compilers don't like ++ */
654 cc = *(++ptr); /* in initializers */
655 #ifndef EBCDIC /* ASCII coding */
656 if (cc >= 'a') cc -= 32; /* Convert to upper case */
657 c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
658 #else /* EBCDIC coding */
659 if (cc <= 'z') cc += 64; /* Convert to upper case */
660 c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
661 #endif
662 }
663 break;
664
665 /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
666 This coding is ASCII-specific, but then the whole concept of \cx is
667 ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
668
669 case 'c':
670 c = *(++ptr);
671 if (c == 0)
672 {
673 *errorcodeptr = ERR2;
674 break;
675 }
676
677 #ifndef EBCDIC /* ASCII coding */
678 if (c >= 'a' && c <= 'z') c -= 32;
679 c ^= 0x40;
680 #else /* EBCDIC coding */
681 if (c >= 'a' && c <= 'z') c += 64;
682 c ^= 0xC0;
683 #endif
684 break;
685
686 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
687 other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
688 for Perl compatibility, it is a literal. This code looks a bit odd, but
689 there used to be some cases other than the default, and there may be again
690 in future, so I haven't "optimized" it. */
691
692 default:
693 if ((options & PCRE_EXTRA) != 0) switch(c)
694 {
695 default:
696 *errorcodeptr = ERR3;
697 break;
698 }
699 break;
700 }
701 }
702
703 *ptrptr = ptr;
704 return c;
705 }
706
707
708
709 #ifdef SUPPORT_UCP
710 /*************************************************
711 * Handle \P and \p *
712 *************************************************/
713
714 /* This function is called after \P or \p has been encountered, provided that
715 PCRE is compiled with support for Unicode properties. On entry, ptrptr is
716 pointing at the P or p. On exit, it is pointing at the final character of the
717 escape sequence.
718
719 Argument:
720 ptrptr points to the pattern position pointer
721 negptr points to a boolean that is set TRUE for negation else FALSE
722 dptr points to an int that is set to the detailed property value
723 errorcodeptr points to the error code variable
724
725 Returns: type value from ucp_type_table, or -1 for an invalid type
726 */
727
728 static int
729 get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
730 {
731 int c, i, bot, top;
732 const uschar *ptr = *ptrptr;
733 char name[32];
734
735 c = *(++ptr);
736 if (c == 0) goto ERROR_RETURN;
737
738 *negptr = FALSE;
739
740 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
741 negation. */
742
743 if (c == '{')
744 {
745 if (ptr[1] == '^')
746 {
747 *negptr = TRUE;
748 ptr++;
749 }
750 for (i = 0; i < (int)sizeof(name) - 1; i++)
751 {
752 c = *(++ptr);
753 if (c == 0) goto ERROR_RETURN;
754 if (c == '}') break;
755 name[i] = c;
756 }
757 if (c !='}') goto ERROR_RETURN;
758 name[i] = 0;
759 }
760
761 /* Otherwise there is just one following character */
762
763 else
764 {
765 name[0] = c;
766 name[1] = 0;
767 }
768
769 *ptrptr = ptr;
770
771 /* Search for a recognized property name using binary chop */
772
773 bot = 0;
774 top = _pcre_utt_size;
775
776 while (bot < top)
777 {
778 i = (bot + top) >> 1;
779 c = strcmp(name, _pcre_utt[i].name);
780 if (c == 0)
781 {
782 *dptr = _pcre_utt[i].value;
783 return _pcre_utt[i].type;
784 }
785 if (c > 0) bot = i + 1; else top = i;
786 }
787
788 *errorcodeptr = ERR47;
789 *ptrptr = ptr;
790 return -1;
791
792 ERROR_RETURN:
793 *errorcodeptr = ERR46;
794 *ptrptr = ptr;
795 return -1;
796 }
797 #endif
798
799
800
801
802 /*************************************************
803 * Check for counted repeat *
804 *************************************************/
805
806 /* This function is called when a '{' is encountered in a place where it might
807 start a quantifier. It looks ahead to see if it really is a quantifier or not.
808 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
809 where the ddds are digits.
810
811 Arguments:
812 p pointer to the first char after '{'
813
814 Returns: TRUE or FALSE
815 */
816
817 static BOOL
818 is_counted_repeat(const uschar *p)
819 {
820 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
821 while ((digitab[*p] & ctype_digit) != 0) p++;
822 if (*p == '}') return TRUE;
823
824 if (*p++ != ',') return FALSE;
825 if (*p == '}') return TRUE;
826
827 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
828 while ((digitab[*p] & ctype_digit) != 0) p++;
829
830 return (*p == '}');
831 }
832
833
834
835 /*************************************************
836 * Read repeat counts *
837 *************************************************/
838
839 /* Read an item of the form {n,m} and return the values. This is called only
840 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
841 so the syntax is guaranteed to be correct, but we need to check the values.
842
843 Arguments:
844 p pointer to first char after '{'
845 minp pointer to int for min
846 maxp pointer to int for max
847 returned as -1 if no max
848 errorcodeptr points to error code variable
849
850 Returns: pointer to '}' on success;
851 current ptr on error, with errorcodeptr set non-zero
852 */
853
854 static const uschar *
855 read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
856 {
857 int min = 0;
858 int max = -1;
859
860 /* Read the minimum value and do a paranoid check: a negative value indicates
861 an integer overflow. */
862
863 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
864 if (min < 0 || min > 65535)
865 {
866 *errorcodeptr = ERR5;
867 return p;
868 }
869
870 /* Read the maximum value if there is one, and again do a paranoid on its size.
871 Also, max must not be less than min. */
872
873 if (*p == '}') max = min; else
874 {
875 if (*(++p) != '}')
876 {
877 max = 0;
878 while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
879 if (max < 0 || max > 65535)
880 {
881 *errorcodeptr = ERR5;
882 return p;
883 }
884 if (max < min)
885 {
886 *errorcodeptr = ERR4;
887 return p;
888 }
889 }
890 }
891
892 /* Fill in the required variables, and pass back the pointer to the terminating
893 '}'. */
894
895 *minp = min;
896 *maxp = max;
897 return p;
898 }
899
900
901
902 /*************************************************
903 * Find forward referenced subpattern *
904 *************************************************/
905
906 /* This function scans along a pattern's text looking for capturing
907 subpatterns, and counting them. If it finds a named pattern that matches the
908 name it is given, it returns its number. Alternatively, if the name is NULL, it
909 returns when it reaches a given numbered subpattern. This is used for forward
910 references to subpatterns. We know that if (?P< is encountered, the name will
911 be terminated by '>' because that is checked in the first pass.
912
913 Arguments:
914 ptr current position in the pattern
915 count current count of capturing parens so far encountered
916 name name to seek, or NULL if seeking a numbered subpattern
917 lorn name length, or subpattern number if name is NULL
918 xmode TRUE if we are in /x mode
919
920 Returns: the number of the named subpattern, or -1 if not found
921 */
922
923 static int
924 find_parens(const uschar *ptr, int count, const uschar *name, int lorn,
925 BOOL xmode)
926 {
927 const uschar *thisname;
928
929 for (; *ptr != 0; ptr++)
930 {
931 int term;
932
933 /* Skip over backslashed characters and also entire \Q...\E */
934
935 if (*ptr == '\\')
936 {
937 if (*(++ptr) == 0) return -1;
938 if (*ptr == 'Q') for (;;)
939 {
940 while (*(++ptr) != 0 && *ptr != '\\');
941 if (*ptr == 0) return -1;
942 if (*(++ptr) == 'E') break;
943 }
944 continue;
945 }
946
947 /* Skip over character classes */
948
949 if (*ptr == '[')
950 {
951 while (*(++ptr) != ']')
952 {
953 if (*ptr == 0) return -1;
954 if (*ptr == '\\')
955 {
956 if (*(++ptr) == 0) return -1;
957 if (*ptr == 'Q') for (;;)
958 {
959 while (*(++ptr) != 0 && *ptr != '\\');
960 if (*ptr == 0) return -1;
961 if (*(++ptr) == 'E') break;
962 }
963 continue;
964 }
965 }
966 continue;
967 }
968
969 /* Skip comments in /x mode */
970
971 if (xmode && *ptr == '#')
972 {
973 while (*(++ptr) != 0 && *ptr != '\n');
974 if (*ptr == 0) return -1;
975 continue;
976 }
977
978 /* An opening parens must now be a real metacharacter */
979
980 if (*ptr != '(') continue;
981 if (ptr[1] != '?' && ptr[1] != '*')
982 {
983 count++;
984 if (name == NULL && count == lorn) return count;
985 continue;
986 }
987
988 ptr += 2;
989 if (*ptr == 'P') ptr++; /* Allow optional P */
990
991 /* We have to disambiguate (?<! and (?<= from (?<name> */
992
993 if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') &&
994 *ptr != '\'')
995 continue;
996
997 count++;
998
999 if (name == NULL && count == lorn) return count;
1000 term = *ptr++;
1001 if (term == '<') term = '>';
1002 thisname = ptr;
1003 while (*ptr != term) ptr++;
1004 if (name != NULL && lorn == ptr - thisname &&
1005 strncmp((const char *)name, (const char *)thisname, lorn) == 0)
1006 return count;
1007 }
1008
1009 return -1;
1010 }
1011
1012
1013
1014 /*************************************************
1015 * Find first significant op code *
1016 *************************************************/
1017
1018 /* This is called by several functions that scan a compiled expression looking
1019 for a fixed first character, or an anchoring op code etc. It skips over things
1020 that do not influence this. For some calls, a change of option is important.
1021 For some calls, it makes sense to skip negative forward and all backward
1022 assertions, and also the \b assertion; for others it does not.
1023
1024 Arguments:
1025 code pointer to the start of the group
1026 options pointer to external options
1027 optbit the option bit whose changing is significant, or
1028 zero if none are
1029 skipassert TRUE if certain assertions are to be skipped
1030
1031 Returns: pointer to the first significant opcode
1032 */
1033
1034 static const uschar*
1035 first_significant_code(const uschar *code, int *options, int optbit,
1036 BOOL skipassert)
1037 {
1038 for (;;)
1039 {
1040 switch ((int)*code)
1041 {
1042 case OP_OPT:
1043 if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
1044 *options = (int)code[1];
1045 code += 2;
1046 break;
1047
1048 case OP_ASSERT_NOT:
1049 case OP_ASSERTBACK:
1050 case OP_ASSERTBACK_NOT:
1051 if (!skipassert) return code;
1052 do code += GET(code, 1); while (*code == OP_ALT);
1053 code += _pcre_OP_lengths[*code];
1054 break;
1055
1056 case OP_WORD_BOUNDARY:
1057 case OP_NOT_WORD_BOUNDARY:
1058 if (!skipassert) return code;
1059 /* Fall through */
1060
1061 case OP_CALLOUT:
1062 case OP_CREF:
1063 case OP_RREF:
1064 case OP_DEF:
1065 code += _pcre_OP_lengths[*code];
1066 break;
1067
1068 default:
1069 return code;
1070 }
1071 }
1072 /* Control never reaches here */
1073 }
1074
1075
1076
1077
1078 /*************************************************
1079 * Find the fixed length of a pattern *
1080 *************************************************/
1081
1082 /* Scan a pattern and compute the fixed length of subject that will match it,
1083 if the length is fixed. This is needed for dealing with backward assertions.
1084 In UTF8 mode, the result is in characters rather than bytes.
1085
1086 Arguments:
1087 code points to the start of the pattern (the bracket)
1088 options the compiling options
1089
1090 Returns: the fixed length, or -1 if there is no fixed length,
1091 or -2 if \C was encountered
1092 */
1093
1094 static int
1095 find_fixedlength(uschar *code, int options)
1096 {
1097 int length = -1;
1098
1099 register int branchlength = 0;
1100 register uschar *cc = code + 1 + LINK_SIZE;
1101
1102 /* Scan along the opcodes for this branch. If we get to the end of the
1103 branch, check the length against that of the other branches. */
1104
1105 for (;;)
1106 {
1107 int d;
1108 register int op = *cc;
1109 switch (op)
1110 {
1111 case OP_CBRA:
1112 case OP_BRA:
1113 case OP_ONCE:
1114 case OP_COND:
1115 d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);
1116 if (d < 0) return d;
1117 branchlength += d;
1118 do cc += GET(cc, 1); while (*cc == OP_ALT);
1119 cc += 1 + LINK_SIZE;
1120 break;
1121
1122 /* Reached end of a branch; if it's a ket it is the end of a nested
1123 call. If it's ALT it is an alternation in a nested call. If it is
1124 END it's the end of the outer call. All can be handled by the same code. */
1125
1126 case OP_ALT:
1127 case OP_KET:
1128 case OP_KETRMAX:
1129 case OP_KETRMIN:
1130 case OP_END:
1131 if (length < 0) length = branchlength;
1132 else if (length != branchlength) return -1;
1133 if (*cc != OP_ALT) return length;
1134 cc += 1 + LINK_SIZE;
1135 branchlength = 0;
1136 break;
1137
1138 /* Skip over assertive subpatterns */
1139
1140 case OP_ASSERT:
1141 case OP_ASSERT_NOT:
1142 case OP_ASSERTBACK:
1143 case OP_ASSERTBACK_NOT:
1144 do cc += GET(cc, 1); while (*cc == OP_ALT);
1145 /* Fall through */
1146
1147 /* Skip over things that don't match chars */
1148
1149 case OP_REVERSE:
1150 case OP_CREF:
1151 case OP_RREF:
1152 case OP_DEF:
1153 case OP_OPT:
1154 case OP_CALLOUT:
1155 case OP_SOD:
1156 case OP_SOM:
1157 case OP_EOD:
1158 case OP_EODN:
1159 case OP_CIRC:
1160 case OP_DOLL:
1161 case OP_NOT_WORD_BOUNDARY:
1162 case OP_WORD_BOUNDARY:
1163 cc += _pcre_OP_lengths[*cc];
1164 break;
1165
1166 /* Handle literal characters */
1167
1168 case OP_CHAR:
1169 case OP_CHARNC:
1170 case OP_NOT:
1171 branchlength++;
1172 cc += 2;
1173 #ifdef SUPPORT_UTF8
1174 if ((options & PCRE_UTF8) != 0)
1175 {
1176 while ((*cc & 0xc0) == 0x80) cc++;
1177 }
1178 #endif
1179 break;
1180
1181 /* Handle exact repetitions. The count is already in characters, but we
1182 need to skip over a multibyte character in UTF8 mode. */
1183
1184 case OP_EXACT:
1185 branchlength += GET2(cc,1);
1186 cc += 4;
1187 #ifdef SUPPORT_UTF8
1188 if ((options & PCRE_UTF8) != 0)
1189 {
1190 while((*cc & 0x80) == 0x80) cc++;
1191 }
1192 #endif
1193 break;
1194
1195 case OP_TYPEEXACT:
1196 branchlength += GET2(cc,1);
1197 if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
1198 cc += 4;
1199 break;
1200
1201 /* Handle single-char matchers */
1202
1203 case OP_PROP:
1204 case OP_NOTPROP:
1205 cc += 2;
1206 /* Fall through */
1207
1208 case OP_NOT_DIGIT:
1209 case OP_DIGIT:
1210 case OP_NOT_WHITESPACE:
1211 case OP_WHITESPACE:
1212 case OP_NOT_WORDCHAR:
1213 case OP_WORDCHAR:
1214 case OP_ANY:
1215 branchlength++;
1216 cc++;
1217 break;
1218
1219 /* The single-byte matcher isn't allowed */
1220
1221 case OP_ANYBYTE:
1222 return -2;
1223
1224 /* Check a class for variable quantification */
1225
1226 #ifdef SUPPORT_UTF8
1227 case OP_XCLASS:
1228 cc += GET(cc, 1) - 33;
1229 /* Fall through */
1230 #endif
1231
1232 case OP_CLASS:
1233 case OP_NCLASS:
1234 cc += 33;
1235
1236 switch (*cc)
1237 {
1238 case OP_CRSTAR:
1239 case OP_CRMINSTAR:
1240 case OP_CRQUERY:
1241 case OP_CRMINQUERY:
1242 return -1;
1243
1244 case OP_CRRANGE:
1245 case OP_CRMINRANGE:
1246 if (GET2(cc,1) != GET2(cc,3)) return -1;
1247 branchlength += GET2(cc,1);
1248 cc += 5;
1249 break;
1250
1251 default:
1252 branchlength++;
1253 }
1254 break;
1255
1256 /* Anything else is variable length */
1257
1258 default:
1259 return -1;
1260 }
1261 }
1262 /* Control never gets here */
1263 }
1264
1265
1266
1267
1268 /*************************************************
1269 * Scan compiled regex for numbered bracket *
1270 *************************************************/
1271
1272 /* This little function scans through a compiled pattern until it finds a
1273 capturing bracket with the given number.
1274
1275 Arguments:
1276 code points to start of expression
1277 utf8 TRUE in UTF-8 mode
1278 number the required bracket number
1279
1280 Returns: pointer to the opcode for the bracket, or NULL if not found
1281 */
1282
1283 static const uschar *
1284 find_bracket(const uschar *code, BOOL utf8, int number)
1285 {
1286 for (;;)
1287 {
1288 register int c = *code;
1289 if (c == OP_END) return NULL;
1290
1291 /* XCLASS is used for classes that cannot be represented just by a bit
1292 map. This includes negated single high-valued characters. The length in
1293 the table is zero; the actual length is stored in the compiled code. */
1294
1295 if (c == OP_XCLASS) code += GET(code, 1);
1296
1297 /* Handle capturing bracket */
1298
1299 else if (c == OP_CBRA)
1300 {
1301 int n = GET2(code, 1+LINK_SIZE);
1302 if (n == number) return (uschar *)code;
1303 code += _pcre_OP_lengths[c];
1304 }
1305
1306 /* Otherwise, we can get the item's length from the table, except that for
1307 repeated character types, we have to test for \p and \P, which have an extra
1308 two bytes of parameters. */
1309
1310 else
1311 {
1312 switch(c)
1313 {
1314 case OP_TYPESTAR:
1315 case OP_TYPEMINSTAR:
1316 case OP_TYPEPLUS:
1317 case OP_TYPEMINPLUS:
1318 case OP_TYPEQUERY:
1319 case OP_TYPEMINQUERY:
1320 case OP_TYPEPOSSTAR:
1321 case OP_TYPEPOSPLUS:
1322 case OP_TYPEPOSQUERY:
1323 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1324 break;
1325
1326 case OP_TYPEUPTO:
1327 case OP_TYPEMINUPTO:
1328 case OP_TYPEEXACT:
1329 case OP_TYPEPOSUPTO:
1330 if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1331 break;
1332 }
1333
1334 /* Add in the fixed length from the table */
1335
1336 code += _pcre_OP_lengths[c];
1337
1338 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1339 a multi-byte character. The length in the table is a minimum, so we have to
1340 arrange to skip the extra bytes. */
1341
1342 #ifdef SUPPORT_UTF8
1343 if (utf8) switch(c)
1344 {
1345 case OP_CHAR:
1346 case OP_CHARNC:
1347 case OP_EXACT:
1348 case OP_UPTO:
1349 case OP_MINUPTO:
1350 case OP_POSUPTO:
1351 case OP_STAR:
1352 case OP_MINSTAR:
1353 case OP_POSSTAR:
1354 case OP_PLUS:
1355 case OP_MINPLUS:
1356 case OP_POSPLUS:
1357 case OP_QUERY:
1358 case OP_MINQUERY:
1359 case OP_POSQUERY:
1360 if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1361 break;
1362 }
1363 #endif
1364 }
1365 }
1366 }
1367
1368
1369
1370 /*************************************************
1371 * Scan compiled regex for recursion reference *
1372 *************************************************/
1373
1374 /* This little function scans through a compiled pattern until it finds an
1375 instance of OP_RECURSE.
1376
1377 Arguments:
1378 code points to start of expression
1379 utf8 TRUE in UTF-8 mode
1380
1381 Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1382 */
1383
1384 static const uschar *
1385 find_recurse(const uschar *code, BOOL utf8)
1386 {
1387 for (;;)
1388 {
1389 register int c = *code;
1390 if (c == OP_END) return NULL;
1391 if (c == OP_RECURSE) return code;
1392
1393 /* XCLASS is used for classes that cannot be represented just by a bit
1394 map. This includes negated single high-valued characters. The length in
1395 the table is zero; the actual length is stored in the compiled code. */
1396
1397 if (c == OP_XCLASS) code += GET(code, 1);
1398
1399 /* Otherwise, we can get the item's length from the table, except that for
1400 repeated character types, we have to test for \p and \P, which have an extra
1401 two bytes of parameters. */
1402
1403 else
1404 {
1405 switch(c)
1406 {
1407 case OP_TYPESTAR:
1408 case OP_TYPEMINSTAR:
1409 case OP_TYPEPLUS:
1410 case OP_TYPEMINPLUS:
1411 case OP_TYPEQUERY:
1412 case OP_TYPEMINQUERY:
1413 case OP_TYPEPOSSTAR:
1414 case OP_TYPEPOSPLUS:
1415 case OP_TYPEPOSQUERY:
1416 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1417 break;
1418
1419 case OP_TYPEPOSUPTO:
1420 case OP_TYPEUPTO:
1421 case OP_TYPEMINUPTO:
1422 case OP_TYPEEXACT:
1423 if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1424 break;
1425 }
1426
1427 /* Add in the fixed length from the table */
1428
1429 code += _pcre_OP_lengths[c];
1430
1431 /* In UTF-8 mode, opcodes that are followed by a character may be followed
1432 by a multi-byte character. The length in the table is a minimum, so we have
1433 to arrange to skip the extra bytes. */
1434
1435 #ifdef SUPPORT_UTF8
1436 if (utf8) switch(c)
1437 {
1438 case OP_CHAR:
1439 case OP_CHARNC:
1440 case OP_EXACT:
1441 case OP_UPTO:
1442 case OP_MINUPTO:
1443 case OP_POSUPTO:
1444 case OP_STAR:
1445 case OP_MINSTAR:
1446 case OP_POSSTAR:
1447 case OP_PLUS:
1448 case OP_MINPLUS:
1449 case OP_POSPLUS:
1450 case OP_QUERY:
1451 case OP_MINQUERY:
1452 case OP_POSQUERY:
1453 if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1454 break;
1455 }
1456 #endif
1457 }
1458 }
1459 }
1460
1461
1462
1463 /*************************************************
1464 * Scan compiled branch for non-emptiness *
1465 *************************************************/
1466
1467 /* This function scans through a branch of a compiled pattern to see whether it
1468 can match the empty string or not. It is called from could_be_empty()
1469 below and from compile_branch() when checking for an unlimited repeat of a
1470 group that can match nothing. Note that first_significant_code() skips over
1471 assertions. If we hit an unclosed bracket, we return "empty" - this means we've
1472 struck an inner bracket whose current branch will already have been scanned.
1473
1474 Arguments:
1475 code points to start of search
1476 endcode points to where to stop
1477 utf8 TRUE if in UTF8 mode
1478
1479 Returns: TRUE if what is matched could be empty
1480 */
1481
1482 static BOOL
1483 could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1484 {
1485 register int c;
1486 for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1487 code < endcode;
1488 code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1489 {
1490 const uschar *ccode;
1491
1492 c = *code;
1493
1494 /* Groups with zero repeats can of course be empty; skip them. */
1495
1496 if (c == OP_BRAZERO || c == OP_BRAMINZERO)
1497 {
1498 code += _pcre_OP_lengths[c];
1499 do code += GET(code, 1); while (*code == OP_ALT);
1500 c = *code;
1501 continue;
1502 }
1503
1504 /* For other groups, scan the branches. */
1505
1506 if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND)
1507 {
1508 BOOL empty_branch;
1509 if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1510
1511 /* Scan a closed bracket */
1512
1513 empty_branch = FALSE;
1514 do
1515 {
1516 if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1517 empty_branch = TRUE;
1518 code += GET(code, 1);
1519 }
1520 while (*code == OP_ALT);
1521 if (!empty_branch) return FALSE; /* All branches are non-empty */
1522 c = *code;
1523 continue;
1524 }
1525
1526 /* Handle the other opcodes */
1527
1528 switch (c)
1529 {
1530 /* Check for quantifiers after a class. XCLASS is used for classes that
1531 cannot be represented just by a bit map. This includes negated single
1532 high-valued characters. The length in _pcre_OP_lengths[] is zero; the
1533 actual length is stored in the compiled code, so we must update "code"
1534 here. */
1535
1536 #ifdef SUPPORT_UTF8
1537 case OP_XCLASS:
1538 ccode = code += GET(code, 1);
1539 goto CHECK_CLASS_REPEAT;
1540 #endif
1541
1542 case OP_CLASS:
1543 case OP_NCLASS:
1544 ccode = code + 33;
1545
1546 #ifdef SUPPORT_UTF8
1547 CHECK_CLASS_REPEAT:
1548 #endif
1549
1550 switch (*ccode)
1551 {
1552 case OP_CRSTAR: /* These could be empty; continue */
1553 case OP_CRMINSTAR:
1554 case OP_CRQUERY:
1555 case OP_CRMINQUERY:
1556 break;
1557
1558 default: /* Non-repeat => class must match */
1559 case OP_CRPLUS: /* These repeats aren't empty */
1560 case OP_CRMINPLUS:
1561 return FALSE;
1562
1563 case OP_CRRANGE:
1564 case OP_CRMINRANGE:
1565 if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
1566 break;
1567 }
1568 break;
1569
1570 /* Opcodes that must match a character */
1571
1572 case OP_PROP:
1573 case OP_NOTPROP:
1574 case OP_EXTUNI:
1575 case OP_NOT_DIGIT:
1576 case OP_DIGIT:
1577 case OP_NOT_WHITESPACE:
1578 case OP_WHITESPACE:
1579 case OP_NOT_WORDCHAR:
1580 case OP_WORDCHAR:
1581 case OP_ANY:
1582 case OP_ANYBYTE:
1583 case OP_CHAR:
1584 case OP_CHARNC:
1585 case OP_NOT:
1586 case OP_PLUS:
1587 case OP_MINPLUS:
1588 case OP_POSPLUS:
1589 case OP_EXACT:
1590 case OP_NOTPLUS:
1591 case OP_NOTMINPLUS:
1592 case OP_NOTPOSPLUS:
1593 case OP_NOTEXACT:
1594 case OP_TYPEPLUS:
1595 case OP_TYPEMINPLUS:
1596 case OP_TYPEPOSPLUS:
1597 case OP_TYPEEXACT:
1598 return FALSE;
1599
1600 /* These are going to continue, as they may be empty, but we have to
1601 fudge the length for the \p and \P cases. */
1602
1603 case OP_TYPESTAR:
1604 case OP_TYPEMINSTAR:
1605 case OP_TYPEPOSSTAR:
1606 case OP_TYPEQUERY:
1607 case OP_TYPEMINQUERY:
1608 case OP_TYPEPOSQUERY:
1609 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1610 break;
1611
1612 /* Same for these */
1613
1614 case OP_TYPEUPTO:
1615 case OP_TYPEMINUPTO:
1616 case OP_TYPEPOSUPTO:
1617 if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1618 break;
1619
1620 /* End of branch */
1621
1622 case OP_KET:
1623 case OP_KETRMAX:
1624 case OP_KETRMIN:
1625 case OP_ALT:
1626 return TRUE;
1627
1628 /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1629 MINUPTO, and POSUPTO may be followed by a multibyte character */
1630
1631 #ifdef SUPPORT_UTF8
1632 case OP_STAR:
1633 case OP_MINSTAR:
1634 case OP_POSSTAR:
1635 case OP_QUERY:
1636 case OP_MINQUERY:
1637 case OP_POSQUERY:
1638 case OP_UPTO:
1639 case OP_MINUPTO:
1640 case OP_POSUPTO:
1641 if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1642 break;
1643 #endif
1644 }
1645 }
1646
1647 return TRUE;
1648 }
1649
1650
1651
1652 /*************************************************
1653 * Scan compiled regex for non-emptiness *
1654 *************************************************/
1655
1656 /* This function is called to check for left recursive calls. We want to check
1657 the current branch of the current pattern to see if it could match the empty
1658 string. If it could, we must look outwards for branches at other levels,
1659 stopping when we pass beyond the bracket which is the subject of the recursion.
1660
1661 Arguments:
1662 code points to start of the recursion
1663 endcode points to where to stop (current RECURSE item)
1664 bcptr points to the chain of current (unclosed) branch starts
1665 utf8 TRUE if in UTF-8 mode
1666
1667 Returns: TRUE if what is matched could be empty
1668 */
1669
1670 static BOOL
1671 could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1672 BOOL utf8)
1673 {
1674 while (bcptr != NULL && bcptr->current >= code)
1675 {
1676 if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1677 bcptr = bcptr->outer;
1678 }
1679 return TRUE;
1680 }
1681
1682
1683
1684 /*************************************************
1685 * Check for POSIX class syntax *
1686 *************************************************/
1687
1688 /* This function is called when the sequence "[:" or "[." or "[=" is
1689 encountered in a character class. It checks whether this is followed by an
1690 optional ^ and then a sequence of letters, terminated by a matching ":]" or
1691 ".]" or "=]".
1692
1693 Argument:
1694 ptr pointer to the initial [
1695 endptr where to return the end pointer
1696 cd pointer to compile data
1697
1698 Returns: TRUE or FALSE
1699 */
1700
1701 static BOOL
1702 check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
1703 {
1704 int terminator; /* Don't combine these lines; the Solaris cc */
1705 terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
1706 if (*(++ptr) == '^') ptr++;
1707 while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
1708 if (*ptr == terminator && ptr[1] == ']')
1709 {
1710 *endptr = ptr;
1711 return TRUE;
1712 }
1713 return FALSE;
1714 }
1715
1716
1717
1718
1719 /*************************************************
1720 * Check POSIX class name *
1721 *************************************************/
1722
1723 /* This function is called to check the name given in a POSIX-style class entry
1724 such as [:alnum:].
1725
1726 Arguments:
1727 ptr points to the first letter
1728 len the length of the name
1729
1730 Returns: a value representing the name, or -1 if unknown
1731 */
1732
1733 static int
1734 check_posix_name(const uschar *ptr, int len)
1735 {
1736 register int yield = 0;
1737 while (posix_name_lengths[yield] != 0)
1738 {
1739 if (len == posix_name_lengths[yield] &&
1740 strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
1741 yield++;
1742 }
1743 return -1;
1744 }
1745
1746
1747 /*************************************************
1748 * Adjust OP_RECURSE items in repeated group *
1749 *************************************************/
1750
1751 /* OP_RECURSE items contain an offset from the start of the regex to the group
1752 that is referenced. This means that groups can be replicated for fixed
1753 repetition simply by copying (because the recursion is allowed to refer to
1754 earlier groups that are outside the current group). However, when a group is
1755 optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
1756 it, after it has been compiled. This means that any OP_RECURSE items within it
1757 that refer to the group itself or any contained groups have to have their
1758 offsets adjusted. That one of the jobs of this function. Before it is called,
1759 the partially compiled regex must be temporarily terminated with OP_END.
1760
1761 This function has been extended with the possibility of forward references for
1762 recursions and subroutine calls. It must also check the list of such references
1763 for the group we are dealing with. If it finds that one of the recursions in
1764 the current group is on this list, it adjusts the offset in the list, not the
1765 value in the reference (which is a group number).
1766
1767 Arguments:
1768 group points to the start of the group
1769 adjust the amount by which the group is to be moved
1770 utf8 TRUE in UTF-8 mode
1771 cd contains pointers to tables etc.
1772 save_hwm the hwm forward reference pointer at the start of the group
1773
1774 Returns: nothing
1775 */
1776
1777 static void
1778 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
1779 uschar *save_hwm)
1780 {
1781 uschar *ptr = group;
1782
1783 while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1784 {
1785 int offset;
1786 uschar *hc;
1787
1788 /* See if this recursion is on the forward reference list. If so, adjust the
1789 reference. */
1790
1791 for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
1792 {
1793 offset = GET(hc, 0);
1794 if (cd->start_code + offset == ptr + 1)
1795 {
1796 PUT(hc, 0, offset + adjust);
1797 break;
1798 }
1799 }
1800
1801 /* Otherwise, adjust the recursion offset if it's after the start of this
1802 group. */
1803
1804 if (hc >= cd->hwm)
1805 {
1806 offset = GET(ptr, 1);
1807 if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1808 }
1809
1810 ptr += 1 + LINK_SIZE;
1811 }
1812 }
1813
1814
1815
1816 /*************************************************
1817 * Insert an automatic callout point *
1818 *************************************************/
1819
1820 /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
1821 callout points before each pattern item.
1822
1823 Arguments:
1824 code current code pointer
1825 ptr current pattern pointer
1826 cd pointers to tables etc
1827
1828 Returns: new code pointer
1829 */
1830
1831 static uschar *
1832 auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
1833 {
1834 *code++ = OP_CALLOUT;
1835 *code++ = 255;
1836 PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */
1837 PUT(code, LINK_SIZE, 0); /* Default length */
1838 return code + 2*LINK_SIZE;
1839 }
1840
1841
1842
1843 /*************************************************
1844 * Complete a callout item *
1845 *************************************************/
1846
1847 /* A callout item contains the length of the next item in the pattern, which
1848 we can't fill in till after we have reached the relevant point. This is used
1849 for both automatic and manual callouts.
1850
1851 Arguments:
1852 previous_callout points to previous callout item
1853 ptr current pattern pointer
1854 cd pointers to tables etc
1855
1856 Returns: nothing
1857 */
1858
1859 static void
1860 complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
1861 {
1862 int length = ptr - cd->start_pattern - GET(previous_callout, 2);
1863 PUT(previous_callout, 2 + LINK_SIZE, length);
1864 }
1865
1866
1867
1868 #ifdef SUPPORT_UCP
1869 /*************************************************
1870 * Get othercase range *
1871 *************************************************/
1872
1873 /* This function is passed the start and end of a class range, in UTF-8 mode
1874 with UCP support. It searches up the characters, looking for internal ranges of
1875 characters in the "other" case. Each call returns the next one, updating the
1876 start address.
1877
1878 Arguments:
1879 cptr points to starting character value; updated
1880 d end value
1881 ocptr where to put start of othercase range
1882 odptr where to put end of othercase range
1883
1884 Yield: TRUE when range returned; FALSE when no more
1885 */
1886
1887 static BOOL
1888 get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
1889 unsigned int *odptr)
1890 {
1891 unsigned int c, othercase, next;
1892
1893 for (c = *cptr; c <= d; c++)
1894 { if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR) break; }
1895
1896 if (c > d) return FALSE;
1897
1898 *ocptr = othercase;
1899 next = othercase + 1;
1900
1901 for (++c; c <= d; c++)
1902 {
1903 if (_pcre_ucp_othercase(c) != next) break;
1904 next++;
1905 }
1906
1907 *odptr = next - 1;
1908 *cptr = c;
1909
1910 return TRUE;
1911 }
1912 #endif /* SUPPORT_UCP */
1913
1914
1915
1916 /*************************************************
1917 * Check if auto-possessifying is possible *
1918 *************************************************/
1919
1920 /* This function is called for unlimited repeats of certain items, to see
1921 whether the next thing could possibly match the repeated item. If not, it makes
1922 sense to automatically possessify the repeated item.
1923
1924 Arguments:
1925 op_code the repeated op code
1926 this data for this item, depends on the opcode
1927 utf8 TRUE in UTF-8 mode
1928 utf8_char used for utf8 character bytes, NULL if not relevant
1929 ptr next character in pattern
1930 options options bits
1931 cd contains pointers to tables etc.
1932
1933 Returns: TRUE if possessifying is wanted
1934 */
1935
1936 static BOOL
1937 check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
1938 const uschar *ptr, int options, compile_data *cd)
1939 {
1940 int next;
1941
1942 /* Skip whitespace and comments in extended mode */
1943
1944 if ((options & PCRE_EXTENDED) != 0)
1945 {
1946 for (;;)
1947 {
1948 while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1949 if (*ptr == '#')
1950 {
1951 while (*(++ptr) != 0)
1952 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1953 }
1954 else break;
1955 }
1956 }
1957
1958 /* If the next item is one that we can handle, get its value. A non-negative
1959 value is a character, a negative value is an escape value. */
1960
1961 if (*ptr == '\\')
1962 {
1963 int temperrorcode = 0;
1964 next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
1965 if (temperrorcode != 0) return FALSE;
1966 ptr++; /* Point after the escape sequence */
1967 }
1968
1969 else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
1970 {
1971 #ifdef SUPPORT_UTF8
1972 if (utf8) { GETCHARINC(next, ptr); } else
1973 #endif
1974 next = *ptr++;
1975 }
1976
1977 else return FALSE;
1978
1979 /* Skip whitespace and comments in extended mode */
1980
1981 if ((options & PCRE_EXTENDED) != 0)
1982 {
1983 for (;;)
1984 {
1985 while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1986 if (*ptr == '#')
1987 {
1988 while (*(++ptr) != 0)
1989 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1990 }
1991 else break;
1992 }
1993 }
1994
1995 /* If the next thing is itself optional, we have to give up. */
1996
1997 if (*ptr == '*' || *ptr == '?' || strncmp((char *)ptr, "{0,", 3) == 0)
1998 return FALSE;
1999
2000 /* Now compare the next item with the previous opcode. If the previous is a
2001 positive single character match, "item" either contains the character or, if
2002 "item" is greater than 127 in utf8 mode, the character's bytes are in
2003 utf8_char. */
2004
2005
2006 /* Handle cases when the next item is a character. */
2007
2008 if (next >= 0) switch(op_code)
2009 {
2010 case OP_CHAR:
2011 #ifdef SUPPORT_UTF8
2012 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2013 #endif
2014 return item != next;
2015
2016 /* For CHARNC (caseless character) we must check the other case. If we have
2017 Unicode property support, we can use it to test the other case of
2018 high-valued characters. */
2019
2020 case OP_CHARNC:
2021 #ifdef SUPPORT_UTF8
2022 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2023 #endif
2024 if (item == next) return FALSE;
2025 #ifdef SUPPORT_UTF8
2026 if (utf8)
2027 {
2028 unsigned int othercase;
2029 if (next < 128) othercase = cd->fcc[next]; else
2030 #ifdef SUPPORT_UCP
2031 othercase = _pcre_ucp_othercase((unsigned int)next);
2032 #else
2033 othercase = NOTACHAR;
2034 #endif
2035 return (unsigned int)item != othercase;
2036 }
2037 else
2038 #endif /* SUPPORT_UTF8 */
2039 return (item != cd->fcc[next]); /* Non-UTF-8 mode */
2040
2041 /* For OP_NOT, "item" must be a single-byte character. */
2042
2043 case OP_NOT:
2044 if (next < 0) return FALSE; /* Not a character */
2045 if (item == next) return TRUE;
2046 if ((options & PCRE_CASELESS) == 0) return FALSE;
2047 #ifdef SUPPORT_UTF8
2048 if (utf8)
2049 {
2050 unsigned int othercase;
2051 if (next < 128) othercase = cd->fcc[next]; else
2052 #ifdef SUPPORT_UCP
2053 othercase = _pcre_ucp_othercase(next);
2054 #else
2055 othercase = NOTACHAR;
2056 #endif
2057 return (unsigned int)item == othercase;
2058 }
2059 else
2060 #endif /* SUPPORT_UTF8 */
2061 return (item == cd->fcc[next]); /* Non-UTF-8 mode */
2062
2063 case OP_DIGIT:
2064 return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
2065
2066 case OP_NOT_DIGIT:
2067 return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
2068
2069 case OP_WHITESPACE:
2070 return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
2071
2072 case OP_NOT_WHITESPACE:
2073 return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
2074
2075 case OP_WORDCHAR:
2076 return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
2077
2078 case OP_NOT_WORDCHAR:
2079 return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
2080
2081 case OP_HSPACE:
2082 case OP_NOT_HSPACE:
2083 switch(next)
2084 {
2085 case 0x09:
2086 case 0x20:
2087 case 0xa0:
2088 case 0x1680:
2089 case 0x180e:
2090 case 0x2000:
2091 case 0x2001:
2092 case 0x2002:
2093 case 0x2003:
2094 case 0x2004:
2095 case 0x2005:
2096 case 0x2006:
2097 case 0x2007:
2098 case 0x2008:
2099 case 0x2009:
2100 case 0x200A:
2101 case 0x202f:
2102 case 0x205f:
2103 case 0x3000:
2104 return op_code != OP_HSPACE;
2105 default:
2106 return op_code == OP_HSPACE;
2107 }
2108
2109 case OP_VSPACE:
2110 case OP_NOT_VSPACE:
2111 switch(next)
2112 {
2113 case 0x0a:
2114 case 0x0b:
2115 case 0x0c:
2116 case 0x0d:
2117 case 0x85:
2118 case 0x2028:
2119 case 0x2029:
2120 return op_code != OP_VSPACE;
2121 default:
2122 return op_code == OP_VSPACE;
2123 }
2124
2125 default:
2126 return FALSE;
2127 }
2128
2129
2130 /* Handle the case when the next item is \d, \s, etc. */
2131
2132 switch(op_code)
2133 {
2134 case OP_CHAR:
2135 case OP_CHARNC:
2136 #ifdef SUPPORT_UTF8
2137 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2138 #endif
2139 switch(-next)
2140 {
2141 case ESC_d:
2142 return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
2143
2144 case ESC_D:
2145 return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
2146
2147 case ESC_s:
2148 return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
2149
2150 case ESC_S:
2151 return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
2152
2153 case ESC_w:
2154 return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
2155
2156 case ESC_W:
2157 return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
2158
2159 case ESC_h:
2160 case ESC_H:
2161 switch(item)
2162 {
2163 case 0x09:
2164 case 0x20:
2165 case 0xa0:
2166 case 0x1680:
2167 case 0x180e:
2168 case 0x2000:
2169 case 0x2001:
2170 case 0x2002:
2171 case 0x2003:
2172 case 0x2004:
2173 case 0x2005:
2174 case 0x2006:
2175 case 0x2007:
2176 case 0x2008:
2177 case 0x2009:
2178 case 0x200A:
2179 case 0x202f:
2180 case 0x205f:
2181 case 0x3000:
2182 return -next != ESC_h;
2183 default:
2184 return -next == ESC_h;
2185 }
2186
2187 case ESC_v:
2188 case ESC_V:
2189 switch(item)
2190 {
2191 case 0x0a:
2192 case 0x0b:
2193 case 0x0c:
2194 case 0x0d:
2195 case 0x85:
2196 case 0x2028:
2197 case 0x2029:
2198 return -next != ESC_v;
2199 default:
2200 return -next == ESC_v;
2201 }
2202
2203 default:
2204 return FALSE;
2205 }
2206
2207 case OP_DIGIT:
2208 return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
2209 next == -ESC_h || next == -ESC_v;
2210
2211 case OP_NOT_DIGIT:
2212 return next == -ESC_d;
2213
2214 case OP_WHITESPACE:
2215 return next == -ESC_S || next == -ESC_d || next == -ESC_w;
2216
2217 case OP_NOT_WHITESPACE:
2218 return next == -ESC_s || next == -ESC_h || next == -ESC_v;
2219
2220 case OP_HSPACE:
2221 return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w;
2222
2223 case OP_NOT_HSPACE:
2224 return next == -ESC_h;
2225
2226 /* Can't have \S in here because VT matches \S (Perl anomaly) */
2227 case OP_VSPACE:
2228 return next == -ESC_V || next == -ESC_d || next == -ESC_w;
2229
2230 case OP_NOT_VSPACE:
2231 return next == -ESC_v;
2232
2233 case OP_WORDCHAR:
2234 return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v;
2235
2236 case OP_NOT_WORDCHAR:
2237 return next == -ESC_w || next == -ESC_d;
2238
2239 default:
2240 return FALSE;
2241 }
2242
2243 /* Control does not reach here */
2244 }
2245
2246
2247
2248 /*************************************************
2249 * Compile one branch *
2250 *************************************************/
2251
2252 /* Scan the pattern, compiling it into the a vector. If the options are
2253 changed during the branch, the pointer is used to change the external options
2254 bits. This function is used during the pre-compile phase when we are trying
2255 to find out the amount of memory needed, as well as during the real compile
2256 phase. The value of lengthptr distinguishes the two phases.
2257
2258 Arguments:
2259 optionsptr pointer to the option bits
2260 codeptr points to the pointer to the current code point
2261 ptrptr points to the current pattern pointer
2262 errorcodeptr points to error code variable
2263 firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2264 reqbyteptr set to the last literal character required, else < 0
2265 bcptr points to current branch chain
2266 cd contains pointers to tables etc.
2267 lengthptr NULL during the real compile phase
2268 points to length accumulator during pre-compile phase
2269
2270 Returns: TRUE on success
2271 FALSE, with *errorcodeptr set non-zero on error
2272 */
2273
2274 static BOOL
2275 compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2276 int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2277 compile_data *cd, int *lengthptr)
2278 {
2279 int repeat_type, op_type;
2280 int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
2281 int bravalue = 0;
2282 int greedy_default, greedy_non_default;
2283 int firstbyte, reqbyte;
2284 int zeroreqbyte, zerofirstbyte;
2285 int req_caseopt, reqvary, tempreqvary;
2286 int options = *optionsptr;
2287 int after_manual_callout = 0;
2288 int length_prevgroup = 0;
2289 register int c;
2290 register uschar *code = *codeptr;
2291 uschar *last_code = code;
2292 uschar *orig_code = code;
2293 uschar *tempcode;
2294 BOOL inescq = FALSE;
2295 BOOL groupsetfirstbyte = FALSE;
2296 const uschar *ptr = *ptrptr;
2297 const uschar *tempptr;
2298 uschar *previous = NULL;
2299 uschar *previous_callout = NULL;
2300 uschar *save_hwm = NULL;
2301 uschar classbits[32];
2302
2303 #ifdef SUPPORT_UTF8
2304 BOOL class_utf8;
2305 BOOL utf8 = (options & PCRE_UTF8) != 0;
2306 uschar *class_utf8data;
2307 uschar utf8_char[6];
2308 #else
2309 BOOL utf8 = FALSE;
2310 uschar *utf8_char = NULL;
2311 #endif
2312
2313 #ifdef DEBUG
2314 if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2315 #endif
2316
2317 /* Set up the default and non-default settings for greediness */
2318
2319 greedy_default = ((options & PCRE_UNGREEDY) != 0);
2320 greedy_non_default = greedy_default ^ 1;
2321
2322 /* Initialize no first byte, no required byte. REQ_UNSET means "no char
2323 matching encountered yet". It gets changed to REQ_NONE if we hit something that
2324 matches a non-fixed char first char; reqbyte just remains unset if we never
2325 find one.
2326
2327 When we hit a repeat whose minimum is zero, we may have to adjust these values
2328 to take the zero repeat into account. This is implemented by setting them to
2329 zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2330 item types that can be repeated set these backoff variables appropriately. */
2331
2332 firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2333
2334 /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2335 according to the current setting of the caseless flag. REQ_CASELESS is a bit
2336 value > 255. It is added into the firstbyte or reqbyte variables to record the
2337 case status of the value. This is used only for ASCII characters. */
2338
2339 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2340
2341 /* Switch on next character until the end of the branch */
2342
2343 for (;; ptr++)
2344 {
2345 BOOL negate_class;
2346 BOOL possessive_quantifier;
2347 BOOL is_quantifier;
2348 BOOL is_recurse;
2349 BOOL reset_bracount;
2350 int class_charcount;
2351 int class_lastchar;
2352 int newoptions;
2353 int recno;
2354 int refsign;
2355 int skipbytes;
2356 int subreqbyte;
2357 int subfirstbyte;
2358 int terminator;
2359 int mclength;
2360 uschar mcbuffer[8];
2361
2362 /* Get next byte in the pattern */
2363
2364 c = *ptr;
2365
2366 /* If we are in the pre-compile phase, accumulate the length used for the
2367 previous cycle of this loop. */
2368
2369 if (lengthptr != NULL)
2370 {
2371 #ifdef DEBUG
2372 if (code > cd->hwm) cd->hwm = code; /* High water info */
2373 #endif
2374 if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
2375 {
2376 *errorcodeptr = ERR52;
2377 goto FAILED;
2378 }
2379
2380 /* There is at least one situation where code goes backwards: this is the
2381 case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2382 the class is simply eliminated. However, it is created first, so we have to
2383 allow memory for it. Therefore, don't ever reduce the length at this point.
2384 */
2385
2386 if (code < last_code) code = last_code;
2387
2388 /* Paranoid check for integer overflow */
2389
2390 if (OFLOW_MAX - *lengthptr < code - last_code)
2391 {
2392 *errorcodeptr = ERR20;
2393 goto FAILED;
2394 }
2395
2396 *lengthptr += code - last_code;
2397 DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2398
2399 /* If "previous" is set and it is not at the start of the work space, move
2400 it back to there, in order to avoid filling up the work space. Otherwise,
2401 if "previous" is NULL, reset the current code pointer to the start. */
2402
2403 if (previous != NULL)
2404 {
2405 if (previous > orig_code)
2406 {
2407 memmove(orig_code, previous, code - previous);
2408 code -= previous - orig_code;
2409 previous = orig_code;
2410 }
2411 }
2412 else code = orig_code;
2413
2414 /* Remember where this code item starts so we can pick up the length
2415 next time round. */
2416
2417 last_code = code;
2418 }
2419
2420 /* In the real compile phase, just check the workspace used by the forward
2421 reference list. */
2422
2423 else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
2424 {
2425 *errorcodeptr = ERR52;
2426 goto FAILED;
2427 }
2428
2429 /* If in \Q...\E, check for the end; if not, we have a literal */
2430
2431 if (inescq && c != 0)
2432 {
2433 if (c == '\\' && ptr[1] == 'E')
2434 {
2435 inescq = FALSE;
2436 ptr++;
2437 continue;
2438 }
2439 else
2440 {
2441 if (previous_callout != NULL)
2442 {
2443 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2444 complete_callout(previous_callout, ptr, cd);
2445 previous_callout = NULL;
2446 }
2447 if ((options & PCRE_AUTO_CALLOUT) != 0)
2448 {
2449 previous_callout = code;
2450 code = auto_callout(code, ptr, cd);
2451 }
2452 goto NORMAL_CHAR;
2453 }
2454 }
2455
2456 /* Fill in length of a previous callout, except when the next thing is
2457 a quantifier. */
2458
2459 is_quantifier = c == '*' || c == '+' || c == '?' ||
2460 (c == '{' && is_counted_repeat(ptr+1));
2461
2462 if (!is_quantifier && previous_callout != NULL &&
2463 after_manual_callout-- <= 0)
2464 {
2465 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2466 complete_callout(previous_callout, ptr, cd);
2467 previous_callout = NULL;
2468 }
2469
2470 /* In extended mode, skip white space and comments */
2471
2472 if ((options & PCRE_EXTENDED) != 0)
2473 {
2474 if ((cd->ctypes[c] & ctype_space) != 0) continue;
2475 if (c == '#')
2476 {
2477 while (*(++ptr) != 0)
2478 {
2479 if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
2480 }
2481 if (*ptr != 0) continue;
2482
2483 /* Else fall through to handle end of string */
2484 c = 0;
2485 }
2486 }
2487
2488 /* No auto callout for quantifiers. */
2489
2490 if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
2491 {
2492 previous_callout = code;
2493 code = auto_callout(code, ptr, cd);
2494 }
2495
2496 switch(c)
2497 {
2498 /* ===================================================================*/
2499 case 0: /* The branch terminates at string end */
2500 case '|': /* or | or ) */
2501 case ')':
2502 *firstbyteptr = firstbyte;
2503 *reqbyteptr = reqbyte;
2504 *codeptr = code;
2505 *ptrptr = ptr;
2506 if (lengthptr != NULL)
2507 {
2508 if (OFLOW_MAX - *lengthptr < code - last_code)
2509 {
2510 *errorcodeptr = ERR20;
2511 goto FAILED;
2512 }
2513 *lengthptr += code - last_code; /* To include callout length */
2514 DPRINTF((">> end branch\n"));
2515 }
2516 return TRUE;
2517
2518
2519 /* ===================================================================*/
2520 /* Handle single-character metacharacters. In multiline mode, ^ disables
2521 the setting of any following char as a first character. */
2522
2523 case '^':
2524 if ((options & PCRE_MULTILINE) != 0)
2525 {
2526 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2527 }
2528 previous = NULL;
2529 *code++ = OP_CIRC;
2530 break;
2531
2532 case '$':
2533 previous = NULL;
2534 *code++ = OP_DOLL;
2535 break;
2536
2537 /* There can never be a first char if '.' is first, whatever happens about
2538 repeats. The value of reqbyte doesn't change either. */
2539
2540 case '.':
2541 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2542 zerofirstbyte = firstbyte;
2543 zeroreqbyte = reqbyte;
2544 previous = code;
2545 *code++ = OP_ANY;
2546 break;
2547
2548
2549 /* ===================================================================*/
2550 /* Character classes. If the included characters are all < 256, we build a
2551 32-byte bitmap of the permitted characters, except in the special case
2552 where there is only one such character. For negated classes, we build the
2553 map as usual, then invert it at the end. However, we use a different opcode
2554 so that data characters > 255 can be handled correctly.
2555
2556 If the class contains characters outside the 0-255 range, a different
2557 opcode is compiled. It may optionally have a bit map for characters < 256,
2558 but those above are are explicitly listed afterwards. A flag byte tells
2559 whether the bitmap is present, and whether this is a negated class or not.
2560 */
2561
2562 case '[':
2563 previous = code;
2564
2565 /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
2566 they are encountered at the top level, so we'll do that too. */
2567
2568 if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2569 check_posix_syntax(ptr, &tempptr, cd))
2570 {
2571 *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;
2572 goto FAILED;
2573 }
2574
2575 /* If the first character is '^', set the negation flag and skip it. Also,
2576 if the first few characters (either before or after ^) are \Q\E or \E we
2577 skip them too. This makes for compatibility with Perl. */
2578
2579 negate_class = FALSE;
2580 for (;;)
2581 {
2582 c = *(++ptr);
2583 if (c == '\\')
2584 {
2585 if (ptr[1] == 'E') ptr++;
2586 else if (strncmp((const char *)ptr+1, "Q\\E", 3) == 0) ptr += 3;
2587 else break;
2588 }
2589 else if (!negate_class && c == '^')
2590 negate_class = TRUE;
2591 else break;
2592 }
2593
2594 /* Keep a count of chars with values < 256 so that we can optimize the case
2595 of just a single character (as long as it's < 256). However, For higher
2596 valued UTF-8 characters, we don't yet do any optimization. */
2597
2598 class_charcount = 0;
2599 class_lastchar = -1;
2600
2601 /* Initialize the 32-char bit map to all zeros. We build the map in a
2602 temporary bit of memory, in case the class contains only 1 character (less
2603 than 256), because in that case the compiled code doesn't use the bit map.
2604 */
2605
2606 memset(classbits, 0, 32 * sizeof(uschar));
2607
2608 #ifdef SUPPORT_UTF8
2609 class_utf8 = FALSE; /* No chars >= 256 */
2610 class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */
2611 #endif
2612
2613 /* Process characters until ] is reached. By writing this as a "do" it
2614 means that an initial ] is taken as a data character. At the start of the
2615 loop, c contains the first byte of the character. */
2616
2617 if (c != 0) do
2618 {
2619 const uschar *oldptr;
2620
2621 #ifdef SUPPORT_UTF8
2622 if (utf8 && c > 127)
2623 { /* Braces are required because the */
2624 GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
2625 }
2626 #endif
2627
2628 /* Inside \Q...\E everything is literal except \E */
2629
2630 if (inescq)
2631 {
2632 if (c == '\\' && ptr[1] == 'E') /* If we are at \E */
2633 {
2634 inescq = FALSE; /* Reset literal state */
2635 ptr++; /* Skip the 'E' */
2636 continue; /* Carry on with next */
2637 }
2638 goto CHECK_RANGE; /* Could be range if \E follows */
2639 }
2640
2641 /* Handle POSIX class names. Perl allows a negation extension of the
2642 form [:^name:]. A square bracket that doesn't match the syntax is
2643 treated as a literal. We also recognize the POSIX constructions
2644 [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
2645 5.6 and 5.8 do. */
2646
2647 if (c == '[' &&
2648 (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2649 check_posix_syntax(ptr, &tempptr, cd))
2650 {
2651 BOOL local_negate = FALSE;
2652 int posix_class, taboffset, tabopt;
2653 register const uschar *cbits = cd->cbits;
2654 uschar pbits[32];
2655
2656 if (ptr[1] != ':')
2657 {
2658 *errorcodeptr = ERR31;
2659 goto FAILED;
2660 }
2661
2662 ptr += 2;
2663 if (*ptr == '^')
2664 {
2665 local_negate = TRUE;
2666 ptr++;
2667 }
2668
2669 posix_class = check_posix_name(ptr, tempptr - ptr);
2670 if (posix_class < 0)
2671 {
2672 *errorcodeptr = ERR30;
2673 goto FAILED;
2674 }
2675
2676 /* If matching is caseless, upper and lower are converted to
2677 alpha. This relies on the fact that the class table starts with
2678 alpha, lower, upper as the first 3 entries. */
2679
2680 if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
2681 posix_class = 0;
2682
2683 /* We build the bit map for the POSIX class in a chunk of local store
2684 because we may be adding and subtracting from it, and we don't want to
2685 subtract bits that may be in the main map already. At the end we or the
2686 result into the bit map that is being built. */
2687
2688 posix_class *= 3;
2689
2690 /* Copy in the first table (always present) */
2691
2692 memcpy(pbits, cbits + posix_class_maps[posix_class],
2693 32 * sizeof(uschar));
2694
2695 /* If there is a second table, add or remove it as required. */
2696
2697 taboffset = posix_class_maps[posix_class + 1];
2698 tabopt = posix_class_maps[posix_class + 2];
2699
2700 if (taboffset >= 0)
2701 {
2702 if (tabopt >= 0)
2703 for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
2704 else
2705 for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
2706 }
2707
2708 /* Not see if we need to remove any special characters. An option
2709 value of 1 removes vertical space and 2 removes underscore. */
2710
2711 if (tabopt < 0) tabopt = -tabopt;
2712 if (tabopt == 1) pbits[1] &= ~0x3c;
2713 else if (tabopt == 2) pbits[11] &= 0x7f;
2714
2715 /* Add the POSIX table or its complement into the main table that is
2716 being built and we are done. */
2717
2718 if (local_negate)
2719 for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
2720 else
2721 for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
2722
2723 ptr = tempptr + 1;
2724 class_charcount = 10; /* Set > 1; assumes more than 1 per class */
2725 continue; /* End of POSIX syntax handling */
2726 }
2727
2728 /* Backslash may introduce a single character, or it may introduce one
2729 of the specials, which just set a flag. The sequence \b is a special
2730 case. Inside a class (and only there) it is treated as backspace.
2731 Elsewhere it marks a word boundary. Other escapes have preset maps ready
2732 to 'or' into the one we are building. We assume they have more than one
2733 character in them, so set class_charcount bigger than one. */
2734
2735 if (c == '\\')
2736 {
2737 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2738 if (*errorcodeptr != 0) goto FAILED;
2739
2740 if (-c == ESC_b) c = '\b'; /* \b is backslash in a class */
2741 else if (-c == ESC_X) c = 'X'; /* \X is literal X in a class */
2742 else if (-c == ESC_R) c = 'R'; /* \R is literal R in a class */
2743 else if (-c == ESC_Q) /* Handle start of quoted string */
2744 {
2745 if (ptr[1] == '\\' && ptr[2] == 'E')
2746 {
2747 ptr += 2; /* avoid empty string */
2748 }
2749 else inescq = TRUE;
2750 continue;
2751 }
2752 else if (-c == ESC_E) continue; /* Ignore orphan \E */
2753
2754 if (c < 0)
2755 {
2756 register const uschar *cbits = cd->cbits;
2757 class_charcount += 2; /* Greater than 1 is what matters */
2758
2759 /* Save time by not doing this in the pre-compile phase. */
2760
2761 if (lengthptr == NULL) switch (-c)
2762 {
2763 case ESC_d:
2764 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
2765 continue;
2766
2767 case ESC_D:
2768 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
2769 continue;
2770
2771 case ESC_w:
2772 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
2773 continue;
2774
2775 case ESC_W:
2776 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
2777 continue;
2778
2779 case ESC_s:
2780 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
2781 classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
2782 continue;
2783
2784 case ESC_S:
2785 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
2786 classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
2787 continue;
2788
2789 case ESC_E: /* Perl ignores an orphan \E */
2790 continue;
2791
2792 default: /* Not recognized; fall through */
2793 break; /* Need "default" setting to stop compiler warning. */
2794 }
2795
2796 /* In the pre-compile phase, just do the recognition. */
2797
2798 else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
2799 c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
2800
2801 /* We need to deal with \H, \h, \V, and \v in both phases because
2802 they use extra memory. */
2803
2804 if (-c == ESC_h)
2805 {
2806 SETBIT(classbits, 0x09); /* VT */
2807 SETBIT(classbits, 0x20); /* SPACE */
2808 SETBIT(classbits, 0xa0); /* NSBP */
2809 #ifdef SUPPORT_UTF8
2810 if (utf8)
2811 {
2812 class_utf8 = TRUE;
2813 *class_utf8data++ = XCL_SINGLE;
2814 class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
2815 *class_utf8data++ = XCL_SINGLE;
2816 class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
2817 *class_utf8data++ = XCL_RANGE;
2818 class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
2819 class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
2820 *class_utf8data++ = XCL_SINGLE;
2821 class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
2822 *class_utf8data++ = XCL_SINGLE;
2823 class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
2824 *class_utf8data++ = XCL_SINGLE;
2825 class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
2826 }
2827 #endif
2828 continue;
2829 }
2830
2831 if (-c == ESC_H)
2832 {
2833 for (c = 0; c < 32; c++)
2834 {
2835 int x = 0xff;
2836 switch (c)
2837 {
2838 case 0x09/8: x ^= 1 << (0x09%8); break;
2839 case 0x20/8: x ^= 1 << (0x20%8); break;
2840 case 0xa0/8: x ^= 1 << (0xa0%8); break;
2841 default: break;
2842 }
2843 classbits[c] |= x;
2844 }
2845
2846 #ifdef SUPPORT_UTF8
2847 if (utf8)
2848 {
2849 class_utf8 = TRUE;
2850 *class_utf8data++ = XCL_RANGE;
2851 class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2852 class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
2853 *class_utf8data++ = XCL_RANGE;
2854 class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
2855 class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
2856 *class_utf8data++ = XCL_RANGE;
2857 class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
2858 class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
2859 *class_utf8data++ = XCL_RANGE;
2860 class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
2861 class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
2862 *class_utf8data++ = XCL_RANGE;
2863 class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
2864 class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
2865 *class_utf8data++ = XCL_RANGE;
2866 class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
2867 class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
2868 *class_utf8data++ = XCL_RANGE;
2869 class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
2870 class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2871 }
2872 #endif
2873 continue;
2874 }
2875
2876 if (-c == ESC_v)
2877 {
2878 SETBIT(classbits, 0x0a); /* LF */
2879 SETBIT(classbits, 0x0b); /* VT */
2880 SETBIT(classbits, 0x0c); /* FF */
2881 SETBIT(classbits, 0x0d); /* CR */
2882 SETBIT(classbits, 0x85); /* NEL */
2883 #ifdef SUPPORT_UTF8
2884 if (utf8)
2885 {
2886 class_utf8 = TRUE;
2887 *class_utf8data++ = XCL_RANGE;
2888 class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
2889 class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
2890 }
2891 #endif
2892 continue;
2893 }
2894
2895 if (-c == ESC_V)
2896 {
2897 for (c = 0; c < 32; c++)
2898 {
2899 int x = 0xff;
2900 switch (c)
2901 {
2902 case 0x0a/8: x ^= 1 << (0x0a%8);
2903 x ^= 1 << (0x0b%8);
2904 x ^= 1 << (0x0c%8);
2905 x ^= 1 << (0x0d%8);
2906 break;
2907 case 0x85/8: x ^= 1 << (0x85%8); break;
2908 default: break;
2909 }
2910 classbits[c] |= x;
2911 }
2912
2913 #ifdef SUPPORT_UTF8
2914 if (utf8)
2915 {
2916 class_utf8 = TRUE;
2917 *class_utf8data++ = XCL_RANGE;
2918 class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2919 class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
2920 *class_utf8data++ = XCL_RANGE;
2921 class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
2922 class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2923 }
2924 #endif
2925 continue;
2926 }
2927
2928 /* We need to deal with \P and \p in both phases. */
2929
2930 #ifdef SUPPORT_UCP
2931 if (-c == ESC_p || -c == ESC_P)
2932 {
2933 BOOL negated;
2934 int pdata;
2935 int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
2936 if (ptype < 0) goto FAILED;
2937 class_utf8 = TRUE;
2938 *class_utf8data++ = ((-c == ESC_p) != negated)?
2939 XCL_PROP : XCL_NOTPROP;
2940 *class_utf8data++ = ptype;
2941 *class_utf8data++ = pdata;
2942 class_charcount -= 2; /* Not a < 256 character */
2943 continue;
2944 }
2945 #endif
2946 /* Unrecognized escapes are faulted if PCRE is running in its
2947 strict mode. By default, for compatibility with Perl, they are
2948 treated as literals. */
2949
2950 if ((options & PCRE_EXTRA) != 0)
2951 {
2952 *errorcodeptr = ERR7;
2953 goto FAILED;
2954 }
2955
2956 class_charcount -= 2; /* Undo the default count from above */
2957 c = *ptr; /* Get the final character and fall through */
2958 }
2959
2960 /* Fall through if we have a single character (c >= 0). This may be
2961 greater than 256 in UTF-8 mode. */
2962
2963 } /* End of backslash handling */
2964
2965 /* A single character may be followed by '-' to form a range. However,
2966 Perl does not permit ']' to be the end of the range. A '-' character
2967 at the end is treated as a literal. Perl ignores orphaned \E sequences
2968 entirely. The code for handling \Q and \E is messy. */
2969
2970 CHECK_RANGE:
2971 while (ptr[1] == '\\' && ptr[2] == 'E')
2972 {
2973 inescq = FALSE;
2974 ptr += 2;
2975 }
2976
2977 oldptr = ptr;
2978
2979 /* Remember \r or \n */
2980
2981 if (c == '\r' || c == '\n') cd->external_flags |= PCRE_HASCRORLF;
2982
2983 /* Check for range */
2984
2985 if (!inescq && ptr[1] == '-')
2986 {
2987 int d;
2988 ptr += 2;
2989 while (*ptr == '\\' && ptr[1] == 'E') ptr += 2;
2990
2991 /* If we hit \Q (not followed by \E) at this point, go into escaped
2992 mode. */
2993
2994 while (*ptr == '\\' && ptr[1] == 'Q')
2995 {
2996 ptr += 2;
2997 if (*ptr == '\\' && ptr[1] == 'E') { ptr += 2; continue; }
2998 inescq = TRUE;
2999 break;
3000 }
3001
3002 if (*ptr == 0 || (!inescq && *ptr == ']'))
3003 {
3004 ptr = oldptr;
3005 goto LONE_SINGLE_CHARACTER;
3006 }
3007
3008 #ifdef SUPPORT_UTF8
3009 if (utf8)
3010 { /* Braces are required because the */
3011 GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
3012 }
3013 else
3014 #endif
3015 d = *ptr; /* Not UTF-8 mode */
3016
3017 /* The second part of a range can be a single-character escape, but
3018 not any of the other escapes. Perl 5.6 treats a hyphen as a literal
3019 in such circumstances. */
3020
3021 if (!inescq && d == '\\')
3022 {
3023 d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3024 if (*errorcodeptr != 0) goto FAILED;
3025
3026 /* \b is backslash; \X is literal X; \R is literal R; any other
3027 special means the '-' was literal */
3028
3029 if (d < 0)
3030 {
3031 if (d == -ESC_b) d = '\b';
3032 else if (d == -ESC_X) d = 'X';
3033 else if (d == -ESC_R) d = 'R'; else
3034 {
3035 ptr = oldptr;
3036 goto LONE_SINGLE_CHARACTER; /* A few lines below */
3037 }
3038 }
3039 }
3040
3041 /* Check that the two values are in the correct order. Optimize
3042 one-character ranges */
3043
3044 if (d < c)
3045 {
3046 *errorcodeptr = ERR8;
3047 goto FAILED;
3048 }
3049
3050 if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
3051
3052 /* Remember \r or \n */
3053
3054 if (d == '\r' || d == '\n') cd->external_flags |= PCRE_HASCRORLF;
3055
3056 /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
3057 matching, we have to use an XCLASS with extra data items. Caseless
3058 matching for characters > 127 is available only if UCP support is
3059 available. */
3060
3061 #ifdef SUPPORT_UTF8
3062 if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
3063 {
3064 class_utf8 = TRUE;
3065
3066 /* With UCP support, we can find the other case equivalents of
3067 the relevant characters. There may be several ranges. Optimize how
3068 they fit with the basic range. */
3069
3070 #ifdef SUPPORT_UCP
3071 if ((options & PCRE_CASELESS) != 0)
3072 {
3073 unsigned int occ, ocd;
3074 unsigned int cc = c;
3075 unsigned int origd = d;
3076 while (get_othercase_range(&cc, origd, &occ, &ocd))
3077 {
3078 if (occ >= (unsigned int)c &&
3079 ocd <= (unsigned int)d)
3080 continue; /* Skip embedded ranges */
3081
3082 if (occ < (unsigned int)c &&
3083 ocd >= (unsigned int)c - 1) /* Extend the basic range */
3084 { /* if there is overlap, */
3085 c = occ; /* noting that if occ < c */
3086 continue; /* we can't have ocd > d */
3087 } /* because a subrange is */
3088 if (ocd > (unsigned int)d &&
3089 occ <= (unsigned int)d + 1) /* always shorter than */
3090 { /* the basic range. */
3091 d = ocd;
3092 continue;
3093 }
3094
3095 if (occ == ocd)
3096 {
3097 *class_utf8data++ = XCL_SINGLE;
3098 }
3099 else
3100 {
3101 *class_utf8data++ = XCL_RANGE;
3102 class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
3103 }
3104 class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
3105 }
3106 }
3107 #endif /* SUPPORT_UCP */
3108
3109 /* Now record the original range, possibly modified for UCP caseless
3110 overlapping ranges. */
3111
3112 *class_utf8data++ = XCL_RANGE;
3113 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3114 class_utf8data += _pcre_ord2utf8(d, class_utf8data);
3115
3116 /* With UCP support, we are done. Without UCP support, there is no
3117 caseless matching for UTF-8 characters > 127; we can use the bit map
3118 for the smaller ones. */
3119
3120 #ifdef SUPPORT_UCP
3121 continue; /* With next character in the class */
3122 #else
3123 if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
3124
3125 /* Adjust upper limit and fall through to set up the map */
3126
3127 d = 127;
3128
3129 #endif /* SUPPORT_UCP */
3130 }
3131 #endif /* SUPPORT_UTF8 */
3132
3133 /* We use the bit map for all cases when not in UTF-8 mode; else
3134 ranges that lie entirely within 0-127 when there is UCP support; else
3135 for partial ranges without UCP support. */
3136
3137 class_charcount += d - c + 1;
3138 class_lastchar = d;
3139
3140 /* We can save a bit of time by skipping this in the pre-compile. */
3141
3142 if (lengthptr == NULL) for (; c <= d; c++)
3143 {
3144 classbits[c/8] |= (1 << (c&7));
3145 if ((options & PCRE_CASELESS) != 0)
3146 {
3147 int uc = cd->fcc[c]; /* flip case */
3148 classbits[uc/8] |= (1 << (uc&7));
3149 }
3150 }
3151
3152 continue; /* Go get the next char in the class */
3153 }
3154
3155 /* Handle a lone single character - we can get here for a normal
3156 non-escape char, or after \ that introduces a single character or for an
3157 apparent range that isn't. */
3158
3159 LONE_SINGLE_CHARACTER:
3160
3161 /* Handle a character that cannot go in the bit map */
3162
3163 #ifdef SUPPORT_UTF8
3164 if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
3165 {
3166 class_utf8 = TRUE;
3167 *class_utf8data++ = XCL_SINGLE;
3168 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3169
3170 #ifdef SUPPORT_UCP
3171 if ((options & PCRE_CASELESS) != 0)
3172 {
3173 unsigned int othercase;
3174 if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR)
3175 {
3176 *class_utf8data++ = XCL_SINGLE;
3177 class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
3178 }
3179 }
3180 #endif /* SUPPORT_UCP */
3181
3182 }
3183 else
3184 #endif /* SUPPORT_UTF8 */
3185
3186 /* Handle a single-byte character */
3187 {
3188 classbits[c/8] |= (1 << (c&7));
3189 if ((options & PCRE_CASELESS) != 0)
3190 {
3191 c = cd->fcc[c]; /* flip case */
3192 classbits[c/8] |= (1 << (c&7));
3193 }
3194 class_charcount++;
3195 class_lastchar = c;
3196 }
3197 }
3198
3199 /* Loop until ']' reached. This "while" is the end of the "do" above. */
3200
3201 while ((c = *(++ptr)) != 0 && (c != ']' || inescq));
3202
3203 if (c == 0) /* Missing terminating ']' */
3204 {
3205 *errorcodeptr = ERR6;
3206 goto FAILED;
3207 }
3208
3209
3210 /* This code has been disabled because it would mean that \s counts as
3211 an explicit \r or \n reference, and that's not really what is wanted. Now
3212 we set the flag only if there is a literal "\r" or "\n" in the class. */
3213
3214 #if 0
3215 /* Remember whether \r or \n are in this class */
3216
3217 if (negate_class)
3218 {
3219 if ((classbits[1] & 0x24) != 0x24) cd->external_flags |= PCRE_HASCRORLF;
3220 }
3221 else
3222 {
3223 if ((classbits[1] & 0x24) != 0) cd->external_flags |= PCRE_HASCRORLF;
3224 }
3225 #endif
3226
3227
3228 /* If class_charcount is 1, we saw precisely one character whose value is
3229 less than 256. As long as there were no characters >= 128 and there was no
3230 use of \p or \P, in other words, no use of any XCLASS features, we can
3231 optimize.
3232
3233 In UTF-8 mode, we can optimize the negative case only if there were no
3234 characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR
3235 operate on single-bytes only. This is an historical hangover. Maybe one day
3236 we can tidy these opcodes to handle multi-byte characters.
3237
3238 The optimization throws away the bit map. We turn the item into a
3239 1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
3240 that OP_NOT does not support multibyte characters. In the positive case, it
3241 can cause firstbyte to be set. Otherwise, there can be no first char if
3242 this item is first, whatever repeat count may follow. In the case of
3243 reqbyte, save the previous value for reinstating. */
3244
3245 #ifdef SUPPORT_UTF8
3246 if (class_charcount == 1 && !class_utf8 &&
3247 (!utf8 || !negate_class || class_lastchar < 128))
3248 #else
3249 if (class_charcount == 1)
3250 #endif
3251 {
3252 zeroreqbyte = reqbyte;
3253
3254 /* The OP_NOT opcode works on one-byte characters only. */
3255
3256 if (negate_class)
3257 {
3258 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3259 zerofirstbyte = firstbyte;
3260 *code++ = OP_NOT;
3261 *code++ = class_lastchar;
3262 break;
3263 }
3264
3265 /* For a single, positive character, get the value into mcbuffer, and
3266 then we can handle this with the normal one-character code. */
3267
3268 #ifdef SUPPORT_UTF8
3269 if (utf8 && class_lastchar > 127)
3270 mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
3271 else
3272 #endif
3273 {
3274 mcbuffer[0] = class_lastchar;
3275 mclength = 1;
3276 }
3277 goto ONE_CHAR;
3278 } /* End of 1-char optimization */
3279
3280 /* The general case - not the one-char optimization. If this is the first
3281 thing in the branch, there can be no first char setting, whatever the
3282 repeat count. Any reqbyte setting must remain unchanged after any kind of
3283 repeat. */
3284
3285 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3286 zerofirstbyte = firstbyte;
3287 zeroreqbyte = reqbyte;
3288
3289 /* If there are characters with values > 255, we have to compile an
3290 extended class, with its own opcode. If there are no characters < 256,
3291 we can omit the bitmap in the actual compiled code. */
3292
3293 #ifdef SUPPORT_UTF8
3294 if (class_utf8)
3295 {
3296 *class_utf8data++ = XCL_END; /* Marks the end of extra data */
3297 *code++ = OP_XCLASS;
3298 code += LINK_SIZE;
3299 *code = negate_class? XCL_NOT : 0;
3300
3301 /* If the map is required, move up the extra data to make room for it;
3302 otherwise just move the code pointer to the end of the extra data. */
3303
3304 if (class_charcount > 0)
3305 {
3306 *code++ |= XCL_MAP;
3307 memmove(code + 32, code, class_utf8data - code);
3308 memcpy(code, classbits, 32);
3309 code = class_utf8data + 32;
3310 }
3311 else code = class_utf8data;
3312
3313 /* Now fill in the complete length of the item */
3314
3315 PUT(previous, 1, code - previous);
3316 break; /* End of class handling */
3317 }
3318 #endif
3319
3320 /* If there are no characters > 255, negate the 32-byte map if necessary,
3321 and copy it into the code vector. If this is the first thing in the branch,
3322 there can be no first char setting, whatever the repeat count. Any reqbyte
3323 setting must remain unchanged after any kind of repeat. */
3324
3325 if (negate_class)
3326 {
3327 *code++ = OP_NCLASS;
3328 if (lengthptr == NULL) /* Save time in the pre-compile phase */
3329 for (c = 0; c < 32; c++) code[c] = ~classbits[c];
3330 }
3331 else
3332 {
3333 *code++ = OP_CLASS;
3334 memcpy(code, classbits, 32);
3335 }
3336 code += 32;
3337 break;
3338
3339
3340 /* ===================================================================*/
3341 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
3342 has been tested above. */
3343
3344 case '{':
3345 if (!is_quantifier) goto NORMAL_CHAR;
3346 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
3347 if (*errorcodeptr != 0) goto FAILED;
3348 goto REPEAT;
3349
3350 case '*':
3351 repeat_min = 0;
3352 repeat_max = -1;
3353 goto REPEAT;
3354
3355 case '+':
3356 repeat_min = 1;
3357 repeat_max = -1;
3358 goto REPEAT;
3359
3360 case '?':
3361 repeat_min = 0;
3362 repeat_max = 1;
3363
3364 REPEAT:
3365 if (previous == NULL)
3366 {
3367 *errorcodeptr = ERR9;
3368 goto FAILED;
3369 }
3370
3371 if (repeat_min == 0)
3372 {
3373 firstbyte = zerofirstbyte; /* Adjust for zero repeat */
3374 reqbyte = zeroreqbyte; /* Ditto */
3375 }
3376
3377 /* Remember whether this is a variable length repeat */
3378
3379 reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
3380
3381 op_type = 0; /* Default single-char op codes */
3382 possessive_quantifier = FALSE; /* Default not possessive quantifier */
3383
3384 /* Save start of previous item, in case we have to move it up to make space
3385 for an inserted OP_ONCE for the additional '+' extension. */
3386
3387 tempcode = previous;
3388
3389 /* If the next character is '+', we have a possessive quantifier. This
3390 implies greediness, whatever the setting of the PCRE_UNGREEDY option.
3391 If the next character is '?' this is a minimizing repeat, by default,
3392 but if PCRE_UNGREEDY is set, it works the other way round. We change the
3393 repeat type to the non-default. */
3394
3395 if (ptr[1] == '+')
3396 {
3397 repeat_type = 0; /* Force greedy */
3398 possessive_quantifier = TRUE;
3399 ptr++;
3400 }
3401 else if (ptr[1] == '?')
3402 {
3403 repeat_type = greedy_non_default;
3404 ptr++;
3405 }
3406 else repeat_type = greedy_default;
3407
3408 /* If previous was a character match, abolish the item and generate a
3409 repeat item instead. If a char item has a minumum of more than one, ensure
3410 that it is set in reqbyte - it might not be if a sequence such as x{3} is
3411 the first thing in a branch because the x will have gone into firstbyte
3412 instead. */
3413
3414 if (*previous == OP_CHAR || *previous == OP_CHARNC)
3415 {
3416 /* Deal with UTF-8 characters that take up more than one byte. It's
3417 easier to write this out separately than try to macrify it. Use c to
3418 hold the length of the character in bytes, plus 0x80 to flag that it's a
3419 length rather than a small character. */
3420
3421 #ifdef SUPPORT_UTF8
3422 if (utf8 && (code[-1] & 0x80) != 0)
3423 {
3424 uschar *lastchar = code - 1;
3425 while((*lastchar & 0xc0) == 0x80) lastchar--;
3426 c = code - lastchar; /* Length of UTF-8 character */
3427 memcpy(utf8_char, lastchar, c); /* Save the char */
3428 c |= 0x80; /* Flag c as a length */
3429 }
3430 else
3431 #endif
3432
3433 /* Handle the case of a single byte - either with no UTF8 support, or
3434 with UTF-8 disabled, or for a UTF-8 character < 128. */
3435
3436 {
3437 c = code[-1];
3438 if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
3439 }
3440
3441 /* If the repetition is unlimited, it pays to see if the next thing on
3442 the line is something that cannot possibly match this character. If so,
3443 automatically possessifying this item gains some performance in the case
3444 where the match fails. */
3445
3446 if (!possessive_quantifier &&
3447 repeat_max < 0 &&
3448 check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3449 options, cd))
3450 {
3451 repeat_type = 0; /* Force greedy */
3452 possessive_quantifier = TRUE;
3453 }
3454
3455 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
3456 }
3457
3458 /* If previous was a single negated character ([^a] or similar), we use
3459 one of the special opcodes, replacing it. The code is shared with single-
3460 character repeats by setting opt_type to add a suitable offset into
3461 repeat_type. We can also test for auto-possessification. OP_NOT is
3462 currently used only for single-byte chars. */
3463
3464 else if (*previous == OP_NOT)
3465 {
3466 op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
3467 c = previous[1];
3468 if (!possessive_quantifier &&
3469 repeat_max < 0 &&
3470 check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3471 {
3472 repeat_type = 0; /* Force greedy */
3473 possessive_quantifier = TRUE;
3474 }
3475 goto OUTPUT_SINGLE_REPEAT;
3476 }
3477
3478 /* If previous was a character type match (\d or similar), abolish it and
3479 create a suitable repeat item. The code is shared with single-character
3480 repeats by setting op_type to add a suitable offset into repeat_type. Note
3481 the the Unicode property types will be present only when SUPPORT_UCP is
3482 defined, but we don't wrap the little bits of code here because it just
3483 makes it horribly messy. */
3484
3485 else if (*previous < OP_EODN)
3486 {
3487 uschar *oldcode;
3488 int prop_type, prop_value;
3489 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
3490 c = *previous;
3491
3492 if (!possessive_quantifier &&
3493 repeat_max < 0 &&
3494 check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3495 {
3496 repeat_type = 0; /* Force greedy */
3497 possessive_quantifier = TRUE;
3498 }
3499
3500 OUTPUT_SINGLE_REPEAT:
3501 if (*previous == OP_PROP || *previous == OP_NOTPROP)
3502 {
3503 prop_type = previous[1];
3504 prop_value = previous[2];
3505 }
3506 else prop_type = prop_value = -1;
3507
3508 oldcode = code;
3509 code = previous; /* Usually overwrite previous item */
3510
3511 /* If the maximum is zero then the minimum must also be zero; Perl allows
3512 this case, so we do too - by simply omitting the item altogether. */
3513
3514 if (repeat_max == 0) goto END_REPEAT;
3515
3516 /* All real repeats make it impossible to handle partial matching (maybe
3517 one day we will be able to remove this restriction). */
3518
3519 if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL;
3520
3521 /* Combine the op_type with the repeat_type */
3522
3523 repeat_type += op_type;
3524
3525 /* A minimum of zero is handled either as the special case * or ?, or as
3526 an UPTO, with the maximum given. */
3527
3528 if (repeat_min == 0)
3529 {
3530 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
3531 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
3532 else
3533 {
3534 *code++ = OP_UPTO + repeat_type;
3535 PUT2INC(code, 0, repeat_max);
3536 }
3537 }
3538
3539 /* A repeat minimum of 1 is optimized into some special cases. If the
3540 maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
3541 left in place and, if the maximum is greater than 1, we use OP_UPTO with
3542 one less than the maximum. */
3543
3544 else if (repeat_min == 1)
3545 {
3546 if (repeat_max == -1)
3547 *code++ = OP_PLUS + repeat_type;
3548 else
3549 {
3550 code = oldcode; /* leave previous item in place */
3551 if (repeat_max == 1) goto END_REPEAT;
3552 *code++ = OP_UPTO + repeat_type;
3553 PUT2INC(code, 0, repeat_max - 1);
3554 }
3555 }
3556
3557 /* The case {n,n} is just an EXACT, while the general case {n,m} is
3558 handled as an EXACT followed by an UPTO. */
3559
3560 else
3561 {
3562 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
3563 PUT2INC(code, 0, repeat_min);
3564
3565 /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
3566 we have to insert the character for the previous code. For a repeated
3567 Unicode property match, there are two extra bytes that define the
3568 required property. In UTF-8 mode, long characters have their length in
3569 c, with the 0x80 bit as a flag. */
3570
3571 if (repeat_max < 0)
3572 {
3573 #ifdef SUPPORT_UTF8
3574 if (utf8 && c >= 128)
3575 {
3576 memcpy(code, utf8_char, c & 7);
3577 code += c & 7;
3578 }
3579 else
3580 #endif
3581 {
3582 *code++ = c;
3583 if (prop_type >= 0)
3584 {
3585 *code++ = prop_type;
3586 *code++ = prop_value;
3587 }
3588 }
3589 *code++ = OP_STAR + repeat_type;
3590 }
3591
3592 /* Else insert an UPTO if the max is greater than the min, again
3593 preceded by the character, for the previously inserted code. If the
3594 UPTO is just for 1 instance, we can use QUERY instead. */
3595
3596 else if (repeat_max != repeat_min)
3597 {
3598 #ifdef SUPPORT_UTF8
3599 if (utf8 && c >= 128)
3600 {
3601 memcpy(code, utf8_char, c & 7);
3602 code += c & 7;
3603 }
3604 else
3605 #endif
3606 *code++ = c;
3607 if (prop_type >= 0)
3608 {
3609 *code++ = prop_type;
3610 *code++ = prop_value;
3611 }
3612 repeat_max -= repeat_min;
3613
3614 if (repeat_max == 1)
3615 {
3616 *code++ = OP_QUERY + repeat_type;
3617 }
3618 else
3619 {
3620 *code++ = OP_UPTO + repeat_type;
3621 PUT2INC(code, 0, repeat_max);
3622 }
3623 }
3624 }
3625
3626 /* The character or character type itself comes last in all cases. */
3627
3628 #ifdef SUPPORT_UTF8
3629 if (utf8 && c >= 128)
3630 {
3631 memcpy(code, utf8_char, c & 7);
3632 code += c & 7;
3633 }
3634 else
3635 #endif
3636 *code++ = c;
3637
3638 /* For a repeated Unicode property match, there are two extra bytes that
3639 define the required property. */
3640
3641 #ifdef SUPPORT_UCP
3642 if (prop_type >= 0)
3643 {
3644 *code++ = prop_type;
3645 *code++ = prop_value;
3646 }
3647 #endif
3648 }
3649
3650 /* If previous was a character class or a back reference, we put the repeat
3651 stuff after it, but just skip the item if the repeat was {0,0}. */
3652
3653 else if (*previous == OP_CLASS ||
3654 *previous == OP_NCLASS ||
3655 #ifdef SUPPORT_UTF8
3656 *previous == OP_XCLASS ||
3657 #endif
3658 *previous == OP_REF)
3659 {
3660 if (repeat_max == 0)
3661 {
3662 code = previous;
3663 goto END_REPEAT;
3664 }
3665
3666 /* All real repeats make it impossible to handle partial matching (maybe
3667 one day we will be able to remove this restriction). */
3668
3669 if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL;
3670
3671 if (repeat_min == 0 && repeat_max == -1)
3672 *code++ = OP_CRSTAR + repeat_type;
3673 else if (repeat_min == 1 && repeat_max == -1)
3674 *code++ = OP_CRPLUS + repeat_type;
3675 else if (repeat_min == 0 && repeat_max == 1)
3676 *code++ = OP_CRQUERY + repeat_type;
3677 else
3678 {
3679 *code++ = OP_CRRANGE + repeat_type;
3680 PUT2INC(code, 0, repeat_min);
3681 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
3682 PUT2INC(code, 0, repeat_max);
3683 }
3684 }
3685
3686 /* If previous was a bracket group, we may have to replicate it in certain
3687 cases. */
3688
3689 else if (*previous == OP_BRA || *previous == OP_CBRA ||
3690 *previous == OP_ONCE || *previous == OP_COND)
3691 {
3692 register int i;
3693 int ketoffset = 0;
3694 int len = code - previous;
3695 uschar *bralink = NULL;
3696
3697 /* Repeating a DEFINE group is pointless */
3698
3699 if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
3700 {
3701 *errorcodeptr = ERR55;
3702 goto FAILED;
3703 }
3704
3705 /* If the maximum repeat count is unlimited, find the end of the bracket
3706 by scanning through from the start, and compute the offset back to it
3707 from the current code pointer. There may be an OP_OPT setting following
3708 the final KET, so we can't find the end just by going back from the code
3709 pointer. */
3710
3711 if (repeat_max == -1)
3712 {
3713 register uschar *ket = previous;
3714 do ket += GET(ket, 1); while (*ket != OP_KET);
3715 ketoffset = code - ket;
3716 }
3717
3718 /* The case of a zero minimum is special because of the need to stick
3719 OP_BRAZERO in front of it, and because the group appears once in the
3720 data, whereas in other cases it appears the minimum number of times. For
3721 this reason, it is simplest to treat this case separately, as otherwise
3722 the code gets far too messy. There are several special subcases when the
3723 minimum is zero. */
3724
3725 if (repeat_min == 0)
3726 {
3727 /* If the maximum is also zero, we just omit the group from the output
3728 altogether. */
3729
3730 if (repeat_max == 0)
3731 {
3732 code = previous;
3733 goto END_REPEAT;
3734 }
3735
3736 /* If the maximum is 1 or unlimited, we just have to stick in the
3737 BRAZERO and do no more at this point. However, we do need to adjust
3738 any OP_RECURSE calls inside the group that refer to the group itself or
3739 any internal or forward referenced group, because the offset is from
3740 the start of the whole regex. Temporarily terminate the pattern while
3741 doing this. */
3742
3743 if (repeat_max <= 1)
3744 {
3745 *code = OP_END;
3746 adjust_recurse(previous, 1, utf8, cd, save_hwm);
3747 memmove(previous+1, previous, len);
3748 code++;
3749 *previous++ = OP_BRAZERO + repeat_type;
3750 }
3751
3752 /* If the maximum is greater than 1 and limited, we have to replicate
3753 in a nested fashion, sticking OP_BRAZERO before each set of brackets.
3754 The first one has to be handled carefully because it's the original
3755 copy, which has to be moved up. The remainder can be handled by code
3756 that is common with the non-zero minimum case below. We have to
3757 adjust the value or repeat_max, since one less copy is required. Once
3758 again, we may have to adjust any OP_RECURSE calls inside the group. */
3759
3760 else
3761 {
3762 int offset;
3763 *code = OP_END;
3764 adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
3765 memmove(previous + 2 + LINK_SIZE, previous, len);
3766 code += 2 + LINK_SIZE;
3767 *previous++ = OP_BRAZERO + repeat_type;
3768 *previous++ = OP_BRA;
3769
3770 /* We chain together the bracket offset fields that have to be
3771 filled in later when the ends of the brackets are reached. */
3772
3773 offset = (bralink == NULL)? 0 : previous - bralink;
3774 bralink = previous;
3775 PUTINC(previous, 0, offset);
3776 }
3777
3778 repeat_max--;
3779 }
3780
3781 /* If the minimum is greater than zero, replicate the group as many
3782 times as necessary, and adjust the maximum to the number of subsequent
3783 copies that we need. If we set a first char from the group, and didn't
3784 set a required char, copy the latter from the former. If there are any
3785 forward reference subroutine calls in the group, there will be entries on
3786 the workspace list; replicate these with an appropriate increment. */
3787
3788 else
3789 {
3790 if (repeat_min > 1)
3791 {
3792 /* In the pre-compile phase, we don't actually do the replication. We
3793 just adjust the length as if we had. Do some paranoid checks for
3794 potential integer overflow. */
3795
3796 if (lengthptr != NULL)
3797 {
3798 int delta = (repeat_min - 1)*length_prevgroup;
3799 if ((double)(repeat_min - 1)*(double)length_prevgroup >
3800 (double)INT_MAX ||
3801 OFLOW_MAX - *lengthptr < delta)
3802 {
3803 *errorcodeptr = ERR20;
3804 goto FAILED;
3805 }
3806 *lengthptr += delta;
3807 }
3808
3809 /* This is compiling for real */
3810
3811 else
3812 {
3813 if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
3814 for (i = 1; i < repeat_min; i++)
3815 {
3816 uschar *hc;
3817 uschar *this_hwm = cd->hwm;
3818 memcpy(code, previous, len);
3819 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3820 {
3821 PUT(cd->hwm, 0, GET(hc, 0) + len);
3822 cd->hwm += LINK_SIZE;
3823 }
3824 save_hwm = this_hwm;
3825 code += len;
3826 }
3827 }
3828 }
3829
3830 if (repeat_max > 0) repeat_max -= repeat_min;
3831 }
3832
3833 /* This code is common to both the zero and non-zero minimum cases. If
3834 the maximum is limited, it replicates the group in a nested fashion,
3835 remembering the bracket starts on a stack. In the case of a zero minimum,
3836 the first one was set up above. In all cases the repeat_max now specifies
3837 the number of additional copies needed. Again, we must remember to
3838 replicate entries on the forward reference list. */
3839
3840 if (repeat_max >= 0)
3841 {
3842 /* In the pre-compile phase, we don't actually do the replication. We
3843 just adjust the length as if we had. For each repetition we must add 1
3844 to the length for BRAZERO and for all but the last repetition we must
3845 add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
3846 paranoid checks to avoid integer overflow. */
3847
3848 if (lengthptr != NULL && repeat_max > 0)
3849 {
3850 int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
3851 2 - 2*LINK_SIZE; /* Last one doesn't nest */
3852 if ((double)repeat_max *
3853 (double)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
3854 > (double)INT_MAX ||
3855 OFLOW_MAX - *lengthptr < delta)
3856 {
3857 *errorcodeptr = ERR20;
3858 goto FAILED;
3859 }
3860 *lengthptr += delta;
3861 }
3862
3863 /* This is compiling for real */
3864
3865 else for (i = repeat_max - 1; i >= 0; i--)
3866 {
3867 uschar *hc;
3868 uschar *this_hwm = cd->hwm;
3869
3870 *code++ = OP_BRAZERO + repeat_type;
3871
3872 /* All but the final copy start a new nesting, maintaining the
3873 chain of brackets outstanding. */
3874
3875 if (i != 0)
3876 {
3877 int offset;
3878 *code++ = OP_BRA;
3879 offset = (bralink == NULL)? 0 : code - bralink;
3880 bralink = code;
3881 PUTINC(code, 0, offset);
3882 }
3883
3884 memcpy(code, previous, len);
3885 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3886 {
3887 PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
3888 cd->hwm += LINK_SIZE;
3889 }
3890 save_hwm = this_hwm;
3891 code += len;
3892 }
3893
3894 /* Now chain through the pending brackets, and fill in their length
3895 fields (which are holding the chain links pro tem). */
3896
3897 while (bralink != NULL)
3898 {
3899 int oldlinkoffset;
3900 int offset = code - bralink + 1;
3901 uschar *bra = code - offset;
3902 oldlinkoffset = GET(bra, 1);
3903 bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
3904 *code++ = OP_KET;
3905 PUTINC(code, 0, offset);
3906 PUT(bra, 1, offset);
3907 }
3908 }
3909
3910 /* If the maximum is unlimited, set a repeater in the final copy. We
3911 can't just offset backwards from the current code point, because we
3912 don't know if there's been an options resetting after the ket. The
3913 correct offset was computed above.
3914
3915 Then, when we are doing the actual compile phase, check to see whether
3916 this group is a non-atomic one that could match an empty string. If so,
3917 convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
3918 that runtime checking can be done. [This check is also applied to
3919 atomic groups at runtime, but in a different way.] */
3920
3921 else
3922 {
3923 uschar *ketcode = code - ketoffset;
3924 uschar *bracode = ketcode - GET(ketcode, 1);
3925 *ketcode = OP_KETRMAX + repeat_type;
3926 if (lengthptr == NULL && *bracode != OP_ONCE)
3927 {
3928 uschar *scode = bracode;
3929 do
3930 {
3931 if (could_be_empty_branch(scode, ketcode, utf8))
3932 {
3933 *bracode += OP_SBRA - OP_BRA;
3934 break;
3935 }
3936 scode += GET(scode, 1);
3937 }
3938 while (*scode == OP_ALT);
3939 }
3940 }
3941 }
3942
3943 /* Else there's some kind of shambles */
3944
3945 else
3946 {
3947 *errorcodeptr = ERR11;
3948 goto FAILED;
3949 }
3950
3951 /* If the character following a repeat is '+', or if certain optimization
3952 tests above succeeded, possessive_quantifier is TRUE. For some of the
3953 simpler opcodes, there is an special alternative opcode for this. For
3954 anything else, we wrap the entire repeated item inside OP_ONCE brackets.
3955 The '+' notation is just syntactic sugar, taken from Sun's Java package,
3956 but the special opcodes can optimize it a bit. The repeated item starts at
3957 tempcode, not at previous, which might be the first part of a string whose
3958 (former) last char we repeated.
3959
3960 Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
3961 an 'upto' may follow. We skip over an 'exact' item, and then test the
3962 length of what remains before proceeding. */
3963
3964 if (possessive_quantifier)
3965 {
3966 int len;
3967 if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||
3968 *tempcode == OP_NOTEXACT)
3969 tempcode += _pcre_OP_lengths[*tempcode];
3970 len = code - tempcode;
3971 if (len > 0) switch (*tempcode)
3972 {
3973 case OP_STAR: *tempcode = OP_POSSTAR; break;
3974 case OP_PLUS: *tempcode = OP_POSPLUS; break;
3975 case OP_QUERY: *tempcode = OP_POSQUERY; break;
3976 case OP_UPTO: *tempcode = OP_POSUPTO; break;
3977
3978 case OP_TYPESTAR: *tempcode = OP_TYPEPOSSTAR; break;
3979 case OP_TYPEPLUS: *tempcode = OP_TYPEPOSPLUS; break;
3980 case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
3981 case OP_TYPEUPTO: *tempcode = OP_TYPEPOSUPTO; break;
3982
3983 case OP_NOTSTAR: *tempcode = OP_NOTPOSSTAR; break;
3984 case OP_NOTPLUS: *tempcode = OP_NOTPOSPLUS; break;
3985 case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
3986 case OP_NOTUPTO: *tempcode = OP_NOTPOSUPTO; break;
3987
3988 default:
3989 memmove(tempcode + 1+LINK_SIZE, tempcode, len);
3990 code += 1 + LINK_SIZE;
3991 len += 1 + LINK_SIZE;
3992 tempcode[0] = OP_ONCE;
3993 *code++ = OP_KET;
3994 PUTINC(code, 0, len);
3995 PUT(tempcode, 1, len);
3996 break;
3997 }
3998 }
3999
4000 /* In all case we no longer have a previous item. We also set the
4001 "follows varying string" flag for subsequently encountered reqbytes if
4002 it isn't already set and we have just passed a varying length item. */
4003
4004 END_REPEAT:
4005 previous = NULL;
4006 cd->req_varyopt |= reqvary;
4007 break;
4008
4009
4010 /* ===================================================================*/
4011 /* Start of nested parenthesized sub-expression, or comment or lookahead or
4012 lookbehind or option setting or condition or all the other extended
4013 parenthesis forms. */
4014
4015 case '(':
4016 newoptions = options;
4017 skipbytes = 0;
4018 bravalue = OP_CBRA;
4019 save_hwm = cd->hwm;
4020 reset_bracount = FALSE;
4021
4022 /* First deal with various "verbs" that can be introduced by '*'. */
4023
4024 if (*(++ptr) == '*' && (cd->ctypes[ptr[1]] & ctype_letter) != 0)
4025 {
4026 int i, namelen;
4027 const uschar *name = ++ptr;
4028 previous = NULL;
4029 while ((cd->ctypes[*++ptr] & ctype_letter) != 0);
4030 if (*ptr == ':')
4031 {
4032 *errorcodeptr = ERR59; /* Not supported */
4033 goto FAILED;
4034 }
4035 if (*ptr != ')')
4036 {
4037 *errorcodeptr = ERR60;
4038 goto FAILED;
4039 }
4040 namelen = ptr - name;
4041 for (i = 0; i < verbcount; i++)
4042 {
4043 if (namelen == verbs[i].len &&
4044 strncmp((char *)name, verbs[i].name, namelen) == 0)
4045 {
4046 *code = verbs[i].op;
4047 if (*code++ == OP_ACCEPT) cd->had_accept = TRUE;
4048 break;
4049 }
4050 }
4051 if (i < verbcount) continue;
4052 *errorcodeptr = ERR60;
4053 goto FAILED;
4054 }
4055
4056 /* Deal with the extended parentheses; all are introduced by '?', and the
4057 appearance of any of them means that this is not a capturing group. */
4058
4059 else if (*ptr == '?')
4060 {
4061 int i, set, unset, namelen;
4062 int *optset;
4063 const uschar *name;
4064 uschar *slot;
4065
4066 switch (*(++ptr))
4067 {
4068 case '#': /* Comment; skip to ket */
4069 ptr++;
4070 while (*ptr != 0 && *ptr != ')') ptr++;
4071 if (*ptr == 0)
4072 {
4073 *errorcodeptr = ERR18;
4074 goto FAILED;
4075 }
4076 continue;
4077
4078
4079 /* ------------------------------------------------------------ */
4080 case '|': /* Reset capture count for each branch */
4081 reset_bracount = TRUE;
4082 /* Fall through */
4083
4084 /* ------------------------------------------------------------ */
4085 case ':': /* Non-capturing bracket */
4086 bravalue = OP_BRA;
4087 ptr++;
4088 break;
4089
4090
4091 /* ------------------------------------------------------------ */
4092 case '(':
4093 bravalue = OP_COND; /* Conditional group */
4094
4095 /* A condition can be an assertion, a number (referring to a numbered
4096 group), a name (referring to a named group), or 'R', referring to
4097 recursion. R<digits> and R&name are also permitted for recursion tests.
4098
4099 There are several syntaxes for testing a named group: (?(name)) is used
4100 by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
4101
4102 There are two unfortunate ambiguities, caused by history. (a) 'R' can
4103 be the recursive thing or the name 'R' (and similarly for 'R' followed
4104 by digits), and (b) a number could be a name that consists of digits.
4105 In both cases, we look for a name first; if not found, we try the other
4106 cases. */
4107
4108 /* For conditions that are assertions, check the syntax, and then exit
4109 the switch. This will take control down to where bracketed groups,
4110 including assertions, are processed. */
4111
4112 if (ptr[1] == '?' && (ptr[2] == '=' || ptr[2] == '!' || ptr[2] == '<'))
4113 break;
4114
4115 /* Most other conditions use OP_CREF (a couple change to OP_RREF
4116 below), and all need to skip 3 bytes at the start of the group. */
4117
4118 code[1+LINK_SIZE] = OP_CREF;
4119 skipbytes = 3;
4120 refsign = -1;
4121
4122 /* Check for a test for recursion in a named group. */
4123
4124 if (ptr[1] == 'R' && ptr[2] == '&')
4125 {
4126 terminator = -1;
4127 ptr += 2;
4128 code[1+LINK_SIZE] = OP_RREF; /* Change the type of test */
4129 }
4130
4131 /* Check for a test for a named group's having been set, using the Perl
4132 syntax (?(<name>) or (?('name') */
4133
4134 else if (ptr[1] == '<')
4135 {
4136 terminator = '>';
4137 ptr++;
4138 }
4139 else if (ptr[1] == '\'')
4140 {
4141 terminator = '\'';
4142 ptr++;
4143 }
4144 else
4145 {
4146 terminator = 0;
4147 if (ptr[1] == '-' || ptr[1] == '+') refsign = *(++ptr);
4148 }
4149
4150 /* We now expect to read a name; any thing else is an error */
4151
4152 if ((cd->ctypes[ptr[1]] & ctype_word) == 0)
4153 {
4154 ptr += 1; /* To get the right offset */
4155 *errorcodeptr = ERR28;
4156 goto FAILED;
4157 }
4158
4159 /* Read the name, but also get it as a number if it's all digits */
4160
4161 recno = 0;
4162 name = ++ptr;
4163 while ((cd->ctypes[*ptr] & ctype_word) != 0)
4164 {
4165 if (recno >= 0)
4166 recno = ((digitab[*ptr] & ctype_digit) != 0)?
4167 recno * 10 + *ptr - '0' : -1;
4168 ptr++;
4169 }
4170 namelen = ptr - name;
4171
4172 if ((terminator > 0 && *ptr++ != terminator) || *ptr++ != ')')
4173 {
4174 ptr--; /* Error offset */
4175 *errorcodeptr = ERR26;
4176 goto FAILED;
4177 }
4178
4179 /* Do no further checking in the pre-compile phase. */
4180
4181 if (lengthptr != NULL) break;
4182
4183 /* In the real compile we do the work of looking for the actual
4184 reference. If the string started with "+" or "-" we require the rest to
4185 be digits, in which case recno will be set. */
4186
4187 if (refsign > 0)
4188 {
4189 if (recno <= 0)
4190 {
4191 *errorcodeptr = ERR58;
4192 goto FAILED;
4193 }
4194 if (refsign == '-')
4195 {
4196 recno = cd->bracount - recno + 1;
4197 if (recno <= 0)
4198 {
4199 *errorcodeptr = ERR15;
4200 goto FAILED;
4201 }
4202 }
4203 else recno += cd->bracount;
4204 PUT2(code, 2+LINK_SIZE, recno);
4205 break;
4206 }
4207
4208 /* Otherwise (did not start with "+" or "-"), start by looking for the
4209 name. */
4210
4211 slot = cd->name_table;
4212 for (i = 0; i < cd->names_found; i++)
4213 {
4214 if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
4215 slot += cd->name_entry_size;
4216 }
4217
4218 /* Found a previous named subpattern */
4219
4220 if (i < cd->names_found)
4221 {
4222 recno = GET2(slot, 0);
4223 PUT2(code, 2+LINK_SIZE, recno);
4224 }
4225
4226 /* Search the pattern for a forward reference */
4227
4228 else if ((i = find_parens(ptr, cd->bracount, name, namelen,
4229 (options & PCRE_EXTENDED) != 0)) > 0)
4230 {
4231 PUT2(code, 2+LINK_SIZE, i);
4232 }
4233
4234 /* If terminator == 0 it means that the name followed directly after
4235 the opening parenthesis [e.g. (?(abc)...] and in this case there are
4236 some further alternatives to try. For the cases where terminator != 0
4237 [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
4238 now checked all the possibilities, so give an error. */
4239
4240 else if (terminator != 0)
4241 {
4242 *errorcodeptr = ERR15;
4243 goto FAILED;
4244 }
4245
4246 /* Check for (?(R) for recursion. Allow digits after R to specify a
4247 specific group number. */
4248
4249 else if (*name == 'R')
4250 {
4251 recno = 0;
4252 for (i = 1; i < namelen; i++)
4253 {
4254 if ((digitab[name[i]] & ctype_digit) == 0)
4255 {
4256 *errorcodeptr = ERR15;
4257 goto FAILED;
4258 }
4259 recno = recno * 10 + name[i] - '0';
4260 }
4261 if (recno == 0) recno = RREF_ANY;
4262 code[1+LINK_SIZE] = OP_RREF; /* Change test type */
4263 PUT2(code, 2+LINK_SIZE, recno);
4264 }
4265
4266 /* Similarly, check for the (?(DEFINE) "condition", which is always
4267 false. */
4268
4269 else if (namelen == 6 && strncmp((char *)name, "DEFINE", 6) == 0)
4270 {
4271 code[1+LINK_SIZE] = OP_DEF;
4272 skipbytes = 1;
4273 }
4274
4275 /* Check for the "name" actually being a subpattern number. */
4276
4277 else if (recno > 0)
4278 {
4279 PUT2(code, 2+LINK_SIZE, recno);
4280 }
4281
4282 /* Either an unidentified subpattern, or a reference to (?(0) */
4283
4284 else
4285 {
4286 *errorcodeptr = (recno == 0)? ERR35: ERR15;
4287 goto FAILED;
4288 }
4289 break;
4290
4291
4292 /* ------------------------------------------------------------ */
4293 case '=': /* Positive lookahead */
4294 bravalue = OP_ASSERT;
4295 ptr++;
4296 break;
4297
4298
4299 /* ------------------------------------------------------------ */
4300 case '!': /* Negative lookahead */
4301 ptr++;
4302 if (*ptr == ')') /* Optimize (?!) */
4303 {
4304 *code++ = OP_FAIL;
4305 previous = NULL;
4306 continue;
4307 }
4308 bravalue = OP_ASSERT_NOT;
4309 break;
4310
4311
4312 /* ------------------------------------------------------------ */
4313 case '<': /* Lookbehind or named define */
4314 switch (ptr[1])
4315 {
4316 case '=': /* Positive lookbehind */
4317 bravalue = OP_ASSERTBACK;
4318 ptr += 2;
4319 break;
4320
4321 case '!': /* Negative lookbehind */
4322 bravalue = OP_ASSERTBACK_NOT;
4323 ptr += 2;
4324 break;
4325
4326 default: /* Could be name define, else bad */
4327 if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME;
4328 ptr++; /* Correct offset for error */
4329 *errorcodeptr = ERR24;
4330 goto FAILED;
4331 }
4332 break;
4333
4334
4335 /* ------------------------------------------------------------ */
4336 case '>': /* One-time brackets */
4337 bravalue = OP_ONCE;
4338 ptr++;
4339 break;
4340
4341
4342 /* ------------------------------------------------------------ */
4343 case 'C': /* Callout - may be followed by digits; */
4344 previous_callout = code; /* Save for later completion */
4345 after_manual_callout = 1; /* Skip one item before completing */
4346 *code++ = OP_CALLOUT;
4347 {
4348 int n = 0;
4349 while ((digitab[*(++ptr)] & ctype_digit) != 0)
4350 n = n * 10 + *ptr - '0';
4351 if (*ptr != ')')
4352 {
4353 *errorcodeptr = ERR39;
4354 goto FAILED;
4355 }
4356 if (n > 255)
4357 {
4358 *errorcodeptr = ERR38;
4359 goto FAILED;
4360 }
4361 *code++ = n;
4362 PUT(code, 0, ptr - cd->start_pattern + 1); /* Pattern offset */
4363 PUT(code, LINK_SIZE, 0); /* Default length */
4364 code += 2 * LINK_SIZE;
4365 }
4366 previous = NULL;
4367 continue;
4368
4369
4370 /* ------------------------------------------------------------ */
4371 case 'P': /* Python-style named subpattern handling */
4372 if (*(++ptr) == '=' || *ptr == '>') /* Reference or recursion */
4373 {
4374 is_recurse = *ptr == '>';
4375 terminator = ')';
4376 goto NAMED_REF_OR_RECURSE;
4377 }
4378 else if (*ptr != '<') /* Test for Python-style definition */
4379 {
4380 *errorcodeptr = ERR41;
4381 goto FAILED;
4382 }
4383 /* Fall through to handle (?P< as (?< is handled */
4384
4385
4386 /* ------------------------------------------------------------ */
4387 DEFINE_NAME: /* Come here from (?< handling */
4388 case '\'':
4389 {
4390 terminator = (*ptr == '<')? '>' : '\'';
4391 name = ++ptr;
4392
4393 while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4394 namelen = ptr - name;
4395
4396 /* In the pre-compile phase, just do a syntax check. */
4397
4398 if (lengthptr != NULL)
4399 {
4400 if (*ptr != terminator)
4401 {
4402 *errorcodeptr = ERR42;
4403 goto FAILED;
4404 }
4405 if (cd->names_found >= MAX_NAME_COUNT)
4406 {
4407 *errorcodeptr = ERR49;
4408 goto FAILED;
4409 }
4410 if (namelen + 3 > cd->name_entry_size)
4411 {
4412 cd->name_entry_size = namelen + 3;
4413 if (namelen > MAX_NAME_SIZE)
4414 {
4415 *errorcodeptr = ERR48;
4416 goto FAILED;
4417 }
4418 }
4419 }
4420
4421 /* In the real compile, create the entry in the table */
4422
4423 else
4424 {
4425 slot = cd->name_table;
4426 for (i = 0; i < cd->names_found; i++)
4427 {
4428 int crc = memcmp(name, slot+2, namelen);
4429 if (crc == 0)
4430 {
4431 if (slot[2+namelen] == 0)
4432 {
4433 if ((options & PCRE_DUPNAMES) == 0)
4434 {
4435 *errorcodeptr = ERR43;
4436 goto FAILED;
4437 }
4438 }
4439 else crc = -1; /* Current name is substring */
4440 }
4441 if (crc < 0)
4442 {
4443 memmove(slot + cd->name_entry_size, slot,
4444 (cd->names_found - i) * cd->name_entry_size);
4445 break;
4446 }
4447 slot += cd->name_entry_size;
4448 }
4449
4450 PUT2(slot, 0, cd->bracount + 1);
4451 memcpy(slot + 2, name, namelen);
4452 slot[2+namelen] = 0;
4453 }
4454 }
4455
4456 /* In both cases, count the number of names we've encountered. */
4457
4458 ptr++; /* Move past > or ' */
4459 cd->names_found++;
4460 goto NUMBERED_GROUP;
4461
4462
4463 /* ------------------------------------------------------------ */
4464 case '&': /* Perl recursion/subroutine syntax */
4465 terminator = ')';
4466 is_recurse = TRUE;
4467 /* Fall through */
4468
4469 /* We come here from the Python syntax above that handles both
4470 references (?P=name) and recursion (?P>name), as well as falling
4471 through from the Perl recursion syntax (?&name). */
4472
4473 NAMED_REF_OR_RECURSE:
4474 name = ++ptr;
4475 while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4476 namelen = ptr - name;
4477
4478 /* In the pre-compile phase, do a syntax check and set a dummy
4479 reference number. */
4480
4481 if (lengthptr != NULL)
4482 {
4483 if (*ptr != terminator)
4484 {
4485 *errorcodeptr = ERR42;
4486 goto FAILED;
4487 }
4488 if (namelen > MAX_NAME_SIZE)
4489 {
4490 *errorcodeptr = ERR48;
4491 goto FAILED;
4492 }
4493 recno = 0;
4494 }
4495
4496 /* In the real compile, seek the name in the table */
4497
4498 else
4499 {
4500 slot = cd->name_table;
4501 for (i = 0; i < cd->names_found; i++)
4502 {
4503 if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
4504 slot += cd->name_entry_size;
4505 }
4506
4507 if (i < cd->names_found) /* Back reference */
4508 {
4509 recno = GET2(slot, 0);
4510 }
4511 else if ((recno = /* Forward back reference */
4512 find_parens(ptr, cd->bracount, name, namelen,
4513 (options & PCRE_EXTENDED) != 0)) <= 0)
4514 {
4515 *errorcodeptr = ERR15;
4516 goto FAILED;
4517 }
4518 }
4519
4520 /* In both phases, we can now go to the code than handles numerical
4521 recursion or backreferences. */
4522
4523 if (is_recurse) goto HANDLE_RECURSION;
4524 else goto HANDLE_REFERENCE;
4525
4526
4527 /* ------------------------------------------------------------ */
4528 case 'R': /* Recursion */
4529 ptr++; /* Same as (?0) */
4530 /* Fall through */
4531
4532
4533 /* ------------------------------------------------------------ */
4534 case '-': case '+':
4535 case '0': case '1': case '2': case '3': case '4': /* Recursion or */
4536 case '5': case '6': case '7': case '8': case '9': /* subroutine */
4537 {
4538 const uschar *called;
4539
4540 if ((refsign = *ptr) == '+') ptr++;
4541 else if (refsign == '-')
4542 {
4543 if ((digitab[ptr[1]] & ctype_digit) == 0)
4544 goto OTHER_CHAR_AFTER_QUERY;
4545 ptr++;
4546 }
4547
4548 recno = 0;
4549 while((digitab[*ptr] & ctype_digit) != 0)
4550 recno = recno * 10 + *ptr++ - '0';
4551
4552 if (*ptr != ')')
4553 {
4554 *errorcodeptr = ERR29;
4555 goto FAILED;
4556 }
4557
4558 if (refsign == '-')
4559 {
4560 if (recno == 0)
4561 {
4562 *errorcodeptr = ERR58;
4563 goto FAILED;
4564 }
4565 recno = cd->bracount - recno + 1;
4566 if (recno <= 0)
4567 {
4568 *errorcodeptr = ERR15;
4569 goto FAILED;
4570 }
4571 }
4572 else if (refsign == '+')
4573 {
4574 if (recno == 0)
4575 {
4576 *errorcodeptr = ERR58;
4577 goto FAILED;
4578 }
4579 recno += cd->bracount;
4580 }
4581
4582 /* Come here from code above that handles a named recursion */
4583
4584 HANDLE_RECURSION:
4585
4586 previous = code;
4587 called = cd->start_code;
4588
4589 /* When we are actually compiling, find the bracket that is being
4590 referenced. Temporarily end the regex in case it doesn't exist before
4591 this point. If we end up with a forward reference, first check that
4592 the bracket does occur later so we can give the error (and position)
4593 now. Then remember this forward reference in the workspace so it can
4594 be filled in at the end. */
4595
4596 if (lengthptr == NULL)
4597 {
4598 *code = OP_END;
4599 if (recno != 0) called = find_bracket(cd->start_code, utf8, recno);
4600
4601 /* Forward reference */
4602
4603 if (called == NULL)
4604 {
4605 if (find_parens(ptr, cd->bracount, NULL, recno,
4606 (options & PCRE_EXTENDED) != 0) < 0)
4607 {
4608 *errorcodeptr = ERR15;
4609 goto FAILED;
4610 }
4611 called = cd->start_code + recno;
4612 PUTINC(cd->hwm, 0, code + 2 + LINK_SIZE - cd->start_code);
4613 }
4614
4615 /* If not a forward reference, and the subpattern is still open,
4616 this is a recursive call. We check to see if this is a left
4617 recursion that could loop for ever, and diagnose that case. */
4618
4619 else if (GET(called, 1) == 0 &&
4620 could_be_empty(called, code, bcptr, utf8))
4621 {
4622 *errorcodeptr = ERR40;
4623 goto FAILED;
4624 }
4625 }
4626
4627 /* Insert the recursion/subroutine item, automatically wrapped inside
4628 "once" brackets. Set up a "previous group" length so that a
4629 subsequent quantifier will work. */
4630
4631 *code = OP_ONCE;
4632 PUT(code, 1, 2 + 2*LINK_SIZE);
4633 code += 1 + LINK_SIZE;
4634
4635 *code = OP_RECURSE;
4636 PUT(code, 1, called - cd->start_code);
4637 code += 1 + LINK_SIZE;
4638
4639 *code = OP_KET;
4640 PUT(code, 1, 2 + 2*LINK_SIZE);
4641 code += 1 + LINK_SIZE;
4642
4643 length_prevgroup = 3 + 3*LINK_SIZE;
4644 }
4645
4646 /* Can't determine a first byte now */
4647
4648 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
4649 continue;
4650
4651
4652 /* ------------------------------------------------------------ */
4653 default: /* Other characters: check option setting */
4654 OTHER_CHAR_AFTER_QUERY:
4655 set = unset = 0;
4656 optset = &set;
4657
4658 while (*ptr != ')' && *ptr != ':')
4659 {
4660 switch (*ptr++)
4661 {
4662 case '-': optset = &unset; break;
4663
4664 case 'J': /* Record that it changed in the external options */
4665 *optset |= PCRE_DUPNAMES;
4666 cd->external_flags |= PCRE_JCHANGED;
4667 break;
4668
4669 case 'i': *optset |= PCRE_CASELESS; break;
4670 case 'm': *optset |= PCRE_MULTILINE; break;
4671 case 's': *optset |= PCRE_DOTALL; break;
4672 case 'x': *optset |= PCRE_EXTENDED; break;
4673 case 'U': *optset |= PCRE_UNGREEDY; break;
4674 case 'X': *optset |= PCRE_EXTRA; break;
4675
4676 default: *errorcodeptr = ERR12;
4677 ptr--; /* Correct the offset */
4678 goto FAILED;
4679 }
4680 }
4681
4682 /* Set up the changed option bits, but don't change anything yet. */
4683
4684 newoptions = (options | set) & (~unset);
4685
4686 /* If the options ended with ')' this is not the start of a nested
4687 group with option changes, so the options change at this level. If this
4688 item is right at the start of the pattern, the options can be
4689 abstracted and made external in the pre-compile phase, and ignored in
4690 the compile phase. This can be helpful when matching -- for instance in
4691 caseless checking of required bytes.
4692
4693 If the code pointer is not (cd->start_code + 1 + LINK_SIZE), we are
4694 definitely *not* at the start of the pattern because something has been
4695 compiled. In the pre-compile phase, however, the code pointer can have
4696 that value after the start, because it gets reset as code is discarded
4697 during the pre-compile. However, this can happen only at top level - if
4698 we are within parentheses, the starting BRA will still be present. At
4699 any parenthesis level, the length value can be used to test if anything
4700 has been compiled at that level. Thus, a test for both these conditions
4701 is necessary to ensure we correctly detect the start of the pattern in
4702 both phases.
4703
4704 If we are not at the pattern start, compile code to change the ims
4705 options if this setting actually changes any of them. We also pass the
4706 new setting back so that it can be put at the start of any following
4707 branches, and when this group ends (if we are in a group), a resetting
4708 item can be compiled. */
4709
4710 if (*ptr == ')')
4711 {
4712 if (code == cd->start_code + 1 + LINK_SIZE &&
4713 (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE))
4714 {
4715 cd->external_options = newoptions;
4716 options = newoptions;
4717 }
4718 else
4719 {
4720 if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
4721 {
4722 *code++ = OP_OPT;
4723 *code++ = newoptions & PCRE_IMS;
4724 }
4725
4726 /* Change options at this level, and pass them back for use
4727 in subsequent branches. Reset the greedy defaults and the case
4728 value for firstbyte and reqbyte. */
4729
4730 *optionsptr = options = newoptions;
4731 greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
4732 greedy_non_default = greedy_default ^ 1;
4733 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
4734 }
4735
4736 previous = NULL; /* This item can't be repeated */
4737 continue; /* It is complete */
4738 }
4739
4740 /* If the options ended with ':' we are heading into a nested group
4741 with possible change of options. Such groups are non-capturing and are
4742 not assertions of any kind. All we need to do is skip over the ':';
4743 the newoptions value is handled below. */
4744
4745 bravalue = OP_BRA;
4746 ptr++;
4747 } /* End of switch for character following (? */
4748 } /* End of (? handling */
4749
4750 /* Opening parenthesis not followed by '?'. If PCRE_NO_AUTO_CAPTURE is set,
4751 all unadorned brackets become non-capturing and behave like (?:...)
4752 brackets. */
4753
4754 else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
4755 {
4756 bravalue = OP_BRA;
4757 }
4758
4759 /* Else we have a capturing group. */
4760
4761 else
4762 {
4763 NUMBERED_GROUP:
4764 cd->bracount += 1;
4765 PUT2(code, 1+LINK_SIZE, cd->bracount);
4766 skipbytes = 2;
4767 }
4768
4769 /* Process nested bracketed regex. Assertions may not be repeated, but
4770 other kinds can be. All their opcodes are >= OP_ONCE. We copy code into a
4771 non-register variable in order to be able to pass its address because some
4772 compilers complain otherwise. Pass in a new setting for the ims options if
4773 they have changed. */
4774
4775 previous = (bravalue >= OP_ONCE)? code : NULL;
4776 *code = bravalue;
4777 tempcode = code;
4778 tempreqvary = cd->req_varyopt; /* Save value before bracket */
4779 length_prevgroup = 0; /* Initialize for pre-compile phase */
4780
4781 if (!compile_regex(
4782 newoptions, /* The complete new option state */
4783 options & PCRE_IMS, /* The previous ims option state */
4784 &tempcode, /* Where to put code (updated) */
4785 &ptr, /* Input pointer (updated) */
4786 errorcodeptr, /* Where to put an error message */
4787 (bravalue == OP_ASSERTBACK ||
4788 bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
4789 reset_bracount, /* True if (?| group */
4790 skipbytes, /* Skip over bracket number */
4791 &subfirstbyte, /* For possible first char */
4792 &subreqbyte, /* For possible last char */
4793 bcptr, /* Current branch chain */
4794 cd, /* Tables block */
4795 (lengthptr == NULL)? NULL : /* Actual compile phase */
4796 &length_prevgroup /* Pre-compile phase */
4797 ))
4798 goto FAILED;
4799
4800 /* At the end of compiling, code is still pointing to the start of the
4801 group, while tempcode has been updated to point past the end of the group
4802 and any option resetting that may follow it. The pattern pointer (ptr)
4803 is on the bracket. */
4804
4805 /* If this is a conditional bracket, check that there are no more than
4806 two branches in the group, or just one if it's a DEFINE group. We do this
4807 in the real compile phase, not in the pre-pass, where the whole group may
4808 not be available. */
4809
4810 if (bravalue == OP_COND && lengthptr == NULL)
4811 {
4812 uschar *tc = code;
4813 int condcount = 0;
4814
4815 do {
4816 condcount++;
4817 tc += GET(tc,1);
4818 }
4819 while (*tc != OP_KET);
4820
4821 /* A DEFINE group is never obeyed inline (the "condition" is always
4822 false). It must have only one branch. */
4823
4824 if (code[LINK_SIZE+1] == OP_DEF)
4825 {
4826 if (condcount > 1)
4827 {
4828 *errorcodeptr = ERR54;
4829 goto FAILED;
4830 }
4831 bravalue = OP_DEF; /* Just a flag to suppress char handling below */
4832 }
4833
4834 /* A "normal" conditional group. If there is just one branch, we must not
4835 make use of its firstbyte or reqbyte, because this is equivalent to an
4836 empty second branch. */
4837
4838 else
4839 {
4840 if (condcount > 2)
4841 {
4842 *errorcodeptr = ERR27;
4843 goto FAILED;
4844 }
4845 if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
4846 }
4847 }
4848
4849 /* Error if hit end of pattern */
4850
4851 if (*ptr != ')')
4852 {
4853 *errorcodeptr = ERR14;
4854 goto FAILED;
4855 }
4856
4857 /* In the pre-compile phase, update the length by the length of the group,
4858 less the brackets at either end. Then reduce the compiled code to just a
4859 set of non-capturing brackets so that it doesn't use much memory if it is
4860 duplicated by a quantifier.*/
4861
4862 if (lengthptr != NULL)
4863 {
4864 if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE)
4865 {
4866 *errorcodeptr = ERR20;
4867 goto FAILED;
4868 }
4869 *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
4870 *code++ = OP_BRA;
4871 PUTINC(code, 0, 1 + LINK_SIZE);
4872 *code++ = OP_KET;
4873 PUTINC(code, 0, 1 + LINK_SIZE);
4874 break; /* No need to waste time with special character handling */
4875 }
4876
4877 /* Otherwise update the main code pointer to the end of the group. */
4878
4879 code = tempcode;
4880
4881 /* For a DEFINE group, required and first character settings are not
4882 relevant. */
4883
4884 if (bravalue == OP_DEF) break;
4885
4886 /* Handle updating of the required and first characters for other types of
4887 group. Update for normal brackets of all kinds, and conditions with two
4888 branches (see code above). If the bracket is followed by a quantifier with
4889 zero repeat, we have to back off. Hence the definition of zeroreqbyte and
4890 zerofirstbyte outside the main loop so that they can be accessed for the
4891 back off. */
4892
4893 zeroreqbyte = reqbyte;
4894 zerofirstbyte = firstbyte;
4895 groupsetfirstbyte = FALSE;
4896
4897 if (bravalue >= OP_ONCE)
4898 {
4899 /* If we have not yet set a firstbyte in this branch, take it from the
4900 subpattern, remembering that it was set here so that a repeat of more
4901 than one can replicate it as reqbyte if necessary. If the subpattern has
4902 no firstbyte, set "none" for the whole branch. In both cases, a zero
4903 repeat forces firstbyte to "none". */
4904
4905 if (firstbyte == REQ_UNSET)
4906 {
4907 if (subfirstbyte >= 0)
4908 {
4909 firstbyte = subfirstbyte;
4910 groupsetfirstbyte = TRUE;
4911 }
4912 else firstbyte = REQ_NONE;
4913 zerofirstbyte = REQ_NONE;
4914 }
4915
4916 /* If firstbyte was previously set, convert the subpattern's firstbyte
4917 into reqbyte if there wasn't one, using the vary flag that was in
4918 existence beforehand. */
4919
4920 else if (subfirstbyte >= 0 && subreqbyte < 0)
4921 subreqbyte = subfirstbyte | tempreqvary;
4922
4923 /* If the subpattern set a required byte (or set a first byte that isn't
4924 really the first byte - see above), set it. */
4925
4926 if (subreqbyte >= 0) reqbyte = subreqbyte;
4927 }
4928
4929 /* For a forward assertion, we take the reqbyte, if set. This can be
4930 helpful if the pattern that follows the assertion doesn't set a different
4931 char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
4932 for an assertion, however because it leads to incorrect effect for patterns
4933 such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
4934 of a firstbyte. This is overcome by a scan at the end if there's no
4935 firstbyte, looking for an asserted first char. */
4936
4937 else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
4938 break; /* End of processing '(' */
4939
4940
4941 /* ===================================================================*/
4942 /* Handle metasequences introduced by \. For ones like \d, the ESC_ values
4943 are arranged to be the negation of the corresponding OP_values. For the
4944 back references, the values are ESC_REF plus the reference number. Only
4945 back references and those types that consume a character may be repeated.
4946 We can test for values between ESC_b and ESC_Z for the latter; this may
4947 have to change if any new ones are ever created. */
4948
4949 case '\\':
4950 tempptr = ptr;
4951 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, FALSE);
4952 if (*errorcodeptr != 0) goto FAILED;
4953
4954 if (c < 0)
4955 {
4956 if (-c == ESC_Q) /* Handle start of quoted string */
4957 {
4958 if (ptr[1] == '\\' && ptr[2] == 'E') ptr += 2; /* avoid empty string */
4959 else inescq = TRUE;
4960 continue;
4961 }
4962
4963 if (-c == ESC_E) continue; /* Perl ignores an orphan \E */
4964
4965 /* For metasequences that actually match a character, we disable the
4966 setting of a first character if it hasn't already been set. */
4967
4968 if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
4969 firstbyte = REQ_NONE;
4970
4971 /* Set values to reset to if this is followed by a zero repeat. */
4972
4973 zerofirstbyte = firstbyte;
4974 zeroreqbyte = reqbyte;
4975
4976 /* \k<name> or \k'name' is a back reference by name (Perl syntax).
4977 We also support \k{name} (.NET syntax) */
4978
4979 if (-c == ESC_k && (ptr[1] == '<' || ptr[1] == '\'' || ptr[1] == '{'))
4980 {
4981 is_recurse = FALSE;
4982 terminator = (*(++ptr) == '<')? '>' : (*ptr == '\'')? '\'' : '}';
4983 goto NAMED_REF_OR_RECURSE;
4984 }
4985
4986 /* Back references are handled specially; must disable firstbyte if
4987 not set to cope with cases like (?=(\w+))\1: which would otherwise set
4988 ':' later. */
4989
4990 if (-c >= ESC_REF)
4991 {
4992 recno = -c - ESC_REF;
4993
4994 HANDLE_REFERENCE: /* Come here from named backref handling */
4995 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
4996 previous = code;
4997 *code++ = OP_REF;
4998 PUT2INC(code, 0, recno);
4999 cd->backref_map |= (recno < 32)? (1 << recno) : 1;
5000 if (recno > cd->top_backref) cd->top_backref = recno;
5001 }
5002
5003 /* So are Unicode property matches, if supported. */
5004
5005 #ifdef SUPPORT_UCP
5006 else if (-c == ESC_P || -c == ESC_p)
5007 {
5008 BOOL negated;
5009 int pdata;
5010 int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
5011 if (ptype < 0) goto FAILED;
5012 previous = code;
5013 *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
5014 *code++ = ptype;
5015 *code++ = pdata;
5016 }
5017 #else
5018
5019 /* If Unicode properties are not supported, \X, \P, and \p are not
5020 allowed. */
5021
5022 else if (-c == ESC_X || -c == ESC_P || -c == ESC_p)
5023 {
5024 *errorcodeptr = ERR45;
5025 goto FAILED;
5026 }
5027 #endif
5028
5029 /* For the rest (including \X when Unicode properties are supported), we
5030 can obtain the OP value by negating the escape value. */
5031
5032 else
5033 {
5034 previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
5035 *code++ = -c;
5036 }
5037 continue;
5038 }
5039
5040 /* We have a data character whose value is in c. In UTF-8 mode it may have
5041 a value > 127. We set its representation in the length/buffer, and then
5042 handle it as a data character. */
5043
5044 #ifdef SUPPORT_UTF8
5045 if (utf8 && c > 127)
5046 mclength = _pcre_ord2utf8(c, mcbuffer);
5047 else
5048 #endif
5049
5050 {
5051 mcbuffer[0] = c;
5052 mclength = 1;
5053 }
5054 goto ONE_CHAR;
5055
5056
5057 /* ===================================================================*/
5058 /* Handle a literal character. It is guaranteed not to be whitespace or #
5059 when the extended flag is set. If we are in UTF-8 mode, it may be a
5060 multi-byte literal character. */
5061
5062 default:
5063 NORMAL_CHAR:
5064 mclength = 1;
5065 mcbuffer[0] = c;
5066
5067 #ifdef SUPPORT_UTF8
5068 if (utf8 && c >= 0xc0)
5069 {
5070 while ((ptr[1] & 0xc0) == 0x80)
5071 mcbuffer[mclength++] = *(++ptr);
5072 }
5073 #endif
5074
5075 /* At this point we have the character's bytes in mcbuffer, and the length
5076 in mclength. When not in UTF-8 mode, the length is always 1. */
5077
5078 ONE_CHAR:
5079 previous = code;
5080 *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARNC : OP_CHAR;
5081 for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
5082
5083 /* Remember if \r or \n were seen */
5084
5085 if (mcbuffer[0] == '\r' || mcbuffer[0] == '\n')
5086 cd->external_flags |= PCRE_HASCRORLF;
5087
5088 /* Set the first and required bytes appropriately. If no previous first
5089 byte, set it from this character, but revert to none on a zero repeat.
5090 Otherwise, leave the firstbyte value alone, and don't change it on a zero
5091 repeat. */
5092
5093 if (firstbyte == REQ_UNSET)
5094 {
5095 zerofirstbyte = REQ_NONE;
5096 zeroreqbyte = reqbyte;
5097
5098 /* If the character is more than one byte long, we can set firstbyte
5099 only if it is not to be matched caselessly. */
5100
5101 if (mclength == 1 || req_caseopt == 0)
5102 {
5103 firstbyte = mcbuffer[0] | req_caseopt;
5104 if (mclength != 1) reqbyte = code[-1] | cd->req_varyopt;
5105 }
5106 else firstbyte = reqbyte = REQ_NONE;
5107 }
5108
5109 /* firstbyte was previously set; we can set reqbyte only the length is
5110 1 or the matching is caseful. */
5111
5112 else
5113 {
5114 zerofirstbyte = firstbyte;
5115 zeroreqbyte = reqbyte;
5116 if (mclength == 1 || req_caseopt == 0)
5117 reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
5118 }
5119
5120 break; /* End of literal character handling */
5121 }
5122 } /* end of big loop */
5123
5124
5125 /* Control never reaches here by falling through, only by a goto for all the
5126 error states. Pass back the position in the pattern so that it can be displayed
5127 to the user for diagnosing the error. */
5128
5129 FAILED:
5130 *ptrptr = ptr;
5131 return FALSE;
5132 }
5133
5134
5135
5136
5137 /*************************************************
5138 * Compile sequence of alternatives *
5139 *************************************************/
5140
5141 /* On entry, ptr is pointing past the bracket character, but on return it
5142 points to the closing bracket, or vertical bar, or end of string. The code
5143 variable is pointing at the byte into which the BRA operator has been stored.
5144 If the ims options are changed at the start (for a (?ims: group) or during any
5145 branch, we need to insert an OP_OPT item at the start of every following branch
5146 to ensure they get set correctly at run time, and also pass the new options
5147 into every subsequent branch compile.
5148
5149 This function is used during the pre-compile phase when we are trying to find
5150 out the amount of memory needed, as well as during the real compile phase. The
5151 value of lengthptr distinguishes the two phases.
5152
5153 Arguments:
5154 options option bits, including any changes for this subpattern
5155 oldims previous settings of ims option bits
5156 codeptr -> the address of the current code pointer
5157 ptrptr -> the address of the current pattern pointer
5158 errorcodeptr -> pointer to error code variable
5159 lookbehind TRUE if this is a lookbehind assertion
5160 reset_bracount TRUE to reset the count for each branch
5161 skipbytes skip this many bytes at start (for brackets and OP_COND)
5162 firstbyteptr place to put the first required character, or a negative number
5163 reqbyteptr place to put the last required character, or a negative number
5164 bcptr pointer to the chain of currently open branches
5165 cd points to the data block with tables pointers etc.
5166 lengthptr NULL during the real compile phase
5167 points to length accumulator during pre-compile phase
5168
5169 Returns: TRUE on success
5170 */
5171
5172 static BOOL
5173 compile_regex(int options, int oldims, uschar **codeptr, const uschar **ptrptr,
5174 int *errorcodeptr, BOOL lookbehind, BOOL reset_bracount, int skipbytes,
5175 int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd,
5176 int *lengthptr)
5177 {
5178 const uschar *ptr = *ptrptr;
5179 uschar *code = *codeptr;
5180 uschar *last_branch = code;
5181 uschar *start_bracket = code;
5182 uschar *reverse_count = NULL;
5183 int firstbyte, reqbyte;
5184 int branchfirstbyte, branchreqbyte;
5185 int length;
5186 int orig_bracount;
5187 int max_bracount;
5188 branch_chain bc;
5189
5190 bc.outer = bcptr;
5191 bc.current = code;
5192
5193 firstbyte = reqbyte = REQ_UNSET;
5194
5195 /* Accumulate the length for use in the pre-compile phase. Start with the
5196 length of the BRA and KET and any extra bytes that are required at the
5197 beginning. We accumulate in a local variable to save frequent testing of
5198 lenthptr for NULL. We cannot do this by looking at the value of code at the
5199 start and end of each alternative, because compiled items are discarded during
5200 the pre-compile phase so that the work space is not exceeded. */
5201
5202 length = 2 + 2*LINK_SIZE + skipbytes;
5203
5204 /* WARNING: If the above line is changed for any reason, you must also change
5205 the code that abstracts option settings at the start of the pattern and makes
5206 them global. It tests the value of length for (2 + 2*LINK_SIZE) in the
5207 pre-compile phase to find out whether anything has yet been compiled or not. */
5208
5209 /* Offset is set zero to mark that this bracket is still open */
5210
5211 PUT(code, 1, 0);
5212 code += 1 + LINK_SIZE + skipbytes;
5213
5214 /* Loop for each alternative branch */
5215
5216 orig_bracount = max_bracount = cd->bracount;
5217 for (;;)
5218 {
5219 /* For a (?| group, reset the capturing bracket count so that each branch
5220 uses the same numbers. */
5221
5222 if (reset_bracount) cd->bracount = orig_bracount;
5223
5224 /* Handle a change of ims options at the start of the branch */
5225
5226 if ((options & PCRE_IMS) != oldims)
5227 {
5228 *code++ = OP_OPT;
5229 *code++ = options & PCRE_IMS;
5230 length += 2;
5231 }
5232
5233 /* Set up dummy OP_REVERSE if lookbehind assertion */
5234
5235 if (lookbehind)
5236 {
5237 *code++ = OP_REVERSE;
5238 reverse_count = code;
5239 PUTINC(code, 0, 0);
5240 length += 1 + LINK_SIZE;
5241 }
5242
5243 /* Now compile the branch; in the pre-compile phase its length gets added
5244 into the length. */
5245
5246 if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstbyte,
5247 &branchreqbyte, &bc, cd, (lengthptr == NULL)? NULL : &length))
5248 {
5249 *ptrptr = ptr;
5250 return FALSE;
5251 }
5252
5253 /* Keep the highest bracket count in case (?| was used and some branch
5254 has fewer than the rest. */
5255
5256 if (cd->bracount > max_bracount) max_bracount = cd->bracount;
5257
5258 /* In the real compile phase, there is some post-processing to be done. */
5259
5260 if (lengthptr == NULL)
5261 {
5262 /* If this is the first branch, the firstbyte and reqbyte values for the
5263 branch become the values for the regex. */
5264
5265 if (*last_branch != OP_ALT)
5266 {
5267 firstbyte = branchfirstbyte;
5268 reqbyte = branchreqbyte;
5269 }
5270
5271 /* If this is not the first branch, the first char and reqbyte have to
5272 match the values from all the previous branches, except that if the
5273 previous value for reqbyte didn't have REQ_VARY set, it can still match,
5274 and we set REQ_VARY for the regex. */
5275
5276 else
5277 {
5278 /* If we previously had a firstbyte, but it doesn't match the new branch,
5279 we have to abandon the firstbyte for the regex, but if there was
5280 previously no reqbyte, it takes on the value of the old firstbyte. */
5281
5282 if (firstbyte >= 0 && firstbyte != branchfirstbyte)
5283 {
5284 if (reqbyte < 0) reqbyte = firstbyte;
5285 firstbyte = REQ_NONE;
5286 }
5287
5288 /* If we (now or from before) have no firstbyte, a firstbyte from the
5289 branch becomes a reqbyte if there isn't a branch reqbyte. */
5290
5291 if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
5292 branchreqbyte = branchfirstbyte;
5293
5294 /* Now ensure that the reqbytes match */
5295
5296 if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
5297 reqbyte = REQ_NONE;
5298 else reqbyte |= branchreqbyte; /* To "or" REQ_VARY */
5299 }
5300
5301 /* If lookbehind, check that this branch matches a fixed-length string, and
5302 put the length into the OP_REVERSE item. Temporarily mark the end of the
5303 branch with OP_END. */
5304
5305 if (lookbehind)
5306 {
5307 int fixed_length;
5308 *code = OP_END;
5309 fixed_length = find_fixedlength(last_branch, options);
5310 DPRINTF(("fixed length = %d\n", fixed_length));
5311 if (fixed_length < 0)
5312 {
5313 *errorcodeptr = (fixed_length == -2)? ERR36 : ERR25;
5314 *ptrptr = ptr;
5315 return FALSE;
5316 }
5317 PUT(reverse_count, 0, fixed_length);
5318 }
5319 }
5320
5321 /* Reached end of expression, either ')' or end of pattern. In the real
5322 compile phase, go back through the alternative branches and reverse the chain
5323 of offsets, with the field in the BRA item now becoming an offset to the
5324 first alternative. If there are no alternatives, it points to the end of the
5325 group. The length in the terminating ket is always the length of the whole
5326 bracketed item. If any of the ims options were changed inside the group,
5327 compile a resetting op-code following, except at the very end of the pattern.
5328 Return leaving the pointer at the terminating char. */
5329
5330 if (*ptr != '|')
5331 {
5332 if (lengthptr == NULL)
5333 {
5334 int branch_length = code - last_branch;
5335 do
5336 {
5337 int prev_length = GET(last_branch, 1);
5338 PUT(last_branch, 1, branch_length);
5339 branch_length = prev_length;
5340 last_branch -= branch_length;
5341 }
5342 while (branch_length > 0);
5343 }
5344
5345 /* Fill in the ket */
5346
5347 *code = OP_KET;
5348 PUT(code, 1, code - start_bracket);
5349 code += 1 + LINK_SIZE;
5350
5351 /* Resetting option if needed */
5352
5353 if ((options & PCRE_IMS) != oldims && *ptr == ')')
5354 {
5355 *code++ = OP_OPT;
5356 *code++ = oldims;
5357 length += 2;
5358 }
5359
5360 /* Retain the highest bracket number, in case resetting was used. */
5361
5362 cd->bracount = max_bracount;
5363
5364 /* Set values to pass back */
5365
5366 *codeptr = code;
5367 *ptrptr = ptr;
5368 *firstbyteptr = firstbyte;
5369 *reqbyteptr = reqbyte;
5370 if (lengthptr != NULL)
5371 {
5372 if (OFLOW_MAX - *lengthptr < length)
5373 {
5374 *errorcodeptr = ERR20;
5375 return FALSE;
5376 }
5377 *lengthptr += length;
5378 }
5379 return TRUE;
5380 }
5381
5382 /* Another branch follows. In the pre-compile phase, we can move the code
5383 pointer back to where it was for the start of the first branch. (That is,
5384 pretend that each branch is the only one.)
5385
5386 In the real compile phase, insert an ALT node. Its length field points back
5387 to the previous branch while the bracket remains open. At the end the chain
5388 is reversed. It's done like this so that the start of the bracket has a
5389 zero offset until it is closed, making it possible to detect recursion. */
5390
5391 if (lengthptr != NULL)
5392 {
5393 code = *codeptr + 1 + LINK_SIZE + skipbytes;
5394 length += 1 + LINK_SIZE;
5395 }
5396 else
5397 {
5398 *code = OP_ALT;
5399 PUT(code, 1, code - last_branch);
5400 bc.current = last_branch = code;
5401 code += 1 + LINK_SIZE;
5402 }
5403
5404 ptr++;
5405 }
5406 /* Control never reaches here */
5407 }
5408
5409
5410
5411
5412 /*************************************************
5413 * Check for anchored expression *
5414 *************************************************/
5415
5416 /* Try to find out if this is an anchored regular expression. Consider each
5417 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
5418 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
5419 it's anchored. However, if this is a multiline pattern, then only OP_SOD
5420 counts, since OP_CIRC can match in the middle.
5421
5422 We can also consider a regex to be anchored if OP_SOM starts all its branches.
5423 This is the code for \G, which means "match at start of match position, taking
5424 into account the match offset".
5425
5426 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
5427 because that will try the rest of the pattern at all possible matching points,
5428 so there is no point trying again.... er ....
5429
5430 .... except when the .* appears inside capturing parentheses, and there is a
5431 subsequent back reference to those parentheses. We haven't enough information
5432 to catch that case precisely.
5433
5434 At first, the best we could do was to detect when .* was in capturing brackets
5435 and the highest back reference was greater than or equal to that level.
5436 However, by keeping a bitmap of the first 31 back references, we can catch some
5437 of the more common cases more precisely.
5438
5439 Arguments:
5440 code points to start of expression (the bracket)
5441 options points to the options setting
5442 bracket_map a bitmap of which brackets we are inside while testing; this
5443 handles up to substring 31; after that we just have to take
5444 the less precise approach
5445 backref_map the back reference bitmap
5446
5447 Returns: TRUE or FALSE
5448 */
5449
5450 static BOOL
5451 is_anchored(register const uschar *code, int *options, unsigned int bracket_map,
5452 unsigned int backref_map)
5453 {
5454 do {
5455 const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
5456 options, PCRE_MULTILINE, FALSE);
5457 register int op = *scode;
5458
5459 /* Non-capturing brackets */
5460
5461 if (op == OP_BRA)
5462 {
5463 if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
5464 }
5465
5466 /* Capturing brackets */
5467
5468 else if (op == OP_CBRA)
5469 {
5470 int n = GET2(scode, 1+LINK_SIZE);
5471 int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
5472 if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;
5473 }
5474
5475 /* Other brackets */
5476
5477 else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
5478 {
5479 if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
5480 }
5481
5482 /* .* is not anchored unless DOTALL is set and it isn't in brackets that
5483 are or may be referenced. */
5484
5485 else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
5486 op == OP_TYPEPOSSTAR) &&
5487 (*options & PCRE_DOTALL) != 0)
5488 {
5489 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
5490 }
5491
5492 /* Check for explicit anchoring */
5493
5494 else if (op != OP_SOD && op != OP_SOM &&
5495 ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
5496 return FALSE;
5497 code += GET(code, 1);
5498 }
5499 while (*code == OP_ALT); /* Loop for each alternative */
5500 return TRUE;
5501 }
5502
5503
5504
5505 /*************************************************
5506 * Check for starting with ^ or .* *
5507 *************************************************/
5508
5509 /* This is called to find out if every branch starts with ^ or .* so that
5510 "first char" processing can be done to speed things up in multiline
5511 matching and for non-DOTALL patterns that start with .* (which must start at
5512 the beginning or after \n). As in the case of is_anchored() (see above), we
5513 have to take account of back references to capturing brackets that contain .*
5514 because in that case we can't make the assumption.
5515
5516 Arguments:
5517 code points to start of expression (the bracket)
5518 bracket_map a bitmap of which brackets we are inside while testing; this
5519 handles up to substring 31; after that we just have to take
5520 the less precise approach
5521 backref_map the back reference bitmap
5522
5523 Returns: TRUE or FALSE
5524 */
5525
5526 static BOOL
5527 is_startline(const uschar *code, unsigned int bracket_map,
5528 unsigned int backref_map)
5529 {
5530 do {
5531 const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
5532 NULL, 0, FALSE);
5533 register int op = *scode;
5534
5535 /* Non-capturing brackets */
5536
5537 if (op == OP_BRA)
5538 {
5539 if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
5540 }
5541
5542 /* Capturing brackets */
5543
5544 else if (op == OP_CBRA)
5545 {
5546 int n = GET2(scode, 1+LINK_SIZE);
5547 int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
5548 if (!is_startline(scode, new_map, backref_map)) return FALSE;
5549 }
5550
5551 /* Other brackets */
5552
5553 else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
5554 { if (!is_startline(scode, bracket_map, backref_map)) return FALSE; }
5555
5556 /* .* means "start at start or after \n" if it isn't in brackets that
5557 may be referenced. */
5558
5559 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
5560 {
5561 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
5562 }
5563
5564 /* Check for explicit circumflex */
5565
5566 else if (op != OP_CIRC) return FALSE;
5567
5568 /* Move on to the next alternative */
5569
5570 code += GET(code, 1);
5571 }
5572 while (*code == OP_ALT); /* Loop for each alternative */
5573 return TRUE;
5574 }
5575
5576
5577
5578 /*************************************************
5579 * Check for asserted fixed first char *
5580 *************************************************/
5581
5582 /* During compilation, the "first char" settings from forward assertions are
5583 discarded, because they can cause conflicts with actual literals that follow.
5584 However, if we end up without a first char setting for an unanchored pattern,
5585 it is worth scanning the regex to see if there is an initial asserted first
5586 char. If all branches start with the same asserted char, or with a bracket all
5587 of whose alternatives start with the same asserted char (recurse ad lib), then
5588 we return that char, otherwise -1.
5589
5590 Arguments:
5591 code points to start of expression (the bracket)
5592 options pointer to the options (used to check casing changes)
5593 inassert TRUE if in an assertion
5594
5595 Returns: -1 or the fixed first char
5596 */
5597
5598 static int
5599 find_firstassertedchar(const uschar *code, int *options, BOOL inassert)
5600 {
5601 register int c = -1;
5602 do {
5603 int d;
5604 const uschar *scode =
5605 first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE);
5606 register int op = *scode;
5607
5608 switch(op)
5609 {
5610 default:
5611 return -1;
5612
5613 case OP_BRA:
5614 case OP_CBRA:
5615 case OP_ASSERT:
5616 case OP_ONCE:
5617 case OP_COND:
5618 if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0)
5619 return -1;
5620 if (c < 0) c = d; else if (c != d) return -1;
5621 break;
5622
5623 case OP_EXACT: /* Fall through */
5624 scode += 2;
5625
5626 case OP_CHAR:
5627 case OP_CHARNC:
5628 case OP_PLUS:
5629 case OP_MINPLUS:
5630 case OP_POSPLUS:
5631 if (!inassert) return -1;
5632 if (c < 0)
5633 {
5634 c = scode[1];
5635 if ((*options & PCRE_CASELESS) != 0) c |= REQ_CASELESS;
5636 }
5637 else if (c != scode[1]) return -1;
5638 break;
5639 }
5640
5641 code += GET(code, 1);
5642 }
5643 while (*code == OP_ALT);
5644 return c;
5645 }
5646
5647
5648
5649 /*************************************************
5650 * Compile a Regular Expression *
5651 *************************************************/
5652
5653 /* This function takes a string and returns a pointer to a block of store
5654 holding a compiled version of the expression. The original API for this
5655 function had no error code return variable; it is retained for backwards
5656 compatibility. The new function is given a new name.
5657
5658 Arguments:
5659 pattern the regular expression
5660 options various option bits
5661 errorcodeptr pointer to error code variable (pcre_compile2() only)
5662 can be NULL if you don't want a code value
5663 errorptr pointer to pointer to error text
5664 erroroffset ptr offset in pattern where error was detected
5665 tables pointer to character tables or NULL
5666
5667 Returns: pointer to compiled data block, or NULL on error,
5668 with errorptr and erroroffset set
5669 */
5670
5671 PCRE_EXP_DEFN pcre *
5672 pcre_compile(const char *pattern, int options, const char **errorptr,
5673 int *erroroffset, const unsigned char *tables)
5674 {
5675 return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
5676 }
5677
5678
5679 PCRE_EXP_DEFN pcre *
5680 pcre_compile2(const char *pattern, int options, int *errorcodeptr,
5681 const char **errorptr, int *erroroffset, const unsigned char *tables)
5682 {
5683 real_pcre *re;
5684 int length = 1; /* For final END opcode */
5685 int firstbyte, reqbyte, newline;
5686 int errorcode = 0;
5687 int skipatstart = 0;
5688 #ifdef SUPPORT_UTF8
5689 BOOL utf8;
5690 #endif
5691 size_t size;
5692 uschar *code;
5693 const uschar *codestart;
5694 const uschar *ptr;
5695 compile_data compile_block;
5696 compile_data *cd = &compile_block;
5697
5698 /* This space is used for "compiling" into during the first phase, when we are
5699 computing the amount of memory that is needed. Compiled items are thrown away
5700 as soon as possible, so that a fairly large buffer should be sufficient for
5701 this purpose. The same space is used in the second phase for remembering where
5702 to fill in forward references to subpatterns. */
5703
5704 uschar cworkspace[COMPILE_WORK_SIZE];
5705
5706
5707 /* Set this early so that early errors get offset 0. */
5708
5709 ptr = (const uschar *)pattern;
5710
5711 /* We can't pass back an error message if errorptr is NULL; I guess the best we
5712 can do is just return NULL, but we can set a code value if there is a code
5713 pointer. */
5714
5715 if (errorptr == NULL)
5716 {
5717 if (errorcodeptr != NULL) *errorcodeptr = 99;
5718 return NULL;
5719 }
5720
5721 *errorptr = NULL;
5722 if (errorcodeptr != NULL) *errorcodeptr = ERR0;
5723
5724 /* However, we can give a message for this error */
5725
5726 if (erroroffset == NULL)
5727 {
5728 errorcode = ERR16;
5729 goto PCRE_EARLY_ERROR_RETURN2;
5730 }
5731
5732 *erroroffset = 0;
5733
5734 /* Can't support UTF8 unless PCRE has been compiled to include the code. */
5735
5736 #ifdef SUPPORT_UTF8
5737 utf8 = (options & PCRE_UTF8) != 0;
5738 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
5739 (*erroroffset = _pcre_valid_utf8((uschar *)pattern, -1)) >= 0)
5740 {
5741 errorcode = ERR44;
5742 goto PCRE_EARLY_ERROR_RETURN2;
5743 }
5744 #else
5745 if ((options & PCRE_UTF8) != 0)
5746 {
5747 errorcode = ERR32;
5748 goto PCRE_EARLY_ERROR_RETURN;
5749 }
5750 #endif
5751
5752 if ((options & ~PUBLIC_OPTIONS) != 0)
5753 {
5754 errorcode = ERR17;
5755 goto PCRE_EARLY_ERROR_RETURN;
5756 }
5757
5758 /* Set up pointers to the individual character tables */
5759
5760 if (tables == NULL) tables = _pcre_default_tables;
5761 cd->lcc = tables + lcc_offset;
5762 cd->fcc = tables + fcc_offset;
5763 cd->cbits = tables + cbits_offset;
5764 cd->ctypes = tables + ctypes_offset;
5765
5766 /* Check for global one-time settings at the start of the pattern, and remember
5767 the offset for later. */
5768
5769 while (ptr[skipatstart] == '(' && ptr[skipatstart+1] == '*')
5770 {
5771 int newnl = 0;
5772 int newbsr = 0;
5773
5774 if (strncmp((char *)(ptr+skipatstart+2), "CR)", 3) == 0)
5775 { skipatstart += 5; newnl = PCRE_NEWLINE_CR; }
5776 else if (strncmp((char *)(ptr+skipatstart+2), "LF)", 3) == 0)
5777 { skipatstart += 5; newnl = PCRE_NEWLINE_LF; }
5778 else if (strncmp((char *)(ptr+skipatstart+2), "CRLF)", 5) == 0)
5779 { skipatstart += 7; newnl = PCRE_NEWLINE_CR + PCRE_NEWLINE_LF; }
5780 else if (strncmp((char *)(ptr+skipatstart+2), "ANY)", 4) == 0)
5781 { skipatstart += 6; newnl = PCRE_NEWLINE_ANY; }
5782 else if (strncmp((char *)(ptr+skipatstart+2), "ANYCRLF)", 8) == 0)
5783 { skipatstart += 10; newnl = PCRE_NEWLINE_ANYCRLF; }
5784
5785 else if (strncmp((char *)(ptr+skipatstart+2), "BSR_ANYCRLF)", 12) == 0)
5786 { skipatstart += 14; newbsr = PCRE_BSR_ANYCRLF; }
5787 else if (strncmp((char *)(ptr+skipatstart+2), "BSR_UNICODE)", 12) == 0)
5788 { skipatstart += 14; newbsr = PCRE_BSR_UNICODE; }
5789
5790 if (newnl != 0)
5791 options = (options & ~PCRE_NEWLINE_BITS) | newnl;
5792 else if (newbsr != 0)
5793 options = (options & ~(PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) | newbsr;
5794 else break;
5795 }
5796
5797 /* Check validity of \R options. */
5798
5799 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
5800 {
5801 case 0:
5802 case PCRE_BSR_ANYCRLF:
5803 case PCRE_BSR_UNICODE:
5804 break;
5805 default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
5806 }
5807
5808 /* Handle different types of newline. The three bits give seven cases. The
5809 current code allows for fixed one- or two-byte sequences, plus "any" and
5810 "anycrlf". */
5811
5812 switch (options & PCRE_NEWLINE_BITS)
5813 {
5814 case 0: newline = NEWLINE; break; /* Build-time default */
5815 case PCRE_NEWLINE_CR: newline = '\r'; break;
5816 case PCRE_NEWLINE_LF: newline = '\n'; break;
5817 case PCRE_NEWLINE_CR+
5818 PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
5819 case PCRE_NEWLINE_ANY: newline = -1; break;
5820 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
5821 default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
5822 }
5823
5824 if (newline == -2)
5825 {
5826 cd->nltype = NLTYPE_ANYCRLF;
5827 }
5828 else if (newline < 0)
5829 {
5830 cd->nltype = NLTYPE_ANY;
5831 }
5832 else
5833 {
5834 cd->nltype = NLTYPE_FIXED;
5835 if (newline > 255)
5836 {
5837 cd->nllen = 2;
5838 cd->nl[0] = (newline >> 8) & 255;
5839 cd->nl[1] = newline & 255;
5840 }
5841 else
5842 {
5843 cd->nllen = 1;
5844 cd->nl[0] = newline;
5845 }
5846 }
5847
5848 /* Maximum back reference and backref bitmap. The bitmap records up to 31 back
5849 references to help in deciding whether (.*) can be treated as anchored or not.
5850 */
5851
5852 cd->top_backref = 0;
5853 cd->backref_map = 0;
5854
5855 /* Reflect pattern for debugging output */
5856
5857 DPRINTF(("------------------------------------------------------------------\n"));
5858 DPRINTF(("%s\n", pattern));
5859
5860 /* Pretend to compile the pattern while actually just accumulating the length
5861 of memory required. This behaviour is triggered by passing a non-NULL final
5862 argument to compile_regex(). We pass a block of workspace (cworkspace) for it
5863 to compile parts of the pattern into; the compiled code is discarded when it is
5864 no longer needed, so hopefully this workspace will never overflow, though there
5865 is a test for its doing so. */
5866
5867 cd->bracount = 0;
5868 cd->names_found = 0;
5869 cd->name_entry_size = 0;
5870 cd->name_table = NULL;
5871 cd->start_workspace = cworkspace;
5872 cd->start_code = cworkspace;
5873 cd->hwm = cworkspace;
5874 cd->start_pattern = (const uschar *)pattern;
5875 cd->end_pattern = (const uschar *)(pattern + strlen(pattern));
5876 cd->req_varyopt = 0;
5877 cd->external_options = options;
5878 cd->external_flags = 0;
5879
5880 /* Now do the pre-compile. On error, errorcode will be set non-zero, so we
5881 don't need to look at the result of the function here. The initial options have
5882 been put into the cd block so that they can be changed if an option setting is
5883 found within the regex right at the beginning. Bringing initial option settings
5884 outside can help speed up starting point checks. */
5885
5886 ptr += skipatstart;
5887 code = cworkspace;
5888 *code = OP_BRA;
5889 (void)compile_regex(cd->external_options, cd->external_options & PCRE_IMS,
5890 &code, &ptr, &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd,
5891 &length);
5892 if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN;
5893
5894 DPRINTF(("end pre-compile: length=%d workspace=%d\n", length,
5895 cd->hwm - cworkspace));
5896
5897 if (length > MAX_PATTERN_SIZE)
5898 {
5899 errorcode = ERR20;
5900 goto PCRE_EARLY_ERROR_RETURN;
5901 }
5902
5903 /* Compute the size of data block needed and get it, either from malloc or
5904 externally provided function. Integer overflow should no longer be possible
5905 because nowadays we limit the maximum value of cd->names_found and
5906 cd->name_entry_size. */
5907
5908 size = length + sizeof(real_pcre) + cd->names_found * (cd->name_entry_size + 3);
5909 re = (real_pcre *)(pcre_malloc)(size);
5910
5911 if (re == NULL)
5912 {
5913 errorcode = ERR21;
5914 goto PCRE_EARLY_ERROR_RETURN;
5915 }
5916
5917 /* Put in the magic number, and save the sizes, initial options, internal
5918 flags, and character table pointer. NULL is used for the default character
5919 tables. The nullpad field is at the end; it's there to help in the case when a
5920 regex compiled on a system with 4-byte pointers is run on another with 8-byte
5921 pointers. */
5922
5923 re->magic_number = MAGIC_NUMBER;
5924 re->size = size;
5925 re->options = cd->external_options;
5926 re->flags = cd->external_flags;
5927 re->dummy1 = 0;
5928 re->first_byte = 0;
5929 re->req_byte = 0;
5930 re->name_table_offset = sizeof(real_pcre);
5931 re->name_entry_size = cd->name_entry_size;
5932 re->name_count = cd->names_found;
5933 re->ref_count = 0;
5934 re->tables = (tables == _pcre_default_tables)? NULL : tables;
5935 re->nullpad = NULL;
5936
5937 /* The starting points of the name/number translation table and of the code are
5938 passed around in the compile data block. The start/end pattern and initial
5939 options are already set from the pre-compile phase, as is the name_entry_size
5940 field. Reset the bracket count and the names_found field. Also reset the hwm
5941 field; this time it's used for remembering forward references to subpatterns.
5942 */
5943
5944 cd->bracount = 0;
5945 cd->names_found = 0;
5946 cd->name_table = (uschar *)re + re->name_table_offset;
5947 codestart = cd->name_table + re->name_entry_size * re->name_count;
5948 cd->start_code = codestart;
5949 cd->hwm = cworkspace;
5950 cd->req_varyopt = 0;
5951 cd->had_accept = FALSE;
5952
5953 /* Set up a starting, non-extracting bracket, then compile the expression. On
5954 error, errorcode will be set non-zero, so we don't need to look at the result
5955 of the function here. */
5956
5957 ptr = (const uschar *)pattern + skipatstart;
5958 code = (uschar *)codestart;
5959 *code = OP_BRA;
5960 (void)compile_regex(re->options, re->options & PCRE_IMS, &code, &ptr,
5961 &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd, NULL);
5962 re->top_bracket = cd->bracount;
5963 re->top_backref = cd->top_backref;
5964 re->flags = cd->external_flags;
5965
5966 if (cd->had_accept) reqbyte = -1; /* Must disable after (*ACCEPT) */
5967
5968 /* If not reached end of pattern on success, there's an excess bracket. */
5969
5970 if (errorcode == 0 && *ptr != 0) errorcode = ERR22;
5971
5972 /* Fill in the terminating state and check for disastrous overflow, but
5973 if debugging, leave the test till after things are printed out. */
5974
5975 *code++ = OP_END;
5976
5977 #ifndef DEBUG
5978 if (code - codestart > length) errorcode = ERR23;
5979 #endif
5980
5981 /* Fill in any forward references that are required. */
5982
5983 while (errorcode == 0 && cd->hwm > cworkspace)
5984 {
5985 int offset, recno;
5986 const uschar *groupptr;
5987 cd->hwm -= LINK_SIZE;
5988 offset = GET(cd->hwm, 0);
5989 recno = GET(codestart, offset);
5990 groupptr = find_bracket(codestart, (re->options & PCRE_UTF8) != 0, recno);
5991 if (groupptr == NULL) errorcode = ERR53;
5992 else PUT(((uschar *)codestart), offset, groupptr - codestart);
5993 }
5994
5995 /* Give an error if there's back reference to a non-existent capturing
5996 subpattern. */
5997
5998 if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15;
5999
6000 /* Failed to compile, or error while post-processing */
6001
6002 if (errorcode != 0)
6003 {
6004 (pcre_free)(re);
6005 PCRE_EARLY_ERROR_RETURN:
6006 *erroroffset = ptr - (const uschar *)pattern;
6007 PCRE_EARLY_ERROR_RETURN2:
6008 *errorptr = error_texts[errorcode];
6009 if (errorcodeptr != NULL) *errorcodeptr = errorcode;
6010 return NULL;
6011 }
6012
6013 /* If the anchored option was not passed, set the flag if we can determine that
6014 the pattern is anchored by virtue of ^ characters or \A or anything else (such
6015 as starting with .* when DOTALL is set).
6016
6017 Otherwise, if we know what the first byte has to be, save it, because that
6018 speeds up unanchored matches no end. If not, see if we can set the
6019 PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
6020 start with ^. and also when all branches start with .* for non-DOTALL matches.
6021 */
6022
6023 if ((re->options & PCRE_ANCHORED) == 0)
6024 {
6025 int temp_options = re->options; /* May get changed during these scans */
6026 if (is_anchored(codestart, &temp_options, 0, cd->backref_map))
6027 re->options |= PCRE_ANCHORED;
6028 else
6029 {
6030 if (firstbyte < 0)
6031 firstbyte = find_firstassertedchar(codestart, &temp_options, FALSE);
6032 if (firstbyte >= 0) /* Remove caseless flag for non-caseable chars */
6033 {
6034 int ch = firstbyte & 255;
6035 re->first_byte = ((firstbyte & REQ_CASELESS) != 0 &&
6036 cd->fcc[ch] == ch)? ch : firstbyte;
6037 re->flags |= PCRE_FIRSTSET;
6038 }
6039 else if (is_startline(codestart, 0, cd->backref_map))
6040 re->flags |= PCRE_STARTLINE;
6041 }
6042 }
6043
6044 /* For an anchored pattern, we use the "required byte" only if it follows a
6045 variable length item in the regex. Remove the caseless flag for non-caseable
6046 bytes. */
6047
6048 if (reqbyte >= 0 &&
6049 ((re->options & PCRE_ANCHORED) == 0 || (reqbyte & REQ_VARY) != 0))
6050 {
6051 int ch = reqbyte & 255;
6052 re->req_byte = ((reqbyte & REQ_CASELESS) != 0 &&
6053 cd->fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte;
6054 re->flags |= PCRE_REQCHSET;
6055 }
6056
6057 /* Print out the compiled data if debugging is enabled. This is never the
6058 case when building a production library. */
6059
6060 #ifdef DEBUG
6061
6062 printf("Length = %d top_bracket = %d top_backref = %d\n",
6063 length, re->top_bracket, re->top_backref);
6064
6065 printf("Options=%08x\n", re->options);
6066
6067 if ((re->flags & PCRE_FIRSTSET) != 0)
6068 {
6069 int ch = re->first_byte & 255;
6070 const char *caseless = ((re->first_byte & REQ_CASELESS) == 0)?
6071 "" : " (caseless)";
6072 if (isprint(ch)) printf("First char = %c%s\n", ch, caseless);
6073 else printf("First char = \\x%02x%s\n", ch, caseless);
6074 }
6075
6076 if ((re->flags & PCRE_REQCHSET) != 0)
6077 {
6078 int ch = re->req_byte & 255;
6079 const char *caseless = ((re->req_byte & REQ_CASELESS) == 0)?
6080 "" : " (caseless)";
6081 if (isprint(ch)) printf("Req char = %c%s\n", ch, caseless);
6082 else printf("Req char = \\x%02x%s\n", ch, caseless);
6083 }
6084
6085 pcre_printint(re, stdout, TRUE);
6086
6087 /* This check is done here in the debugging case so that the code that
6088 was compiled can be seen. */
6089
6090 if (code - codestart > length)
6091 {
6092 (pcre_free)(re);
6093 *errorptr = error_texts[ERR23];
6094 *erroroffset = ptr - (uschar *)pattern;
6095 if (errorcodeptr != NULL) *errorcodeptr = ERR23;
6096 return NULL;
6097 }
6098 #endif /* DEBUG */
6099
6100 return (pcre *)re;
6101 }
6102
6103 /* End of pcre_compile.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12