/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Contents of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 286 - (show annotations) (download)
Mon Dec 17 14:46:11 2007 UTC (6 years, 8 months ago) by ph10
File MIME type: text/plain
File size: 195634 byte(s)
Add .gz and .bz2 optional support to pcregrep.

1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2007 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains the external function pcre_compile(), along with
42 supporting internal functions that are not used by other modules. */
43
44
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48
49 #define NLBLOCK cd /* Block containing newline information */
50 #define PSSTART start_pattern /* Field containing processed string start */
51 #define PSEND end_pattern /* Field containing processed string end */
52
53 #include "pcre_internal.h"
54
55
56 /* When DEBUG is defined, we need the pcre_printint() function, which is also
57 used by pcretest. DEBUG is not defined when building a production library. */
58
59 #ifdef DEBUG
60 #include "pcre_printint.src"
61 #endif
62
63
64 /* Macro for setting individual bits in class bitmaps. */
65
66 #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
67
68 /* Maximum length value to check against when making sure that the integer that
69 holds the compiled pattern length does not overflow. We make it a bit less than
70 INT_MAX to allow for adding in group terminating bytes, so that we don't have
71 to check them every time. */
72
73 #define OFLOW_MAX (INT_MAX - 20)
74
75
76 /*************************************************
77 * Code parameters and static tables *
78 *************************************************/
79
80 /* This value specifies the size of stack workspace that is used during the
81 first pre-compile phase that determines how much memory is required. The regex
82 is partly compiled into this space, but the compiled parts are discarded as
83 soon as they can be, so that hopefully there will never be an overrun. The code
84 does, however, check for an overrun. The largest amount I've seen used is 218,
85 so this number is very generous.
86
87 The same workspace is used during the second, actual compile phase for
88 remembering forward references to groups so that they can be filled in at the
89 end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
90 is 4 there is plenty of room. */
91
92 #define COMPILE_WORK_SIZE (4096)
93
94
95 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
96 are simple data values; negative values are for special things like \d and so
97 on. Zero means further processing is needed (for things like \x), or the escape
98 is invalid. */
99
100 #ifndef EBCDIC /* This is the "normal" table for ASCII systems */
101 static const short int escapes[] = {
102 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
103 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
104 '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */
105 -ESC_H, 0, 0, -ESC_K, 0, 0, 0, 0, /* H - O */
106 -ESC_P, -ESC_Q, -ESC_R, -ESC_S, 0, 0, -ESC_V, -ESC_W, /* P - W */
107 -ESC_X, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
108 '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */
109 -ESC_h, 0, 0, -ESC_k, 0, 0, ESC_n, 0, /* h - o */
110 -ESC_p, 0, ESC_r, -ESC_s, ESC_tee, 0, -ESC_v, -ESC_w, /* p - w */
111 0, 0, -ESC_z /* x - z */
112 };
113
114 #else /* This is the "abnormal" table for EBCDIC systems */
115 static const short int escapes[] = {
116 /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
117 /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
118 /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
119 /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
120 /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
121 /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
122 /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
123 /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
124 /* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0,
125 /* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
126 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
127 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0,
128 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
129 /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
130 /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
131 /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
132 /* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0,
133 /* D0 */ '}', 0, -ESC_K, 0, 0, 0, 0, -ESC_P,
134 /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
135 /* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X,
136 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
137 /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
138 /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
139 };
140 #endif
141
142
143 /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
144 searched linearly. Put all the names into a single string, in order to reduce
145 the number of relocations when a shared library is dynamically linked. */
146
147 typedef struct verbitem {
148 int len;
149 int op;
150 } verbitem;
151
152 static const char verbnames[] =
153 "ACCEPT\0"
154 "COMMIT\0"
155 "F\0"
156 "FAIL\0"
157 "PRUNE\0"
158 "SKIP\0"
159 "THEN";
160
161 static verbitem verbs[] = {
162 { 6, OP_ACCEPT },
163 { 6, OP_COMMIT },
164 { 1, OP_FAIL },
165 { 4, OP_FAIL },
166 { 5, OP_PRUNE },
167 { 4, OP_SKIP },
168 { 4, OP_THEN }
169 };
170
171 static int verbcount = sizeof(verbs)/sizeof(verbitem);
172
173
174 /* Tables of names of POSIX character classes and their lengths. The names are
175 now all in a single string, to reduce the number of relocations when a shared
176 library is dynamically loaded. The list of lengths is terminated by a zero
177 length entry. The first three must be alpha, lower, upper, as this is assumed
178 for handling case independence. */
179
180 static const char posix_names[] =
181 "alpha\0" "lower\0" "upper\0" "alnum\0" "ascii\0" "blank\0"
182 "cntrl\0" "digit\0" "graph\0" "print\0" "punct\0" "space\0"
183 "word\0" "xdigit";
184
185 static const uschar posix_name_lengths[] = {
186 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
187
188 /* Table of class bit maps for each POSIX class. Each class is formed from a
189 base map, with an optional addition or removal of another map. Then, for some
190 classes, there is some additional tweaking: for [:blank:] the vertical space
191 characters are removed, and for [:alpha:] and [:alnum:] the underscore
192 character is removed. The triples in the table consist of the base map offset,
193 second map offset or -1 if no second map, and a non-negative value for map
194 addition or a negative value for map subtraction (if there are two maps). The
195 absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
196 remove vertical space characters, 2 => remove underscore. */
197
198 static const int posix_class_maps[] = {
199 cbit_word, cbit_digit, -2, /* alpha */
200 cbit_lower, -1, 0, /* lower */
201 cbit_upper, -1, 0, /* upper */
202 cbit_word, -1, 2, /* alnum - word without underscore */
203 cbit_print, cbit_cntrl, 0, /* ascii */
204 cbit_space, -1, 1, /* blank - a GNU extension */
205 cbit_cntrl, -1, 0, /* cntrl */
206 cbit_digit, -1, 0, /* digit */
207 cbit_graph, -1, 0, /* graph */
208 cbit_print, -1, 0, /* print */
209 cbit_punct, -1, 0, /* punct */
210 cbit_space, -1, 0, /* space */
211 cbit_word, -1, 0, /* word - a Perl extension */
212 cbit_xdigit,-1, 0 /* xdigit */
213 };
214
215
216 #define STRING(a) # a
217 #define XSTRING(s) STRING(s)
218
219 /* The texts of compile-time error messages. These are "char *" because they
220 are passed to the outside world. Do not ever re-use any error number, because
221 they are documented. Always add a new error instead. Messages marked DEAD below
222 are no longer used. This used to be a table of strings, but in order to reduce
223 the number of relocations needed when a shared library is loaded dynamically,
224 it is now one long string. We cannot use a table of offsets, because the
225 lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
226 simply count through to the one we want - this isn't a performance issue
227 because these strings are used only when there is a compilation error. */
228
229 static const char error_texts[] =
230 "no error\0"
231 "\\ at end of pattern\0"
232 "\\c at end of pattern\0"
233 "unrecognized character follows \\\0"
234 "numbers out of order in {} quantifier\0"
235 /* 5 */
236 "number too big in {} quantifier\0"
237 "missing terminating ] for character class\0"
238 "invalid escape sequence in character class\0"
239 "range out of order in character class\0"
240 "nothing to repeat\0"
241 /* 10 */
242 "operand of unlimited repeat could match the empty string\0" /** DEAD **/
243 "internal error: unexpected repeat\0"
244 "unrecognized character after (? or (?-\0"
245 "POSIX named classes are supported only within a class\0"
246 "missing )\0"
247 /* 15 */
248 "reference to non-existent subpattern\0"
249 "erroffset passed as NULL\0"
250 "unknown option bit(s) set\0"
251 "missing ) after comment\0"
252 "parentheses nested too deeply\0" /** DEAD **/
253 /* 20 */
254 "regular expression is too large\0"
255 "failed to get memory\0"
256 "unmatched parentheses\0"
257 "internal error: code overflow\0"
258 "unrecognized character after (?<\0"
259 /* 25 */
260 "lookbehind assertion is not fixed length\0"
261 "malformed number or name after (?(\0"
262 "conditional group contains more than two branches\0"
263 "assertion expected after (?(\0"
264 "(?R or (?[+-]digits must be followed by )\0"
265 /* 30 */
266 "unknown POSIX class name\0"
267 "POSIX collating elements are not supported\0"
268 "this version of PCRE is not compiled with PCRE_UTF8 support\0"
269 "spare error\0" /** DEAD **/
270 "character value in \\x{...} sequence is too large\0"
271 /* 35 */
272 "invalid condition (?(0)\0"
273 "\\C not allowed in lookbehind assertion\0"
274 "PCRE does not support \\L, \\l, \\N, \\U, or \\u\0"
275 "number after (?C is > 255\0"
276 "closing ) for (?C expected\0"
277 /* 40 */
278 "recursive call could loop indefinitely\0"
279 "unrecognized character after (?P\0"
280 "syntax error in subpattern name (missing terminator)\0"
281 "two named subpatterns have the same name\0"
282 "invalid UTF-8 string\0"
283 /* 45 */
284 "support for \\P, \\p, and \\X has not been compiled\0"
285 "malformed \\P or \\p sequence\0"
286 "unknown property name after \\P or \\p\0"
287 "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
288 "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
289 /* 50 */
290 "repeated subpattern is too long\0" /** DEAD **/
291 "octal value is greater than \\377 (not in UTF-8 mode)\0"
292 "internal error: overran compiling workspace\0"
293 "internal error: previously-checked referenced subpattern not found\0"
294 "DEFINE group contains more than one branch\0"
295 /* 55 */
296 "repeating a DEFINE group is not allowed\0"
297 "inconsistent NEWLINE options\0"
298 "\\g is not followed by a braced name or an optionally braced non-zero number\0"
299 "(?+ or (?- or (?(+ or (?(- must be followed by a non-zero number\0"
300 "(*VERB) with an argument is not supported\0"
301 /* 60 */
302 "(*VERB) not recognized\0"
303 "number is too big\0"
304 "subpattern name expected\0"
305 "digit expected after (?+";
306
307
308 /* Table to identify digits and hex digits. This is used when compiling
309 patterns. Note that the tables in chartables are dependent on the locale, and
310 may mark arbitrary characters as digits - but the PCRE compiling code expects
311 to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
312 a private table here. It costs 256 bytes, but it is a lot faster than doing
313 character value tests (at least in some simple cases I timed), and in some
314 applications one wants PCRE to compile efficiently as well as match
315 efficiently.
316
317 For convenience, we use the same bit definitions as in chartables:
318
319 0x04 decimal digit
320 0x08 hexadecimal digit
321
322 Then we can use ctype_digit and ctype_xdigit in the code. */
323
324 #ifndef EBCDIC /* This is the "normal" case, for ASCII systems */
325 static const unsigned char digitab[] =
326 {
327 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
328 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
329 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
330 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
331 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
332 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
333 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
334 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
335 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
336 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
337 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
338 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
339 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
340 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
341 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
342 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
343 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
344 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
345 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
346 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
347 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
348 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
349 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
350 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
351 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
352 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
353 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
354 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
355 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
356 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
357 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
358 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
359
360 #else /* This is the "abnormal" case, for EBCDIC systems */
361 static const unsigned char digitab[] =
362 {
363 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
364 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
365 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
366 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
367 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
368 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
369 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
370 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
371 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
372 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
373 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
374 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */
375 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
376 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
377 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
378 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
379 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
380 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
381 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
382 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
383 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
384 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
385 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
386 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
387 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
388 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
389 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
390 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
391 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
392 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
393 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
394 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
395
396 static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
397 0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
398 0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
399 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
400 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
401 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
402 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
403 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
404 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
405 0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
406 0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
407 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
408 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */
409 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
410 0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
411 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
412 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
413 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
414 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
415 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
416 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
417 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
418 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
419 0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
420 0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
421 0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
422 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
423 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
424 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
425 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
426 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
427 0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
428 0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
429 #endif
430
431
432 /* Definition to allow mutual recursion */
433
434 static BOOL
435 compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
436 int *, int *, branch_chain *, compile_data *, int *);
437
438
439
440 /*************************************************
441 * Find an error text *
442 *************************************************/
443
444 /* The error texts are now all in one long string, to save on relocations. As
445 some of the text is of unknown length, we can't use a table of offsets.
446 Instead, just count through the strings. This is not a performance issue
447 because it happens only when there has been a compilation error.
448
449 Argument: the error number
450 Returns: pointer to the error string
451 */
452
453 static const char *
454 find_error_text(int n)
455 {
456 const char *s = error_texts;
457 for (; n > 0; n--) while (*s++ != 0);
458 return s;
459 }
460
461
462 /*************************************************
463 * Handle escapes *
464 *************************************************/
465
466 /* This function is called when a \ has been encountered. It either returns a
467 positive value for a simple escape such as \n, or a negative value which
468 encodes one of the more complicated things such as \d. A backreference to group
469 n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
470 UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
471 ptr is pointing at the \. On exit, it is on the final character of the escape
472 sequence.
473
474 Arguments:
475 ptrptr points to the pattern position pointer
476 errorcodeptr points to the errorcode variable
477 bracount number of previous extracting brackets
478 options the options bits
479 isclass TRUE if inside a character class
480
481 Returns: zero or positive => a data character
482 negative => a special escape sequence
483 on error, errorcodeptr is set
484 */
485
486 static int
487 check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
488 int options, BOOL isclass)
489 {
490 BOOL utf8 = (options & PCRE_UTF8) != 0;
491 const uschar *ptr = *ptrptr + 1;
492 int c, i;
493
494 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
495 ptr--; /* Set pointer back to the last byte */
496
497 /* If backslash is at the end of the pattern, it's an error. */
498
499 if (c == 0) *errorcodeptr = ERR1;
500
501 /* Non-alphanumerics are literals. For digits or letters, do an initial lookup
502 in a table. A non-zero result is something that can be returned immediately.
503 Otherwise further processing may be required. */
504
505 #ifndef EBCDIC /* ASCII coding */
506 else if (c < '0' || c > 'z') {} /* Not alphanumeric */
507 else if ((i = escapes[c - '0']) != 0) c = i;
508
509 #else /* EBCDIC coding */
510 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphanumeric */
511 else if ((i = escapes[c - 0x48]) != 0) c = i;
512 #endif
513
514 /* Escapes that need further processing, or are illegal. */
515
516 else
517 {
518 const uschar *oldptr;
519 BOOL braced, negated;
520
521 switch (c)
522 {
523 /* A number of Perl escapes are not handled by PCRE. We give an explicit
524 error. */
525
526 case 'l':
527 case 'L':
528 case 'N':
529 case 'u':
530 case 'U':
531 *errorcodeptr = ERR37;
532 break;
533
534 /* \g must be followed by a number, either plain or braced. If positive, it
535 is an absolute backreference. If negative, it is a relative backreference.
536 This is a Perl 5.10 feature. Perl 5.10 also supports \g{name} as a
537 reference to a named group. This is part of Perl's movement towards a
538 unified syntax for back references. As this is synonymous with \k{name}, we
539 fudge it up by pretending it really was \k. */
540
541 case 'g':
542 if (ptr[1] == '{')
543 {
544 const uschar *p;
545 for (p = ptr+2; *p != 0 && *p != '}'; p++)
546 if (*p != '-' && (digitab[*p] & ctype_digit) == 0) break;
547 if (*p != 0 && *p != '}')
548 {
549 c = -ESC_k;
550 break;
551 }
552 braced = TRUE;
553 ptr++;
554 }
555 else braced = FALSE;
556
557 if (ptr[1] == '-')
558 {
559 negated = TRUE;
560 ptr++;
561 }
562 else negated = FALSE;
563
564 c = 0;
565 while ((digitab[ptr[1]] & ctype_digit) != 0)
566 c = c * 10 + *(++ptr) - '0';
567
568 if (c < 0)
569 {
570 *errorcodeptr = ERR61;
571 break;
572 }
573
574 if (c == 0 || (braced && *(++ptr) != '}'))
575 {
576 *errorcodeptr = ERR57;
577 break;
578 }
579
580 if (negated)
581 {
582 if (c > bracount)
583 {
584 *errorcodeptr = ERR15;
585 break;
586 }
587 c = bracount - (c - 1);
588 }
589
590 c = -(ESC_REF + c);
591 break;
592
593 /* The handling of escape sequences consisting of a string of digits
594 starting with one that is not zero is not straightforward. By experiment,
595 the way Perl works seems to be as follows:
596
597 Outside a character class, the digits are read as a decimal number. If the
598 number is less than 10, or if there are that many previous extracting
599 left brackets, then it is a back reference. Otherwise, up to three octal
600 digits are read to form an escaped byte. Thus \123 is likely to be octal
601 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
602 value is greater than 377, the least significant 8 bits are taken. Inside a
603 character class, \ followed by a digit is always an octal number. */
604
605 case '1': case '2': case '3': case '4': case '5':
606 case '6': case '7': case '8': case '9':
607
608 if (!isclass)
609 {
610 oldptr = ptr;
611 c -= '0';
612 while ((digitab[ptr[1]] & ctype_digit) != 0)
613 c = c * 10 + *(++ptr) - '0';
614 if (c < 0)
615 {
616 *errorcodeptr = ERR61;
617 break;
618 }
619 if (c < 10 || c <= bracount)
620 {
621 c = -(ESC_REF + c);
622 break;
623 }
624 ptr = oldptr; /* Put the pointer back and fall through */
625 }
626
627 /* Handle an octal number following \. If the first digit is 8 or 9, Perl
628 generates a binary zero byte and treats the digit as a following literal.
629 Thus we have to pull back the pointer by one. */
630
631 if ((c = *ptr) >= '8')
632 {
633 ptr--;
634 c = 0;
635 break;
636 }
637
638 /* \0 always starts an octal number, but we may drop through to here with a
639 larger first octal digit. The original code used just to take the least
640 significant 8 bits of octal numbers (I think this is what early Perls used
641 to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
642 than 3 octal digits. */
643
644 case '0':
645 c -= '0';
646 while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
647 c = c * 8 + *(++ptr) - '0';
648 if (!utf8 && c > 255) *errorcodeptr = ERR51;
649 break;
650
651 /* \x is complicated. \x{ddd} is a character number which can be greater
652 than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
653 treated as a data character. */
654
655 case 'x':
656 if (ptr[1] == '{')
657 {
658 const uschar *pt = ptr + 2;
659 int count = 0;
660
661 c = 0;
662 while ((digitab[*pt] & ctype_xdigit) != 0)
663 {
664 register int cc = *pt++;
665 if (c == 0 && cc == '0') continue; /* Leading zeroes */
666 count++;
667
668 #ifndef EBCDIC /* ASCII coding */
669 if (cc >= 'a') cc -= 32; /* Convert to upper case */
670 c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));
671 #else /* EBCDIC coding */
672 if (cc >= 'a' && cc <= 'z') cc += 64; /* Convert to upper case */
673 c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));
674 #endif
675 }
676
677 if (*pt == '}')
678 {
679 if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
680 ptr = pt;
681 break;
682 }
683
684 /* If the sequence of hex digits does not end with '}', then we don't
685 recognize this construct; fall through to the normal \x handling. */
686 }
687
688 /* Read just a single-byte hex-defined char */
689
690 c = 0;
691 while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
692 {
693 int cc; /* Some compilers don't like ++ */
694 cc = *(++ptr); /* in initializers */
695 #ifndef EBCDIC /* ASCII coding */
696 if (cc >= 'a') cc -= 32; /* Convert to upper case */
697 c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
698 #else /* EBCDIC coding */
699 if (cc <= 'z') cc += 64; /* Convert to upper case */
700 c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
701 #endif
702 }
703 break;
704
705 /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
706 This coding is ASCII-specific, but then the whole concept of \cx is
707 ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
708
709 case 'c':
710 c = *(++ptr);
711 if (c == 0)
712 {
713 *errorcodeptr = ERR2;
714 break;
715 }
716
717 #ifndef EBCDIC /* ASCII coding */
718 if (c >= 'a' && c <= 'z') c -= 32;
719 c ^= 0x40;
720 #else /* EBCDIC coding */
721 if (c >= 'a' && c <= 'z') c += 64;
722 c ^= 0xC0;
723 #endif
724 break;
725
726 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
727 other alphanumeric following \ is an error if PCRE_EXTRA was set;
728 otherwise, for Perl compatibility, it is a literal. This code looks a bit
729 odd, but there used to be some cases other than the default, and there may
730 be again in future, so I haven't "optimized" it. */
731
732 default:
733 if ((options & PCRE_EXTRA) != 0) switch(c)
734 {
735 default:
736 *errorcodeptr = ERR3;
737 break;
738 }
739 break;
740 }
741 }
742
743 *ptrptr = ptr;
744 return c;
745 }
746
747
748
749 #ifdef SUPPORT_UCP
750 /*************************************************
751 * Handle \P and \p *
752 *************************************************/
753
754 /* This function is called after \P or \p has been encountered, provided that
755 PCRE is compiled with support for Unicode properties. On entry, ptrptr is
756 pointing at the P or p. On exit, it is pointing at the final character of the
757 escape sequence.
758
759 Argument:
760 ptrptr points to the pattern position pointer
761 negptr points to a boolean that is set TRUE for negation else FALSE
762 dptr points to an int that is set to the detailed property value
763 errorcodeptr points to the error code variable
764
765 Returns: type value from ucp_type_table, or -1 for an invalid type
766 */
767
768 static int
769 get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
770 {
771 int c, i, bot, top;
772 const uschar *ptr = *ptrptr;
773 char name[32];
774
775 c = *(++ptr);
776 if (c == 0) goto ERROR_RETURN;
777
778 *negptr = FALSE;
779
780 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
781 negation. */
782
783 if (c == '{')
784 {
785 if (ptr[1] == '^')
786 {
787 *negptr = TRUE;
788 ptr++;
789 }
790 for (i = 0; i < (int)sizeof(name) - 1; i++)
791 {
792 c = *(++ptr);
793 if (c == 0) goto ERROR_RETURN;
794 if (c == '}') break;
795 name[i] = c;
796 }
797 if (c !='}') goto ERROR_RETURN;
798 name[i] = 0;
799 }
800
801 /* Otherwise there is just one following character */
802
803 else
804 {
805 name[0] = c;
806 name[1] = 0;
807 }
808
809 *ptrptr = ptr;
810
811 /* Search for a recognized property name using binary chop */
812
813 bot = 0;
814 top = _pcre_utt_size;
815
816 while (bot < top)
817 {
818 i = (bot + top) >> 1;
819 c = strcmp(name, _pcre_utt_names + _pcre_utt[i].name_offset);
820 if (c == 0)
821 {
822 *dptr = _pcre_utt[i].value;
823 return _pcre_utt[i].type;
824 }
825 if (c > 0) bot = i + 1; else top = i;
826 }
827
828 *errorcodeptr = ERR47;
829 *ptrptr = ptr;
830 return -1;
831
832 ERROR_RETURN:
833 *errorcodeptr = ERR46;
834 *ptrptr = ptr;
835 return -1;
836 }
837 #endif
838
839
840
841
842 /*************************************************
843 * Check for counted repeat *
844 *************************************************/
845
846 /* This function is called when a '{' is encountered in a place where it might
847 start a quantifier. It looks ahead to see if it really is a quantifier or not.
848 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
849 where the ddds are digits.
850
851 Arguments:
852 p pointer to the first char after '{'
853
854 Returns: TRUE or FALSE
855 */
856
857 static BOOL
858 is_counted_repeat(const uschar *p)
859 {
860 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
861 while ((digitab[*p] & ctype_digit) != 0) p++;
862 if (*p == '}') return TRUE;
863
864 if (*p++ != ',') return FALSE;
865 if (*p == '}') return TRUE;
866
867 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
868 while ((digitab[*p] & ctype_digit) != 0) p++;
869
870 return (*p == '}');
871 }
872
873
874
875 /*************************************************
876 * Read repeat counts *
877 *************************************************/
878
879 /* Read an item of the form {n,m} and return the values. This is called only
880 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
881 so the syntax is guaranteed to be correct, but we need to check the values.
882
883 Arguments:
884 p pointer to first char after '{'
885 minp pointer to int for min
886 maxp pointer to int for max
887 returned as -1 if no max
888 errorcodeptr points to error code variable
889
890 Returns: pointer to '}' on success;
891 current ptr on error, with errorcodeptr set non-zero
892 */
893
894 static const uschar *
895 read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
896 {
897 int min = 0;
898 int max = -1;
899
900 /* Read the minimum value and do a paranoid check: a negative value indicates
901 an integer overflow. */
902
903 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
904 if (min < 0 || min > 65535)
905 {
906 *errorcodeptr = ERR5;
907 return p;
908 }
909
910 /* Read the maximum value if there is one, and again do a paranoid on its size.
911 Also, max must not be less than min. */
912
913 if (*p == '}') max = min; else
914 {
915 if (*(++p) != '}')
916 {
917 max = 0;
918 while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
919 if (max < 0 || max > 65535)
920 {
921 *errorcodeptr = ERR5;
922 return p;
923 }
924 if (max < min)
925 {
926 *errorcodeptr = ERR4;
927 return p;
928 }
929 }
930 }
931
932 /* Fill in the required variables, and pass back the pointer to the terminating
933 '}'. */
934
935 *minp = min;
936 *maxp = max;
937 return p;
938 }
939
940
941
942 /*************************************************
943 * Find forward referenced subpattern *
944 *************************************************/
945
946 /* This function scans along a pattern's text looking for capturing
947 subpatterns, and counting them. If it finds a named pattern that matches the
948 name it is given, it returns its number. Alternatively, if the name is NULL, it
949 returns when it reaches a given numbered subpattern. This is used for forward
950 references to subpatterns. We know that if (?P< is encountered, the name will
951 be terminated by '>' because that is checked in the first pass.
952
953 Arguments:
954 ptr current position in the pattern
955 count current count of capturing parens so far encountered
956 name name to seek, or NULL if seeking a numbered subpattern
957 lorn name length, or subpattern number if name is NULL
958 xmode TRUE if we are in /x mode
959
960 Returns: the number of the named subpattern, or -1 if not found
961 */
962
963 static int
964 find_parens(const uschar *ptr, int count, const uschar *name, int lorn,
965 BOOL xmode)
966 {
967 const uschar *thisname;
968
969 for (; *ptr != 0; ptr++)
970 {
971 int term;
972
973 /* Skip over backslashed characters and also entire \Q...\E */
974
975 if (*ptr == '\\')
976 {
977 if (*(++ptr) == 0) return -1;
978 if (*ptr == 'Q') for (;;)
979 {
980 while (*(++ptr) != 0 && *ptr != '\\');
981 if (*ptr == 0) return -1;
982 if (*(++ptr) == 'E') break;
983 }
984 continue;
985 }
986
987 /* Skip over character classes */
988
989 if (*ptr == '[')
990 {
991 while (*(++ptr) != ']')
992 {
993 if (*ptr == 0) return -1;
994 if (*ptr == '\\')
995 {
996 if (*(++ptr) == 0) return -1;
997 if (*ptr == 'Q') for (;;)
998 {
999 while (*(++ptr) != 0 && *ptr != '\\');
1000 if (*ptr == 0) return -1;
1001 if (*(++ptr) == 'E') break;
1002 }
1003 continue;
1004 }
1005 }
1006 continue;
1007 }
1008
1009 /* Skip comments in /x mode */
1010
1011 if (xmode && *ptr == '#')
1012 {
1013 while (*(++ptr) != 0 && *ptr != '\n');
1014 if (*ptr == 0) return -1;
1015 continue;
1016 }
1017
1018 /* An opening parens must now be a real metacharacter */
1019
1020 if (*ptr != '(') continue;
1021 if (ptr[1] != '?' && ptr[1] != '*')
1022 {
1023 count++;
1024 if (name == NULL && count == lorn) return count;
1025 continue;
1026 }
1027
1028 ptr += 2;
1029 if (*ptr == 'P') ptr++; /* Allow optional P */
1030
1031 /* We have to disambiguate (?<! and (?<= from (?<name> */
1032
1033 if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') &&
1034 *ptr != '\'')
1035 continue;
1036
1037 count++;
1038
1039 if (name == NULL && count == lorn) return count;
1040 term = *ptr++;
1041 if (term == '<') term = '>';
1042 thisname = ptr;
1043 while (*ptr != term) ptr++;
1044 if (name != NULL && lorn == ptr - thisname &&
1045 strncmp((const char *)name, (const char *)thisname, lorn) == 0)
1046 return count;
1047 }
1048
1049 return -1;
1050 }
1051
1052
1053
1054 /*************************************************
1055 * Find first significant op code *
1056 *************************************************/
1057
1058 /* This is called by several functions that scan a compiled expression looking
1059 for a fixed first character, or an anchoring op code etc. It skips over things
1060 that do not influence this. For some calls, a change of option is important.
1061 For some calls, it makes sense to skip negative forward and all backward
1062 assertions, and also the \b assertion; for others it does not.
1063
1064 Arguments:
1065 code pointer to the start of the group
1066 options pointer to external options
1067 optbit the option bit whose changing is significant, or
1068 zero if none are
1069 skipassert TRUE if certain assertions are to be skipped
1070
1071 Returns: pointer to the first significant opcode
1072 */
1073
1074 static const uschar*
1075 first_significant_code(const uschar *code, int *options, int optbit,
1076 BOOL skipassert)
1077 {
1078 for (;;)
1079 {
1080 switch ((int)*code)
1081 {
1082 case OP_OPT:
1083 if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
1084 *options = (int)code[1];
1085 code += 2;
1086 break;
1087
1088 case OP_ASSERT_NOT:
1089 case OP_ASSERTBACK:
1090 case OP_ASSERTBACK_NOT:
1091 if (!skipassert) return code;
1092 do code += GET(code, 1); while (*code == OP_ALT);
1093 code += _pcre_OP_lengths[*code];
1094 break;
1095
1096 case OP_WORD_BOUNDARY:
1097 case OP_NOT_WORD_BOUNDARY:
1098 if (!skipassert) return code;
1099 /* Fall through */
1100
1101 case OP_CALLOUT:
1102 case OP_CREF:
1103 case OP_RREF:
1104 case OP_DEF:
1105 code += _pcre_OP_lengths[*code];
1106 break;
1107
1108 default:
1109 return code;
1110 }
1111 }
1112 /* Control never reaches here */
1113 }
1114
1115
1116
1117
1118 /*************************************************
1119 * Find the fixed length of a pattern *
1120 *************************************************/
1121
1122 /* Scan a pattern and compute the fixed length of subject that will match it,
1123 if the length is fixed. This is needed for dealing with backward assertions.
1124 In UTF8 mode, the result is in characters rather than bytes.
1125
1126 Arguments:
1127 code points to the start of the pattern (the bracket)
1128 options the compiling options
1129
1130 Returns: the fixed length, or -1 if there is no fixed length,
1131 or -2 if \C was encountered
1132 */
1133
1134 static int
1135 find_fixedlength(uschar *code, int options)
1136 {
1137 int length = -1;
1138
1139 register int branchlength = 0;
1140 register uschar *cc = code + 1 + LINK_SIZE;
1141
1142 /* Scan along the opcodes for this branch. If we get to the end of the
1143 branch, check the length against that of the other branches. */
1144
1145 for (;;)
1146 {
1147 int d;
1148 register int op = *cc;
1149 switch (op)
1150 {
1151 case OP_CBRA:
1152 case OP_BRA:
1153 case OP_ONCE:
1154 case OP_COND:
1155 d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);
1156 if (d < 0) return d;
1157 branchlength += d;
1158 do cc += GET(cc, 1); while (*cc == OP_ALT);
1159 cc += 1 + LINK_SIZE;
1160 break;
1161
1162 /* Reached end of a branch; if it's a ket it is the end of a nested
1163 call. If it's ALT it is an alternation in a nested call. If it is
1164 END it's the end of the outer call. All can be handled by the same code. */
1165
1166 case OP_ALT:
1167 case OP_KET:
1168 case OP_KETRMAX:
1169 case OP_KETRMIN:
1170 case OP_END:
1171 if (length < 0) length = branchlength;
1172 else if (length != branchlength) return -1;
1173 if (*cc != OP_ALT) return length;
1174 cc += 1 + LINK_SIZE;
1175 branchlength = 0;
1176 break;
1177
1178 /* Skip over assertive subpatterns */
1179
1180 case OP_ASSERT:
1181 case OP_ASSERT_NOT:
1182 case OP_ASSERTBACK:
1183 case OP_ASSERTBACK_NOT:
1184 do cc += GET(cc, 1); while (*cc == OP_ALT);
1185 /* Fall through */
1186
1187 /* Skip over things that don't match chars */
1188
1189 case OP_REVERSE:
1190 case OP_CREF:
1191 case OP_RREF:
1192 case OP_DEF:
1193 case OP_OPT:
1194 case OP_CALLOUT:
1195 case OP_SOD:
1196 case OP_SOM:
1197 case OP_EOD:
1198 case OP_EODN:
1199 case OP_CIRC:
1200 case OP_DOLL:
1201 case OP_NOT_WORD_BOUNDARY:
1202 case OP_WORD_BOUNDARY:
1203 cc += _pcre_OP_lengths[*cc];
1204 break;
1205
1206 /* Handle literal characters */
1207
1208 case OP_CHAR:
1209 case OP_CHARNC:
1210 case OP_NOT:
1211 branchlength++;
1212 cc += 2;
1213 #ifdef SUPPORT_UTF8
1214 if ((options & PCRE_UTF8) != 0)
1215 {
1216 while ((*cc & 0xc0) == 0x80) cc++;
1217 }
1218 #endif
1219 break;
1220
1221 /* Handle exact repetitions. The count is already in characters, but we
1222 need to skip over a multibyte character in UTF8 mode. */
1223
1224 case OP_EXACT:
1225 branchlength += GET2(cc,1);
1226 cc += 4;
1227 #ifdef SUPPORT_UTF8
1228 if ((options & PCRE_UTF8) != 0)
1229 {
1230 while((*cc & 0x80) == 0x80) cc++;
1231 }
1232 #endif
1233 break;
1234
1235 case OP_TYPEEXACT:
1236 branchlength += GET2(cc,1);
1237 if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
1238 cc += 4;
1239 break;
1240
1241 /* Handle single-char matchers */
1242
1243 case OP_PROP:
1244 case OP_NOTPROP:
1245 cc += 2;
1246 /* Fall through */
1247
1248 case OP_NOT_DIGIT:
1249 case OP_DIGIT:
1250 case OP_NOT_WHITESPACE:
1251 case OP_WHITESPACE:
1252 case OP_NOT_WORDCHAR:
1253 case OP_WORDCHAR:
1254 case OP_ANY:
1255 branchlength++;
1256 cc++;
1257 break;
1258
1259 /* The single-byte matcher isn't allowed */
1260
1261 case OP_ANYBYTE:
1262 return -2;
1263
1264 /* Check a class for variable quantification */
1265
1266 #ifdef SUPPORT_UTF8
1267 case OP_XCLASS:
1268 cc += GET(cc, 1) - 33;
1269 /* Fall through */
1270 #endif
1271
1272 case OP_CLASS:
1273 case OP_NCLASS:
1274 cc += 33;
1275
1276 switch (*cc)
1277 {
1278 case OP_CRSTAR:
1279 case OP_CRMINSTAR:
1280 case OP_CRQUERY:
1281 case OP_CRMINQUERY:
1282 return -1;
1283
1284 case OP_CRRANGE:
1285 case OP_CRMINRANGE:
1286 if (GET2(cc,1) != GET2(cc,3)) return -1;
1287 branchlength += GET2(cc,1);
1288 cc += 5;
1289 break;
1290
1291 default:
1292 branchlength++;
1293 }
1294 break;
1295
1296 /* Anything else is variable length */
1297
1298 default:
1299 return -1;
1300 }
1301 }
1302 /* Control never gets here */
1303 }
1304
1305
1306
1307
1308 /*************************************************
1309 * Scan compiled regex for numbered bracket *
1310 *************************************************/
1311
1312 /* This little function scans through a compiled pattern until it finds a
1313 capturing bracket with the given number.
1314
1315 Arguments:
1316 code points to start of expression
1317 utf8 TRUE in UTF-8 mode
1318 number the required bracket number
1319
1320 Returns: pointer to the opcode for the bracket, or NULL if not found
1321 */
1322
1323 static const uschar *
1324 find_bracket(const uschar *code, BOOL utf8, int number)
1325 {
1326 for (;;)
1327 {
1328 register int c = *code;
1329 if (c == OP_END) return NULL;
1330
1331 /* XCLASS is used for classes that cannot be represented just by a bit
1332 map. This includes negated single high-valued characters. The length in
1333 the table is zero; the actual length is stored in the compiled code. */
1334
1335 if (c == OP_XCLASS) code += GET(code, 1);
1336
1337 /* Handle capturing bracket */
1338
1339 else if (c == OP_CBRA)
1340 {
1341 int n = GET2(code, 1+LINK_SIZE);
1342 if (n == number) return (uschar *)code;
1343 code += _pcre_OP_lengths[c];
1344 }
1345
1346 /* Otherwise, we can get the item's length from the table, except that for
1347 repeated character types, we have to test for \p and \P, which have an extra
1348 two bytes of parameters. */
1349
1350 else
1351 {
1352 switch(c)
1353 {
1354 case OP_TYPESTAR:
1355 case OP_TYPEMINSTAR:
1356 case OP_TYPEPLUS:
1357 case OP_TYPEMINPLUS:
1358 case OP_TYPEQUERY:
1359 case OP_TYPEMINQUERY:
1360 case OP_TYPEPOSSTAR:
1361 case OP_TYPEPOSPLUS:
1362 case OP_TYPEPOSQUERY:
1363 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1364 break;
1365
1366 case OP_TYPEUPTO:
1367 case OP_TYPEMINUPTO:
1368 case OP_TYPEEXACT:
1369 case OP_TYPEPOSUPTO:
1370 if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1371 break;
1372 }
1373
1374 /* Add in the fixed length from the table */
1375
1376 code += _pcre_OP_lengths[c];
1377
1378 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1379 a multi-byte character. The length in the table is a minimum, so we have to
1380 arrange to skip the extra bytes. */
1381
1382 #ifdef SUPPORT_UTF8
1383 if (utf8) switch(c)
1384 {
1385 case OP_CHAR:
1386 case OP_CHARNC:
1387 case OP_EXACT:
1388 case OP_UPTO:
1389 case OP_MINUPTO:
1390 case OP_POSUPTO:
1391 case OP_STAR:
1392 case OP_MINSTAR:
1393 case OP_POSSTAR:
1394 case OP_PLUS:
1395 case OP_MINPLUS:
1396 case OP_POSPLUS:
1397 case OP_QUERY:
1398 case OP_MINQUERY:
1399 case OP_POSQUERY:
1400 if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1401 break;
1402 }
1403 #endif
1404 }
1405 }
1406 }
1407
1408
1409
1410 /*************************************************
1411 * Scan compiled regex for recursion reference *
1412 *************************************************/
1413
1414 /* This little function scans through a compiled pattern until it finds an
1415 instance of OP_RECURSE.
1416
1417 Arguments:
1418 code points to start of expression
1419 utf8 TRUE in UTF-8 mode
1420
1421 Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1422 */
1423
1424 static const uschar *
1425 find_recurse(const uschar *code, BOOL utf8)
1426 {
1427 for (;;)
1428 {
1429 register int c = *code;
1430 if (c == OP_END) return NULL;
1431 if (c == OP_RECURSE) return code;
1432
1433 /* XCLASS is used for classes that cannot be represented just by a bit
1434 map. This includes negated single high-valued characters. The length in
1435 the table is zero; the actual length is stored in the compiled code. */
1436
1437 if (c == OP_XCLASS) code += GET(code, 1);
1438
1439 /* Otherwise, we can get the item's length from the table, except that for
1440 repeated character types, we have to test for \p and \P, which have an extra
1441 two bytes of parameters. */
1442
1443 else
1444 {
1445 switch(c)
1446 {
1447 case OP_TYPESTAR:
1448 case OP_TYPEMINSTAR:
1449 case OP_TYPEPLUS:
1450 case OP_TYPEMINPLUS:
1451 case OP_TYPEQUERY:
1452 case OP_TYPEMINQUERY:
1453 case OP_TYPEPOSSTAR:
1454 case OP_TYPEPOSPLUS:
1455 case OP_TYPEPOSQUERY:
1456 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1457 break;
1458
1459 case OP_TYPEPOSUPTO:
1460 case OP_TYPEUPTO:
1461 case OP_TYPEMINUPTO:
1462 case OP_TYPEEXACT:
1463 if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1464 break;
1465 }
1466
1467 /* Add in the fixed length from the table */
1468
1469 code += _pcre_OP_lengths[c];
1470
1471 /* In UTF-8 mode, opcodes that are followed by a character may be followed
1472 by a multi-byte character. The length in the table is a minimum, so we have
1473 to arrange to skip the extra bytes. */
1474
1475 #ifdef SUPPORT_UTF8
1476 if (utf8) switch(c)
1477 {
1478 case OP_CHAR:
1479 case OP_CHARNC:
1480 case OP_EXACT:
1481 case OP_UPTO:
1482 case OP_MINUPTO:
1483 case OP_POSUPTO:
1484 case OP_STAR:
1485 case OP_MINSTAR:
1486 case OP_POSSTAR:
1487 case OP_PLUS:
1488 case OP_MINPLUS:
1489 case OP_POSPLUS:
1490 case OP_QUERY:
1491 case OP_MINQUERY:
1492 case OP_POSQUERY:
1493 if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1494 break;
1495 }
1496 #endif
1497 }
1498 }
1499 }
1500
1501
1502
1503 /*************************************************
1504 * Scan compiled branch for non-emptiness *
1505 *************************************************/
1506
1507 /* This function scans through a branch of a compiled pattern to see whether it
1508 can match the empty string or not. It is called from could_be_empty()
1509 below and from compile_branch() when checking for an unlimited repeat of a
1510 group that can match nothing. Note that first_significant_code() skips over
1511 backward and negative forward assertions when its final argument is TRUE. If we
1512 hit an unclosed bracket, we return "empty" - this means we've struck an inner
1513 bracket whose current branch will already have been scanned.
1514
1515 Arguments:
1516 code points to start of search
1517 endcode points to where to stop
1518 utf8 TRUE if in UTF8 mode
1519
1520 Returns: TRUE if what is matched could be empty
1521 */
1522
1523 static BOOL
1524 could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1525 {
1526 register int c;
1527 for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1528 code < endcode;
1529 code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1530 {
1531 const uschar *ccode;
1532
1533 c = *code;
1534
1535 /* Skip over forward assertions; the other assertions are skipped by
1536 first_significant_code() with a TRUE final argument. */
1537
1538 if (c == OP_ASSERT)
1539 {
1540 do code += GET(code, 1); while (*code == OP_ALT);
1541 c = *code;
1542 continue;
1543 }
1544
1545 /* Groups with zero repeats can of course be empty; skip them. */
1546
1547 if (c == OP_BRAZERO || c == OP_BRAMINZERO)
1548 {
1549 code += _pcre_OP_lengths[c];
1550 do code += GET(code, 1); while (*code == OP_ALT);
1551 c = *code;
1552 continue;
1553 }
1554
1555 /* For other groups, scan the branches. */
1556
1557 if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND)
1558 {
1559 BOOL empty_branch;
1560 if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1561
1562 /* Scan a closed bracket */
1563
1564 empty_branch = FALSE;
1565 do
1566 {
1567 if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1568 empty_branch = TRUE;
1569 code += GET(code, 1);
1570 }
1571 while (*code == OP_ALT);
1572 if (!empty_branch) return FALSE; /* All branches are non-empty */
1573 c = *code;
1574 continue;
1575 }
1576
1577 /* Handle the other opcodes */
1578
1579 switch (c)
1580 {
1581 /* Check for quantifiers after a class. XCLASS is used for classes that
1582 cannot be represented just by a bit map. This includes negated single
1583 high-valued characters. The length in _pcre_OP_lengths[] is zero; the
1584 actual length is stored in the compiled code, so we must update "code"
1585 here. */
1586
1587 #ifdef SUPPORT_UTF8
1588 case OP_XCLASS:
1589 ccode = code += GET(code, 1);
1590 goto CHECK_CLASS_REPEAT;
1591 #endif
1592
1593 case OP_CLASS:
1594 case OP_NCLASS:
1595 ccode = code + 33;
1596
1597 #ifdef SUPPORT_UTF8
1598 CHECK_CLASS_REPEAT:
1599 #endif
1600
1601 switch (*ccode)
1602 {
1603 case OP_CRSTAR: /* These could be empty; continue */
1604 case OP_CRMINSTAR:
1605 case OP_CRQUERY:
1606 case OP_CRMINQUERY:
1607 break;
1608
1609 default: /* Non-repeat => class must match */
1610 case OP_CRPLUS: /* These repeats aren't empty */
1611 case OP_CRMINPLUS:
1612 return FALSE;
1613
1614 case OP_CRRANGE:
1615 case OP_CRMINRANGE:
1616 if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
1617 break;
1618 }
1619 break;
1620
1621 /* Opcodes that must match a character */
1622
1623 case OP_PROP:
1624 case OP_NOTPROP:
1625 case OP_EXTUNI:
1626 case OP_NOT_DIGIT:
1627 case OP_DIGIT:
1628 case OP_NOT_WHITESPACE:
1629 case OP_WHITESPACE:
1630 case OP_NOT_WORDCHAR:
1631 case OP_WORDCHAR:
1632 case OP_ANY:
1633 case OP_ANYBYTE:
1634 case OP_CHAR:
1635 case OP_CHARNC:
1636 case OP_NOT:
1637 case OP_PLUS:
1638 case OP_MINPLUS:
1639 case OP_POSPLUS:
1640 case OP_EXACT:
1641 case OP_NOTPLUS:
1642 case OP_NOTMINPLUS:
1643 case OP_NOTPOSPLUS:
1644 case OP_NOTEXACT:
1645 case OP_TYPEPLUS:
1646 case OP_TYPEMINPLUS:
1647 case OP_TYPEPOSPLUS:
1648 case OP_TYPEEXACT:
1649 return FALSE;
1650
1651 /* These are going to continue, as they may be empty, but we have to
1652 fudge the length for the \p and \P cases. */
1653
1654 case OP_TYPESTAR:
1655 case OP_TYPEMINSTAR:
1656 case OP_TYPEPOSSTAR:
1657 case OP_TYPEQUERY:
1658 case OP_TYPEMINQUERY:
1659 case OP_TYPEPOSQUERY:
1660 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1661 break;
1662
1663 /* Same for these */
1664
1665 case OP_TYPEUPTO:
1666 case OP_TYPEMINUPTO:
1667 case OP_TYPEPOSUPTO:
1668 if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1669 break;
1670
1671 /* End of branch */
1672
1673 case OP_KET:
1674 case OP_KETRMAX:
1675 case OP_KETRMIN:
1676 case OP_ALT:
1677 return TRUE;
1678
1679 /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1680 MINUPTO, and POSUPTO may be followed by a multibyte character */
1681
1682 #ifdef SUPPORT_UTF8
1683 case OP_STAR:
1684 case OP_MINSTAR:
1685 case OP_POSSTAR:
1686 case OP_QUERY:
1687 case OP_MINQUERY:
1688 case OP_POSQUERY:
1689 case OP_UPTO:
1690 case OP_MINUPTO:
1691 case OP_POSUPTO:
1692 if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1693 break;
1694 #endif
1695 }
1696 }
1697
1698 return TRUE;
1699 }
1700
1701
1702
1703 /*************************************************
1704 * Scan compiled regex for non-emptiness *
1705 *************************************************/
1706
1707 /* This function is called to check for left recursive calls. We want to check
1708 the current branch of the current pattern to see if it could match the empty
1709 string. If it could, we must look outwards for branches at other levels,
1710 stopping when we pass beyond the bracket which is the subject of the recursion.
1711
1712 Arguments:
1713 code points to start of the recursion
1714 endcode points to where to stop (current RECURSE item)
1715 bcptr points to the chain of current (unclosed) branch starts
1716 utf8 TRUE if in UTF-8 mode
1717
1718 Returns: TRUE if what is matched could be empty
1719 */
1720
1721 static BOOL
1722 could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1723 BOOL utf8)
1724 {
1725 while (bcptr != NULL && bcptr->current >= code)
1726 {
1727 if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1728 bcptr = bcptr->outer;
1729 }
1730 return TRUE;
1731 }
1732
1733
1734
1735 /*************************************************
1736 * Check for POSIX class syntax *
1737 *************************************************/
1738
1739 /* This function is called when the sequence "[:" or "[." or "[=" is
1740 encountered in a character class. It checks whether this is followed by an
1741 optional ^ and then a sequence of letters, terminated by a matching ":]" or
1742 ".]" or "=]".
1743
1744 Argument:
1745 ptr pointer to the initial [
1746 endptr where to return the end pointer
1747 cd pointer to compile data
1748
1749 Returns: TRUE or FALSE
1750 */
1751
1752 static BOOL
1753 check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
1754 {
1755 int terminator; /* Don't combine these lines; the Solaris cc */
1756 terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
1757 if (*(++ptr) == '^') ptr++;
1758 while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
1759 if (*ptr == terminator && ptr[1] == ']')
1760 {
1761 *endptr = ptr;
1762 return TRUE;
1763 }
1764 return FALSE;
1765 }
1766
1767
1768
1769
1770 /*************************************************
1771 * Check POSIX class name *
1772 *************************************************/
1773
1774 /* This function is called to check the name given in a POSIX-style class entry
1775 such as [:alnum:].
1776
1777 Arguments:
1778 ptr points to the first letter
1779 len the length of the name
1780
1781 Returns: a value representing the name, or -1 if unknown
1782 */
1783
1784 static int
1785 check_posix_name(const uschar *ptr, int len)
1786 {
1787 const char *pn = posix_names;
1788 register int yield = 0;
1789 while (posix_name_lengths[yield] != 0)
1790 {
1791 if (len == posix_name_lengths[yield] &&
1792 strncmp((const char *)ptr, pn, len) == 0) return yield;
1793 pn += posix_name_lengths[yield] + 1;
1794 yield++;
1795 }
1796 return -1;
1797 }
1798
1799
1800 /*************************************************
1801 * Adjust OP_RECURSE items in repeated group *
1802 *************************************************/
1803
1804 /* OP_RECURSE items contain an offset from the start of the regex to the group
1805 that is referenced. This means that groups can be replicated for fixed
1806 repetition simply by copying (because the recursion is allowed to refer to
1807 earlier groups that are outside the current group). However, when a group is
1808 optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
1809 it, after it has been compiled. This means that any OP_RECURSE items within it
1810 that refer to the group itself or any contained groups have to have their
1811 offsets adjusted. That one of the jobs of this function. Before it is called,
1812 the partially compiled regex must be temporarily terminated with OP_END.
1813
1814 This function has been extended with the possibility of forward references for
1815 recursions and subroutine calls. It must also check the list of such references
1816 for the group we are dealing with. If it finds that one of the recursions in
1817 the current group is on this list, it adjusts the offset in the list, not the
1818 value in the reference (which is a group number).
1819
1820 Arguments:
1821 group points to the start of the group
1822 adjust the amount by which the group is to be moved
1823 utf8 TRUE in UTF-8 mode
1824 cd contains pointers to tables etc.
1825 save_hwm the hwm forward reference pointer at the start of the group
1826
1827 Returns: nothing
1828 */
1829
1830 static void
1831 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
1832 uschar *save_hwm)
1833 {
1834 uschar *ptr = group;
1835
1836 while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1837 {
1838 int offset;
1839 uschar *hc;
1840
1841 /* See if this recursion is on the forward reference list. If so, adjust the
1842 reference. */
1843
1844 for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
1845 {
1846 offset = GET(hc, 0);
1847 if (cd->start_code + offset == ptr + 1)
1848 {
1849 PUT(hc, 0, offset + adjust);
1850 break;
1851 }
1852 }
1853
1854 /* Otherwise, adjust the recursion offset if it's after the start of this
1855 group. */
1856
1857 if (hc >= cd->hwm)
1858 {
1859 offset = GET(ptr, 1);
1860 if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1861 }
1862
1863 ptr += 1 + LINK_SIZE;
1864 }
1865 }
1866
1867
1868
1869 /*************************************************
1870 * Insert an automatic callout point *
1871 *************************************************/
1872
1873 /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
1874 callout points before each pattern item.
1875
1876 Arguments:
1877 code current code pointer
1878 ptr current pattern pointer
1879 cd pointers to tables etc
1880
1881 Returns: new code pointer
1882 */
1883
1884 static uschar *
1885 auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
1886 {
1887 *code++ = OP_CALLOUT;
1888 *code++ = 255;
1889 PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */
1890 PUT(code, LINK_SIZE, 0); /* Default length */
1891 return code + 2*LINK_SIZE;
1892 }
1893
1894
1895
1896 /*************************************************
1897 * Complete a callout item *
1898 *************************************************/
1899
1900 /* A callout item contains the length of the next item in the pattern, which
1901 we can't fill in till after we have reached the relevant point. This is used
1902 for both automatic and manual callouts.
1903
1904 Arguments:
1905 previous_callout points to previous callout item
1906 ptr current pattern pointer
1907 cd pointers to tables etc
1908
1909 Returns: nothing
1910 */
1911
1912 static void
1913 complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
1914 {
1915 int length = ptr - cd->start_pattern - GET(previous_callout, 2);
1916 PUT(previous_callout, 2 + LINK_SIZE, length);
1917 }
1918
1919
1920
1921 #ifdef SUPPORT_UCP
1922 /*************************************************
1923 * Get othercase range *
1924 *************************************************/
1925
1926 /* This function is passed the start and end of a class range, in UTF-8 mode
1927 with UCP support. It searches up the characters, looking for internal ranges of
1928 characters in the "other" case. Each call returns the next one, updating the
1929 start address.
1930
1931 Arguments:
1932 cptr points to starting character value; updated
1933 d end value
1934 ocptr where to put start of othercase range
1935 odptr where to put end of othercase range
1936
1937 Yield: TRUE when range returned; FALSE when no more
1938 */
1939
1940 static BOOL
1941 get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
1942 unsigned int *odptr)
1943 {
1944 unsigned int c, othercase, next;
1945
1946 for (c = *cptr; c <= d; c++)
1947 { if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR) break; }
1948
1949 if (c > d) return FALSE;
1950
1951 *ocptr = othercase;
1952 next = othercase + 1;
1953
1954 for (++c; c <= d; c++)
1955 {
1956 if (_pcre_ucp_othercase(c) != next) break;
1957 next++;
1958 }
1959
1960 *odptr = next - 1;
1961 *cptr = c;
1962
1963 return TRUE;
1964 }
1965 #endif /* SUPPORT_UCP */
1966
1967
1968
1969 /*************************************************
1970 * Check if auto-possessifying is possible *
1971 *************************************************/
1972
1973 /* This function is called for unlimited repeats of certain items, to see
1974 whether the next thing could possibly match the repeated item. If not, it makes
1975 sense to automatically possessify the repeated item.
1976
1977 Arguments:
1978 op_code the repeated op code
1979 this data for this item, depends on the opcode
1980 utf8 TRUE in UTF-8 mode
1981 utf8_char used for utf8 character bytes, NULL if not relevant
1982 ptr next character in pattern
1983 options options bits
1984 cd contains pointers to tables etc.
1985
1986 Returns: TRUE if possessifying is wanted
1987 */
1988
1989 static BOOL
1990 check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
1991 const uschar *ptr, int options, compile_data *cd)
1992 {
1993 int next;
1994
1995 /* Skip whitespace and comments in extended mode */
1996
1997 if ((options & PCRE_EXTENDED) != 0)
1998 {
1999 for (;;)
2000 {
2001 while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2002 if (*ptr == '#')
2003 {
2004 while (*(++ptr) != 0)
2005 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2006 }
2007 else break;
2008 }
2009 }
2010
2011 /* If the next item is one that we can handle, get its value. A non-negative
2012 value is a character, a negative value is an escape value. */
2013
2014 if (*ptr == '\\')
2015 {
2016 int temperrorcode = 0;
2017 next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
2018 if (temperrorcode != 0) return FALSE;
2019 ptr++; /* Point after the escape sequence */
2020 }
2021
2022 else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
2023 {
2024 #ifdef SUPPORT_UTF8
2025 if (utf8) { GETCHARINC(next, ptr); } else
2026 #endif
2027 next = *ptr++;
2028 }
2029
2030 else return FALSE;
2031
2032 /* Skip whitespace and comments in extended mode */
2033
2034 if ((options & PCRE_EXTENDED) != 0)
2035 {
2036 for (;;)
2037 {
2038 while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2039 if (*ptr == '#')
2040 {
2041 while (*(++ptr) != 0)
2042 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2043 }
2044 else break;
2045 }
2046 }
2047
2048 /* If the next thing is itself optional, we have to give up. */
2049
2050 if (*ptr == '*' || *ptr == '?' || strncmp((char *)ptr, "{0,", 3) == 0)
2051 return FALSE;
2052
2053 /* Now compare the next item with the previous opcode. If the previous is a
2054 positive single character match, "item" either contains the character or, if
2055 "item" is greater than 127 in utf8 mode, the character's bytes are in
2056 utf8_char. */
2057
2058
2059 /* Handle cases when the next item is a character. */
2060
2061 if (next >= 0) switch(op_code)
2062 {
2063 case OP_CHAR:
2064 #ifdef SUPPORT_UTF8
2065 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2066 #endif
2067 return item != next;
2068
2069 /* For CHARNC (caseless character) we must check the other case. If we have
2070 Unicode property support, we can use it to test the other case of
2071 high-valued characters. */
2072
2073 case OP_CHARNC:
2074 #ifdef SUPPORT_UTF8
2075 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2076 #endif
2077 if (item == next) return FALSE;
2078 #ifdef SUPPORT_UTF8
2079 if (utf8)
2080 {
2081 unsigned int othercase;
2082 if (next < 128) othercase = cd->fcc[next]; else
2083 #ifdef SUPPORT_UCP
2084 othercase = _pcre_ucp_othercase((unsigned int)next);
2085 #else
2086 othercase = NOTACHAR;
2087 #endif
2088 return (unsigned int)item != othercase;
2089 }
2090 else
2091 #endif /* SUPPORT_UTF8 */
2092 return (item != cd->fcc[next]); /* Non-UTF-8 mode */
2093
2094 /* For OP_NOT, "item" must be a single-byte character. */
2095
2096 case OP_NOT:
2097 if (next < 0) return FALSE; /* Not a character */
2098 if (item == next) return TRUE;
2099 if ((options & PCRE_CASELESS) == 0) return FALSE;
2100 #ifdef SUPPORT_UTF8
2101 if (utf8)
2102 {
2103 unsigned int othercase;
2104 if (next < 128) othercase = cd->fcc[next]; else
2105 #ifdef SUPPORT_UCP
2106 othercase = _pcre_ucp_othercase(next);
2107 #else
2108 othercase = NOTACHAR;
2109 #endif
2110 return (unsigned int)item == othercase;
2111 }
2112 else
2113 #endif /* SUPPORT_UTF8 */
2114 return (item == cd->fcc[next]); /* Non-UTF-8 mode */
2115
2116 case OP_DIGIT:
2117 return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
2118
2119 case OP_NOT_DIGIT:
2120 return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
2121
2122 case OP_WHITESPACE:
2123 return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
2124
2125 case OP_NOT_WHITESPACE:
2126 return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
2127
2128 case OP_WORDCHAR:
2129 return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
2130
2131 case OP_NOT_WORDCHAR:
2132 return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
2133
2134 case OP_HSPACE:
2135 case OP_NOT_HSPACE:
2136 switch(next)
2137 {
2138 case 0x09:
2139 case 0x20:
2140 case 0xa0:
2141 case 0x1680:
2142 case 0x180e:
2143 case 0x2000:
2144 case 0x2001:
2145 case 0x2002:
2146 case 0x2003:
2147 case 0x2004:
2148 case 0x2005:
2149 case 0x2006:
2150 case 0x2007:
2151 case 0x2008:
2152 case 0x2009:
2153 case 0x200A:
2154 case 0x202f:
2155 case 0x205f:
2156 case 0x3000:
2157 return op_code != OP_HSPACE;
2158 default:
2159 return op_code == OP_HSPACE;
2160 }
2161
2162 case OP_VSPACE:
2163 case OP_NOT_VSPACE:
2164 switch(next)
2165 {
2166 case 0x0a:
2167 case 0x0b:
2168 case 0x0c:
2169 case 0x0d:
2170 case 0x85:
2171 case 0x2028:
2172 case 0x2029:
2173 return op_code != OP_VSPACE;
2174 default:
2175 return op_code == OP_VSPACE;
2176 }
2177
2178 default:
2179 return FALSE;
2180 }
2181
2182
2183 /* Handle the case when the next item is \d, \s, etc. */
2184
2185 switch(op_code)
2186 {
2187 case OP_CHAR:
2188 case OP_CHARNC:
2189 #ifdef SUPPORT_UTF8
2190 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2191 #endif
2192 switch(-next)
2193 {
2194 case ESC_d:
2195 return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
2196
2197 case ESC_D:
2198 return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
2199
2200 case ESC_s:
2201 return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
2202
2203 case ESC_S:
2204 return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
2205
2206 case ESC_w:
2207 return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
2208
2209 case ESC_W:
2210 return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
2211
2212 case ESC_h:
2213 case ESC_H:
2214 switch(item)
2215 {
2216 case 0x09:
2217 case 0x20:
2218 case 0xa0:
2219 case 0x1680:
2220 case 0x180e:
2221 case 0x2000:
2222 case 0x2001:
2223 case 0x2002:
2224 case 0x2003:
2225 case 0x2004:
2226 case 0x2005:
2227 case 0x2006:
2228 case 0x2007:
2229 case 0x2008:
2230 case 0x2009:
2231 case 0x200A:
2232 case 0x202f:
2233 case 0x205f:
2234 case 0x3000:
2235 return -next != ESC_h;
2236 default:
2237 return -next == ESC_h;
2238 }
2239
2240 case ESC_v:
2241 case ESC_V:
2242 switch(item)
2243 {
2244 case 0x0a:
2245 case 0x0b:
2246 case 0x0c:
2247 case 0x0d:
2248 case 0x85:
2249 case 0x2028:
2250 case 0x2029:
2251 return -next != ESC_v;
2252 default:
2253 return -next == ESC_v;
2254 }
2255
2256 default:
2257 return FALSE;
2258 }
2259
2260 case OP_DIGIT:
2261 return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
2262 next == -ESC_h || next == -ESC_v;
2263
2264 case OP_NOT_DIGIT:
2265 return next == -ESC_d;
2266
2267 case OP_WHITESPACE:
2268 return next == -ESC_S || next == -ESC_d || next == -ESC_w;
2269
2270 case OP_NOT_WHITESPACE:
2271 return next == -ESC_s || next == -ESC_h || next == -ESC_v;
2272
2273 case OP_HSPACE:
2274 return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w;
2275
2276 case OP_NOT_HSPACE:
2277 return next == -ESC_h;
2278
2279 /* Can't have \S in here because VT matches \S (Perl anomaly) */
2280 case OP_VSPACE:
2281 return next == -ESC_V || next == -ESC_d || next == -ESC_w;
2282
2283 case OP_NOT_VSPACE:
2284 return next == -ESC_v;
2285
2286 case OP_WORDCHAR:
2287 return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v;
2288
2289 case OP_NOT_WORDCHAR:
2290 return next == -ESC_w || next == -ESC_d;
2291
2292 default:
2293 return FALSE;
2294 }
2295
2296 /* Control does not reach here */
2297 }
2298
2299
2300
2301 /*************************************************
2302 * Compile one branch *
2303 *************************************************/
2304
2305 /* Scan the pattern, compiling it into the a vector. If the options are
2306 changed during the branch, the pointer is used to change the external options
2307 bits. This function is used during the pre-compile phase when we are trying
2308 to find out the amount of memory needed, as well as during the real compile
2309 phase. The value of lengthptr distinguishes the two phases.
2310
2311 Arguments:
2312 optionsptr pointer to the option bits
2313 codeptr points to the pointer to the current code point
2314 ptrptr points to the current pattern pointer
2315 errorcodeptr points to error code variable
2316 firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2317 reqbyteptr set to the last literal character required, else < 0
2318 bcptr points to current branch chain
2319 cd contains pointers to tables etc.
2320 lengthptr NULL during the real compile phase
2321 points to length accumulator during pre-compile phase
2322
2323 Returns: TRUE on success
2324 FALSE, with *errorcodeptr set non-zero on error
2325 */
2326
2327 static BOOL
2328 compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2329 int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2330 compile_data *cd, int *lengthptr)
2331 {
2332 int repeat_type, op_type;
2333 int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
2334 int bravalue = 0;
2335 int greedy_default, greedy_non_default;
2336 int firstbyte, reqbyte;
2337 int zeroreqbyte, zerofirstbyte;
2338 int req_caseopt, reqvary, tempreqvary;
2339 int options = *optionsptr;
2340 int after_manual_callout = 0;
2341 int length_prevgroup = 0;
2342 register int c;
2343 register uschar *code = *codeptr;
2344 uschar *last_code = code;
2345 uschar *orig_code = code;
2346 uschar *tempcode;
2347 BOOL inescq = FALSE;
2348 BOOL groupsetfirstbyte = FALSE;
2349 const uschar *ptr = *ptrptr;
2350 const uschar *tempptr;
2351 uschar *previous = NULL;
2352 uschar *previous_callout = NULL;
2353 uschar *save_hwm = NULL;
2354 uschar classbits[32];
2355
2356 #ifdef SUPPORT_UTF8
2357 BOOL class_utf8;
2358 BOOL utf8 = (options & PCRE_UTF8) != 0;
2359 uschar *class_utf8data;
2360 uschar utf8_char[6];
2361 #else
2362 BOOL utf8 = FALSE;
2363 uschar *utf8_char = NULL;
2364 #endif
2365
2366 #ifdef DEBUG
2367 if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2368 #endif
2369
2370 /* Set up the default and non-default settings for greediness */
2371
2372 greedy_default = ((options & PCRE_UNGREEDY) != 0);
2373 greedy_non_default = greedy_default ^ 1;
2374
2375 /* Initialize no first byte, no required byte. REQ_UNSET means "no char
2376 matching encountered yet". It gets changed to REQ_NONE if we hit something that
2377 matches a non-fixed char first char; reqbyte just remains unset if we never
2378 find one.
2379
2380 When we hit a repeat whose minimum is zero, we may have to adjust these values
2381 to take the zero repeat into account. This is implemented by setting them to
2382 zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2383 item types that can be repeated set these backoff variables appropriately. */
2384
2385 firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2386
2387 /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2388 according to the current setting of the caseless flag. REQ_CASELESS is a bit
2389 value > 255. It is added into the firstbyte or reqbyte variables to record the
2390 case status of the value. This is used only for ASCII characters. */
2391
2392 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2393
2394 /* Switch on next character until the end of the branch */
2395
2396 for (;; ptr++)
2397 {
2398 BOOL negate_class;
2399 BOOL should_flip_negation;
2400 BOOL possessive_quantifier;
2401 BOOL is_quantifier;
2402 BOOL is_recurse;
2403 BOOL reset_bracount;
2404 int class_charcount;
2405 int class_lastchar;
2406 int newoptions;
2407 int recno;
2408 int refsign;
2409 int skipbytes;
2410 int subreqbyte;
2411 int subfirstbyte;
2412 int terminator;
2413 int mclength;
2414 uschar mcbuffer[8];
2415
2416 /* Get next byte in the pattern */
2417
2418 c = *ptr;
2419
2420 /* If we are in the pre-compile phase, accumulate the length used for the
2421 previous cycle of this loop. */
2422
2423 if (lengthptr != NULL)
2424 {
2425 #ifdef DEBUG
2426 if (code > cd->hwm) cd->hwm = code; /* High water info */
2427 #endif
2428 if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
2429 {
2430 *errorcodeptr = ERR52;
2431 goto FAILED;
2432 }
2433
2434 /* There is at least one situation where code goes backwards: this is the
2435 case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2436 the class is simply eliminated. However, it is created first, so we have to
2437 allow memory for it. Therefore, don't ever reduce the length at this point.
2438 */
2439
2440 if (code < last_code) code = last_code;
2441
2442 /* Paranoid check for integer overflow */
2443
2444 if (OFLOW_MAX - *lengthptr < code - last_code)
2445 {
2446 *errorcodeptr = ERR20;
2447 goto FAILED;
2448 }
2449
2450 *lengthptr += code - last_code;
2451 DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2452
2453 /* If "previous" is set and it is not at the start of the work space, move
2454 it back to there, in order to avoid filling up the work space. Otherwise,
2455 if "previous" is NULL, reset the current code pointer to the start. */
2456
2457 if (previous != NULL)
2458 {
2459 if (previous > orig_code)
2460 {
2461 memmove(orig_code, previous, code - previous);
2462 code -= previous - orig_code;
2463 previous = orig_code;
2464 }
2465 }
2466 else code = orig_code;
2467
2468 /* Remember where this code item starts so we can pick up the length
2469 next time round. */
2470
2471 last_code = code;
2472 }
2473
2474 /* In the real compile phase, just check the workspace used by the forward
2475 reference list. */
2476
2477 else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
2478 {
2479 *errorcodeptr = ERR52;
2480 goto FAILED;
2481 }
2482
2483 /* If in \Q...\E, check for the end; if not, we have a literal */
2484
2485 if (inescq && c != 0)
2486 {
2487 if (c == '\\' && ptr[1] == 'E')
2488 {
2489 inescq = FALSE;
2490 ptr++;
2491 continue;
2492 }
2493 else
2494 {
2495 if (previous_callout != NULL)
2496 {
2497 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2498 complete_callout(previous_callout, ptr, cd);
2499 previous_callout = NULL;
2500 }
2501 if ((options & PCRE_AUTO_CALLOUT) != 0)
2502 {
2503 previous_callout = code;
2504 code = auto_callout(code, ptr, cd);
2505 }
2506 goto NORMAL_CHAR;
2507 }
2508 }
2509
2510 /* Fill in length of a previous callout, except when the next thing is
2511 a quantifier. */
2512
2513 is_quantifier = c == '*' || c == '+' || c == '?' ||
2514 (c == '{' && is_counted_repeat(ptr+1));
2515
2516 if (!is_quantifier && previous_callout != NULL &&
2517 after_manual_callout-- <= 0)
2518 {
2519 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2520 complete_callout(previous_callout, ptr, cd);
2521 previous_callout = NULL;
2522 }
2523
2524 /* In extended mode, skip white space and comments */
2525
2526 if ((options & PCRE_EXTENDED) != 0)
2527 {
2528 if ((cd->ctypes[c] & ctype_space) != 0) continue;
2529 if (c == '#')
2530 {
2531 while (*(++ptr) != 0)
2532 {
2533 if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
2534 }
2535 if (*ptr != 0) continue;
2536
2537 /* Else fall through to handle end of string */
2538 c = 0;
2539 }
2540 }
2541
2542 /* No auto callout for quantifiers. */
2543
2544 if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
2545 {
2546 previous_callout = code;
2547 code = auto_callout(code, ptr, cd);
2548 }
2549
2550 switch(c)
2551 {
2552 /* ===================================================================*/
2553 case 0: /* The branch terminates at string end */
2554 case '|': /* or | or ) */
2555 case ')':
2556 *firstbyteptr = firstbyte;
2557 *reqbyteptr = reqbyte;
2558 *codeptr = code;
2559 *ptrptr = ptr;
2560 if (lengthptr != NULL)
2561 {
2562 if (OFLOW_MAX - *lengthptr < code - last_code)
2563 {
2564 *errorcodeptr = ERR20;
2565 goto FAILED;
2566 }
2567 *lengthptr += code - last_code; /* To include callout length */
2568 DPRINTF((">> end branch\n"));
2569 }
2570 return TRUE;
2571
2572
2573 /* ===================================================================*/
2574 /* Handle single-character metacharacters. In multiline mode, ^ disables
2575 the setting of any following char as a first character. */
2576
2577 case '^':
2578 if ((options & PCRE_MULTILINE) != 0)
2579 {
2580 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2581 }
2582 previous = NULL;
2583 *code++ = OP_CIRC;
2584 break;
2585
2586 case '$':
2587 previous = NULL;
2588 *code++ = OP_DOLL;
2589 break;
2590
2591 /* There can never be a first char if '.' is first, whatever happens about
2592 repeats. The value of reqbyte doesn't change either. */
2593
2594 case '.':
2595 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2596 zerofirstbyte = firstbyte;
2597 zeroreqbyte = reqbyte;
2598 previous = code;
2599 *code++ = OP_ANY;
2600 break;
2601
2602
2603 /* ===================================================================*/
2604 /* Character classes. If the included characters are all < 256, we build a
2605 32-byte bitmap of the permitted characters, except in the special case
2606 where there is only one such character. For negated classes, we build the
2607 map as usual, then invert it at the end. However, we use a different opcode
2608 so that data characters > 255 can be handled correctly.
2609
2610 If the class contains characters outside the 0-255 range, a different
2611 opcode is compiled. It may optionally have a bit map for characters < 256,
2612 but those above are are explicitly listed afterwards. A flag byte tells
2613 whether the bitmap is present, and whether this is a negated class or not.
2614 */
2615
2616 case '[':
2617 previous = code;
2618
2619 /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
2620 they are encountered at the top level, so we'll do that too. */
2621
2622 if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2623 check_posix_syntax(ptr, &tempptr, cd))
2624 {
2625 *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;
2626 goto FAILED;
2627 }
2628
2629 /* If the first character is '^', set the negation flag and skip it. Also,
2630 if the first few characters (either before or after ^) are \Q\E or \E we
2631 skip them too. This makes for compatibility with Perl. */
2632
2633 negate_class = FALSE;
2634 for (;;)
2635 {
2636 c = *(++ptr);
2637 if (c == '\\')
2638 {
2639 if (ptr[1] == 'E') ptr++;
2640 else if (strncmp((const char *)ptr+1, "Q\\E", 3) == 0) ptr += 3;
2641 else break;
2642 }
2643 else if (!negate_class && c == '^')
2644 negate_class = TRUE;
2645 else break;
2646 }
2647
2648 /* If a class contains a negative special such as \S, we need to flip the
2649 negation flag at the end, so that support for characters > 255 works
2650 correctly (they are all included in the class). */
2651
2652 should_flip_negation = FALSE;
2653
2654 /* Keep a count of chars with values < 256 so that we can optimize the case
2655 of just a single character (as long as it's < 256). However, For higher
2656 valued UTF-8 characters, we don't yet do any optimization. */
2657
2658 class_charcount = 0;
2659 class_lastchar = -1;
2660
2661 /* Initialize the 32-char bit map to all zeros. We build the map in a
2662 temporary bit of memory, in case the class contains only 1 character (less
2663 than 256), because in that case the compiled code doesn't use the bit map.
2664 */
2665
2666 memset(classbits, 0, 32 * sizeof(uschar));
2667
2668 #ifdef SUPPORT_UTF8
2669 class_utf8 = FALSE; /* No chars >= 256 */
2670 class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */
2671 #endif
2672
2673 /* Process characters until ] is reached. By writing this as a "do" it
2674 means that an initial ] is taken as a data character. At the start of the
2675 loop, c contains the first byte of the character. */
2676
2677 if (c != 0) do
2678 {
2679 const uschar *oldptr;
2680
2681 #ifdef SUPPORT_UTF8
2682 if (utf8 && c > 127)
2683 { /* Braces are required because the */
2684 GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
2685 }
2686 #endif
2687
2688 /* Inside \Q...\E everything is literal except \E */
2689
2690 if (inescq)
2691 {
2692 if (c == '\\' && ptr[1] == 'E') /* If we are at \E */
2693 {
2694 inescq = FALSE; /* Reset literal state */
2695 ptr++; /* Skip the 'E' */
2696 continue; /* Carry on with next */
2697 }
2698 goto CHECK_RANGE; /* Could be range if \E follows */
2699 }
2700
2701 /* Handle POSIX class names. Perl allows a negation extension of the
2702 form [:^name:]. A square bracket that doesn't match the syntax is
2703 treated as a literal. We also recognize the POSIX constructions
2704 [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
2705 5.6 and 5.8 do. */
2706
2707 if (c == '[' &&
2708 (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2709 check_posix_syntax(ptr, &tempptr, cd))
2710 {
2711 BOOL local_negate = FALSE;
2712 int posix_class, taboffset, tabopt;
2713 register const uschar *cbits = cd->cbits;
2714 uschar pbits[32];
2715
2716 if (ptr[1] != ':')
2717 {
2718 *errorcodeptr = ERR31;
2719 goto FAILED;
2720 }
2721
2722 ptr += 2;
2723 if (*ptr == '^')
2724 {
2725 local_negate = TRUE;
2726 should_flip_negation = TRUE; /* Note negative special */
2727 ptr++;
2728 }
2729
2730 posix_class = check_posix_name(ptr, tempptr - ptr);
2731 if (posix_class < 0)
2732 {
2733 *errorcodeptr = ERR30;
2734 goto FAILED;
2735 }
2736
2737 /* If matching is caseless, upper and lower are converted to
2738 alpha. This relies on the fact that the class table starts with
2739 alpha, lower, upper as the first 3 entries. */
2740
2741 if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
2742 posix_class = 0;
2743
2744 /* We build the bit map for the POSIX class in a chunk of local store
2745 because we may be adding and subtracting from it, and we don't want to
2746 subtract bits that may be in the main map already. At the end we or the
2747 result into the bit map that is being built. */
2748
2749 posix_class *= 3;
2750
2751 /* Copy in the first table (always present) */
2752
2753 memcpy(pbits, cbits + posix_class_maps[posix_class],
2754 32 * sizeof(uschar));
2755
2756 /* If there is a second table, add or remove it as required. */
2757
2758 taboffset = posix_class_maps[posix_class + 1];
2759 tabopt = posix_class_maps[posix_class + 2];
2760
2761 if (taboffset >= 0)
2762 {
2763 if (tabopt >= 0)
2764 for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
2765 else
2766 for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
2767 }
2768
2769 /* Not see if we need to remove any special characters. An option
2770 value of 1 removes vertical space and 2 removes underscore. */
2771
2772 if (tabopt < 0) tabopt = -tabopt;
2773 if (tabopt == 1) pbits[1] &= ~0x3c;
2774 else if (tabopt == 2) pbits[11] &= 0x7f;
2775
2776 /* Add the POSIX table or its complement into the main table that is
2777 being built and we are done. */
2778
2779 if (local_negate)
2780 for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
2781 else
2782 for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
2783
2784 ptr = tempptr + 1;
2785 class_charcount = 10; /* Set > 1; assumes more than 1 per class */
2786 continue; /* End of POSIX syntax handling */
2787 }
2788
2789 /* Backslash may introduce a single character, or it may introduce one
2790 of the specials, which just set a flag. The sequence \b is a special
2791 case. Inside a class (and only there) it is treated as backspace.
2792 Elsewhere it marks a word boundary. Other escapes have preset maps ready
2793 to 'or' into the one we are building. We assume they have more than one
2794 character in them, so set class_charcount bigger than one. */
2795
2796 if (c == '\\')
2797 {
2798 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2799 if (*errorcodeptr != 0) goto FAILED;
2800
2801 if (-c == ESC_b) c = '\b'; /* \b is backspace in a class */
2802 else if (-c == ESC_X) c = 'X'; /* \X is literal X in a class */
2803 else if (-c == ESC_R) c = 'R'; /* \R is literal R in a class */
2804 else if (-c == ESC_Q) /* Handle start of quoted string */
2805 {
2806 if (ptr[1] == '\\' && ptr[2] == 'E')
2807 {
2808 ptr += 2; /* avoid empty string */
2809 }
2810 else inescq = TRUE;
2811 continue;
2812 }
2813 else if (-c == ESC_E) continue; /* Ignore orphan \E */
2814
2815 if (c < 0)
2816 {
2817 register const uschar *cbits = cd->cbits;
2818 class_charcount += 2; /* Greater than 1 is what matters */
2819
2820 /* Save time by not doing this in the pre-compile phase. */
2821
2822 if (lengthptr == NULL) switch (-c)
2823 {
2824 case ESC_d:
2825 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
2826 continue;
2827
2828 case ESC_D:
2829 should_flip_negation = TRUE;
2830 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
2831 continue;
2832
2833 case ESC_w:
2834 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
2835 continue;
2836
2837 case ESC_W:
2838 should_flip_negation = TRUE;
2839 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
2840 continue;
2841
2842 case ESC_s:
2843 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
2844 classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
2845 continue;
2846
2847 case ESC_S:
2848 should_flip_negation = TRUE;
2849 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
2850 classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
2851 continue;
2852
2853 default: /* Not recognized; fall through */
2854 break; /* Need "default" setting to stop compiler warning. */
2855 }
2856
2857 /* In the pre-compile phase, just do the recognition. */
2858
2859 else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
2860 c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
2861
2862 /* We need to deal with \H, \h, \V, and \v in both phases because
2863 they use extra memory. */
2864
2865 if (-c == ESC_h)
2866 {
2867 SETBIT(classbits, 0x09); /* VT */
2868 SETBIT(classbits, 0x20); /* SPACE */
2869 SETBIT(classbits, 0xa0); /* NSBP */
2870 #ifdef SUPPORT_UTF8
2871 if (utf8)
2872 {
2873 class_utf8 = TRUE;
2874 *class_utf8data++ = XCL_SINGLE;
2875 class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
2876 *class_utf8data++ = XCL_SINGLE;
2877 class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
2878 *class_utf8data++ = XCL_RANGE;
2879 class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
2880 class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
2881 *class_utf8data++ = XCL_SINGLE;
2882 class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
2883 *class_utf8data++ = XCL_SINGLE;
2884 class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
2885 *class_utf8data++ = XCL_SINGLE;
2886 class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
2887 }
2888 #endif
2889 continue;
2890 }
2891
2892 if (-c == ESC_H)
2893 {
2894 for (c = 0; c < 32; c++)
2895 {
2896 int x = 0xff;
2897 switch (c)
2898 {
2899 case 0x09/8: x ^= 1 << (0x09%8); break;
2900 case 0x20/8: x ^= 1 << (0x20%8); break;
2901 case 0xa0/8: x ^= 1 << (0xa0%8); break;
2902 default: break;
2903 }
2904 classbits[c] |= x;
2905 }
2906
2907 #ifdef SUPPORT_UTF8
2908 if (utf8)
2909 {
2910 class_utf8 = TRUE;
2911 *class_utf8data++ = XCL_RANGE;
2912 class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2913 class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
2914 *class_utf8data++ = XCL_RANGE;
2915 class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
2916 class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
2917 *class_utf8data++ = XCL_RANGE;
2918 class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
2919 class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
2920 *class_utf8data++ = XCL_RANGE;
2921 class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
2922 class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
2923 *class_utf8data++ = XCL_RANGE;
2924 class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
2925 class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
2926 *class_utf8data++ = XCL_RANGE;
2927 class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
2928 class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
2929 *class_utf8data++ = XCL_RANGE;
2930 class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
2931 class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2932 }
2933 #endif
2934 continue;
2935 }
2936
2937 if (-c == ESC_v)
2938 {
2939 SETBIT(classbits, 0x0a); /* LF */
2940 SETBIT(classbits, 0x0b); /* VT */
2941 SETBIT(classbits, 0x0c); /* FF */
2942 SETBIT(classbits, 0x0d); /* CR */
2943 SETBIT(classbits, 0x85); /* NEL */
2944 #ifdef SUPPORT_UTF8
2945 if (utf8)
2946 {
2947 class_utf8 = TRUE;
2948 *class_utf8data++ = XCL_RANGE;
2949 class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
2950 class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
2951 }
2952 #endif
2953 continue;
2954 }
2955
2956 if (-c == ESC_V)
2957 {
2958 for (c = 0; c < 32; c++)
2959 {
2960 int x = 0xff;
2961 switch (c)
2962 {
2963 case 0x0a/8: x ^= 1 << (0x0a%8);
2964 x ^= 1 << (0x0b%8);
2965 x ^= 1 << (0x0c%8);
2966 x ^= 1 << (0x0d%8);
2967 break;
2968 case 0x85/8: x ^= 1 << (0x85%8); break;
2969 default: break;
2970 }
2971 classbits[c] |= x;
2972 }
2973
2974 #ifdef SUPPORT_UTF8
2975 if (utf8)
2976 {
2977 class_utf8 = TRUE;
2978 *class_utf8data++ = XCL_RANGE;
2979 class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2980 class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
2981 *class_utf8data++ = XCL_RANGE;
2982 class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
2983 class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2984 }
2985 #endif
2986 continue;
2987 }
2988
2989 /* We need to deal with \P and \p in both phases. */
2990
2991 #ifdef SUPPORT_UCP
2992 if (-c == ESC_p || -c == ESC_P)
2993 {
2994 BOOL negated;
2995 int pdata;
2996 int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
2997 if (ptype < 0) goto FAILED;
2998 class_utf8 = TRUE;
2999 *class_utf8data++ = ((-c == ESC_p) != negated)?
3000 XCL_PROP : XCL_NOTPROP;
3001 *class_utf8data++ = ptype;
3002 *class_utf8data++ = pdata;
3003 class_charcount -= 2; /* Not a < 256 character */
3004 continue;
3005 }
3006 #endif
3007 /* Unrecognized escapes are faulted if PCRE is running in its
3008 strict mode. By default, for compatibility with Perl, they are
3009 treated as literals. */
3010
3011 if ((options & PCRE_EXTRA) != 0)
3012 {
3013 *errorcodeptr = ERR7;
3014 goto FAILED;
3015 }
3016
3017 class_charcount -= 2; /* Undo the default count from above */
3018 c = *ptr; /* Get the final character and fall through */
3019 }
3020
3021 /* Fall through if we have a single character (c >= 0). This may be
3022 greater than 256 in UTF-8 mode. */
3023
3024 } /* End of backslash handling */
3025
3026 /* A single character may be followed by '-' to form a range. However,
3027 Perl does not permit ']' to be the end of the range. A '-' character
3028 at the end is treated as a literal. Perl ignores orphaned \E sequences
3029 entirely. The code for handling \Q and \E is messy. */
3030
3031 CHECK_RANGE:
3032 while (ptr[1] == '\\' && ptr[2] == 'E')
3033 {
3034 inescq = FALSE;
3035 ptr += 2;
3036 }
3037
3038 oldptr = ptr;
3039
3040 /* Remember \r or \n */
3041
3042 if (c == '\r' || c == '\n') cd->external_flags |= PCRE_HASCRORLF;
3043
3044 /* Check for range */
3045
3046 if (!inescq && ptr[1] == '-')
3047 {
3048 int d;
3049 ptr += 2;
3050 while (*ptr == '\\' && ptr[1] == 'E') ptr += 2;
3051
3052 /* If we hit \Q (not followed by \E) at this point, go into escaped
3053 mode. */
3054
3055 while (*ptr == '\\' && ptr[1] == 'Q')
3056 {
3057 ptr += 2;
3058 if (*ptr == '\\' && ptr[1] == 'E') { ptr += 2; continue; }
3059 inescq = TRUE;
3060 break;
3061 }
3062
3063 if (*ptr == 0 || (!inescq && *ptr == ']'))
3064 {
3065 ptr = oldptr;
3066 goto LONE_SINGLE_CHARACTER;
3067 }
3068
3069 #ifdef SUPPORT_UTF8
3070 if (utf8)
3071 { /* Braces are required because the */
3072 GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
3073 }
3074 else
3075 #endif
3076 d = *ptr; /* Not UTF-8 mode */
3077
3078 /* The second part of a range can be a single-character escape, but
3079 not any of the other escapes. Perl 5.6 treats a hyphen as a literal
3080 in such circumstances. */
3081
3082 if (!inescq && d == '\\')
3083 {
3084 d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3085 if (*errorcodeptr != 0) goto FAILED;
3086
3087 /* \b is backspace; \X is literal X; \R is literal R; any other
3088 special means the '-' was literal */
3089
3090 if (d < 0)
3091 {
3092 if (d == -ESC_b) d = '\b';
3093 else if (d == -ESC_X) d = 'X';
3094 else if (d == -ESC_R) d = 'R'; else
3095 {
3096 ptr = oldptr;
3097 goto LONE_SINGLE_CHARACTER; /* A few lines below */
3098 }
3099 }
3100 }
3101
3102 /* Check that the two values are in the correct order. Optimize
3103 one-character ranges */
3104
3105 if (d < c)
3106 {
3107 *errorcodeptr = ERR8;
3108 goto FAILED;
3109 }
3110
3111 if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
3112
3113 /* Remember \r or \n */
3114
3115 if (d == '\r' || d == '\n') cd->external_flags |= PCRE_HASCRORLF;
3116
3117 /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
3118 matching, we have to use an XCLASS with extra data items. Caseless
3119 matching for characters > 127 is available only if UCP support is
3120 available. */
3121
3122 #ifdef SUPPORT_UTF8
3123 if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
3124 {
3125 class_utf8 = TRUE;
3126
3127 /* With UCP support, we can find the other case equivalents of
3128 the relevant characters. There may be several ranges. Optimize how
3129 they fit with the basic range. */
3130
3131 #ifdef SUPPORT_UCP
3132 if ((options & PCRE_CASELESS) != 0)
3133 {
3134 unsigned int occ, ocd;
3135 unsigned int cc = c;
3136 unsigned int origd = d;
3137 while (get_othercase_range(&cc, origd, &occ, &ocd))
3138 {
3139 if (occ >= (unsigned int)c &&
3140 ocd <= (unsigned int)d)
3141 continue; /* Skip embedded ranges */
3142
3143 if (occ < (unsigned int)c &&
3144 ocd >= (unsigned int)c - 1) /* Extend the basic range */
3145 { /* if there is overlap, */
3146 c = occ; /* noting that if occ < c */
3147 continue; /* we can't have ocd > d */
3148 } /* because a subrange is */
3149 if (ocd > (unsigned int)d &&
3150 occ <= (unsigned int)d + 1) /* always shorter than */
3151 { /* the basic range. */
3152 d = ocd;
3153 continue;
3154 }
3155
3156 if (occ == ocd)
3157 {
3158 *class_utf8data++ = XCL_SINGLE;
3159 }
3160 else
3161 {
3162 *class_utf8data++ = XCL_RANGE;
3163 class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
3164 }
3165 class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
3166 }
3167 }
3168 #endif /* SUPPORT_UCP */
3169
3170 /* Now record the original range, possibly modified for UCP caseless
3171 overlapping ranges. */
3172
3173 *class_utf8data++ = XCL_RANGE;
3174 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3175 class_utf8data += _pcre_ord2utf8(d, class_utf8data);
3176
3177 /* With UCP support, we are done. Without UCP support, there is no
3178 caseless matching for UTF-8 characters > 127; we can use the bit map
3179 for the smaller ones. */
3180
3181 #ifdef SUPPORT_UCP
3182 continue; /* With next character in the class */
3183 #else
3184 if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
3185
3186 /* Adjust upper limit and fall through to set up the map */
3187
3188 d = 127;
3189
3190 #endif /* SUPPORT_UCP */
3191 }
3192 #endif /* SUPPORT_UTF8 */
3193
3194 /* We use the bit map for all cases when not in UTF-8 mode; else
3195 ranges that lie entirely within 0-127 when there is UCP support; else
3196 for partial ranges without UCP support. */
3197
3198 class_charcount += d - c + 1;
3199 class_lastchar = d;
3200
3201 /* We can save a bit of time by skipping this in the pre-compile. */
3202
3203 if (lengthptr == NULL) for (; c <= d; c++)
3204 {
3205 classbits[c/8] |= (1 << (c&7));
3206 if ((options & PCRE_CASELESS) != 0)
3207 {
3208 int uc = cd->fcc[c]; /* flip case */
3209 classbits[uc/8] |= (1 << (uc&7));
3210 }
3211 }
3212
3213 continue; /* Go get the next char in the class */
3214 }
3215
3216 /* Handle a lone single character - we can get here for a normal
3217 non-escape char, or after \ that introduces a single character or for an
3218 apparent range that isn't. */
3219
3220 LONE_SINGLE_CHARACTER:
3221
3222 /* Handle a character that cannot go in the bit map */
3223
3224 #ifdef SUPPORT_UTF8
3225 if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
3226 {
3227 class_utf8 = TRUE;
3228 *class_utf8data++ = XCL_SINGLE;
3229 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3230
3231 #ifdef SUPPORT_UCP
3232 if ((options & PCRE_CASELESS) != 0)
3233 {
3234 unsigned int othercase;
3235 if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR)
3236 {
3237 *class_utf8data++ = XCL_SINGLE;
3238 class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
3239 }
3240 }
3241 #endif /* SUPPORT_UCP */
3242
3243 }
3244 else
3245 #endif /* SUPPORT_UTF8 */
3246
3247 /* Handle a single-byte character */
3248 {
3249 classbits[c/8] |= (1 << (c&7));
3250 if ((options & PCRE_CASELESS) != 0)
3251 {
3252 c = cd->fcc[c]; /* flip case */
3253 classbits[c/8] |= (1 << (c&7));
3254 }
3255 class_charcount++;
3256 class_lastchar = c;
3257 }
3258 }
3259
3260 /* Loop until ']' reached. This "while" is the end of the "do" above. */
3261
3262 while ((c = *(++ptr)) != 0 && (c != ']' || inescq));
3263
3264 if (c == 0) /* Missing terminating ']' */
3265 {
3266 *errorcodeptr = ERR6;
3267 goto FAILED;
3268 }
3269
3270
3271 /* This code has been disabled because it would mean that \s counts as
3272 an explicit \r or \n reference, and that's not really what is wanted. Now
3273 we set the flag only if there is a literal "\r" or "\n" in the class. */
3274
3275 #if 0
3276 /* Remember whether \r or \n are in this class */
3277
3278 if (negate_class)
3279 {
3280 if ((classbits[1] & 0x24) != 0x24) cd->external_flags |= PCRE_HASCRORLF;
3281 }
3282 else
3283 {
3284 if ((classbits[1] & 0x24) != 0) cd->external_flags |= PCRE_HASCRORLF;
3285 }
3286 #endif
3287
3288
3289 /* If class_charcount is 1, we saw precisely one character whose value is
3290 less than 256. As long as there were no characters >= 128 and there was no
3291 use of \p or \P, in other words, no use of any XCLASS features, we can
3292 optimize.
3293
3294 In UTF-8 mode, we can optimize the negative case only if there were no
3295 characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR
3296 operate on single-bytes only. This is an historical hangover. Maybe one day
3297 we can tidy these opcodes to handle multi-byte characters.
3298
3299 The optimization throws away the bit map. We turn the item into a
3300 1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
3301 that OP_NOT does not support multibyte characters. In the positive case, it
3302 can cause firstbyte to be set. Otherwise, there can be no first char if
3303 this item is first, whatever repeat count may follow. In the case of
3304 reqbyte, save the previous value for reinstating. */
3305
3306 #ifdef SUPPORT_UTF8
3307 if (class_charcount == 1 && !class_utf8 &&
3308 (!utf8 || !negate_class || class_lastchar < 128))
3309 #else
3310 if (class_charcount == 1)
3311 #endif
3312 {
3313 zeroreqbyte = reqbyte;
3314
3315 /* The OP_NOT opcode works on one-byte characters only. */
3316
3317 if (negate_class)
3318 {
3319 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3320 zerofirstbyte = firstbyte;
3321 *code++ = OP_NOT;
3322 *code++ = class_lastchar;
3323 break;
3324 }
3325
3326 /* For a single, positive character, get the value into mcbuffer, and
3327 then we can handle this with the normal one-character code. */
3328
3329 #ifdef SUPPORT_UTF8
3330 if (utf8 && class_lastchar > 127)
3331 mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
3332 else
3333 #endif
3334 {
3335 mcbuffer[0] = class_lastchar;
3336 mclength = 1;
3337 }
3338 goto ONE_CHAR;
3339 } /* End of 1-char optimization */
3340
3341 /* The general case - not the one-char optimization. If this is the first
3342 thing in the branch, there can be no first char setting, whatever the
3343 repeat count. Any reqbyte setting must remain unchanged after any kind of
3344 repeat. */
3345
3346 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3347 zerofirstbyte = firstbyte;
3348 zeroreqbyte = reqbyte;
3349
3350 /* If there are characters with values > 255, we have to compile an
3351 extended class, with its own opcode, unless there was a negated special
3352 such as \S in the class, because in that case all characters > 255 are in
3353 the class, so any that were explicitly given as well can be ignored. If
3354 (when there are explicit characters > 255 that must be listed) there are no
3355 characters < 256, we can omit the bitmap in the actual compiled code. */
3356
3357 #ifdef SUPPORT_UTF8
3358 if (class_utf8 && !should_flip_negation)
3359 {
3360 *class_utf8data++ = XCL_END; /* Marks the end of extra data */
3361 *code++ = OP_XCLASS;
3362 code += LINK_SIZE;
3363 *code = negate_class? XCL_NOT : 0;
3364
3365 /* If the map is required, move up the extra data to make room for it;
3366 otherwise just move the code pointer to the end of the extra data. */
3367
3368 if (class_charcount > 0)
3369 {
3370 *code++ |= XCL_MAP;
3371 memmove(code + 32, code, class_utf8data - code);
3372 memcpy(code, classbits, 32);
3373 code = class_utf8data + 32;
3374 }
3375 else code = class_utf8data;
3376
3377 /* Now fill in the complete length of the item */
3378
3379 PUT(previous, 1, code - previous);
3380 break; /* End of class handling */
3381 }
3382 #endif
3383
3384 /* If there are no characters > 255, set the opcode to OP_CLASS or
3385 OP_NCLASS, depending on whether the whole class was negated and whether
3386 there were negative specials such as \S in the class. Then copy the 32-byte
3387 map into the code vector, negating it if necessary. */
3388
3389 *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
3390 if (negate_class)
3391 {
3392 if (lengthptr == NULL) /* Save time in the pre-compile phase */
3393 for (c = 0; c < 32; c++) code[c] = ~classbits[c];
3394 }
3395 else
3396 {
3397 memcpy(code, classbits, 32);
3398 }
3399 code += 32;
3400 break;
3401
3402
3403 /* ===================================================================*/
3404 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
3405 has been tested above. */
3406
3407 case '{':
3408 if (!is_quantifier) goto NORMAL_CHAR;
3409 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
3410 if (*errorcodeptr != 0) goto FAILED;
3411 goto REPEAT;
3412
3413 case '*':
3414 repeat_min = 0;
3415 repeat_max = -1;
3416 goto REPEAT;
3417
3418 case '+':
3419 repeat_min = 1;
3420 repeat_max = -1;
3421 goto REPEAT;
3422
3423 case '?':
3424 repeat_min = 0;
3425 repeat_max = 1;
3426
3427 REPEAT:
3428 if (previous == NULL)
3429 {
3430 *errorcodeptr = ERR9;
3431 goto FAILED;
3432 }
3433
3434 if (repeat_min == 0)
3435 {
3436 firstbyte = zerofirstbyte; /* Adjust for zero repeat */
3437 reqbyte = zeroreqbyte; /* Ditto */
3438 }
3439
3440 /* Remember whether this is a variable length repeat */
3441
3442 reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
3443
3444 op_type = 0; /* Default single-char op codes */
3445 possessive_quantifier = FALSE; /* Default not possessive quantifier */
3446
3447 /* Save start of previous item, in case we have to move it up to make space
3448 for an inserted OP_ONCE for the additional '+' extension. */
3449
3450 tempcode = previous;
3451
3452 /* If the next character is '+', we have a possessive quantifier. This
3453 implies greediness, whatever the setting of the PCRE_UNGREEDY option.
3454 If the next character is '?' this is a minimizing repeat, by default,
3455 but if PCRE_UNGREEDY is set, it works the other way round. We change the
3456 repeat type to the non-default. */
3457
3458 if (ptr[1] == '+')
3459 {
3460 repeat_type = 0; /* Force greedy */
3461 possessive_quantifier = TRUE;
3462 ptr++;
3463 }
3464 else if (ptr[1] == '?')
3465 {
3466 repeat_type = greedy_non_default;
3467 ptr++;
3468 }
3469 else repeat_type = greedy_default;
3470
3471 /* If previous was a character match, abolish the item and generate a
3472 repeat item instead. If a char item has a minumum of more than one, ensure
3473 that it is set in reqbyte - it might not be if a sequence such as x{3} is
3474 the first thing in a branch because the x will have gone into firstbyte
3475 instead. */
3476
3477 if (*previous == OP_CHAR || *previous == OP_CHARNC)
3478 {
3479 /* Deal with UTF-8 characters that take up more than one byte. It's
3480 easier to write this out separately than try to macrify it. Use c to
3481 hold the length of the character in bytes, plus 0x80 to flag that it's a
3482 length rather than a small character. */
3483
3484 #ifdef SUPPORT_UTF8
3485 if (utf8 && (code[-1] & 0x80) != 0)
3486 {
3487 uschar *lastchar = code - 1;
3488 while((*lastchar & 0xc0) == 0x80) lastchar--;
3489 c = code - lastchar; /* Length of UTF-8 character */
3490 memcpy(utf8_char, lastchar, c); /* Save the char */
3491 c |= 0x80; /* Flag c as a length */
3492 }
3493 else
3494 #endif
3495
3496 /* Handle the case of a single byte - either with no UTF8 support, or
3497 with UTF-8 disabled, or for a UTF-8 character < 128. */
3498
3499 {
3500 c = code[-1];
3501 if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
3502 }
3503
3504 /* If the repetition is unlimited, it pays to see if the next thing on
3505 the line is something that cannot possibly match this character. If so,
3506 automatically possessifying this item gains some performance in the case
3507 where the match fails. */
3508
3509 if (!possessive_quantifier &&
3510 repeat_max < 0 &&
3511 check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3512 options, cd))
3513 {
3514 repeat_type = 0; /* Force greedy */
3515 possessive_quantifier = TRUE;
3516 }
3517
3518 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
3519 }
3520
3521 /* If previous was a single negated character ([^a] or similar), we use
3522 one of the special opcodes, replacing it. The code is shared with single-
3523 character repeats by setting opt_type to add a suitable offset into
3524 repeat_type. We can also test for auto-possessification. OP_NOT is
3525 currently used only for single-byte chars. */
3526
3527 else if (*previous == OP_NOT)
3528 {
3529 op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
3530 c = previous[1];
3531 if (!possessive_quantifier &&
3532 repeat_max < 0 &&
3533 check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3534 {
3535 repeat_type = 0; /* Force greedy */
3536 possessive_quantifier = TRUE;
3537 }
3538 goto OUTPUT_SINGLE_REPEAT;
3539 }
3540
3541 /* If previous was a character type match (\d or similar), abolish it and
3542 create a suitable repeat item. The code is shared with single-character
3543 repeats by setting op_type to add a suitable offset into repeat_type. Note
3544 the the Unicode property types will be present only when SUPPORT_UCP is
3545 defined, but we don't wrap the little bits of code here because it just
3546 makes it horribly messy. */
3547
3548 else if (*previous < OP_EODN)
3549 {
3550 uschar *oldcode;
3551 int prop_type, prop_value;
3552 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
3553 c = *previous;
3554
3555 if (!possessive_quantifier &&
3556 repeat_max < 0 &&
3557 check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3558 {
3559 repeat_type = 0; /* Force greedy */
3560 possessive_quantifier = TRUE;
3561 }
3562
3563 OUTPUT_SINGLE_REPEAT:
3564 if (*previous == OP_PROP || *previous == OP_NOTPROP)
3565 {
3566 prop_type = previous[1];
3567 prop_value = previous[2];
3568 }
3569 else prop_type = prop_value = -1;
3570
3571 oldcode = code;
3572 code = previous; /* Usually overwrite previous item */
3573
3574 /* If the maximum is zero then the minimum must also be zero; Perl allows
3575 this case, so we do too - by simply omitting the item altogether. */
3576
3577 if (repeat_max == 0) goto END_REPEAT;
3578
3579 /* All real repeats make it impossible to handle partial matching (maybe
3580 one day we will be able to remove this restriction). */
3581
3582 if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL;
3583
3584 /* Combine the op_type with the repeat_type */
3585
3586 repeat_type += op_type;
3587
3588 /* A minimum of zero is handled either as the special case * or ?, or as
3589 an UPTO, with the maximum given. */
3590
3591 if (repeat_min == 0)
3592 {
3593 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
3594 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
3595 else
3596 {
3597 *code++ = OP_UPTO + repeat_type;
3598 PUT2INC(code, 0, repeat_max);
3599 }
3600 }
3601
3602 /* A repeat minimum of 1 is optimized into some special cases. If the
3603 maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
3604 left in place and, if the maximum is greater than 1, we use OP_UPTO with
3605 one less than the maximum. */
3606
3607 else if (repeat_min == 1)
3608 {
3609 if (repeat_max == -1)
3610 *code++ = OP_PLUS + repeat_type;
3611 else
3612 {
3613 code = oldcode; /* leave previous item in place */
3614 if (repeat_max == 1) goto END_REPEAT;
3615 *code++ = OP_UPTO + repeat_type;
3616 PUT2INC(code, 0, repeat_max - 1);
3617 }
3618 }
3619
3620 /* The case {n,n} is just an EXACT, while the general case {n,m} is
3621 handled as an EXACT followed by an UPTO. */
3622
3623 else
3624 {
3625 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
3626 PUT2INC(code, 0, repeat_min);
3627
3628 /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
3629 we have to insert the character for the previous code. For a repeated
3630 Unicode property match, there are two extra bytes that define the
3631 required property. In UTF-8 mode, long characters have their length in
3632 c, with the 0x80 bit as a flag. */
3633
3634 if (repeat_max < 0)
3635 {
3636 #ifdef SUPPORT_UTF8
3637 if (utf8 && c >= 128)
3638 {
3639 memcpy(code, utf8_char, c & 7);
3640 code += c & 7;
3641 }
3642 else
3643 #endif
3644 {
3645 *code++ = c;
3646 if (prop_type >= 0)
3647 {
3648 *code++ = prop_type;
3649 *code++ = prop_value;
3650 }
3651 }
3652 *code++ = OP_STAR + repeat_type;
3653 }
3654
3655 /* Else insert an UPTO if the max is greater than the min, again
3656 preceded by the character, for the previously inserted code. If the
3657 UPTO is just for 1 instance, we can use QUERY instead. */
3658
3659 else if (repeat_max != repeat_min)
3660 {
3661 #ifdef SUPPORT_UTF8
3662 if (utf8 && c >= 128)
3663 {
3664 memcpy(code, utf8_char, c & 7);
3665 code += c & 7;
3666 }
3667 else
3668 #endif
3669 *code++ = c;
3670 if (prop_type >= 0)
3671 {
3672 *code++ = prop_type;
3673 *code++ = prop_value;
3674 }
3675 repeat_max -= repeat_min;
3676
3677 if (repeat_max == 1)
3678 {
3679 *code++ = OP_QUERY + repeat_type;
3680 }
3681 else
3682 {
3683 *code++ = OP_UPTO + repeat_type;
3684 PUT2INC(code, 0, repeat_max);
3685 }
3686 }
3687 }
3688
3689 /* The character or character type itself comes last in all cases. */
3690
3691 #ifdef SUPPORT_UTF8
3692 if (utf8 && c >= 128)
3693 {
3694 memcpy(code, utf8_char, c & 7);
3695 code += c & 7;
3696 }
3697 else
3698 #endif
3699 *code++ = c;
3700
3701 /* For a repeated Unicode property match, there are two extra bytes that
3702 define the required property. */
3703
3704 #ifdef SUPPORT_UCP
3705 if (prop_type >= 0)
3706 {
3707 *code++ = prop_type;
3708 *code++ = prop_value;
3709 }
3710 #endif
3711 }
3712
3713 /* If previous was a character class or a back reference, we put the repeat
3714 stuff after it, but just skip the item if the repeat was {0,0}. */
3715
3716 else if (*previous == OP_CLASS ||
3717 *previous == OP_NCLASS ||
3718 #ifdef SUPPORT_UTF8
3719 *previous == OP_XCLASS ||
3720 #endif
3721 *previous == OP_REF)
3722 {
3723 if (repeat_max == 0)
3724 {
3725 code = previous;
3726 goto END_REPEAT;
3727 }
3728
3729 /* All real repeats make it impossible to handle partial matching (maybe
3730 one day we will be able to remove this restriction). */
3731
3732 if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL;
3733
3734 if (repeat_min == 0 && repeat_max == -1)
3735 *code++ = OP_CRSTAR + repeat_type;
3736 else if (repeat_min == 1 && repeat_max == -1)
3737 *code++ = OP_CRPLUS + repeat_type;
3738 else if (repeat_min == 0 && repeat_max == 1)
3739 *code++ = OP_CRQUERY + repeat_type;
3740 else
3741 {
3742 *code++ = OP_CRRANGE + repeat_type;
3743 PUT2INC(code, 0, repeat_min);
3744 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
3745 PUT2INC(code, 0, repeat_max);
3746 }
3747 }
3748
3749 /* If previous was a bracket group, we may have to replicate it in certain
3750 cases. */
3751
3752 else if (*previous == OP_BRA || *previous == OP_CBRA ||
3753 *previous == OP_ONCE || *previous == OP_COND)
3754 {
3755 register int i;
3756 int ketoffset = 0;
3757 int len = code - previous;
3758 uschar *bralink = NULL;
3759
3760 /* Repeating a DEFINE group is pointless */
3761
3762 if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
3763 {
3764 *errorcodeptr = ERR55;
3765 goto FAILED;
3766 }
3767
3768 /* If the maximum repeat count is unlimited, find the end of the bracket
3769 by scanning through from the start, and compute the offset back to it
3770 from the current code pointer. There may be an OP_OPT setting following
3771 the final KET, so we can't find the end just by going back from the code
3772 pointer. */
3773
3774 if (repeat_max == -1)
3775 {
3776 register uschar *ket = previous;
3777 do ket += GET(ket, 1); while (*ket != OP_KET);
3778 ketoffset = code - ket;
3779 }
3780
3781 /* The case of a zero minimum is special because of the need to stick
3782 OP_BRAZERO in front of it, and because the group appears once in the
3783 data, whereas in other cases it appears the minimum number of times. For
3784 this reason, it is simplest to treat this case separately, as otherwise
3785 the code gets far too messy. There are several special subcases when the
3786 minimum is zero. */
3787
3788 if (repeat_min == 0)
3789 {
3790 /* If the maximum is also zero, we just omit the group from the output
3791 altogether. */
3792
3793 if (repeat_max == 0)
3794 {
3795 code = previous;
3796 goto END_REPEAT;
3797 }
3798
3799 /* If the maximum is 1 or unlimited, we just have to stick in the
3800 BRAZERO and do no more at this point. However, we do need to adjust
3801 any OP_RECURSE calls inside the group that refer to the group itself or
3802 any internal or forward referenced group, because the offset is from
3803 the start of the whole regex. Temporarily terminate the pattern while
3804 doing this. */
3805
3806 if (repeat_max <= 1)
3807 {
3808 *code = OP_END;
3809 adjust_recurse(previous, 1, utf8, cd, save_hwm);
3810 memmove(previous+1, previous, len);
3811 code++;
3812 *previous++ = OP_BRAZERO + repeat_type;
3813 }
3814
3815 /* If the maximum is greater than 1 and limited, we have to replicate
3816 in a nested fashion, sticking OP_BRAZERO before each set of brackets.
3817 The first one has to be handled carefully because it's the original
3818 copy, which has to be moved up. The remainder can be handled by code
3819 that is common with the non-zero minimum case below. We have to
3820 adjust the value or repeat_max, since one less copy is required. Once
3821 again, we may have to adjust any OP_RECURSE calls inside the group. */
3822
3823 else
3824 {
3825 int offset;
3826 *code = OP_END;
3827 adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
3828 memmove(previous + 2 + LINK_SIZE, previous, len);
3829 code += 2 + LINK_SIZE;
3830 *previous++ = OP_BRAZERO + repeat_type;
3831 *previous++ = OP_BRA;
3832
3833 /* We chain together the bracket offset fields that have to be
3834 filled in later when the ends of the brackets are reached. */
3835
3836 offset = (bralink == NULL)? 0 : previous - bralink;
3837 bralink = previous;
3838 PUTINC(previous, 0, offset);
3839 }
3840
3841 repeat_max--;
3842 }
3843
3844 /* If the minimum is greater than zero, replicate the group as many
3845 times as necessary, and adjust the maximum to the number of subsequent
3846 copies that we need. If we set a first char from the group, and didn't
3847 set a required char, copy the latter from the former. If there are any
3848 forward reference subroutine calls in the group, there will be entries on
3849 the workspace list; replicate these with an appropriate increment. */
3850
3851 else
3852 {
3853 if (repeat_min > 1)
3854 {
3855 /* In the pre-compile phase, we don't actually do the replication. We
3856 just adjust the length as if we had. Do some paranoid checks for
3857 potential integer overflow. */
3858
3859 if (lengthptr != NULL)
3860 {
3861 int delta = (repeat_min - 1)*length_prevgroup;
3862 if ((double)(repeat_min - 1)*(double)length_prevgroup >
3863 (double)INT_MAX ||
3864 OFLOW_MAX - *lengthptr < delta)
3865 {
3866 *errorcodeptr = ERR20;
3867 goto FAILED;
3868 }
3869 *lengthptr += delta;
3870 }
3871
3872 /* This is compiling for real */
3873
3874 else
3875 {
3876 if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
3877 for (i = 1; i < repeat_min; i++)
3878 {
3879 uschar *hc;
3880 uschar *this_hwm = cd->hwm;
3881 memcpy(code, previous, len);
3882 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3883 {
3884 PUT(cd->hwm, 0, GET(hc, 0) + len);
3885 cd->hwm += LINK_SIZE;
3886 }
3887 save_hwm = this_hwm;
3888 code += len;
3889 }
3890 }
3891 }
3892
3893 if (repeat_max > 0) repeat_max -= repeat_min;
3894 }
3895
3896 /* This code is common to both the zero and non-zero minimum cases. If
3897 the maximum is limited, it replicates the group in a nested fashion,
3898 remembering the bracket starts on a stack. In the case of a zero minimum,
3899 the first one was set up above. In all cases the repeat_max now specifies
3900 the number of additional copies needed. Again, we must remember to
3901 replicate entries on the forward reference list. */
3902
3903 if (repeat_max >= 0)
3904 {
3905 /* In the pre-compile phase, we don't actually do the replication. We
3906 just adjust the length as if we had. For each repetition we must add 1
3907 to the length for BRAZERO and for all but the last repetition we must
3908 add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
3909 paranoid checks to avoid integer overflow. */
3910
3911 if (lengthptr != NULL && repeat_max > 0)
3912 {
3913 int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
3914 2 - 2*LINK_SIZE; /* Last one doesn't nest */
3915 if ((double)repeat_max *
3916 (double)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
3917 > (double)INT_MAX ||
3918 OFLOW_MAX - *lengthptr < delta)
3919 {
3920 *errorcodeptr = ERR20;
3921 goto FAILED;
3922 }
3923 *lengthptr += delta;
3924 }
3925
3926 /* This is compiling for real */
3927
3928 else for (i = repeat_max - 1; i >= 0; i--)
3929 {
3930 uschar *hc;
3931 uschar *this_hwm = cd->hwm;
3932
3933 *code++ = OP_BRAZERO + repeat_type;
3934
3935 /* All but the final copy start a new nesting, maintaining the
3936 chain of brackets outstanding. */
3937
3938 if (i != 0)
3939 {
3940 int offset;
3941 *code++ = OP_BRA;
3942 offset = (bralink == NULL)? 0 : code - bralink;
3943 bralink = code;
3944 PUTINC(code, 0, offset);
3945 }
3946
3947 memcpy(code, previous, len);
3948 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3949 {
3950 PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
3951 cd->hwm += LINK_SIZE;
3952 }
3953 save_hwm = this_hwm;
3954 code += len;
3955 }
3956
3957 /* Now chain through the pending brackets, and fill in their length
3958 fields (which are holding the chain links pro tem). */
3959
3960 while (bralink != NULL)
3961 {
3962 int oldlinkoffset;
3963 int offset = code - bralink + 1;
3964 uschar *bra = code - offset;
3965 oldlinkoffset = GET(bra, 1);
3966 bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
3967 *code++ = OP_KET;
3968 PUTINC(code, 0, offset);
3969 PUT(bra, 1, offset);
3970 }
3971 }
3972
3973 /* If the maximum is unlimited, set a repeater in the final copy. We
3974 can't just offset backwards from the current code point, because we
3975 don't know if there's been an options resetting after the ket. The
3976 correct offset was computed above.
3977
3978 Then, when we are doing the actual compile phase, check to see whether
3979 this group is a non-atomic one that could match an empty string. If so,
3980 convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
3981 that runtime checking can be done. [This check is also applied to
3982 atomic groups at runtime, but in a different way.] */
3983
3984 else
3985 {
3986 uschar *ketcode = code - ketoffset;
3987 uschar *bracode = ketcode - GET(ketcode, 1);
3988 *ketcode = OP_KETRMAX + repeat_type;
3989 if (lengthptr == NULL && *bracode != OP_ONCE)
3990 {
3991 uschar *scode = bracode;
3992 do
3993 {
3994 if (could_be_empty_branch(scode, ketcode, utf8))
3995 {
3996 *bracode += OP_SBRA - OP_BRA;
3997 break;
3998 }
3999 scode += GET(scode, 1);
4000 }
4001 while (*scode == OP_ALT);
4002 }
4003 }
4004 }
4005
4006 /* Else there's some kind of shambles */
4007
4008 else
4009 {
4010 *errorcodeptr = ERR11;
4011 goto FAILED;
4012 }
4013
4014 /* If the character following a repeat is '+', or if certain optimization
4015 tests above succeeded, possessive_quantifier is TRUE. For some of the
4016 simpler opcodes, there is an special alternative opcode for this. For
4017 anything else, we wrap the entire repeated item inside OP_ONCE brackets.
4018 The '+' notation is just syntactic sugar, taken from Sun's Java package,
4019 but the special opcodes can optimize it a bit. The repeated item starts at
4020 tempcode, not at previous, which might be the first part of a string whose
4021 (former) last char we repeated.
4022
4023 Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
4024 an 'upto' may follow. We skip over an 'exact' item, and then test the
4025 length of what remains before proceeding. */
4026
4027 if (possessive_quantifier)
4028 {
4029 int len;
4030 if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||
4031 *tempcode == OP_NOTEXACT)
4032 tempcode += _pcre_OP_lengths[*tempcode] +
4033 ((*tempcode == OP_TYPEEXACT &&
4034 (tempcode[3] == OP_PROP || tempcode[3] == OP_NOTPROP))? 2:0);
4035 len = code - tempcode;
4036 if (len > 0) switch (*tempcode)
4037 {
4038 case OP_STAR: *tempcode = OP_POSSTAR; break;
4039 case OP_PLUS: *tempcode = OP_POSPLUS; break;
4040 case OP_QUERY: *tempcode = OP_POSQUERY; break;
4041 case OP_UPTO: *tempcode = OP_POSUPTO; break;
4042
4043 case OP_TYPESTAR: *tempcode = OP_TYPEPOSSTAR; break;
4044 case OP_TYPEPLUS: *tempcode = OP_TYPEPOSPLUS; break;
4045 case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
4046 case OP_TYPEUPTO: *tempcode = OP_TYPEPOSUPTO; break;
4047
4048 case OP_NOTSTAR: *tempcode = OP_NOTPOSSTAR; break;
4049 case OP_NOTPLUS: *tempcode = OP_NOTPOSPLUS; break;
4050 case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
4051 case OP_NOTUPTO: *tempcode = OP_NOTPOSUPTO; break;
4052
4053 default:
4054 memmove(tempcode + 1+LINK_SIZE, tempcode, len);
4055 code += 1 + LINK_SIZE;
4056 len += 1 + LINK_SIZE;
4057 tempcode[0] = OP_ONCE;
4058 *code++ = OP_KET;
4059 PUTINC(code, 0, len);
4060 PUT(tempcode, 1, len);
4061 break;
4062 }
4063 }
4064
4065 /* In all case we no longer have a previous item. We also set the
4066 "follows varying string" flag for subsequently encountered reqbytes if
4067 it isn't already set and we have just passed a varying length item. */
4068
4069 END_REPEAT:
4070 previous = NULL;
4071 cd->req_varyopt |= reqvary;
4072 break;
4073
4074
4075 /* ===================================================================*/
4076 /* Start of nested parenthesized sub-expression, or comment or lookahead or
4077 lookbehind or option setting or condition or all the other extended
4078 parenthesis forms. */
4079
4080 case '(':
4081 newoptions = options;
4082 skipbytes = 0;
4083 bravalue = OP_CBRA;
4084 save_hwm = cd->hwm;
4085 reset_bracount = FALSE;
4086
4087 /* First deal with various "verbs" that can be introduced by '*'. */
4088
4089 if (*(++ptr) == '*' && (cd->ctypes[ptr[1]] & ctype_letter) != 0)
4090 {
4091 int i, namelen;
4092 const char *vn = verbnames;
4093 const uschar *name = ++ptr;
4094 previous = NULL;
4095 while ((cd->ctypes[*++ptr] & ctype_letter) != 0);
4096 if (*ptr == ':')
4097 {
4098 *errorcodeptr = ERR59; /* Not supported */
4099 goto FAILED;
4100 }
4101 if (*ptr != ')')
4102 {
4103 *errorcodeptr = ERR60;
4104 goto FAILED;
4105 }
4106 namelen = ptr - name;
4107 for (i = 0; i < verbcount; i++)
4108 {
4109 if (namelen == verbs[i].len &&
4110 strncmp((char *)name, vn, namelen) == 0)
4111 {
4112 *code = verbs[i].op;
4113 if (*code++ == OP_ACCEPT) cd->had_accept = TRUE;
4114 break;
4115 }
4116 vn += verbs[i].len + 1;
4117 }
4118 if (i < verbcount) continue;
4119 *errorcodeptr = ERR60;
4120 goto FAILED;
4121 }
4122
4123 /* Deal with the extended parentheses; all are introduced by '?', and the
4124 appearance of any of them means that this is not a capturing group. */
4125
4126 else if (*ptr == '?')
4127 {
4128 int i, set, unset, namelen;
4129 int *optset;
4130 const uschar *name;
4131 uschar *slot;
4132
4133 switch (*(++ptr))
4134 {
4135 case '#': /* Comment; skip to ket */
4136 ptr++;
4137 while (*ptr != 0 && *ptr != ')') ptr++;
4138 if (*ptr == 0)
4139 {
4140 *errorcodeptr = ERR18;
4141 goto FAILED;
4142 }
4143 continue;
4144
4145
4146 /* ------------------------------------------------------------ */
4147 case '|': /* Reset capture count for each branch */
4148 reset_bracount = TRUE;
4149 /* Fall through */
4150
4151 /* ------------------------------------------------------------ */
4152 case ':': /* Non-capturing bracket */
4153 bravalue = OP_BRA;
4154 ptr++;
4155 break;
4156
4157
4158 /* ------------------------------------------------------------ */
4159 case '(':
4160 bravalue = OP_COND; /* Conditional group */
4161
4162 /* A condition can be an assertion, a number (referring to a numbered
4163 group), a name (referring to a named group), or 'R', referring to
4164 recursion. R<digits> and R&name are also permitted for recursion tests.
4165
4166 There are several syntaxes for testing a named group: (?(name)) is used
4167 by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
4168
4169 There are two unfortunate ambiguities, caused by history. (a) 'R' can
4170 be the recursive thing or the name 'R' (and similarly for 'R' followed
4171 by digits), and (b) a number could be a name that consists of digits.
4172 In both cases, we look for a name first; if not found, we try the other
4173 cases. */
4174
4175 /* For conditions that are assertions, check the syntax, and then exit
4176 the switch. This will take control down to where bracketed groups,
4177 including assertions, are processed. */
4178
4179 if (ptr[1] == '?' && (ptr[2] == '=' || ptr[2] == '!' || ptr[2] == '<'))
4180 break;
4181
4182 /* Most other conditions use OP_CREF (a couple change to OP_RREF
4183 below), and all need to skip 3 bytes at the start of the group. */
4184
4185 code[1+LINK_SIZE] = OP_CREF;
4186 skipbytes = 3;
4187 refsign = -1;
4188
4189 /* Check for a test for recursion in a named group. */
4190
4191 if (ptr[1] == 'R' && ptr[2] == '&')
4192 {
4193 terminator = -1;
4194 ptr += 2;
4195 code[1+LINK_SIZE] = OP_RREF; /* Change the type of test */
4196 }
4197
4198 /* Check for a test for a named group's having been set, using the Perl
4199 syntax (?(<name>) or (?('name') */
4200
4201 else if (ptr[1] == '<')
4202 {
4203 terminator = '>';
4204 ptr++;
4205 }
4206 else if (ptr[1] == '\'')
4207 {
4208 terminator = '\'';
4209 ptr++;
4210 }
4211 else
4212 {
4213 terminator = 0;
4214 if (ptr[1] == '-' || ptr[1] == '+') refsign = *(++ptr);
4215 }
4216
4217 /* We now expect to read a name; any thing else is an error */
4218
4219 if ((cd->ctypes[ptr[1]] & ctype_word) == 0)
4220 {
4221 ptr += 1; /* To get the right offset */
4222 *errorcodeptr = ERR28;
4223 goto FAILED;
4224 }
4225
4226 /* Read the name, but also get it as a number if it's all digits */
4227
4228 recno = 0;
4229 name = ++ptr;
4230 while ((cd->ctypes[*ptr] & ctype_word) != 0)
4231 {
4232 if (recno >= 0)
4233 recno = ((digitab[*ptr] & ctype_digit) != 0)?
4234 recno * 10 + *ptr - '0' : -1;
4235 ptr++;
4236 }
4237 namelen = ptr - name;
4238
4239 if ((terminator > 0 && *ptr++ != terminator) || *ptr++ != ')')
4240 {
4241 ptr--; /* Error offset */
4242 *errorcodeptr = ERR26;
4243 goto FAILED;
4244 }
4245
4246 /* Do no further checking in the pre-compile phase. */
4247
4248 if (lengthptr != NULL) break;
4249
4250 /* In the real compile we do the work of looking for the actual
4251 reference. If the string started with "+" or "-" we require the rest to
4252 be digits, in which case recno will be set. */
4253
4254 if (refsign > 0)
4255 {
4256 if (recno <= 0)
4257 {
4258 *errorcodeptr = ERR58;
4259 goto FAILED;
4260 }
4261 recno = (refsign == '-')?
4262 cd->bracount - recno + 1 : recno +cd->bracount;
4263 if (recno <= 0 || recno > cd->final_bracount)
4264 {
4265 *errorcodeptr = ERR15;
4266 goto FAILED;
4267 }
4268 PUT2(code, 2+LINK_SIZE, recno);
4269 break;
4270 }
4271
4272 /* Otherwise (did not start with "+" or "-"), start by looking for the
4273 name. */
4274
4275 slot = cd->name_table;
4276 for (i = 0; i < cd->names_found; i++)
4277 {
4278 if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
4279 slot += cd->name_entry_size;
4280 }
4281
4282 /* Found a previous named subpattern */
4283
4284 if (i < cd->names_found)
4285 {
4286 recno = GET2(slot, 0);
4287 PUT2(code, 2+LINK_SIZE, recno);
4288 }
4289
4290 /* Search the pattern for a forward reference */
4291
4292 else if ((i = find_parens(ptr, cd->bracount, name, namelen,
4293 (options & PCRE_EXTENDED) != 0)) > 0)
4294 {
4295 PUT2(code, 2+LINK_SIZE, i);
4296 }
4297
4298 /* If terminator == 0 it means that the name followed directly after
4299 the opening parenthesis [e.g. (?(abc)...] and in this case there are
4300 some further alternatives to try. For the cases where terminator != 0
4301 [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
4302 now checked all the possibilities, so give an error. */
4303
4304 else if (terminator != 0)
4305 {
4306 *errorcodeptr = ERR15;
4307 goto FAILED;
4308 }
4309
4310 /* Check for (?(R) for recursion. Allow digits after R to specify a
4311 specific group number. */
4312
4313 else if (*name == 'R')
4314 {
4315 recno = 0;
4316 for (i = 1; i < namelen; i++)
4317 {
4318 if ((digitab[name[i]] & ctype_digit) == 0)
4319 {
4320 *errorcodeptr = ERR15;
4321 goto FAILED;
4322 }
4323 recno = recno * 10 + name[i] - '0';
4324 }
4325 if (recno == 0) recno = RREF_ANY;
4326 code[1+LINK_SIZE] = OP_RREF; /* Change test type */
4327 PUT2(code, 2+LINK_SIZE, recno);
4328 }
4329
4330 /* Similarly, check for the (?(DEFINE) "condition", which is always
4331 false. */
4332
4333 else if (namelen == 6 && strncmp((char *)name, "DEFINE", 6) == 0)
4334 {
4335 code[1+LINK_SIZE] = OP_DEF;
4336 skipbytes = 1;
4337 }
4338
4339 /* Check for the "name" actually being a subpattern number. We are
4340 in the second pass here, so final_bracount is set. */
4341
4342 else if (recno > 0 && recno <= cd->final_bracount)
4343 {
4344 PUT2(code, 2+LINK_SIZE, recno);
4345 }
4346
4347 /* Either an unidentified subpattern, or a reference to (?(0) */
4348
4349 else
4350 {
4351 *errorcodeptr = (recno == 0)? ERR35: ERR15;
4352 goto FAILED;
4353 }
4354 break;
4355
4356
4357 /* ------------------------------------------------------------ */
4358 case '=': /* Positive lookahead */
4359 bravalue = OP_ASSERT;
4360 ptr++;
4361 break;
4362
4363
4364 /* ------------------------------------------------------------ */
4365 case '!': /* Negative lookahead */
4366 ptr++;
4367 if (*ptr == ')') /* Optimize (?!) */
4368 {
4369 *code++ = OP_FAIL;
4370 previous = NULL;
4371 continue;
4372 }
4373 bravalue = OP_ASSERT_NOT;
4374 break;
4375
4376
4377 /* ------------------------------------------------------------ */
4378 case '<': /* Lookbehind or named define */
4379 switch (ptr[1])
4380 {
4381 case '=': /* Positive lookbehind */
4382 bravalue = OP_ASSERTBACK;
4383 ptr += 2;
4384 break;
4385
4386 case '!': /* Negative lookbehind */
4387 bravalue = OP_ASSERTBACK_NOT;
4388 ptr += 2;
4389 break;
4390
4391 default: /* Could be name define, else bad */
4392 if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME;
4393 ptr++; /* Correct offset for error */
4394 *errorcodeptr = ERR24;
4395 goto FAILED;
4396 }
4397 break;
4398
4399
4400 /* ------------------------------------------------------------ */
4401 case '>': /* One-time brackets */
4402 bravalue = OP_ONCE;
4403 ptr++;
4404 break;
4405
4406
4407 /* ------------------------------------------------------------ */
4408 case 'C': /* Callout - may be followed by digits; */
4409 previous_callout = code; /* Save for later completion */
4410 after_manual_callout = 1; /* Skip one item before completing */
4411 *code++ = OP_CALLOUT;
4412 {
4413 int n = 0;
4414 while ((digitab[*(++ptr)] & ctype_digit) != 0)
4415 n = n * 10 + *ptr - '0';
4416 if (*ptr != ')')
4417 {
4418 *errorcodeptr = ERR39;
4419 goto FAILED;
4420 }
4421 if (n > 255)
4422 {
4423 *errorcodeptr = ERR38;
4424 goto FAILED;
4425 }
4426 *code++ = n;
4427 PUT(code, 0, ptr - cd->start_pattern + 1); /* Pattern offset */
4428 PUT(code, LINK_SIZE, 0); /* Default length */
4429 code += 2 * LINK_SIZE;
4430 }
4431 previous = NULL;
4432 continue;
4433
4434
4435 /* ------------------------------------------------------------ */
4436 case 'P': /* Python-style named subpattern handling */
4437 if (*(++ptr) == '=' || *ptr == '>') /* Reference or recursion */
4438 {
4439 is_recurse = *ptr == '>';
4440 terminator = ')';
4441 goto NAMED_REF_OR_RECURSE;
4442 }
4443 else if (*ptr != '<') /* Test for Python-style definition */
4444 {
4445 *errorcodeptr = ERR41;
4446 goto FAILED;
4447 }
4448 /* Fall through to handle (?P< as (?< is handled */
4449
4450
4451 /* ------------------------------------------------------------ */
4452 DEFINE_NAME: /* Come here from (?< handling */
4453 case '\'':
4454 {
4455 terminator = (*ptr == '<')? '>' : '\'';
4456 name = ++ptr;
4457
4458 while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4459 namelen = ptr - name;
4460
4461 /* In the pre-compile phase, just do a syntax check. */
4462
4463 if (lengthptr != NULL)
4464 {
4465 if (*ptr != terminator)
4466 {
4467 *errorcodeptr = ERR42;
4468 goto FAILED;
4469 }
4470 if (cd->names_found >= MAX_NAME_COUNT)
4471 {
4472 *errorcodeptr = ERR49;
4473 goto FAILED;
4474 }
4475 if (namelen + 3 > cd->name_entry_size)
4476 {
4477 cd->name_entry_size = namelen + 3;
4478 if (namelen > MAX_NAME_SIZE)
4479 {
4480 *errorcodeptr = ERR48;
4481 goto FAILED;
4482 }
4483 }
4484 }
4485
4486 /* In the real compile, create the entry in the table */
4487
4488 else
4489 {
4490 slot = cd->name_table;
4491 for (i = 0; i < cd->names_found; i++)
4492 {
4493 int crc = memcmp(name, slot+2, namelen);
4494 if (crc == 0)
4495 {
4496 if (slot[2+namelen] == 0)
4497 {
4498 if ((options & PCRE_DUPNAMES) == 0)
4499 {
4500 *errorcodeptr = ERR43;
4501 goto FAILED;
4502 }
4503 }
4504 else crc = -1; /* Current name is substring */
4505 }
4506 if (crc < 0)
4507 {
4508 memmove(slot + cd->name_entry_size, slot,
4509 (cd->names_found - i) * cd->name_entry_size);
4510 break;
4511 }
4512 slot += cd->name_entry_size;
4513 }
4514
4515 PUT2(slot, 0, cd->bracount + 1);
4516 memcpy(slot + 2, name, namelen);
4517 slot[2+namelen] = 0;
4518 }
4519 }
4520
4521 /* In both cases, count the number of names we've encountered. */
4522
4523 ptr++; /* Move past > or ' */
4524 cd->names_found++;
4525 goto NUMBERED_GROUP;
4526
4527
4528 /* ------------------------------------------------------------ */
4529 case '&': /* Perl recursion/subroutine syntax */
4530 terminator = ')';
4531 is_recurse = TRUE;
4532 /* Fall through */
4533
4534 /* We come here from the Python syntax above that handles both
4535 references (?P=name) and recursion (?P>name), as well as falling
4536 through from the Perl recursion syntax (?&name). We also come here from
4537 the Perl \k<name> or \k'name' back reference syntax and the \k{name}
4538 .NET syntax. */
4539
4540 NAMED_REF_OR_RECURSE:
4541 name = ++ptr;
4542 while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4543 namelen = ptr - name;
4544
4545 /* In the pre-compile phase, do a syntax check and set a dummy
4546 reference number. */
4547
4548 if (lengthptr != NULL)
4549 {
4550 if (namelen == 0)
4551 {
4552 *errorcodeptr = ERR62;
4553 goto FAILED;
4554 }
4555 if (*ptr != terminator)
4556 {
4557 *errorcodeptr = ERR42;
4558 goto FAILED;
4559 }
4560 if (namelen > MAX_NAME_SIZE)
4561 {
4562 *errorcodeptr = ERR48;
4563 goto FAILED;
4564 }
4565 recno = 0;
4566 }
4567
4568 /* In the real compile, seek the name in the table. We check the name
4569 first, and then check that we have reached the end of the name in the
4570 table. That way, if the name that is longer than any in the table,
4571 the comparison will fail without reading beyond the table entry. */
4572
4573 else
4574 {
4575 slot = cd->name_table;
4576 for (i = 0; i < cd->names_found; i++)
4577 {
4578 if (strncmp((char *)name, (char *)slot+2, namelen) == 0 &&
4579 slot[2+namelen] == 0)
4580 break;
4581 slot += cd->name_entry_size;
4582 }
4583
4584 if (i < cd->names_found) /* Back reference */
4585 {
4586 recno = GET2(slot, 0);
4587 }
4588 else if ((recno = /* Forward back reference */
4589 find_parens(ptr, cd->bracount, name, namelen,
4590 (options & PCRE_EXTENDED) != 0)) <= 0)
4591 {
4592 *errorcodeptr = ERR15;
4593 goto FAILED;
4594 }
4595 }
4596
4597 /* In both phases, we can now go to the code than handles numerical
4598 recursion or backreferences. */
4599
4600 if (is_recurse) goto HANDLE_RECURSION;
4601 else goto HANDLE_REFERENCE;
4602
4603
4604 /* ------------------------------------------------------------ */
4605 case 'R': /* Recursion */
4606 ptr++; /* Same as (?0) */
4607 /* Fall through */
4608
4609
4610 /* ------------------------------------------------------------ */
4611 case '-': case '+':
4612 case '0': case '1': case '2': case '3': case '4': /* Recursion or */
4613 case '5': case '6': case '7': case '8': case '9': /* subroutine */
4614 {
4615 const uschar *called;
4616
4617 if ((refsign = *ptr) == '+')
4618 {
4619 ptr++;
4620 if ((digitab[*ptr] & ctype_digit) == 0)
4621 {
4622 *errorcodeptr = ERR63;
4623 goto FAILED;
4624 }
4625 }
4626 else if (refsign == '-')
4627 {
4628 if ((digitab[ptr[1]] & ctype_digit) == 0)
4629 goto OTHER_CHAR_AFTER_QUERY;
4630 ptr++;
4631 }
4632
4633 recno = 0;
4634 while((digitab[*ptr] & ctype_digit) != 0)
4635 recno = recno * 10 + *ptr++ - '0';
4636
4637 if (*ptr != ')')
4638 {
4639 *errorcodeptr = ERR29;
4640 goto FAILED;
4641 }
4642
4643 if (refsign == '-')
4644 {
4645 if (recno == 0)
4646 {
4647 *errorcodeptr = ERR58;
4648 goto FAILED;
4649 }
4650 recno = cd->bracount - recno + 1;
4651 if (recno <= 0)
4652 {
4653 *errorcodeptr = ERR15;
4654 goto FAILED;
4655 }
4656 }
4657 else if (refsign == '+')
4658 {
4659 if (recno == 0)
4660 {
4661 *errorcodeptr = ERR58;
4662 goto FAILED;
4663 }
4664 recno += cd->bracount;
4665 }
4666
4667 /* Come here from code above that handles a named recursion */
4668
4669 HANDLE_RECURSION:
4670
4671 previous = code;
4672 called = cd->start_code;
4673
4674 /* When we are actually compiling, find the bracket that is being
4675 referenced. Temporarily end the regex in case it doesn't exist before
4676 this point. If we end up with a forward reference, first check that
4677 the bracket does occur later so we can give the error (and position)
4678 now. Then remember this forward reference in the workspace so it can
4679 be filled in at the end. */
4680
4681 if (lengthptr == NULL)
4682 {
4683 *code = OP_END;
4684 if (recno != 0) called = find_bracket(cd->start_code, utf8, recno);
4685
4686 /* Forward reference */
4687
4688 if (called == NULL)
4689 {
4690 if (find_parens(ptr, cd->bracount, NULL, recno,
4691 (options & PCRE_EXTENDED) != 0) < 0)
4692 {
4693 *errorcodeptr = ERR15;
4694 goto FAILED;
4695 }
4696 called = cd->start_code + recno;
4697 PUTINC(cd->hwm, 0, code + 2 + LINK_SIZE - cd->start_code);
4698 }
4699
4700 /* If not a forward reference, and the subpattern is still open,
4701 this is a recursive call. We check to see if this is a left
4702 recursion that could loop for ever, and diagnose that case. */
4703
4704 else if (GET(called, 1) == 0 &&
4705 could_be_empty(called, code, bcptr, utf8))
4706 {
4707 *errorcodeptr = ERR40;
4708 goto FAILED;
4709 }
4710 }
4711
4712 /* Insert the recursion/subroutine item, automatically wrapped inside
4713 "once" brackets. Set up a "previous group" length so that a
4714 subsequent quantifier will work. */
4715
4716 *code = OP_ONCE;
4717 PUT(code, 1, 2 + 2*LINK_SIZE);
4718 code += 1 + LINK_SIZE;
4719
4720 *code = OP_RECURSE;
4721 PUT(code, 1, called - cd->start_code);
4722 code += 1 + LINK_SIZE;
4723
4724 *code = OP_KET;
4725 PUT(code, 1, 2 + 2*LINK_SIZE);
4726 code += 1 + LINK_SIZE;
4727
4728 length_prevgroup = 3 + 3*LINK_SIZE;
4729 }
4730
4731 /* Can't determine a first byte now */
4732
4733 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
4734 continue;
4735
4736
4737 /* ------------------------------------------------------------ */
4738 default: /* Other characters: check option setting */
4739 OTHER_CHAR_AFTER_QUERY:
4740 set = unset = 0;
4741 optset = &set;
4742
4743 while (*ptr != ')' && *ptr != ':')
4744 {
4745 switch (*ptr++)
4746 {
4747 case '-': optset = &unset; break;
4748
4749 case 'J': /* Record that it changed in the external options */
4750 *optset |= PCRE_DUPNAMES;
4751 cd->external_flags |= PCRE_JCHANGED;
4752 break;
4753
4754 case 'i': *optset |= PCRE_CASELESS; break;
4755 case 'm': *optset |= PCRE_MULTILINE; break;
4756 case 's': *optset |= PCRE_DOTALL; break;
4757 case 'x': *optset |= PCRE_EXTENDED; break;
4758 case 'U': *optset |= PCRE_UNGREEDY; break;
4759 case 'X': *optset |= PCRE_EXTRA; break;
4760
4761 default: *errorcodeptr = ERR12;
4762 ptr--; /* Correct the offset */
4763 goto FAILED;
4764 }
4765 }
4766
4767 /* Set up the changed option bits, but don't change anything yet. */
4768
4769 newoptions = (options | set) & (~unset);
4770
4771 /* If the options ended with ')' this is not the start of a nested
4772 group with option changes, so the options change at this level. If this
4773 item is right at the start of the pattern, the options can be
4774 abstracted and made external in the pre-compile phase, and ignored in
4775 the compile phase. This can be helpful when matching -- for instance in
4776 caseless checking of required bytes.
4777
4778 If the code pointer is not (cd->start_code + 1 + LINK_SIZE), we are
4779 definitely *not* at the start of the pattern because something has been
4780 compiled. In the pre-compile phase, however, the code pointer can have
4781 that value after the start, because it gets reset as code is discarded
4782 during the pre-compile. However, this can happen only at top level - if
4783 we are within parentheses, the starting BRA will still be present. At
4784 any parenthesis level, the length value can be used to test if anything
4785 has been compiled at that level. Thus, a test for both these conditions
4786 is necessary to ensure we correctly detect the start of the pattern in
4787 both phases.
4788
4789 If we are not at the pattern start, compile code to change the ims
4790 options if this setting actually changes any of them. We also pass the
4791 new setting back so that it can be put at the start of any following
4792 branches, and when this group ends (if we are in a group), a resetting
4793 item can be compiled. */
4794
4795 if (*ptr == ')')
4796 {
4797 if (code == cd->start_code + 1 + LINK_SIZE &&
4798 (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE))
4799 {
4800 cd->external_options = newoptions;
4801 options = newoptions;
4802 }
4803 else
4804 {
4805 if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
4806 {
4807 *code++ = OP_OPT;
4808 *code++ = newoptions & PCRE_IMS;
4809 }
4810
4811 /* Change options at this level, and pass them back for use
4812 in subsequent branches. Reset the greedy defaults and the case
4813 value for firstbyte and reqbyte. */
4814
4815 *optionsptr = options = newoptions;
4816 greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
4817 greedy_non_default = greedy_default ^ 1;
4818 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
4819 }
4820
4821 previous = NULL; /* This item can't be repeated */
4822 continue; /* It is complete */
4823 }
4824
4825 /* If the options ended with ':' we are heading into a nested group
4826 with possible change of options. Such groups are non-capturing and are
4827 not assertions of any kind. All we need to do is skip over the ':';
4828 the newoptions value is handled below. */
4829
4830 bravalue = OP_BRA;
4831 ptr++;
4832 } /* End of switch for character following (? */
4833 } /* End of (? handling */
4834
4835 /* Opening parenthesis not followed by '?'. If PCRE_NO_AUTO_CAPTURE is set,
4836 all unadorned brackets become non-capturing and behave like (?:...)
4837 brackets. */
4838
4839 else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
4840 {
4841 bravalue = OP_BRA;
4842 }
4843
4844 /* Else we have a capturing group. */
4845
4846 else
4847 {
4848 NUMBERED_GROUP:
4849 cd->bracount += 1;
4850 PUT2(code, 1+LINK_SIZE, cd->bracount);
4851 skipbytes = 2;
4852 }
4853
4854 /* Process nested bracketed regex. Assertions may not be repeated, but
4855 other kinds can be. All their opcodes are >= OP_ONCE. We copy code into a
4856 non-register variable in order to be able to pass its address because some
4857 compilers complain otherwise. Pass in a new setting for the ims options if
4858 they have changed. */
4859
4860 previous = (bravalue >= OP_ONCE)? code : NULL;
4861 *code = bravalue;
4862 tempcode = code;
4863 tempreqvary = cd->req_varyopt; /* Save value before bracket */
4864 length_prevgroup = 0; /* Initialize for pre-compile phase */
4865
4866 if (!compile_regex(
4867 newoptions, /* The complete new option state */
4868 options & PCRE_IMS, /* The previous ims option state */
4869 &tempcode, /* Where to put code (updated) */
4870 &ptr, /* Input pointer (updated) */
4871 errorcodeptr, /* Where to put an error message */
4872 (bravalue == OP_ASSERTBACK ||
4873 bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
4874 reset_bracount, /* True if (?| group */
4875 skipbytes, /* Skip over bracket number */
4876 &subfirstbyte, /* For possible first char */
4877 &subreqbyte, /* For possible last char */
4878 bcptr, /* Current branch chain */
4879 cd, /* Tables block */
4880 (lengthptr == NULL)? NULL : /* Actual compile phase */
4881 &length_prevgroup /* Pre-compile phase */
4882 ))
4883 goto FAILED;
4884
4885 /* At the end of compiling, code is still pointing to the start of the
4886 group, while tempcode has been updated to point past the end of the group
4887 and any option resetting that may follow it. The pattern pointer (ptr)
4888 is on the bracket. */
4889
4890 /* If this is a conditional bracket, check that there are no more than
4891 two branches in the group, or just one if it's a DEFINE group. We do this
4892 in the real compile phase, not in the pre-pass, where the whole group may
4893 not be available. */
4894
4895 if (bravalue == OP_COND && lengthptr == NULL)
4896 {
4897 uschar *tc = code;
4898 int condcount = 0;
4899
4900 do {
4901 condcount++;
4902 tc += GET(tc,1);
4903 }
4904 while (*tc != OP_KET);
4905
4906 /* A DEFINE group is never obeyed inline (the "condition" is always
4907 false). It must have only one branch. */
4908
4909 if (code[LINK_SIZE+1] == OP_DEF)
4910 {
4911 if (condcount > 1)
4912 {
4913 *errorcodeptr = ERR54;
4914 goto FAILED;
4915 }
4916 bravalue = OP_DEF; /* Just a flag to suppress char handling below */
4917 }
4918
4919 /* A "normal" conditional group. If there is just one branch, we must not
4920 make use of its firstbyte or reqbyte, because this is equivalent to an
4921 empty second branch. */
4922
4923 else
4924 {
4925 if (condcount > 2)
4926 {
4927 *errorcodeptr = ERR27;
4928 goto FAILED;
4929 }
4930 if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
4931 }
4932 }
4933
4934 /* Error if hit end of pattern */
4935
4936 if (*ptr != ')')
4937 {
4938 *errorcodeptr = ERR14;
4939 goto FAILED;
4940 }
4941
4942 /* In the pre-compile phase, update the length by the length of the group,
4943 less the brackets at either end. Then reduce the compiled code to just a
4944 set of non-capturing brackets so that it doesn't use much memory if it is
4945 duplicated by a quantifier.*/
4946
4947 if (lengthptr != NULL)
4948 {
4949 if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE)
4950 {
4951 *errorcodeptr = ERR20;
4952 goto FAILED;
4953 }
4954 *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
4955 *code++ = OP_BRA;
4956 PUTINC(code, 0, 1 + LINK_SIZE);
4957 *code++ = OP_KET;
4958 PUTINC(code, 0, 1 + LINK_SIZE);
4959 break; /* No need to waste time with special character handling */
4960 }
4961
4962 /* Otherwise update the main code pointer to the end of the group. */
4963
4964 code = tempcode;
4965
4966 /* For a DEFINE group, required and first character settings are not
4967 relevant. */
4968
4969 if (bravalue == OP_DEF) break;
4970
4971 /* Handle updating of the required and first characters for other types of
4972 group. Update for normal brackets of all kinds, and conditions with two
4973 branches (see code above). If the bracket is followed by a quantifier with
4974 zero repeat, we have to back off. Hence the definition of zeroreqbyte and
4975 zerofirstbyte outside the main loop so that they can be accessed for the
4976 back off. */
4977
4978 zeroreqbyte = reqbyte;
4979 zerofirstbyte = firstbyte;
4980 groupsetfirstbyte = FALSE;
4981
4982 if (bravalue >= OP_ONCE)
4983 {
4984 /* If we have not yet set a firstbyte in this branch, take it from the
4985 subpattern, remembering that it was set here so that a repeat of more
4986 than one can replicate it as reqbyte if necessary. If the subpattern has
4987 no firstbyte, set "none" for the whole branch. In both cases, a zero
4988 repeat forces firstbyte to "none". */
4989
4990 if (firstbyte == REQ_UNSET)
4991 {
4992 if (subfirstbyte >= 0)
4993 {
4994 firstbyte = subfirstbyte;
4995 groupsetfirstbyte = TRUE;
4996 }
4997 else firstbyte = REQ_NONE;
4998 zerofirstbyte = REQ_NONE;
4999 }
5000
5001 /* If firstbyte was previously set, convert the subpattern's firstbyte
5002 into reqbyte if there wasn't one, using the vary flag that was in
5003 existence beforehand. */
5004
5005 else if (subfirstbyte >= 0 && subreqbyte < 0)
5006 subreqbyte = subfirstbyte | tempreqvary;
5007
5008 /* If the subpattern set a required byte (or set a first byte that isn't
5009 really the first byte - see above), set it. */
5010
5011 if (subreqbyte >= 0) reqbyte = subreqbyte;
5012 }
5013
5014 /* For a forward assertion, we take the reqbyte, if set. This can be
5015 helpful if the pattern that follows the assertion doesn't set a different
5016 char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
5017 for an assertion, however because it leads to incorrect effect for patterns
5018 such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
5019 of a firstbyte. This is overcome by a scan at the end if there's no
5020 firstbyte, looking for an asserted first char. */
5021
5022 else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
5023 break; /* End of processing '(' */
5024
5025
5026 /* ===================================================================*/
5027 /* Handle metasequences introduced by \. For ones like \d, the ESC_ values
5028 are arranged to be the negation of the corresponding OP_values. For the
5029 back references, the values are ESC_REF plus the reference number. Only
5030 back references and those types that consume a character may be repeated.
5031 We can test for values between ESC_b and ESC_Z for the latter; this may
5032 have to change if any new ones are ever created. */
5033
5034 case '\\':
5035 tempptr = ptr;
5036 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, FALSE);
5037 if (*errorcodeptr != 0) goto FAILED;
5038
5039 if (c < 0)
5040 {
5041 if (-c == ESC_Q) /* Handle start of quoted string */
5042 {
5043 if (ptr[1] == '\\' && ptr[2] == 'E') ptr += 2; /* avoid empty string */
5044 else inescq = TRUE;
5045 continue;
5046 }
5047
5048 if (-c == ESC_E) continue; /* Perl ignores an orphan \E */
5049
5050 /* For metasequences that actually match a character, we disable the
5051 setting of a first character if it hasn't already been set. */
5052
5053 if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
5054 firstbyte = REQ_NONE;
5055
5056 /* Set values to reset to if this is followed by a zero repeat. */
5057
5058 zerofirstbyte = firstbyte;
5059 zeroreqbyte = reqbyte;
5060
5061 /* \k<name> or \k'name' is a back reference by name (Perl syntax).
5062 We also support \k{name} (.NET syntax) */
5063
5064 if (-c == ESC_k && (ptr[1] == '<' || ptr[1] == '\'' || ptr[1] == '{'))
5065 {
5066 is_recurse = FALSE;
5067 terminator = (*(++ptr) == '<')? '>' : (*ptr == '\'')? '\'' : '}';
5068 goto NAMED_REF_OR_RECURSE;
5069 }
5070
5071 /* Back references are handled specially; must disable firstbyte if
5072 not set to cope with cases like (?=(\w+))\1: which would otherwise set
5073 ':' later. */
5074
5075 if (-c >= ESC_REF)
5076 {
5077 recno = -c - ESC_REF;
5078
5079 HANDLE_REFERENCE: /* Come here from named backref handling */
5080 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
5081 previous = code;
5082 *code++ = OP_REF;
5083 PUT2INC(code, 0, recno);
5084 cd->backref_map |= (recno < 32)? (1 << recno) : 1;
5085 if (recno > cd->top_backref) cd->top_backref = recno;
5086 }
5087
5088 /* So are Unicode property matches, if supported. */
5089
5090 #ifdef SUPPORT_UCP
5091 else if (-c == ESC_P || -c == ESC_p)
5092 {
5093 BOOL negated;
5094 int pdata;
5095 int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
5096 if (ptype < 0) goto FAILED;
5097 previous = code;
5098 *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
5099 *code++ = ptype;
5100 *code++ = pdata;
5101 }
5102 #else
5103
5104 /* If Unicode properties are not supported, \X, \P, and \p are not
5105 allowed. */
5106
5107 else if (-c == ESC_X || -c == ESC_P || -c == ESC_p)
5108 {
5109 *errorcodeptr = ERR45;
5110 goto FAILED;
5111 }
5112 #endif
5113
5114 /* For the rest (including \X when Unicode properties are supported), we
5115 can obtain the OP value by negating the escape value. */
5116
5117 else
5118 {
5119 previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
5120 *code++ = -c;
5121 }
5122 continue;
5123 }
5124
5125 /* We have a data character whose value is in c. In UTF-8 mode it may have
5126 a value > 127. We set its representation in the length/buffer, and then
5127 handle it as a data character. */
5128
5129 #ifdef SUPPORT_UTF8
5130 if (utf8 && c > 127)
5131 mclength = _pcre_ord2utf8(c, mcbuffer);
5132 else
5133 #endif
5134
5135 {
5136 mcbuffer[0] = c;
5137 mclength = 1;
5138 }
5139 goto ONE_CHAR;
5140
5141
5142 /* ===================================================================*/
5143 /* Handle a literal character. It is guaranteed not to be whitespace or #
5144 when the extended flag is set. If we are in UTF-8 mode, it may be a
5145 multi-byte literal character. */
5146
5147 default:
5148 NORMAL_CHAR:
5149 mclength = 1;
5150 mcbuffer[0] = c;
5151
5152 #ifdef SUPPORT_UTF8
5153 if (utf8 && c >= 0xc0)
5154 {
5155 while ((ptr[1] & 0xc0) == 0x80)
5156 mcbuffer[mclength++] = *(++ptr);
5157 }
5158 #endif
5159
5160 /* At this point we have the character's bytes in mcbuffer, and the length
5161 in mclength. When not in UTF-8 mode, the length is always 1. */
5162
5163 ONE_CHAR:
5164 previous = code;
5165 *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARNC : OP_CHAR;
5166 for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
5167
5168 /* Remember if \r or \n were seen */
5169
5170 if (mcbuffer[0] == '\r' || mcbuffer[0] == '\n')
5171 cd->external_flags |= PCRE_HASCRORLF;
5172
5173 /* Set the first and required bytes appropriately. If no previous first
5174 byte, set it from this character, but revert to none on a zero repeat.
5175 Otherwise, leave the firstbyte value alone, and don't change it on a zero
5176 repeat. */
5177
5178 if (firstbyte == REQ_UNSET)
5179 {
5180 zerofirstbyte = REQ_NONE;
5181 zeroreqbyte = reqbyte;
5182
5183 /* If the character is more than one byte long, we can set firstbyte
5184 only if it is not to be matched caselessly. */
5185
5186 if (mclength == 1 || req_caseopt == 0)
5187 {
5188 firstbyte = mcbuffer[0] | req_caseopt;
5189 if (mclength != 1) reqbyte = code[-1] | cd->req_varyopt;
5190 }
5191 else firstbyte = reqbyte = REQ_NONE;
5192 }
5193
5194 /* firstbyte was previously set; we can set reqbyte only the length is
5195 1 or the matching is caseful. */
5196
5197 else
5198 {
5199 zerofirstbyte = firstbyte;
5200 zeroreqbyte = reqbyte;
5201 if (mclength == 1 || req_caseopt == 0)
5202 reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
5203 }
5204
5205 break; /* End of literal character handling */
5206 }
5207 } /* end of big loop */
5208
5209
5210 /* Control never reaches here by falling through, only by a goto for all the
5211 error states. Pass back the position in the pattern so that it can be displayed
5212 to the user for diagnosing the error. */
5213
5214 FAILED:
5215 *ptrptr = ptr;
5216 return FALSE;
5217 }
5218
5219
5220
5221
5222 /*************************************************
5223 * Compile sequence of alternatives *
5224 *************************************************/
5225
5226 /* On entry, ptr is pointing past the bracket character, but on return it
5227 points to the closing bracket, or vertical bar, or end of string. The code
5228 variable is pointing at the byte into which the BRA operator has been stored.
5229 If the ims options are changed at the start (for a (?ims: group) or during any
5230 branch, we need to insert an OP_OPT item at the start of every following branch
5231 to ensure they get set correctly at run time, and also pass the new options
5232 into every subsequent branch compile.
5233
5234 This function is used during the pre-compile phase when we are trying to find
5235 out the amount of memory needed, as well as during the real compile phase. The
5236 value of lengthptr distinguishes the two phases.
5237
5238 Arguments:
5239 options option bits, including any changes for this subpattern
5240 oldims previous settings of ims option bits
5241 codeptr -> the address of the current code pointer
5242 ptrptr -> the address of the current pattern pointer
5243 errorcodeptr -> pointer to error code variable
5244 lookbehind TRUE if this is a lookbehind assertion
5245 reset_bracount TRUE to reset the count for each branch
5246 skipbytes skip this many bytes at start (for brackets and OP_COND)
5247 firstbyteptr place to put the first required character, or a negative number
5248 reqbyteptr place to put the last required character, or a negative number
5249 bcptr pointer to the chain of currently open branches
5250 cd points to the data block with tables pointers etc.
5251 lengthptr NULL during the real compile phase
5252 points to length accumulator during pre-compile phase
5253
5254 Returns: TRUE on success
5255 */
5256
5257 static BOOL
5258 compile_regex(int options, int oldims, uschar **codeptr, const uschar **ptrptr,
5259 int *errorcodeptr, BOOL lookbehind, BOOL reset_bracount, int skipbytes,
5260 int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd,
5261 int *lengthptr)
5262 {
5263 const uschar *ptr = *ptrptr;
5264 uschar *code = *codeptr;
5265 uschar *last_branch = code;
5266 uschar *start_bracket = code;
5267 uschar *reverse_count = NULL;
5268 int firstbyte, reqbyte;
5269 int branchfirstbyte, branchreqbyte;
5270 int length;
5271 int orig_bracount;
5272 int max_bracount;
5273 branch_chain bc;
5274
5275 bc.outer = bcptr;
5276 bc.current = code;
5277
5278 firstbyte = reqbyte = REQ_UNSET;
5279
5280 /* Accumulate the length for use in the pre-compile phase. Start with the
5281 length of the BRA and KET and any extra bytes that are required at the
5282 beginning. We accumulate in a local variable to save frequent testing of
5283 lenthptr for NULL. We cannot do this by looking at the value of code at the
5284 start and end of each alternative, because compiled items are discarded during
5285 the pre-compile phase so that the work space is not exceeded. */
5286
5287 length = 2 + 2*LINK_SIZE + skipbytes;
5288
5289 /* WARNING: If the above line is changed for any reason, you must also change
5290 the code that abstracts option settings at the start of the pattern and makes
5291 them global. It tests the value of length for (2 + 2*LINK_SIZE) in the
5292 pre-compile phase to find out whether anything has yet been compiled or not. */
5293
5294 /* Offset is set zero to mark that this bracket is still open */
5295
5296 PUT(code, 1, 0);
5297 code += 1 + LINK_SIZE + skipbytes;
5298
5299 /* Loop for each alternative branch */
5300
5301 orig_bracount = max_bracount = cd->bracount;
5302 for (;;)
5303 {
5304 /* For a (?| group, reset the capturing bracket count so that each branch
5305 uses the same numbers. */
5306
5307 if (reset_bracount) cd->bracount = orig_bracount;
5308
5309 /* Handle a change of ims options at the start of the branch */
5310
5311 if ((options & PCRE_IMS) != oldims)
5312 {
5313 *code++ = OP_OPT;
5314 *code++ = options & PCRE_IMS;
5315 length += 2;
5316 }
5317
5318 /* Set up dummy OP_REVERSE if lookbehind assertion */
5319
5320 if (lookbehind)
5321 {
5322 *code++ = OP_REVERSE;
5323 reverse_count = code;
5324 PUTINC(code, 0, 0);
5325 length += 1 + LINK_SIZE;
5326 }
5327
5328 /* Now compile the branch; in the pre-compile phase its length gets added
5329 into the length. */
5330
5331 if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstbyte,
5332 &branchreqbyte, &bc, cd, (lengthptr == NULL)? NULL : &length))
5333 {
5334 *ptrptr = ptr;
5335 return FALSE;
5336 }
5337
5338 /* Keep the highest bracket count in case (?| was used and some branch
5339 has fewer than the rest. */
5340
5341 if (cd->bracount > max_bracount) max_bracount = cd->bracount;
5342
5343 /* In the real compile phase, there is some post-processing to be done. */
5344
5345 if (lengthptr == NULL)
5346 {
5347 /* If this is the first branch, the firstbyte and reqbyte values for the
5348 branch become the values for the regex. */
5349
5350 if (*last_branch != OP_ALT)
5351 {
5352 firstbyte = branchfirstbyte;
5353 reqbyte = branchreqbyte;
5354 }
5355
5356 /* If this is not the first branch, the first char and reqbyte have to
5357 match the values from all the previous branches, except that if the
5358 previous value for reqbyte didn't have REQ_VARY set, it can still match,
5359 and we set REQ_VARY for the regex. */
5360
5361 else
5362 {
5363 /* If we previously had a firstbyte, but it doesn't match the new branch,
5364 we have to abandon the firstbyte for the regex, but if there was
5365 previously no reqbyte, it takes on the value of the old firstbyte. */
5366
5367 if (firstbyte >= 0 && firstbyte != branchfirstbyte)
5368 {
5369 if (reqbyte < 0) reqbyte = firstbyte;
5370 firstbyte = REQ_NONE;
5371 }
5372
5373 /* If we (now or from before) have no firstbyte, a firstbyte from the
5374 branch becomes a reqbyte if there isn't a branch reqbyte. */
5375
5376 if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
5377 branchreqbyte = branchfirstbyte;
5378
5379 /* Now ensure that the reqbytes match */
5380
5381 if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
5382 reqbyte = REQ_NONE;
5383 else reqbyte |= branchreqbyte; /* To "or" REQ_VARY */
5384 }
5385
5386 /* If lookbehind, check that this branch matches a fixed-length string, and
5387 put the length into the OP_REVERSE item. Temporarily mark the end of the
5388 branch with OP_END. */
5389
5390 if (lookbehind)
5391 {
5392 int fixed_length;
5393 *code = OP_END;
5394 fixed_length = find_fixedlength(last_branch, options);
5395 DPRINTF(("fixed length = %d\n", fixed_length));
5396 if (fixed_length < 0)
5397 {
5398 *errorcodeptr = (fixed_length == -2)? ERR36 : ERR25;
5399 *ptrptr = ptr;
5400 return FALSE;
5401 }
5402 PUT(reverse_count, 0, fixed_length);
5403 }
5404 }
5405
5406 /* Reached end of expression, either ')' or end of pattern. In the real
5407 compile phase, go back through the alternative branches and reverse the chain
5408 of offsets, with the field in the BRA item now becoming an offset to the
5409 first alternative. If there are no alternatives, it points to the end of the
5410 group. The length in the terminating ket is always the length of the whole
5411 bracketed item. If any of the ims options were changed inside the group,
5412 compile a resetting op-code following, except at the very end of the pattern.
5413 Return leaving the pointer at the terminating char. */
5414
5415 if (*ptr != '|')
5416 {
5417 if (lengthptr == NULL)
5418 {
5419 int branch_length = code - last_branch;
5420 do
5421 {
5422 int prev_length = GET(last_branch, 1);
5423 PUT(last_branch, 1, branch_length);
5424 branch_length = prev_length;
5425 last_branch -= branch_length;
5426 }
5427 while (branch_length > 0);
5428 }
5429
5430 /* Fill in the ket */
5431
5432 *code = OP_KET;
5433 PUT(code, 1, code - start_bracket);
5434 code += 1 + LINK_SIZE;
5435
5436 /* Resetting option if needed */
5437
5438 if ((options & PCRE_IMS) != oldims && *ptr == ')')
5439 {
5440 *code++ = OP_OPT;
5441 *code++ = oldims;
5442 length += 2;
5443 }
5444
5445 /* Retain the highest bracket number, in case resetting was used. */
5446
5447 cd->bracount = max_bracount;
5448
5449 /* Set values to pass back */
5450
5451 *codeptr = code;
5452 *ptrptr = ptr;
5453 *firstbyteptr = firstbyte;
5454 *reqbyteptr = reqbyte;
5455 if (lengthptr != NULL)
5456 {
5457 if (OFLOW_MAX - *lengthptr < length)
5458 {
5459 *errorcodeptr = ERR20;
5460 return FALSE;
5461 }
5462 *lengthptr += length;
5463 }
5464 return TRUE;
5465 }
5466
5467 /* Another branch follows. In the pre-compile phase, we can move the code
5468 pointer back to where it was for the start of the first branch. (That is,
5469 pretend that each branch is the only one.)
5470
5471 In the real compile phase, insert an ALT node. Its length field points back
5472 to the previous branch while the bracket remains open. At the end the chain
5473 is reversed. It's done like this so that the start of the bracket has a
5474 zero offset until it is closed, making it possible to detect recursion. */
5475
5476 if (lengthptr != NULL)
5477 {
5478 code = *codeptr + 1 + LINK_SIZE + skipbytes;
5479 length += 1 + LINK_SIZE;
5480 }
5481 else
5482 {
5483 *code = OP_ALT;
5484 PUT(code, 1, code - last_branch);
5485 bc.current = last_branch = code;
5486 code += 1 + LINK_SIZE;
5487 }
5488
5489 ptr++;
5490 }
5491 /* Control never reaches here */
5492 }
5493
5494
5495
5496
5497 /*************************************************
5498 * Check for anchored expression *
5499 *************************************************/
5500
5501 /* Try to find out if this is an anchored regular expression. Consider each
5502 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
5503 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
5504 it's anchored. However, if this is a multiline pattern, then only OP_SOD
5505 counts, since OP_CIRC can match in the middle.
5506
5507 We can also consider a regex to be anchored if OP_SOM starts all its branches.
5508 This is the code for \G, which means "match at start of match position, taking
5509 into account the match offset".
5510
5511 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
5512 because that will try the rest of the pattern at all possible matching points,
5513 so there is no point trying again.... er ....
5514
5515 .... except when the .* appears inside capturing parentheses, and there is a
5516 subsequent back reference to those parentheses. We haven't enough information
5517 to catch that case precisely.
5518
5519 At first, the best we could do was to detect when .* was in capturing brackets
5520 and the highest back reference was greater than or equal to that level.
5521 However, by keeping a bitmap of the first 31 back references, we can catch some
5522 of the more common cases more precisely.
5523
5524 Arguments:
5525 code points to start of expression (the bracket)
5526 options points to the options setting
5527 bracket_map a bitmap of which brackets we are inside while testing; this
5528 handles up to substring 31; after that we just have to take
5529 the less precise approach
5530 backref_map the back reference bitmap
5531
5532 Returns: TRUE or FALSE
5533 */
5534
5535 static BOOL
5536 is_anchored(register const uschar *code, int *options, unsigned int bracket_map,
5537 unsigned int backref_map)
5538 {
5539 do {
5540 const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
5541 options, PCRE_MULTILINE, FALSE);
5542 register int op = *scode;
5543
5544 /* Non-capturing brackets */
5545
5546 if (op == OP_BRA)
5547 {
5548 if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
5549 }
5550
5551 /* Capturing brackets */
5552
5553 else if (op == OP_CBRA)
5554 {
5555 int n = GET2(scode, 1+LINK_SIZE);
5556 int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
5557 if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;
5558 }
5559
5560 /* Other brackets */
5561
5562 else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
5563 {
5564 if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
5565 }
5566
5567 /* .* is not anchored unless DOTALL is set and it isn't in brackets that
5568 are or may be referenced. */
5569
5570 else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
5571 op == OP_TYPEPOSSTAR) &&
5572 (*options & PCRE_DOTALL) != 0)
5573 {
5574 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
5575 }
5576
5577 /* Check for explicit anchoring */
5578
5579 else if (op != OP_SOD && op != OP_SOM &&
5580 ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
5581 return FALSE;
5582 code += GET(code, 1);
5583 }
5584 while (*code == OP_ALT); /* Loop for each alternative */
5585 return TRUE;
5586 }
5587
5588
5589
5590 /*************************************************
5591 * Check for starting with ^ or .* *
5592 *************************************************/
5593
5594 /* This is called to find out if every branch starts with ^ or .* so that
5595 "first char" processing can be done to speed things up in multiline
5596 matching and for non-DOTALL patterns that start with .* (which must start at
5597 the beginning or after \n). As in the case of is_anchored() (see above), we
5598 have to take account of back references to capturing brackets that contain .*
5599 because in that case we can't make the assumption.
5600
5601 Arguments:
5602 code points to start of expression (the bracket)
5603 bracket_map a bitmap of which brackets we are inside while testing; this
5604 handles up to substring 31; after that we just have to take
5605 the less precise approach
5606 backref_map the back reference bitmap
5607
5608 Returns: TRUE or FALSE
5609 */
5610
5611 static BOOL
5612 is_startline(const uschar *code, unsigned int bracket_map,
5613 unsigned int backref_map)
5614 {
5615 do {
5616 const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
5617 NULL, 0, FALSE);
5618 register int op = *scode;
5619
5620 /* Non-capturing brackets */
5621
5622 if (op == OP_BRA)
5623 {
5624 if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
5625 }
5626
5627 /* Capturing brackets */
5628
5629 else if (op == OP_CBRA)
5630 {
5631 int n = GET2(scode, 1+LINK_SIZE);
5632 int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
5633 if (!is_startline(scode, new_map, backref_map)) return FALSE;
5634 }
5635
5636 /* Other brackets */
5637
5638 else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
5639 { if (!is_startline(scode, bracket_map, backref_map)) return FALSE; }
5640
5641 /* .* means "start at start or after \n" if it isn't in brackets that
5642 may be referenced. */
5643
5644 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
5645 {
5646 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
5647 }
5648
5649 /* Check for explicit circumflex */
5650
5651 else if (op != OP_CIRC) return FALSE;
5652
5653 /* Move on to the next alternative */
5654
5655 code += GET(code, 1);
5656 }
5657 while (*code == OP_ALT); /* Loop for each alternative */
5658 return TRUE;
5659 }
5660
5661
5662
5663 /*************************************************
5664 * Check for asserted fixed first char *
5665 *************************************************/
5666
5667 /* During compilation, the "first char" settings from forward assertions are
5668 discarded, because they can cause conflicts with actual literals that follow.
5669 However, if we end up without a first char setting for an unanchored pattern,
5670 it is worth scanning the regex to see if there is an initial asserted first
5671 char. If all branches start with the same asserted char, or with a bracket all
5672 of whose alternatives start with the same asserted char (recurse ad lib), then
5673 we return that char, otherwise -1.
5674
5675 Arguments:
5676 code points to start of expression (the bracket)
5677 options pointer to the options (used to check casing changes)
5678 inassert TRUE if in an assertion
5679
5680 Returns: -1 or the fixed first char
5681 */
5682
5683 static int
5684 find_firstassertedchar(const uschar *code, int *options, BOOL inassert)
5685 {
5686 register int c = -1;
5687 do {
5688 int d;
5689 const uschar *scode =
5690 first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE);
5691 register int op = *scode;
5692
5693 switch(op)
5694 {
5695 default:
5696 return -1;
5697
5698 case OP_BRA:
5699 case OP_CBRA:
5700 case OP_ASSERT:
5701 case OP_ONCE:
5702 case OP_COND:
5703 if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0)
5704 return -1;
5705 if (c < 0) c = d; else if (c != d) return -1;
5706 break;
5707
5708 case OP_EXACT: /* Fall through */
5709 scode += 2;
5710
5711 case OP_CHAR:
5712 case OP_CHARNC:
5713 case OP_PLUS:
5714 case OP_MINPLUS:
5715 case OP_POSPLUS:
5716 if (!inassert) return -1;
5717 if (c < 0)
5718 {
5719 c = scode[1];
5720 if ((*options & PCRE_CASELESS) != 0) c |= REQ_CASELESS;
5721 }
5722 else if (c != scode[1]) return -1;
5723 break;
5724 }
5725
5726 code += GET(code, 1);
5727 }
5728 while (*code == OP_ALT);
5729 return c;
5730 }
5731
5732
5733
5734 /*************************************************
5735 * Compile a Regular Expression *
5736 *************************************************/
5737
5738 /* This function takes a string and returns a pointer to a block of store
5739 holding a compiled version of the expression. The original API for this
5740 function had no error code return variable; it is retained for backwards
5741 compatibility. The new function is given a new name.
5742
5743 Arguments:
5744 pattern the regular expression
5745 options various option bits
5746 errorcodeptr pointer to error code variable (pcre_compile2() only)
5747 can be NULL if you don't want a code value
5748 errorptr pointer to pointer to error text
5749 erroroffset ptr offset in pattern where error was detected
5750 tables pointer to character tables or NULL
5751
5752 Returns: pointer to compiled data block, or NULL on error,
5753 with errorptr and erroroffset set
5754 */
5755
5756 PCRE_EXP_DEFN pcre *
5757 pcre_compile(const char *pattern, int options, const char **errorptr,
5758 int *erroroffset, const unsigned char *tables)
5759 {
5760 return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
5761 }
5762
5763
5764 PCRE_EXP_DEFN pcre *
5765 pcre_compile2(const char *pattern, int options, int *errorcodeptr,
5766 const char **errorptr, int *erroroffset, const unsigned char *tables)
5767 {
5768 real_pcre *re;
5769 int length = 1; /* For final END opcode */
5770 int firstbyte, reqbyte, newline;
5771 int errorcode = 0;
5772 int skipatstart = 0;
5773 #ifdef SUPPORT_UTF8
5774 BOOL utf8;
5775 #endif
5776 size_t size;
5777 uschar *code;
5778 const uschar *codestart;
5779 const uschar *ptr;
5780 compile_data compile_block;
5781 compile_data *cd = &compile_block;
5782
5783 /* This space is used for "compiling" into during the first phase, when we are
5784 computing the amount of memory that is needed. Compiled items are thrown away
5785 as soon as possible, so that a fairly large buffer should be sufficient for
5786 this purpose. The same space is used in the second phase for remembering where
5787 to fill in forward references to subpatterns. */
5788
5789 uschar cworkspace[COMPILE_WORK_SIZE];
5790
5791
5792 /* Set this early so that early errors get offset 0. */
5793
5794 ptr = (const uschar *)pattern;
5795
5796 /* We can't pass back an error message if errorptr is NULL; I guess the best we
5797 can do is just return NULL, but we can set a code value if there is a code
5798 pointer. */
5799
5800 if (errorptr == NULL)
5801 {
5802 if (errorcodeptr != NULL) *errorcodeptr = 99;
5803 return NULL;
5804 }
5805
5806 *errorptr = NULL;
5807 if (errorcodeptr != NULL) *errorcodeptr = ERR0;
5808
5809 /* However, we can give a message for this error */
5810
5811 if (erroroffset == NULL)
5812 {
5813 errorcode = ERR16;
5814 goto PCRE_EARLY_ERROR_RETURN2;
5815 }
5816
5817 *erroroffset = 0;
5818
5819 /* Can't support UTF8 unless PCRE has been compiled to include the code. */
5820
5821 #ifdef SUPPORT_UTF8
5822 utf8 = (options & PCRE_UTF8) != 0;
5823 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
5824 (*erroroffset = _pcre_valid_utf8((uschar *)pattern, -1)) >= 0)
5825 {
5826 errorcode = ERR44;
5827 goto PCRE_EARLY_ERROR_RETURN2;
5828 }
5829 #else
5830 if ((options & PCRE_UTF8) != 0)
5831 {
5832 errorcode = ERR32;
5833 goto PCRE_EARLY_ERROR_RETURN;
5834 }
5835 #endif
5836
5837 if ((options & ~PUBLIC_OPTIONS) != 0)
5838 {
5839 errorcode = ERR17;
5840 goto PCRE_EARLY_ERROR_RETURN;
5841 }
5842
5843 /* Set up pointers to the individual character tables */
5844
5845 if (tables == NULL) tables = _pcre_default_tables;
5846 cd->lcc = tables + lcc_offset;
5847 cd->fcc = tables + fcc_offset;
5848 cd->cbits = tables + cbits_offset;
5849 cd->ctypes = tables + ctypes_offset;
5850
5851 /* Check for global one-time settings at the start of the pattern, and remember
5852 the offset for later. */
5853
5854 while (ptr[skipatstart] == '(' && ptr[skipatstart+1] == '*')
5855 {
5856 int newnl = 0;
5857 int newbsr = 0;
5858
5859 if (strncmp((char *)(ptr+skipatstart+2), "CR)", 3) == 0)
5860 { skipatstart += 5; newnl = PCRE_NEWLINE_CR; }
5861 else if (strncmp((char *)(ptr+skipatstart+2), "LF)", 3) == 0)
5862 { skipatstart += 5; newnl = PCRE_NEWLINE_LF; }
5863 else if (strncmp((char *)(ptr+skipatstart+2), "CRLF)", 5) == 0)
5864 { skipatstart += 7; newnl = PCRE_NEWLINE_CR + PCRE_NEWLINE_LF; }
5865 else if (strncmp((char *)(ptr+skipatstart+2), "ANY)", 4) == 0)
5866 { skipatstart += 6; newnl = PCRE_NEWLINE_ANY; }
5867 else if (strncmp((char *)(ptr+skipatstart+2), "ANYCRLF)", 8) == 0)
5868 { skipatstart += 10; newnl = PCRE_NEWLINE_ANYCRLF; }
5869
5870 else if (strncmp((char *)(ptr+skipatstart+2), "BSR_ANYCRLF)", 12) == 0)
5871 { skipatstart += 14; newbsr = PCRE_BSR_ANYCRLF; }
5872 else if (strncmp((char *)(ptr+skipatstart+2), "BSR_UNICODE)", 12) == 0)
5873 { skipatstart += 14; newbsr = PCRE_BSR_UNICODE; }
5874
5875 if (newnl != 0)
5876 options = (options & ~PCRE_NEWLINE_BITS) | newnl;
5877 else if (newbsr != 0)
5878 options = (options & ~(PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) | newbsr;
5879 else break;
5880 }
5881
5882 /* Check validity of \R options. */
5883
5884 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
5885 {
5886 case 0:
5887 case PCRE_BSR_ANYCRLF:
5888 case PCRE_BSR_UNICODE:
5889 break;
5890 default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
5891 }
5892
5893 /* Handle different types of newline. The three bits give seven cases. The
5894 current code allows for fixed one- or two-byte sequences, plus "any" and
5895 "anycrlf". */
5896
5897 switch (options & PCRE_NEWLINE_BITS)
5898 {
5899 case 0: newline = NEWLINE; break; /* Build-time default */
5900 case PCRE_NEWLINE_CR: newline = '\r'; break;
5901 case PCRE_NEWLINE_LF: newline = '\n'; break;
5902 case PCRE_NEWLINE_CR+
5903 PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
5904 case PCRE_NEWLINE_ANY: newline = -1; break;
5905 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
5906 default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
5907 }
5908
5909 if (newline == -2)
5910 {
5911 cd->nltype = NLTYPE_ANYCRLF;
5912 }
5913 else if (newline < 0)
5914 {
5915 cd->nltype = NLTYPE_ANY;
5916 }
5917 else
5918 {
5919 cd->nltype = NLTYPE_FIXED;
5920 if (newline > 255)
5921 {
5922 cd->nllen = 2;
5923 cd->nl[0] = (newline >> 8) & 255;
5924 cd->nl[1] = newline & 255;
5925 }
5926 else
5927 {
5928 cd->nllen = 1;
5929 cd->nl[0] = newline;
5930 }
5931 }
5932
5933 /* Maximum back reference and backref bitmap. The bitmap records up to 31 back
5934 references to help in deciding whether (.*) can be treated as anchored or not.
5935 */
5936
5937 cd->top_backref = 0;
5938 cd->backref_map = 0;
5939
5940 /* Reflect pattern for debugging output */
5941
5942 DPRINTF(("------------------------------------------------------------------\n"));
5943 DPRINTF(("%s\n", pattern));
5944
5945 /* Pretend to compile the pattern while actually just accumulating the length
5946 of memory required. This behaviour is triggered by passing a non-NULL final
5947 argument to compile_regex(). We pass a block of workspace (cworkspace) for it
5948 to compile parts of the pattern into; the compiled code is discarded when it is
5949 no longer needed, so hopefully this workspace will never overflow, though there
5950 is a test for its doing so. */
5951
5952 cd->bracount = cd->final_bracount = 0;
5953 cd->names_found = 0;
5954 cd->name_entry_size = 0;
5955 cd->name_table = NULL;
5956 cd->start_workspace = cworkspace;
5957 cd->start_code = cworkspace;
5958 cd->hwm = cworkspace;
5959 cd->start_pattern = (const uschar *)pattern;
5960 cd->end_pattern = (const uschar *)(pattern + strlen(pattern));
5961 cd->req_varyopt = 0;
5962 cd->external_options = options;
5963 cd->external_flags = 0;
5964
5965 /* Now do the pre-compile. On error, errorcode will be set non-zero, so we
5966 don't need to look at the result of the function here. The initial options have
5967 been put into the cd block so that they can be changed if an option setting is
5968 found within the regex right at the beginning. Bringing initial option settings
5969 outside can help speed up starting point checks. */
5970
5971 ptr += skipatstart;
5972 code = cworkspace;
5973 *code = OP_BRA;
5974 (void)compile_regex(cd->external_options, cd->external_options & PCRE_IMS,
5975 &code, &ptr, &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd,
5976 &length);
5977 if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN;
5978
5979 DPRINTF(("end pre-compile: length=%d workspace=%d\n", length,
5980 cd->hwm - cworkspace));
5981
5982 if (length > MAX_PATTERN_SIZE)
5983 {
5984 errorcode = ERR20;
5985 goto PCRE_EARLY_ERROR_RETURN;
5986 }
5987
5988 /* Compute the size of data block needed and get it, either from malloc or
5989 externally provided function. Integer overflow should no longer be possible
5990 because nowadays we limit the maximum value of cd->names_found and
5991 cd->name_entry_size. */
5992
5993 size = length + sizeof(real_pcre) + cd->names_found * (cd->name_entry_size + 3);
5994 re = (real_pcre *)(pcre_malloc)(size);
5995
5996 if (re == NULL)
5997 {
5998 errorcode = ERR21;
5999 goto PCRE_EARLY_ERROR_RETURN;
6000 }
6001
6002 /* Put in the magic number, and save the sizes, initial options, internal
6003 flags, and character table pointer. NULL is used for the default character
6004 tables. The nullpad field is at the end; it's there to help in the case when a
6005 regex compiled on a system with 4-byte pointers is run on another with 8-byte
6006 pointers. */
6007
6008 re->magic_number = MAGIC_NUMBER;
6009 re->size = size;
6010 re->options = cd->external_options;
6011 re->flags = cd->external_flags;
6012 re->dummy1 = 0;
6013 re->first_byte = 0;
6014 re->req_byte = 0;
6015 re->name_table_offset = sizeof(real_pcre);
6016 re->name_entry_size = cd->name_entry_size;
6017 re->name_count = cd->names_found;
6018 re->ref_count = 0;
6019 re->tables = (tables == _pcre_default_tables)? NULL : tables;
6020 re->nullpad = NULL;
6021
6022 /* The starting points of the name/number translation table and of the code are
6023 passed around in the compile data block. The start/end pattern and initial
6024 options are already set from the pre-compile phase, as is the name_entry_size
6025 field. Reset the bracket count and the names_found field. Also reset the hwm
6026 field; this time it's used for remembering forward references to subpatterns.
6027 */
6028
6029 cd->final_bracount = cd->bracount; /* Save for checking forward references */
6030 cd->bracount = 0;
6031 cd->names_found = 0;
6032 cd->name_table = (uschar *)re + re->name_table_offset;
6033 codestart = cd->name_table + re->name_entry_size * re->name_count;
6034 cd->start_code = codestart;
6035 cd->hwm = cworkspace;
6036 cd->req_varyopt = 0;
6037 cd->had_accept = FALSE;
6038
6039 /* Set up a starting, non-extracting bracket, then compile the expression. On
6040 error, errorcode will be set non-zero, so we don't need to look at the result
6041 of the function here. */
6042
6043 ptr = (const uschar *)pattern + skipatstart;
6044 code = (uschar *)codestart;
6045 *code = OP_BRA;
6046 (void)compile_regex(re->options, re->options & PCRE_IMS, &code, &ptr,
6047 &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd, NULL);
6048 re->top_bracket = cd->bracount;
6049 re->top_backref = cd->top_backref;
6050 re->flags = cd->external_flags;
6051
6052 if (cd->had_accept) reqbyte = -1; /* Must disable after (*ACCEPT) */
6053
6054 /* If not reached end of pattern on success, there's an excess bracket. */
6055
6056 if (errorcode == 0 && *ptr != 0) errorcode = ERR22;
6057
6058 /* Fill in the terminating state and check for disastrous overflow, but
6059 if debugging, leave the test till after things are printed out. */
6060
6061 *code++ = OP_END;
6062
6063 #ifndef DEBUG
6064 if (code - codestart > length) errorcode = ERR23;
6065 #endif
6066
6067 /* Fill in any forward references that are required. */
6068
6069 while (errorcode == 0 && cd->hwm > cworkspace)
6070 {
6071 int offset, recno;
6072 const uschar *groupptr;
6073 cd->hwm -= LINK_SIZE;
6074 offset = GET(cd->hwm, 0);
6075 recno = GET(codestart, offset);
6076 groupptr = find_bracket(codestart, (re->options & PCRE_UTF8) != 0, recno);
6077 if (groupptr == NULL) errorcode = ERR53;
6078 else PUT(((uschar *)codestart), offset, groupptr - codestart);
6079 }
6080
6081 /* Give an error if there's back reference to a non-existent capturing
6082 subpattern. */
6083
6084 if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15;
6085
6086 /* Failed to compile, or error while post-processing */
6087
6088 if (errorcode != 0)
6089 {
6090 (pcre_free)(re);
6091 PCRE_EARLY_ERROR_RETURN:
6092 *erroroffset = ptr - (const uschar *)pattern;
6093 PCRE_EARLY_ERROR_RETURN2:
6094 *errorptr = find_error_text(errorcode);
6095 if (errorcodeptr != NULL) *errorcodeptr = errorcode;
6096 return NULL;
6097 }
6098
6099 /* If the anchored option was not passed, set the flag if we can determine that
6100 the pattern is anchored by virtue of ^ characters or \A or anything else (such
6101 as starting with .* when DOTALL is set).
6102
6103 Otherwise, if we know what the first byte has to be, save it, because that
6104 speeds up unanchored matches no end. If not, see if we can set the
6105 PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
6106 start with ^. and also when all branches start with .* for non-DOTALL matches.
6107 */
6108
6109 if ((re->options & PCRE_ANCHORED) == 0)
6110 {
6111 int temp_options = re->options; /* May get changed during these scans */
6112 if (is_anchored(codestart, &temp_options, 0, cd->backref_map))
6113 re->options |= PCRE_ANCHORED;
6114 else
6115 {
6116 if (firstbyte < 0)
6117 firstbyte = find_firstassertedchar(codestart, &temp_options, FALSE);
6118 if (firstbyte >= 0) /* Remove caseless flag for non-caseable chars */
6119 {
6120 int ch = firstbyte & 255;
6121 re->first_byte = ((firstbyte & REQ_CASELESS) != 0 &&
6122 cd->fcc[ch] == ch)? ch : firstbyte;
6123 re->flags |= PCRE_FIRSTSET;
6124 }
6125 else if (is_startline(codestart, 0, cd->backref_map))
6126 re->flags |= PCRE_STARTLINE;
6127 }
6128 }
6129
6130 /* For an anchored pattern, we use the "required byte" only if it follows a
6131 variable length item in the regex. Remove the caseless flag for non-caseable
6132 bytes. */
6133
6134 if (reqbyte >= 0 &&
6135 ((re->options & PCRE_ANCHORED) == 0 || (reqbyte & REQ_VARY) != 0))
6136 {
6137 int ch = reqbyte & 255;
6138 re->req_byte = ((reqbyte & REQ_CASELESS) != 0 &&
6139 cd->fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte;
6140 re->flags |= PCRE_REQCHSET;
6141 }
6142
6143 /* Print out the compiled data if debugging is enabled. This is never the
6144 case when building a production library. */
6145
6146 #ifdef DEBUG
6147
6148 printf("Length = %d top_bracket = %d top_backref = %d\n",
6149 length, re->top_bracket, re->top_backref);
6150
6151 printf("Options=%08x\n", re->options);
6152
6153 if ((re->flags & PCRE_FIRSTSET) != 0)
6154 {
6155 int ch = re->first_byte & 255;
6156 const char *caseless = ((re->first_byte & REQ_CASELESS) == 0)?
6157 "" : " (caseless)";
6158 if (isprint(ch)) printf("First char = %c%s\n", ch, caseless);
6159 else printf("First char = \\x%02x%s\n", ch, caseless);
6160 }
6161
6162 if ((re->flags & PCRE_REQCHSET) != 0)
6163 {
6164 int ch = re->req_byte & 255;
6165 const char *caseless = ((re->req_byte & REQ_CASELESS) == 0)?
6166 "" : " (caseless)";
6167 if (isprint(ch)) printf("Req char = %c%s\n", ch, caseless);
6168 else printf("Req char = \\x%02x%s\n", ch, caseless);
6169 }
6170
6171 pcre_printint(re, stdout, TRUE);
6172
6173 /* This check is done here in the debugging case so that the code that
6174 was compiled can be seen. */
6175
6176 if (code - codestart > length)
6177 {
6178 (pcre_free)(re);
6179 *errorptr = find_error_text(ERR23);
6180 *erroroffset = ptr - (uschar *)pattern;
6181 if (errorcodeptr != NULL) *errorcodeptr = ERR23;
6182 return NULL;
6183 }
6184 #endif /* DEBUG */
6185
6186 return (pcre *)re;
6187 }
6188
6189 /* End of pcre_compile.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12