/[pcre]/code/tags/pcre-7.8/pcre_compile.c
ViewVC logotype

Contents of /code/tags/pcre-7.8/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 243 - (show annotations) (download)
Thu Sep 13 09:28:14 2007 UTC (7 years, 3 months ago) by ph10
Original Path: code/trunk/pcre_compile.c
File MIME type: text/plain
File size: 193564 byte(s)
Detrailed files for 7.4-RC1 test release.

1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2007 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains the external function pcre_compile(), along with
42 supporting internal functions that are not used by other modules. */
43
44
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48
49 #define NLBLOCK cd /* Block containing newline information */
50 #define PSSTART start_pattern /* Field containing processed string start */
51 #define PSEND end_pattern /* Field containing processed string end */
52
53 #include "pcre_internal.h"
54
55
56 /* When DEBUG is defined, we need the pcre_printint() function, which is also
57 used by pcretest. DEBUG is not defined when building a production library. */
58
59 #ifdef DEBUG
60 #include "pcre_printint.src"
61 #endif
62
63
64 /* Macro for setting individual bits in class bitmaps. */
65
66 #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
67
68 /* Maximum length value to check against when making sure that the integer that
69 holds the compiled pattern length does not overflow. We make it a bit less than
70 INT_MAX to allow for adding in group terminating bytes, so that we don't have
71 to check them every time. */
72
73 #define OFLOW_MAX (INT_MAX - 20)
74
75
76 /*************************************************
77 * Code parameters and static tables *
78 *************************************************/
79
80 /* This value specifies the size of stack workspace that is used during the
81 first pre-compile phase that determines how much memory is required. The regex
82 is partly compiled into this space, but the compiled parts are discarded as
83 soon as they can be, so that hopefully there will never be an overrun. The code
84 does, however, check for an overrun. The largest amount I've seen used is 218,
85 so this number is very generous.
86
87 The same workspace is used during the second, actual compile phase for
88 remembering forward references to groups so that they can be filled in at the
89 end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
90 is 4 there is plenty of room. */
91
92 #define COMPILE_WORK_SIZE (4096)
93
94
95 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
96 are simple data values; negative values are for special things like \d and so
97 on. Zero means further processing is needed (for things like \x), or the escape
98 is invalid. */
99
100 #ifndef EBCDIC /* This is the "normal" table for ASCII systems */
101 static const short int escapes[] = {
102 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
103 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
104 '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */
105 -ESC_H, 0, 0, -ESC_K, 0, 0, 0, 0, /* H - O */
106 -ESC_P, -ESC_Q, -ESC_R, -ESC_S, 0, 0, -ESC_V, -ESC_W, /* P - W */
107 -ESC_X, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
108 '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */
109 -ESC_h, 0, 0, -ESC_k, 0, 0, ESC_n, 0, /* h - o */
110 -ESC_p, 0, ESC_r, -ESC_s, ESC_tee, 0, -ESC_v, -ESC_w, /* p - w */
111 0, 0, -ESC_z /* x - z */
112 };
113
114 #else /* This is the "abnormal" table for EBCDIC systems */
115 static const short int escapes[] = {
116 /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
117 /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
118 /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
119 /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
120 /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
121 /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
122 /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
123 /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
124 /* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0,
125 /* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
126 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
127 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0,
128 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
129 /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
130 /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
131 /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
132 /* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0,
133 /* D0 */ '}', 0, -ESC_K, 0, 0, 0, 0, -ESC_P,
134 /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
135 /* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X,
136 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
137 /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
138 /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
139 };
140 #endif
141
142
143 /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
144 searched linearly. Put all the names into a single string, in order to reduce
145 the number of relocations when a shared library is dynamically linked. */
146
147 typedef struct verbitem {
148 int len;
149 int op;
150 } verbitem;
151
152 static const char verbnames[] =
153 "ACCEPT\0"
154 "COMMIT\0"
155 "F\0"
156 "FAIL\0"
157 "PRUNE\0"
158 "SKIP\0"
159 "THEN";
160
161 static verbitem verbs[] = {
162 { 6, OP_ACCEPT },
163 { 6, OP_COMMIT },
164 { 1, OP_FAIL },
165 { 4, OP_FAIL },
166 { 5, OP_PRUNE },
167 { 4, OP_SKIP },
168 { 4, OP_THEN }
169 };
170
171 static int verbcount = sizeof(verbs)/sizeof(verbitem);
172
173
174 /* Tables of names of POSIX character classes and their lengths. The names are
175 now all in a single string, to reduce the number of relocations when a shared
176 library is dynamically loaded. The list of lengths is terminated by a zero
177 length entry. The first three must be alpha, lower, upper, as this is assumed
178 for handling case independence. */
179
180 static const char posix_names[] =
181 "alpha\0" "lower\0" "upper\0" "alnum\0" "ascii\0" "blank\0"
182 "cntrl\0" "digit\0" "graph\0" "print\0" "punct\0" "space\0"
183 "word\0" "xdigit";
184
185 static const uschar posix_name_lengths[] = {
186 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
187
188 /* Table of class bit maps for each POSIX class. Each class is formed from a
189 base map, with an optional addition or removal of another map. Then, for some
190 classes, there is some additional tweaking: for [:blank:] the vertical space
191 characters are removed, and for [:alpha:] and [:alnum:] the underscore
192 character is removed. The triples in the table consist of the base map offset,
193 second map offset or -1 if no second map, and a non-negative value for map
194 addition or a negative value for map subtraction (if there are two maps). The
195 absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
196 remove vertical space characters, 2 => remove underscore. */
197
198 static const int posix_class_maps[] = {
199 cbit_word, cbit_digit, -2, /* alpha */
200 cbit_lower, -1, 0, /* lower */
201 cbit_upper, -1, 0, /* upper */
202 cbit_word, -1, 2, /* alnum - word without underscore */
203 cbit_print, cbit_cntrl, 0, /* ascii */
204 cbit_space, -1, 1, /* blank - a GNU extension */
205 cbit_cntrl, -1, 0, /* cntrl */
206 cbit_digit, -1, 0, /* digit */
207 cbit_graph, -1, 0, /* graph */
208 cbit_print, -1, 0, /* print */
209 cbit_punct, -1, 0, /* punct */
210 cbit_space, -1, 0, /* space */
211 cbit_word, -1, 0, /* word - a Perl extension */
212 cbit_xdigit,-1, 0 /* xdigit */
213 };
214
215
216 #define STRING(a) # a
217 #define XSTRING(s) STRING(s)
218
219 /* The texts of compile-time error messages. These are "char *" because they
220 are passed to the outside world. Do not ever re-use any error number, because
221 they are documented. Always add a new error instead. Messages marked DEAD below
222 are no longer used. This used to be a table of strings, but in order to reduce
223 the number of relocations needed when a shared library is loaded dynamically,
224 it is now one long string. We cannot use a table of offsets, because the
225 lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
226 simply count through to the one we want - this isn't a performance issue
227 because these strings are used only when there is a compilation error. */
228
229 static const char error_texts[] =
230 "no error\0"
231 "\\ at end of pattern\0"
232 "\\c at end of pattern\0"
233 "unrecognized character follows \\\0"
234 "numbers out of order in {} quantifier\0"
235 /* 5 */
236 "number too big in {} quantifier\0"
237 "missing terminating ] for character class\0"
238 "invalid escape sequence in character class\0"
239 "range out of order in character class\0"
240 "nothing to repeat\0"
241 /* 10 */
242 "operand of unlimited repeat could match the empty string\0" /** DEAD **/
243 "internal error: unexpected repeat\0"
244 "unrecognized character after (?\0"
245 "POSIX named classes are supported only within a class\0"
246 "missing )\0"
247 /* 15 */
248 "reference to non-existent subpattern\0"
249 "erroffset passed as NULL\0"
250 "unknown option bit(s) set\0"
251 "missing ) after comment\0"
252 "parentheses nested too deeply\0" /** DEAD **/
253 /* 20 */
254 "regular expression is too large\0"
255 "failed to get memory\0"
256 "unmatched parentheses\0"
257 "internal error: code overflow\0"
258 "unrecognized character after (?<\0"
259 /* 25 */
260 "lookbehind assertion is not fixed length\0"
261 "malformed number or name after (?(\0"
262 "conditional group contains more than two branches\0"
263 "assertion expected after (?(\0"
264 "(?R or (?[+-]digits must be followed by )\0"
265 /* 30 */
266 "unknown POSIX class name\0"
267 "POSIX collating elements are not supported\0"
268 "this version of PCRE is not compiled with PCRE_UTF8 support\0"
269 "spare error\0" /** DEAD **/
270 "character value in \\x{...} sequence is too large\0"
271 /* 35 */
272 "invalid condition (?(0)\0"
273 "\\C not allowed in lookbehind assertion\0"
274 "PCRE does not support \\L, \\l, \\N, \\U, or \\u\0"
275 "number after (?C is > 255\0"
276 "closing ) for (?C expected\0"
277 /* 40 */
278 "recursive call could loop indefinitely\0"
279 "unrecognized character after (?P\0"
280 "syntax error in subpattern name (missing terminator)\0"
281 "two named subpatterns have the same name\0"
282 "invalid UTF-8 string\0"
283 /* 45 */
284 "support for \\P, \\p, and \\X has not been compiled\0"
285 "malformed \\P or \\p sequence\0"
286 "unknown property name after \\P or \\p\0"
287 "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
288 "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
289 /* 50 */
290 "repeated subpattern is too long\0" /** DEAD **/
291 "octal value is greater than \\377 (not in UTF-8 mode)\0"
292 "internal error: overran compiling workspace\0"
293 "internal error: previously-checked referenced subpattern not found\0"
294 "DEFINE group contains more than one branch\0"
295 /* 55 */
296 "repeating a DEFINE group is not allowed\0"
297 "inconsistent NEWLINE options\0"
298 "\\g is not followed by a braced name or an optionally braced non-zero number\0"
299 "(?+ or (?- or (?(+ or (?(- must be followed by a non-zero number\0"
300 "(*VERB) with an argument is not supported\0"
301 /* 60 */
302 "(*VERB) not recognized\0"
303 "number is too big";
304
305
306 /* Table to identify digits and hex digits. This is used when compiling
307 patterns. Note that the tables in chartables are dependent on the locale, and
308 may mark arbitrary characters as digits - but the PCRE compiling code expects
309 to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
310 a private table here. It costs 256 bytes, but it is a lot faster than doing
311 character value tests (at least in some simple cases I timed), and in some
312 applications one wants PCRE to compile efficiently as well as match
313 efficiently.
314
315 For convenience, we use the same bit definitions as in chartables:
316
317 0x04 decimal digit
318 0x08 hexadecimal digit
319
320 Then we can use ctype_digit and ctype_xdigit in the code. */
321
322 #ifndef EBCDIC /* This is the "normal" case, for ASCII systems */
323 static const unsigned char digitab[] =
324 {
325 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
326 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
327 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
328 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
329 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
330 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
331 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
332 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
333 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
334 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
335 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
336 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
337 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
338 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
339 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
340 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
341 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
342 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
343 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
344 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
345 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
346 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
347 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
348 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
349 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
350 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
351 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
352 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
353 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
354 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
355 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
356 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
357
358 #else /* This is the "abnormal" case, for EBCDIC systems */
359 static const unsigned char digitab[] =
360 {
361 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
362 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
363 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
364 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
365 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
366 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
367 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
368 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
369 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
370 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
371 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
372 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */
373 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
374 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
375 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
376 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
377 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
378 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
379 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
380 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
381 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
382 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
383 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
384 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
385 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
386 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
387 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
388 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
389 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
390 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
391 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
392 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
393
394 static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
395 0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
396 0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
397 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
398 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
399 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
400 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
401 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
402 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
403 0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
404 0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
405 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
406 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */
407 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
408 0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
409 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
410 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
411 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
412 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
413 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
414 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
415 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
416 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
417 0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
418 0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
419 0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
420 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
421 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
422 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
423 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
424 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
425 0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
426 0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
427 #endif
428
429
430 /* Definition to allow mutual recursion */
431
432 static BOOL
433 compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
434 int *, int *, branch_chain *, compile_data *, int *);
435
436
437
438 /*************************************************
439 * Find an error text *
440 *************************************************/
441
442 /* The error texts are now all in one long string, to save on relocations. As
443 some of the text is of unknown length, we can't use a table of offsets.
444 Instead, just count through the strings. This is not a performance issue
445 because it happens only when there has been a compilation error.
446
447 Argument: the error number
448 Returns: pointer to the error string
449 */
450
451 static const char *
452 find_error_text(int n)
453 {
454 const char *s = error_texts;
455 for (; n > 0; n--) while (*s++ != 0);
456 return s;
457 }
458
459
460 /*************************************************
461 * Handle escapes *
462 *************************************************/
463
464 /* This function is called when a \ has been encountered. It either returns a
465 positive value for a simple escape such as \n, or a negative value which
466 encodes one of the more complicated things such as \d. A backreference to group
467 n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
468 UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
469 ptr is pointing at the \. On exit, it is on the final character of the escape
470 sequence.
471
472 Arguments:
473 ptrptr points to the pattern position pointer
474 errorcodeptr points to the errorcode variable
475 bracount number of previous extracting brackets
476 options the options bits
477 isclass TRUE if inside a character class
478
479 Returns: zero or positive => a data character
480 negative => a special escape sequence
481 on error, errorcodeptr is set
482 */
483
484 static int
485 check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
486 int options, BOOL isclass)
487 {
488 BOOL utf8 = (options & PCRE_UTF8) != 0;
489 const uschar *ptr = *ptrptr + 1;
490 int c, i;
491
492 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
493 ptr--; /* Set pointer back to the last byte */
494
495 /* If backslash is at the end of the pattern, it's an error. */
496
497 if (c == 0) *errorcodeptr = ERR1;
498
499 /* Non-alphamerics are literals. For digits or letters, do an initial lookup in
500 a table. A non-zero result is something that can be returned immediately.
501 Otherwise further processing may be required. */
502
503 #ifndef EBCDIC /* ASCII coding */
504 else if (c < '0' || c > 'z') {} /* Not alphameric */
505 else if ((i = escapes[c - '0']) != 0) c = i;
506
507 #else /* EBCDIC coding */
508 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphameric */
509 else if ((i = escapes[c - 0x48]) != 0) c = i;
510 #endif
511
512 /* Escapes that need further processing, or are illegal. */
513
514 else
515 {
516 const uschar *oldptr;
517 BOOL braced, negated;
518
519 switch (c)
520 {
521 /* A number of Perl escapes are not handled by PCRE. We give an explicit
522 error. */
523
524 case 'l':
525 case 'L':
526 case 'N':
527 case 'u':
528 case 'U':
529 *errorcodeptr = ERR37;
530 break;
531
532 /* \g must be followed by a number, either plain or braced. If positive, it
533 is an absolute backreference. If negative, it is a relative backreference.
534 This is a Perl 5.10 feature. Perl 5.10 also supports \g{name} as a
535 reference to a named group. This is part of Perl's movement towards a
536 unified syntax for back references. As this is synonymous with \k{name}, we
537 fudge it up by pretending it really was \k. */
538
539 case 'g':
540 if (ptr[1] == '{')
541 {
542 const uschar *p;
543 for (p = ptr+2; *p != 0 && *p != '}'; p++)
544 if (*p != '-' && (digitab[*p] & ctype_digit) == 0) break;
545 if (*p != 0 && *p != '}')
546 {
547 c = -ESC_k;
548 break;
549 }
550 braced = TRUE;
551 ptr++;
552 }
553 else braced = FALSE;
554
555 if (ptr[1] == '-')
556 {
557 negated = TRUE;
558 ptr++;
559 }
560 else negated = FALSE;
561
562 c = 0;
563 while ((digitab[ptr[1]] & ctype_digit) != 0)
564 c = c * 10 + *(++ptr) - '0';
565
566 if (c < 0)
567 {
568 *errorcodeptr = ERR61;
569 break;
570 }
571
572 if (c == 0 || (braced && *(++ptr) != '}'))
573 {
574 *errorcodeptr = ERR57;
575 break;
576 }
577
578 if (negated)
579 {
580 if (c > bracount)
581 {
582 *errorcodeptr = ERR15;
583 break;
584 }
585 c = bracount - (c - 1);
586 }
587
588 c = -(ESC_REF + c);
589 break;
590
591 /* The handling of escape sequences consisting of a string of digits
592 starting with one that is not zero is not straightforward. By experiment,
593 the way Perl works seems to be as follows:
594
595 Outside a character class, the digits are read as a decimal number. If the
596 number is less than 10, or if there are that many previous extracting
597 left brackets, then it is a back reference. Otherwise, up to three octal
598 digits are read to form an escaped byte. Thus \123 is likely to be octal
599 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
600 value is greater than 377, the least significant 8 bits are taken. Inside a
601 character class, \ followed by a digit is always an octal number. */
602
603 case '1': case '2': case '3': case '4': case '5':
604 case '6': case '7': case '8': case '9':
605
606 if (!isclass)
607 {
608 oldptr = ptr;
609 c -= '0';
610 while ((digitab[ptr[1]] & ctype_digit) != 0)
611 c = c * 10 + *(++ptr) - '0';
612 if (c < 0)
613 {
614 *errorcodeptr = ERR61;
615 break;
616 }
617 if (c < 10 || c <= bracount)
618 {
619 c = -(ESC_REF + c);
620 break;
621 }
622 ptr = oldptr; /* Put the pointer back and fall through */
623 }
624
625 /* Handle an octal number following \. If the first digit is 8 or 9, Perl
626 generates a binary zero byte and treats the digit as a following literal.
627 Thus we have to pull back the pointer by one. */
628
629 if ((c = *ptr) >= '8')
630 {
631 ptr--;
632 c = 0;
633 break;
634 }
635
636 /* \0 always starts an octal number, but we may drop through to here with a
637 larger first octal digit. The original code used just to take the least
638 significant 8 bits of octal numbers (I think this is what early Perls used
639 to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
640 than 3 octal digits. */
641
642 case '0':
643 c -= '0';
644 while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
645 c = c * 8 + *(++ptr) - '0';
646 if (!utf8 && c > 255) *errorcodeptr = ERR51;
647 break;
648
649 /* \x is complicated. \x{ddd} is a character number which can be greater
650 than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
651 treated as a data character. */
652
653 case 'x':
654 if (ptr[1] == '{')
655 {
656 const uschar *pt = ptr + 2;
657 int count = 0;
658
659 c = 0;
660 while ((digitab[*pt] & ctype_xdigit) != 0)
661 {
662 register int cc = *pt++;
663 if (c == 0 && cc == '0') continue; /* Leading zeroes */
664 count++;
665
666 #ifndef EBCDIC /* ASCII coding */
667 if (cc >= 'a') cc -= 32; /* Convert to upper case */
668 c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));
669 #else /* EBCDIC coding */
670 if (cc >= 'a' && cc <= 'z') cc += 64; /* Convert to upper case */
671 c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));
672 #endif
673 }
674
675 if (*pt == '}')
676 {
677 if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
678 ptr = pt;
679 break;
680 }
681
682 /* If the sequence of hex digits does not end with '}', then we don't
683 recognize this construct; fall through to the normal \x handling. */
684 }
685
686 /* Read just a single-byte hex-defined char */
687
688 c = 0;
689 while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
690 {
691 int cc; /* Some compilers don't like ++ */
692 cc = *(++ptr); /* in initializers */
693 #ifndef EBCDIC /* ASCII coding */
694 if (cc >= 'a') cc -= 32; /* Convert to upper case */
695 c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
696 #else /* EBCDIC coding */
697 if (cc <= 'z') cc += 64; /* Convert to upper case */
698 c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
699 #endif
700 }
701 break;
702
703 /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
704 This coding is ASCII-specific, but then the whole concept of \cx is
705 ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
706
707 case 'c':
708 c = *(++ptr);
709 if (c == 0)
710 {
711 *errorcodeptr = ERR2;
712 break;
713 }
714
715 #ifndef EBCDIC /* ASCII coding */
716 if (c >= 'a' && c <= 'z') c -= 32;
717 c ^= 0x40;
718 #else /* EBCDIC coding */
719 if (c >= 'a' && c <= 'z') c += 64;
720 c ^= 0xC0;
721 #endif
722 break;
723
724 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
725 other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
726 for Perl compatibility, it is a literal. This code looks a bit odd, but
727 there used to be some cases other than the default, and there may be again
728 in future, so I haven't "optimized" it. */
729
730 default:
731 if ((options & PCRE_EXTRA) != 0) switch(c)
732 {
733 default:
734 *errorcodeptr = ERR3;
735 break;
736 }
737 break;
738 }
739 }
740
741 *ptrptr = ptr;
742 return c;
743 }
744
745
746
747 #ifdef SUPPORT_UCP
748 /*************************************************
749 * Handle \P and \p *
750 *************************************************/
751
752 /* This function is called after \P or \p has been encountered, provided that
753 PCRE is compiled with support for Unicode properties. On entry, ptrptr is
754 pointing at the P or p. On exit, it is pointing at the final character of the
755 escape sequence.
756
757 Argument:
758 ptrptr points to the pattern position pointer
759 negptr points to a boolean that is set TRUE for negation else FALSE
760 dptr points to an int that is set to the detailed property value
761 errorcodeptr points to the error code variable
762
763 Returns: type value from ucp_type_table, or -1 for an invalid type
764 */
765
766 static int
767 get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
768 {
769 int c, i, bot, top;
770 const uschar *ptr = *ptrptr;
771 char name[32];
772
773 c = *(++ptr);
774 if (c == 0) goto ERROR_RETURN;
775
776 *negptr = FALSE;
777
778 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
779 negation. */
780
781 if (c == '{')
782 {
783 if (ptr[1] == '^')
784 {
785 *negptr = TRUE;
786 ptr++;
787 }
788 for (i = 0; i < (int)sizeof(name) - 1; i++)
789 {
790 c = *(++ptr);
791 if (c == 0) goto ERROR_RETURN;
792 if (c == '}') break;
793 name[i] = c;
794 }
795 if (c !='}') goto ERROR_RETURN;
796 name[i] = 0;
797 }
798
799 /* Otherwise there is just one following character */
800
801 else
802 {
803 name[0] = c;
804 name[1] = 0;
805 }
806
807 *ptrptr = ptr;
808
809 /* Search for a recognized property name using binary chop */
810
811 bot = 0;
812 top = _pcre_utt_size;
813
814 while (bot < top)
815 {
816 i = (bot + top) >> 1;
817 c = strcmp(name, _pcre_utt_names + _pcre_utt[i].name_offset);
818 if (c == 0)
819 {
820 *dptr = _pcre_utt[i].value;
821 return _pcre_utt[i].type;
822 }
823 if (c > 0) bot = i + 1; else top = i;
824 }
825
826 *errorcodeptr = ERR47;
827 *ptrptr = ptr;
828 return -1;
829
830 ERROR_RETURN:
831 *errorcodeptr = ERR46;
832 *ptrptr = ptr;
833 return -1;
834 }
835 #endif
836
837
838
839
840 /*************************************************
841 * Check for counted repeat *
842 *************************************************/
843
844 /* This function is called when a '{' is encountered in a place where it might
845 start a quantifier. It looks ahead to see if it really is a quantifier or not.
846 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
847 where the ddds are digits.
848
849 Arguments:
850 p pointer to the first char after '{'
851
852 Returns: TRUE or FALSE
853 */
854
855 static BOOL
856 is_counted_repeat(const uschar *p)
857 {
858 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
859 while ((digitab[*p] & ctype_digit) != 0) p++;
860 if (*p == '}') return TRUE;
861
862 if (*p++ != ',') return FALSE;
863 if (*p == '}') return TRUE;
864
865 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
866 while ((digitab[*p] & ctype_digit) != 0) p++;
867
868 return (*p == '}');
869 }
870
871
872
873 /*************************************************
874 * Read repeat counts *
875 *************************************************/
876
877 /* Read an item of the form {n,m} and return the values. This is called only
878 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
879 so the syntax is guaranteed to be correct, but we need to check the values.
880
881 Arguments:
882 p pointer to first char after '{'
883 minp pointer to int for min
884 maxp pointer to int for max
885 returned as -1 if no max
886 errorcodeptr points to error code variable
887
888 Returns: pointer to '}' on success;
889 current ptr on error, with errorcodeptr set non-zero
890 */
891
892 static const uschar *
893 read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
894 {
895 int min = 0;
896 int max = -1;
897
898 /* Read the minimum value and do a paranoid check: a negative value indicates
899 an integer overflow. */
900
901 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
902 if (min < 0 || min > 65535)
903 {
904 *errorcodeptr = ERR5;
905 return p;
906 }
907
908 /* Read the maximum value if there is one, and again do a paranoid on its size.
909 Also, max must not be less than min. */
910
911 if (*p == '}') max = min; else
912 {
913 if (*(++p) != '}')
914 {
915 max = 0;
916 while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
917 if (max < 0 || max > 65535)
918 {
919 *errorcodeptr = ERR5;
920 return p;
921 }
922 if (max < min)
923 {
924 *errorcodeptr = ERR4;
925 return p;
926 }
927 }
928 }
929
930 /* Fill in the required variables, and pass back the pointer to the terminating
931 '}'. */
932
933 *minp = min;
934 *maxp = max;
935 return p;
936 }
937
938
939
940 /*************************************************
941 * Find forward referenced subpattern *
942 *************************************************/
943
944 /* This function scans along a pattern's text looking for capturing
945 subpatterns, and counting them. If it finds a named pattern that matches the
946 name it is given, it returns its number. Alternatively, if the name is NULL, it
947 returns when it reaches a given numbered subpattern. This is used for forward
948 references to subpatterns. We know that if (?P< is encountered, the name will
949 be terminated by '>' because that is checked in the first pass.
950
951 Arguments:
952 ptr current position in the pattern
953 count current count of capturing parens so far encountered
954 name name to seek, or NULL if seeking a numbered subpattern
955 lorn name length, or subpattern number if name is NULL
956 xmode TRUE if we are in /x mode
957
958 Returns: the number of the named subpattern, or -1 if not found
959 */
960
961 static int
962 find_parens(const uschar *ptr, int count, const uschar *name, int lorn,
963 BOOL xmode)
964 {
965 const uschar *thisname;
966
967 for (; *ptr != 0; ptr++)
968 {
969 int term;
970
971 /* Skip over backslashed characters and also entire \Q...\E */
972
973 if (*ptr == '\\')
974 {
975 if (*(++ptr) == 0) return -1;
976 if (*ptr == 'Q') for (;;)
977 {
978 while (*(++ptr) != 0 && *ptr != '\\');
979 if (*ptr == 0) return -1;
980 if (*(++ptr) == 'E') break;
981 }
982 continue;
983 }
984
985 /* Skip over character classes */
986
987 if (*ptr == '[')
988 {
989 while (*(++ptr) != ']')
990 {
991 if (*ptr == 0) return -1;
992 if (*ptr == '\\')
993 {
994 if (*(++ptr) == 0) return -1;
995 if (*ptr == 'Q') for (;;)
996 {
997 while (*(++ptr) != 0 && *ptr != '\\');
998 if (*ptr == 0) return -1;
999 if (*(++ptr) == 'E') break;
1000 }
1001 continue;
1002 }
1003 }
1004 continue;
1005 }
1006
1007 /* Skip comments in /x mode */
1008
1009 if (xmode && *ptr == '#')
1010 {
1011 while (*(++ptr) != 0 && *ptr != '\n');
1012 if (*ptr == 0) return -1;
1013 continue;
1014 }
1015
1016 /* An opening parens must now be a real metacharacter */
1017
1018 if (*ptr != '(') continue;
1019 if (ptr[1] != '?' && ptr[1] != '*')
1020 {
1021 count++;
1022 if (name == NULL && count == lorn) return count;
1023 continue;
1024 }
1025
1026 ptr += 2;
1027 if (*ptr == 'P') ptr++; /* Allow optional P */
1028
1029 /* We have to disambiguate (?<! and (?<= from (?<name> */
1030
1031 if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') &&
1032 *ptr != '\'')
1033 continue;
1034
1035 count++;
1036
1037 if (name == NULL && count == lorn) return count;
1038 term = *ptr++;
1039 if (term == '<') term = '>';
1040 thisname = ptr;
1041 while (*ptr != term) ptr++;
1042 if (name != NULL && lorn == ptr - thisname &&
1043 strncmp((const char *)name, (const char *)thisname, lorn) == 0)
1044 return count;
1045 }
1046
1047 return -1;
1048 }
1049
1050
1051
1052 /*************************************************
1053 * Find first significant op code *
1054 *************************************************/
1055
1056 /* This is called by several functions that scan a compiled expression looking
1057 for a fixed first character, or an anchoring op code etc. It skips over things
1058 that do not influence this. For some calls, a change of option is important.
1059 For some calls, it makes sense to skip negative forward and all backward
1060 assertions, and also the \b assertion; for others it does not.
1061
1062 Arguments:
1063 code pointer to the start of the group
1064 options pointer to external options
1065 optbit the option bit whose changing is significant, or
1066 zero if none are
1067 skipassert TRUE if certain assertions are to be skipped
1068
1069 Returns: pointer to the first significant opcode
1070 */
1071
1072 static const uschar*
1073 first_significant_code(const uschar *code, int *options, int optbit,
1074 BOOL skipassert)
1075 {
1076 for (;;)
1077 {
1078 switch ((int)*code)
1079 {
1080 case OP_OPT:
1081 if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
1082 *options = (int)code[1];
1083 code += 2;
1084 break;
1085
1086 case OP_ASSERT_NOT:
1087 case OP_ASSERTBACK:
1088 case OP_ASSERTBACK_NOT:
1089 if (!skipassert) return code;
1090 do code += GET(code, 1); while (*code == OP_ALT);
1091 code += _pcre_OP_lengths[*code];
1092 break;
1093
1094 case OP_WORD_BOUNDARY:
1095 case OP_NOT_WORD_BOUNDARY:
1096 if (!skipassert) return code;
1097 /* Fall through */
1098
1099 case OP_CALLOUT:
1100 case OP_CREF:
1101 case OP_RREF:
1102 case OP_DEF:
1103 code += _pcre_OP_lengths[*code];
1104 break;
1105
1106 default:
1107 return code;
1108 }
1109 }
1110 /* Control never reaches here */
1111 }
1112
1113
1114
1115
1116 /*************************************************
1117 * Find the fixed length of a pattern *
1118 *************************************************/
1119
1120 /* Scan a pattern and compute the fixed length of subject that will match it,
1121 if the length is fixed. This is needed for dealing with backward assertions.
1122 In UTF8 mode, the result is in characters rather than bytes.
1123
1124 Arguments:
1125 code points to the start of the pattern (the bracket)
1126 options the compiling options
1127
1128 Returns: the fixed length, or -1 if there is no fixed length,
1129 or -2 if \C was encountered
1130 */
1131
1132 static int
1133 find_fixedlength(uschar *code, int options)
1134 {
1135 int length = -1;
1136
1137 register int branchlength = 0;
1138 register uschar *cc = code + 1 + LINK_SIZE;
1139
1140 /* Scan along the opcodes for this branch. If we get to the end of the
1141 branch, check the length against that of the other branches. */
1142
1143 for (;;)
1144 {
1145 int d;
1146 register int op = *cc;
1147 switch (op)
1148 {
1149 case OP_CBRA:
1150 case OP_BRA:
1151 case OP_ONCE:
1152 case OP_COND:
1153 d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);
1154 if (d < 0) return d;
1155 branchlength += d;
1156 do cc += GET(cc, 1); while (*cc == OP_ALT);
1157 cc += 1 + LINK_SIZE;
1158 break;
1159
1160 /* Reached end of a branch; if it's a ket it is the end of a nested
1161 call. If it's ALT it is an alternation in a nested call. If it is
1162 END it's the end of the outer call. All can be handled by the same code. */
1163
1164 case OP_ALT:
1165 case OP_KET:
1166 case OP_KETRMAX:
1167 case OP_KETRMIN:
1168 case OP_END:
1169 if (length < 0) length = branchlength;
1170 else if (length != branchlength) return -1;
1171 if (*cc != OP_ALT) return length;
1172 cc += 1 + LINK_SIZE;
1173 branchlength = 0;
1174 break;
1175
1176 /* Skip over assertive subpatterns */
1177
1178 case OP_ASSERT:
1179 case OP_ASSERT_NOT:
1180 case OP_ASSERTBACK:
1181 case OP_ASSERTBACK_NOT:
1182 do cc += GET(cc, 1); while (*cc == OP_ALT);
1183 /* Fall through */
1184
1185 /* Skip over things that don't match chars */
1186
1187 case OP_REVERSE:
1188 case OP_CREF:
1189 case OP_RREF:
1190 case OP_DEF:
1191 case OP_OPT:
1192 case OP_CALLOUT:
1193 case OP_SOD:
1194 case OP_SOM:
1195 case OP_EOD:
1196 case OP_EODN:
1197 case OP_CIRC:
1198 case OP_DOLL:
1199 case OP_NOT_WORD_BOUNDARY:
1200 case OP_WORD_BOUNDARY:
1201 cc += _pcre_OP_lengths[*cc];
1202 break;
1203
1204 /* Handle literal characters */
1205
1206 case OP_CHAR:
1207 case OP_CHARNC:
1208 case OP_NOT:
1209 branchlength++;
1210 cc += 2;
1211 #ifdef SUPPORT_UTF8
1212 if ((options & PCRE_UTF8) != 0)
1213 {
1214 while ((*cc & 0xc0) == 0x80) cc++;
1215 }
1216 #endif
1217 break;
1218
1219 /* Handle exact repetitions. The count is already in characters, but we
1220 need to skip over a multibyte character in UTF8 mode. */
1221
1222 case OP_EXACT:
1223 branchlength += GET2(cc,1);
1224 cc += 4;
1225 #ifdef SUPPORT_UTF8
1226 if ((options & PCRE_UTF8) != 0)
1227 {
1228 while((*cc & 0x80) == 0x80) cc++;
1229 }
1230 #endif
1231 break;
1232
1233 case OP_TYPEEXACT:
1234 branchlength += GET2(cc,1);
1235 if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
1236 cc += 4;
1237 break;
1238
1239 /* Handle single-char matchers */
1240
1241 case OP_PROP:
1242 case OP_NOTPROP:
1243 cc += 2;
1244 /* Fall through */
1245
1246 case OP_NOT_DIGIT:
1247 case OP_DIGIT:
1248 case OP_NOT_WHITESPACE:
1249 case OP_WHITESPACE:
1250 case OP_NOT_WORDCHAR:
1251 case OP_WORDCHAR:
1252 case OP_ANY:
1253 branchlength++;
1254 cc++;
1255 break;
1256
1257 /* The single-byte matcher isn't allowed */
1258
1259 case OP_ANYBYTE:
1260 return -2;
1261
1262 /* Check a class for variable quantification */
1263
1264 #ifdef SUPPORT_UTF8
1265 case OP_XCLASS:
1266 cc += GET(cc, 1) - 33;
1267 /* Fall through */
1268 #endif
1269
1270 case OP_CLASS:
1271 case OP_NCLASS:
1272 cc += 33;
1273
1274 switch (*cc)
1275 {
1276 case OP_CRSTAR:
1277 case OP_CRMINSTAR:
1278 case OP_CRQUERY:
1279 case OP_CRMINQUERY:
1280 return -1;
1281
1282 case OP_CRRANGE:
1283 case OP_CRMINRANGE:
1284 if (GET2(cc,1) != GET2(cc,3)) return -1;
1285 branchlength += GET2(cc,1);
1286 cc += 5;
1287 break;
1288
1289 default:
1290 branchlength++;
1291 }
1292 break;
1293
1294 /* Anything else is variable length */
1295
1296 default:
1297 return -1;
1298 }
1299 }
1300 /* Control never gets here */
1301 }
1302
1303
1304
1305
1306 /*************************************************
1307 * Scan compiled regex for numbered bracket *
1308 *************************************************/
1309
1310 /* This little function scans through a compiled pattern until it finds a
1311 capturing bracket with the given number.
1312
1313 Arguments:
1314 code points to start of expression
1315 utf8 TRUE in UTF-8 mode
1316 number the required bracket number
1317
1318 Returns: pointer to the opcode for the bracket, or NULL if not found
1319 */
1320
1321 static const uschar *
1322 find_bracket(const uschar *code, BOOL utf8, int number)
1323 {
1324 for (;;)
1325 {
1326 register int c = *code;
1327 if (c == OP_END) return NULL;
1328
1329 /* XCLASS is used for classes that cannot be represented just by a bit
1330 map. This includes negated single high-valued characters. The length in
1331 the table is zero; the actual length is stored in the compiled code. */
1332
1333 if (c == OP_XCLASS) code += GET(code, 1);
1334
1335 /* Handle capturing bracket */
1336
1337 else if (c == OP_CBRA)
1338 {
1339 int n = GET2(code, 1+LINK_SIZE);
1340 if (n == number) return (uschar *)code;
1341 code += _pcre_OP_lengths[c];
1342 }
1343
1344 /* Otherwise, we can get the item's length from the table, except that for
1345 repeated character types, we have to test for \p and \P, which have an extra
1346 two bytes of parameters. */
1347
1348 else
1349 {
1350 switch(c)
1351 {
1352 case OP_TYPESTAR:
1353 case OP_TYPEMINSTAR:
1354 case OP_TYPEPLUS:
1355 case OP_TYPEMINPLUS:
1356 case OP_TYPEQUERY:
1357 case OP_TYPEMINQUERY:
1358 case OP_TYPEPOSSTAR:
1359 case OP_TYPEPOSPLUS:
1360 case OP_TYPEPOSQUERY:
1361 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1362 break;
1363
1364 case OP_TYPEUPTO:
1365 case OP_TYPEMINUPTO:
1366 case OP_TYPEEXACT:
1367 case OP_TYPEPOSUPTO:
1368 if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1369 break;
1370 }
1371
1372 /* Add in the fixed length from the table */
1373
1374 code += _pcre_OP_lengths[c];
1375
1376 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1377 a multi-byte character. The length in the table is a minimum, so we have to
1378 arrange to skip the extra bytes. */
1379
1380 #ifdef SUPPORT_UTF8
1381 if (utf8) switch(c)
1382 {
1383 case OP_CHAR:
1384 case OP_CHARNC:
1385 case OP_EXACT:
1386 case OP_UPTO:
1387 case OP_MINUPTO:
1388 case OP_POSUPTO:
1389 case OP_STAR:
1390 case OP_MINSTAR:
1391 case OP_POSSTAR:
1392 case OP_PLUS:
1393 case OP_MINPLUS:
1394 case OP_POSPLUS:
1395 case OP_QUERY:
1396 case OP_MINQUERY:
1397 case OP_POSQUERY:
1398 if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1399 break;
1400 }
1401 #endif
1402 }
1403 }
1404 }
1405
1406
1407
1408 /*************************************************
1409 * Scan compiled regex for recursion reference *
1410 *************************************************/
1411
1412 /* This little function scans through a compiled pattern until it finds an
1413 instance of OP_RECURSE.
1414
1415 Arguments:
1416 code points to start of expression
1417 utf8 TRUE in UTF-8 mode
1418
1419 Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1420 */
1421
1422 static const uschar *
1423 find_recurse(const uschar *code, BOOL utf8)
1424 {
1425 for (;;)
1426 {
1427 register int c = *code;
1428 if (c == OP_END) return NULL;
1429 if (c == OP_RECURSE) return code;
1430
1431 /* XCLASS is used for classes that cannot be represented just by a bit
1432 map. This includes negated single high-valued characters. The length in
1433 the table is zero; the actual length is stored in the compiled code. */
1434
1435 if (c == OP_XCLASS) code += GET(code, 1);
1436
1437 /* Otherwise, we can get the item's length from the table, except that for
1438 repeated character types, we have to test for \p and \P, which have an extra
1439 two bytes of parameters. */
1440
1441 else
1442 {
1443 switch(c)
1444 {
1445 case OP_TYPESTAR:
1446 case OP_TYPEMINSTAR:
1447 case OP_TYPEPLUS:
1448 case OP_TYPEMINPLUS:
1449 case OP_TYPEQUERY:
1450 case OP_TYPEMINQUERY:
1451 case OP_TYPEPOSSTAR:
1452 case OP_TYPEPOSPLUS:
1453 case OP_TYPEPOSQUERY:
1454 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1455 break;
1456
1457 case OP_TYPEPOSUPTO:
1458 case OP_TYPEUPTO:
1459 case OP_TYPEMINUPTO:
1460 case OP_TYPEEXACT:
1461 if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1462 break;
1463 }
1464
1465 /* Add in the fixed length from the table */
1466
1467 code += _pcre_OP_lengths[c];
1468
1469 /* In UTF-8 mode, opcodes that are followed by a character may be followed
1470 by a multi-byte character. The length in the table is a minimum, so we have
1471 to arrange to skip the extra bytes. */
1472
1473 #ifdef SUPPORT_UTF8
1474 if (utf8) switch(c)
1475 {
1476 case OP_CHAR:
1477 case OP_CHARNC:
1478 case OP_EXACT:
1479 case OP_UPTO:
1480 case OP_MINUPTO:
1481 case OP_POSUPTO:
1482 case OP_STAR:
1483 case OP_MINSTAR:
1484 case OP_POSSTAR:
1485 case OP_PLUS:
1486 case OP_MINPLUS:
1487 case OP_POSPLUS:
1488 case OP_QUERY:
1489 case OP_MINQUERY:
1490 case OP_POSQUERY:
1491 if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1492 break;
1493 }
1494 #endif
1495 }
1496 }
1497 }
1498
1499
1500
1501 /*************************************************
1502 * Scan compiled branch for non-emptiness *
1503 *************************************************/
1504
1505 /* This function scans through a branch of a compiled pattern to see whether it
1506 can match the empty string or not. It is called from could_be_empty()
1507 below and from compile_branch() when checking for an unlimited repeat of a
1508 group that can match nothing. Note that first_significant_code() skips over
1509 assertions. If we hit an unclosed bracket, we return "empty" - this means we've
1510 struck an inner bracket whose current branch will already have been scanned.
1511
1512 Arguments:
1513 code points to start of search
1514 endcode points to where to stop
1515 utf8 TRUE if in UTF8 mode
1516
1517 Returns: TRUE if what is matched could be empty
1518 */
1519
1520 static BOOL
1521 could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1522 {
1523 register int c;
1524 for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1525 code < endcode;
1526 code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1527 {
1528 const uschar *ccode;
1529
1530 c = *code;
1531
1532 /* Groups with zero repeats can of course be empty; skip them. */
1533
1534 if (c == OP_BRAZERO || c == OP_BRAMINZERO)
1535 {
1536 code += _pcre_OP_lengths[c];
1537 do code += GET(code, 1); while (*code == OP_ALT);
1538 c = *code;
1539 continue;
1540 }
1541
1542 /* For other groups, scan the branches. */
1543
1544 if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND)
1545 {
1546 BOOL empty_branch;
1547 if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1548
1549 /* Scan a closed bracket */
1550
1551 empty_branch = FALSE;
1552 do
1553 {
1554 if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1555 empty_branch = TRUE;
1556 code += GET(code, 1);
1557 }
1558 while (*code == OP_ALT);
1559 if (!empty_branch) return FALSE; /* All branches are non-empty */
1560 c = *code;
1561 continue;
1562 }
1563
1564 /* Handle the other opcodes */
1565
1566 switch (c)
1567 {
1568 /* Check for quantifiers after a class. XCLASS is used for classes that
1569 cannot be represented just by a bit map. This includes negated single
1570 high-valued characters. The length in _pcre_OP_lengths[] is zero; the
1571 actual length is stored in the compiled code, so we must update "code"
1572 here. */
1573
1574 #ifdef SUPPORT_UTF8
1575 case OP_XCLASS:
1576 ccode = code += GET(code, 1);
1577 goto CHECK_CLASS_REPEAT;
1578 #endif
1579
1580 case OP_CLASS:
1581 case OP_NCLASS:
1582 ccode = code + 33;
1583
1584 #ifdef SUPPORT_UTF8
1585 CHECK_CLASS_REPEAT:
1586 #endif
1587
1588 switch (*ccode)
1589 {
1590 case OP_CRSTAR: /* These could be empty; continue */
1591 case OP_CRMINSTAR:
1592 case OP_CRQUERY:
1593 case OP_CRMINQUERY:
1594 break;
1595
1596 default: /* Non-repeat => class must match */
1597 case OP_CRPLUS: /* These repeats aren't empty */
1598 case OP_CRMINPLUS:
1599 return FALSE;
1600
1601 case OP_CRRANGE:
1602 case OP_CRMINRANGE:
1603 if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
1604 break;
1605 }
1606 break;
1607
1608 /* Opcodes that must match a character */
1609
1610 case OP_PROP:
1611 case OP_NOTPROP:
1612 case OP_EXTUNI:
1613 case OP_NOT_DIGIT:
1614 case OP_DIGIT:
1615 case OP_NOT_WHITESPACE:
1616 case OP_WHITESPACE:
1617 case OP_NOT_WORDCHAR:
1618 case OP_WORDCHAR:
1619 case OP_ANY:
1620 case OP_ANYBYTE:
1621 case OP_CHAR:
1622 case OP_CHARNC:
1623 case OP_NOT:
1624 case OP_PLUS:
1625 case OP_MINPLUS:
1626 case OP_POSPLUS:
1627 case OP_EXACT:
1628 case OP_NOTPLUS:
1629 case OP_NOTMINPLUS:
1630 case OP_NOTPOSPLUS:
1631 case OP_NOTEXACT:
1632 case OP_TYPEPLUS:
1633 case OP_TYPEMINPLUS:
1634 case OP_TYPEPOSPLUS:
1635 case OP_TYPEEXACT:
1636 return FALSE;
1637
1638 /* These are going to continue, as they may be empty, but we have to
1639 fudge the length for the \p and \P cases. */
1640
1641 case OP_TYPESTAR:
1642 case OP_TYPEMINSTAR:
1643 case OP_TYPEPOSSTAR:
1644 case OP_TYPEQUERY:
1645 case OP_TYPEMINQUERY:
1646 case OP_TYPEPOSQUERY:
1647 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1648 break;
1649
1650 /* Same for these */
1651
1652 case OP_TYPEUPTO:
1653 case OP_TYPEMINUPTO:
1654 case OP_TYPEPOSUPTO:
1655 if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1656 break;
1657
1658 /* End of branch */
1659
1660 case OP_KET:
1661 case OP_KETRMAX:
1662 case OP_KETRMIN:
1663 case OP_ALT:
1664 return TRUE;
1665
1666 /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1667 MINUPTO, and POSUPTO may be followed by a multibyte character */
1668
1669 #ifdef SUPPORT_UTF8
1670 case OP_STAR:
1671 case OP_MINSTAR:
1672 case OP_POSSTAR:
1673 case OP_QUERY:
1674 case OP_MINQUERY:
1675 case OP_POSQUERY:
1676 case OP_UPTO:
1677 case OP_MINUPTO:
1678 case OP_POSUPTO:
1679 if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1680 break;
1681 #endif
1682 }
1683 }
1684
1685 return TRUE;
1686 }
1687
1688
1689
1690 /*************************************************
1691 * Scan compiled regex for non-emptiness *
1692 *************************************************/
1693
1694 /* This function is called to check for left recursive calls. We want to check
1695 the current branch of the current pattern to see if it could match the empty
1696 string. If it could, we must look outwards for branches at other levels,
1697 stopping when we pass beyond the bracket which is the subject of the recursion.
1698
1699 Arguments:
1700 code points to start of the recursion
1701 endcode points to where to stop (current RECURSE item)
1702 bcptr points to the chain of current (unclosed) branch starts
1703 utf8 TRUE if in UTF-8 mode
1704
1705 Returns: TRUE if what is matched could be empty
1706 */
1707
1708 static BOOL
1709 could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1710 BOOL utf8)
1711 {
1712 while (bcptr != NULL && bcptr->current >= code)
1713 {
1714 if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1715 bcptr = bcptr->outer;
1716 }
1717 return TRUE;
1718 }
1719
1720
1721
1722 /*************************************************
1723 * Check for POSIX class syntax *
1724 *************************************************/
1725
1726 /* This function is called when the sequence "[:" or "[." or "[=" is
1727 encountered in a character class. It checks whether this is followed by an
1728 optional ^ and then a sequence of letters, terminated by a matching ":]" or
1729 ".]" or "=]".
1730
1731 Argument:
1732 ptr pointer to the initial [
1733 endptr where to return the end pointer
1734 cd pointer to compile data
1735
1736 Returns: TRUE or FALSE
1737 */
1738
1739 static BOOL
1740 check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
1741 {
1742 int terminator; /* Don't combine these lines; the Solaris cc */
1743 terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
1744 if (*(++ptr) == '^') ptr++;
1745 while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
1746 if (*ptr == terminator && ptr[1] == ']')
1747 {
1748 *endptr = ptr;
1749 return TRUE;
1750 }
1751 return FALSE;
1752 }
1753
1754
1755
1756
1757 /*************************************************
1758 * Check POSIX class name *
1759 *************************************************/
1760
1761 /* This function is called to check the name given in a POSIX-style class entry
1762 such as [:alnum:].
1763
1764 Arguments:
1765 ptr points to the first letter
1766 len the length of the name
1767
1768 Returns: a value representing the name, or -1 if unknown
1769 */
1770
1771 static int
1772 check_posix_name(const uschar *ptr, int len)
1773 {
1774 const char *pn = posix_names;
1775 register int yield = 0;
1776 while (posix_name_lengths[yield] != 0)
1777 {
1778 if (len == posix_name_lengths[yield] &&
1779 strncmp((const char *)ptr, pn, len) == 0) return yield;
1780 pn += posix_name_lengths[yield] + 1;
1781 yield++;
1782 }
1783 return -1;
1784 }
1785
1786
1787 /*************************************************
1788 * Adjust OP_RECURSE items in repeated group *
1789 *************************************************/
1790
1791 /* OP_RECURSE items contain an offset from the start of the regex to the group
1792 that is referenced. This means that groups can be replicated for fixed
1793 repetition simply by copying (because the recursion is allowed to refer to
1794 earlier groups that are outside the current group). However, when a group is
1795 optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
1796 it, after it has been compiled. This means that any OP_RECURSE items within it
1797 that refer to the group itself or any contained groups have to have their
1798 offsets adjusted. That one of the jobs of this function. Before it is called,
1799 the partially compiled regex must be temporarily terminated with OP_END.
1800
1801 This function has been extended with the possibility of forward references for
1802 recursions and subroutine calls. It must also check the list of such references
1803 for the group we are dealing with. If it finds that one of the recursions in
1804 the current group is on this list, it adjusts the offset in the list, not the
1805 value in the reference (which is a group number).
1806
1807 Arguments:
1808 group points to the start of the group
1809 adjust the amount by which the group is to be moved
1810 utf8 TRUE in UTF-8 mode
1811 cd contains pointers to tables etc.
1812 save_hwm the hwm forward reference pointer at the start of the group
1813
1814 Returns: nothing
1815 */
1816
1817 static void
1818 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
1819 uschar *save_hwm)
1820 {
1821 uschar *ptr = group;
1822
1823 while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1824 {
1825 int offset;
1826 uschar *hc;
1827
1828 /* See if this recursion is on the forward reference list. If so, adjust the
1829 reference. */
1830
1831 for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
1832 {
1833 offset = GET(hc, 0);
1834 if (cd->start_code + offset == ptr + 1)
1835 {
1836 PUT(hc, 0, offset + adjust);
1837 break;
1838 }
1839 }
1840
1841 /* Otherwise, adjust the recursion offset if it's after the start of this
1842 group. */
1843
1844 if (hc >= cd->hwm)
1845 {
1846 offset = GET(ptr, 1);
1847 if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1848 }
1849
1850 ptr += 1 + LINK_SIZE;
1851 }
1852 }
1853
1854
1855
1856 /*************************************************
1857 * Insert an automatic callout point *
1858 *************************************************/
1859
1860 /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
1861 callout points before each pattern item.
1862
1863 Arguments:
1864 code current code pointer
1865 ptr current pattern pointer
1866 cd pointers to tables etc
1867
1868 Returns: new code pointer
1869 */
1870
1871 static uschar *
1872 auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
1873 {
1874 *code++ = OP_CALLOUT;
1875 *code++ = 255;
1876 PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */
1877 PUT(code, LINK_SIZE, 0); /* Default length */
1878 return code + 2*LINK_SIZE;
1879 }
1880
1881
1882
1883 /*************************************************
1884 * Complete a callout item *
1885 *************************************************/
1886
1887 /* A callout item contains the length of the next item in the pattern, which
1888 we can't fill in till after we have reached the relevant point. This is used
1889 for both automatic and manual callouts.
1890
1891 Arguments:
1892 previous_callout points to previous callout item
1893 ptr current pattern pointer
1894 cd pointers to tables etc
1895
1896 Returns: nothing
1897 */
1898
1899 static void
1900 complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
1901 {
1902 int length = ptr - cd->start_pattern - GET(previous_callout, 2);
1903 PUT(previous_callout, 2 + LINK_SIZE, length);
1904 }
1905
1906
1907
1908 #ifdef SUPPORT_UCP
1909 /*************************************************
1910 * Get othercase range *
1911 *************************************************/
1912
1913 /* This function is passed the start and end of a class range, in UTF-8 mode
1914 with UCP support. It searches up the characters, looking for internal ranges of
1915 characters in the "other" case. Each call returns the next one, updating the
1916 start address.
1917
1918 Arguments:
1919 cptr points to starting character value; updated
1920 d end value
1921 ocptr where to put start of othercase range
1922 odptr where to put end of othercase range
1923
1924 Yield: TRUE when range returned; FALSE when no more
1925 */
1926
1927 static BOOL
1928 get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
1929 unsigned int *odptr)
1930 {
1931 unsigned int c, othercase, next;
1932
1933 for (c = *cptr; c <= d; c++)
1934 { if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR) break; }
1935
1936 if (c > d) return FALSE;
1937
1938 *ocptr = othercase;
1939 next = othercase + 1;
1940
1941 for (++c; c <= d; c++)
1942 {
1943 if (_pcre_ucp_othercase(c) != next) break;
1944 next++;
1945 }
1946
1947 *odptr = next - 1;
1948 *cptr = c;
1949
1950 return TRUE;
1951 }
1952 #endif /* SUPPORT_UCP */
1953
1954
1955
1956 /*************************************************
1957 * Check if auto-possessifying is possible *
1958 *************************************************/
1959
1960 /* This function is called for unlimited repeats of certain items, to see
1961 whether the next thing could possibly match the repeated item. If not, it makes
1962 sense to automatically possessify the repeated item.
1963
1964 Arguments:
1965 op_code the repeated op code
1966 this data for this item, depends on the opcode
1967 utf8 TRUE in UTF-8 mode
1968 utf8_char used for utf8 character bytes, NULL if not relevant
1969 ptr next character in pattern
1970 options options bits
1971 cd contains pointers to tables etc.
1972
1973 Returns: TRUE if possessifying is wanted
1974 */
1975
1976 static BOOL
1977 check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
1978 const uschar *ptr, int options, compile_data *cd)
1979 {
1980 int next;
1981
1982 /* Skip whitespace and comments in extended mode */
1983
1984 if ((options & PCRE_EXTENDED) != 0)
1985 {
1986 for (;;)
1987 {
1988 while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1989 if (*ptr == '#')
1990 {
1991 while (*(++ptr) != 0)
1992 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1993 }
1994 else break;
1995 }
1996 }
1997
1998 /* If the next item is one that we can handle, get its value. A non-negative
1999 value is a character, a negative value is an escape value. */
2000
2001 if (*ptr == '\\')
2002 {
2003 int temperrorcode = 0;
2004 next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
2005 if (temperrorcode != 0) return FALSE;
2006 ptr++; /* Point after the escape sequence */
2007 }
2008
2009 else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
2010 {
2011 #ifdef SUPPORT_UTF8
2012 if (utf8) { GETCHARINC(next, ptr); } else
2013 #endif
2014 next = *ptr++;
2015 }
2016
2017 else return FALSE;
2018
2019 /* Skip whitespace and comments in extended mode */
2020
2021 if ((options & PCRE_EXTENDED) != 0)
2022 {
2023 for (;;)
2024 {
2025 while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2026 if (*ptr == '#')
2027 {
2028 while (*(++ptr) != 0)
2029 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2030 }
2031 else break;
2032 }
2033 }
2034
2035 /* If the next thing is itself optional, we have to give up. */
2036
2037 if (*ptr == '*' || *ptr == '?' || strncmp((char *)ptr, "{0,", 3) == 0)
2038 return FALSE;
2039
2040 /* Now compare the next item with the previous opcode. If the previous is a
2041 positive single character match, "item" either contains the character or, if
2042 "item" is greater than 127 in utf8 mode, the character's bytes are in
2043 utf8_char. */
2044
2045
2046 /* Handle cases when the next item is a character. */
2047
2048 if (next >= 0) switch(op_code)
2049 {
2050 case OP_CHAR:
2051 #ifdef SUPPORT_UTF8
2052 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2053 #endif
2054 return item != next;
2055
2056 /* For CHARNC (caseless character) we must check the other case. If we have
2057 Unicode property support, we can use it to test the other case of
2058 high-valued characters. */
2059
2060 case OP_CHARNC:
2061 #ifdef SUPPORT_UTF8
2062 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2063 #endif
2064 if (item == next) return FALSE;
2065 #ifdef SUPPORT_UTF8
2066 if (utf8)
2067 {
2068 unsigned int othercase;
2069 if (next < 128) othercase = cd->fcc[next]; else
2070 #ifdef SUPPORT_UCP
2071 othercase = _pcre_ucp_othercase((unsigned int)next);
2072 #else
2073 othercase = NOTACHAR;
2074 #endif
2075 return (unsigned int)item != othercase;
2076 }
2077 else
2078 #endif /* SUPPORT_UTF8 */
2079 return (item != cd->fcc[next]); /* Non-UTF-8 mode */
2080
2081 /* For OP_NOT, "item" must be a single-byte character. */
2082
2083 case OP_NOT:
2084 if (next < 0) return FALSE; /* Not a character */
2085 if (item == next) return TRUE;
2086 if ((options & PCRE_CASELESS) == 0) return FALSE;
2087 #ifdef SUPPORT_UTF8
2088 if (utf8)
2089 {
2090 unsigned int othercase;
2091 if (next < 128) othercase = cd->fcc[next]; else
2092 #ifdef SUPPORT_UCP
2093 othercase = _pcre_ucp_othercase(next);
2094 #else
2095 othercase = NOTACHAR;
2096 #endif
2097 return (unsigned int)item == othercase;
2098 }
2099 else
2100 #endif /* SUPPORT_UTF8 */
2101 return (item == cd->fcc[next]); /* Non-UTF-8 mode */
2102
2103 case OP_DIGIT:
2104 return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
2105
2106 case OP_NOT_DIGIT:
2107 return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
2108
2109 case OP_WHITESPACE:
2110 return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
2111
2112 case OP_NOT_WHITESPACE:
2113 return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
2114
2115 case OP_WORDCHAR:
2116 return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
2117
2118 case OP_NOT_WORDCHAR:
2119 return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
2120
2121 case OP_HSPACE:
2122 case OP_NOT_HSPACE:
2123 switch(next)
2124 {
2125 case 0x09:
2126 case 0x20:
2127 case 0xa0:
2128 case 0x1680:
2129 case 0x180e:
2130 case 0x2000:
2131 case 0x2001:
2132 case 0x2002:
2133 case 0x2003:
2134 case 0x2004:
2135 case 0x2005:
2136 case 0x2006:
2137 case 0x2007:
2138 case 0x2008:
2139 case 0x2009:
2140 case 0x200A:
2141 case 0x202f:
2142 case 0x205f:
2143 case 0x3000:
2144 return op_code != OP_HSPACE;
2145 default:
2146 return op_code == OP_HSPACE;
2147 }
2148
2149 case OP_VSPACE:
2150 case OP_NOT_VSPACE:
2151 switch(next)
2152 {
2153 case 0x0a:
2154 case 0x0b:
2155 case 0x0c:
2156 case 0x0d:
2157 case 0x85:
2158 case 0x2028:
2159 case 0x2029:
2160 return op_code != OP_VSPACE;
2161 default:
2162 return op_code == OP_VSPACE;
2163 }
2164
2165 default:
2166 return FALSE;
2167 }
2168
2169
2170 /* Handle the case when the next item is \d, \s, etc. */
2171
2172 switch(op_code)
2173 {
2174 case OP_CHAR:
2175 case OP_CHARNC:
2176 #ifdef SUPPORT_UTF8
2177 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2178 #endif
2179 switch(-next)
2180 {
2181 case ESC_d:
2182 return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
2183
2184 case ESC_D:
2185 return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
2186
2187 case ESC_s:
2188 return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
2189
2190 case ESC_S:
2191 return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
2192
2193 case ESC_w:
2194 return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
2195
2196 case ESC_W:
2197 return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
2198
2199 case ESC_h:
2200 case ESC_H:
2201 switch(item)
2202 {
2203 case 0x09:
2204 case 0x20:
2205 case 0xa0:
2206 case 0x1680:
2207 case 0x180e:
2208 case 0x2000:
2209 case 0x2001:
2210 case 0x2002:
2211 case 0x2003:
2212 case 0x2004:
2213 case 0x2005:
2214 case 0x2006:
2215 case 0x2007:
2216 case 0x2008:
2217 case 0x2009:
2218 case 0x200A:
2219 case 0x202f:
2220 case 0x205f:
2221 case 0x3000:
2222 return -next != ESC_h;
2223 default:
2224 return -next == ESC_h;
2225 }
2226
2227 case ESC_v:
2228 case ESC_V:
2229 switch(item)
2230 {
2231 case 0x0a:
2232 case 0x0b:
2233 case 0x0c:
2234 case 0x0d:
2235 case 0x85:
2236 case 0x2028:
2237 case 0x2029:
2238 return -next != ESC_v;
2239 default:
2240 return -next == ESC_v;
2241 }
2242
2243 default:
2244 return FALSE;
2245 }
2246
2247 case OP_DIGIT:
2248 return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
2249 next == -ESC_h || next == -ESC_v;
2250
2251 case OP_NOT_DIGIT:
2252 return next == -ESC_d;
2253
2254 case OP_WHITESPACE:
2255 return next == -ESC_S || next == -ESC_d || next == -ESC_w;
2256
2257 case OP_NOT_WHITESPACE:
2258 return next == -ESC_s || next == -ESC_h || next == -ESC_v;
2259
2260 case OP_HSPACE:
2261 return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w;
2262
2263 case OP_NOT_HSPACE:
2264 return next == -ESC_h;
2265
2266 /* Can't have \S in here because VT matches \S (Perl anomaly) */
2267 case OP_VSPACE:
2268 return next == -ESC_V || next == -ESC_d || next == -ESC_w;
2269
2270 case OP_NOT_VSPACE:
2271 return next == -ESC_v;
2272
2273 case OP_WORDCHAR:
2274 return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v;
2275
2276 case OP_NOT_WORDCHAR:
2277 return next == -ESC_w || next == -ESC_d;
2278
2279 default:
2280 return FALSE;
2281 }
2282
2283 /* Control does not reach here */
2284 }
2285
2286
2287
2288 /*************************************************
2289 * Compile one branch *
2290 *************************************************/
2291
2292 /* Scan the pattern, compiling it into the a vector. If the options are
2293 changed during the branch, the pointer is used to change the external options
2294 bits. This function is used during the pre-compile phase when we are trying
2295 to find out the amount of memory needed, as well as during the real compile
2296 phase. The value of lengthptr distinguishes the two phases.
2297
2298 Arguments:
2299 optionsptr pointer to the option bits
2300 codeptr points to the pointer to the current code point
2301 ptrptr points to the current pattern pointer
2302 errorcodeptr points to error code variable
2303 firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2304 reqbyteptr set to the last literal character required, else < 0
2305 bcptr points to current branch chain
2306 cd contains pointers to tables etc.
2307 lengthptr NULL during the real compile phase
2308 points to length accumulator during pre-compile phase
2309
2310 Returns: TRUE on success
2311 FALSE, with *errorcodeptr set non-zero on error
2312 */
2313
2314 static BOOL
2315 compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2316 int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2317 compile_data *cd, int *lengthptr)
2318 {
2319 int repeat_type, op_type;
2320 int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
2321 int bravalue = 0;
2322 int greedy_default, greedy_non_default;
2323 int firstbyte, reqbyte;
2324 int zeroreqbyte, zerofirstbyte;
2325 int req_caseopt, reqvary, tempreqvary;
2326 int options = *optionsptr;
2327 int after_manual_callout = 0;
2328 int length_prevgroup = 0;
2329 register int c;
2330 register uschar *code = *codeptr;
2331 uschar *last_code = code;
2332 uschar *orig_code = code;
2333 uschar *tempcode;
2334 BOOL inescq = FALSE;
2335 BOOL groupsetfirstbyte = FALSE;
2336 const uschar *ptr = *ptrptr;
2337 const uschar *tempptr;
2338 uschar *previous = NULL;
2339 uschar *previous_callout = NULL;
2340 uschar *save_hwm = NULL;
2341 uschar classbits[32];
2342
2343 #ifdef SUPPORT_UTF8
2344 BOOL class_utf8;
2345 BOOL utf8 = (options & PCRE_UTF8) != 0;
2346 uschar *class_utf8data;
2347 uschar utf8_char[6];
2348 #else
2349 BOOL utf8 = FALSE;
2350 uschar *utf8_char = NULL;
2351 #endif
2352
2353 #ifdef DEBUG
2354 if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2355 #endif
2356
2357 /* Set up the default and non-default settings for greediness */
2358
2359 greedy_default = ((options & PCRE_UNGREEDY) != 0);
2360 greedy_non_default = greedy_default ^ 1;
2361
2362 /* Initialize no first byte, no required byte. REQ_UNSET means "no char
2363 matching encountered yet". It gets changed to REQ_NONE if we hit something that
2364 matches a non-fixed char first char; reqbyte just remains unset if we never
2365 find one.
2366
2367 When we hit a repeat whose minimum is zero, we may have to adjust these values
2368 to take the zero repeat into account. This is implemented by setting them to
2369 zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2370 item types that can be repeated set these backoff variables appropriately. */
2371
2372 firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2373
2374 /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2375 according to the current setting of the caseless flag. REQ_CASELESS is a bit
2376 value > 255. It is added into the firstbyte or reqbyte variables to record the
2377 case status of the value. This is used only for ASCII characters. */
2378
2379 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2380
2381 /* Switch on next character until the end of the branch */
2382
2383 for (;; ptr++)
2384 {
2385 BOOL negate_class;
2386 BOOL possessive_quantifier;
2387 BOOL is_quantifier;
2388 BOOL is_recurse;
2389 BOOL reset_bracount;
2390 int class_charcount;
2391 int class_lastchar;
2392 int newoptions;
2393 int recno;
2394 int refsign;
2395 int skipbytes;
2396 int subreqbyte;
2397 int subfirstbyte;
2398 int terminator;
2399 int mclength;
2400 uschar mcbuffer[8];
2401
2402 /* Get next byte in the pattern */
2403
2404 c = *ptr;
2405
2406 /* If we are in the pre-compile phase, accumulate the length used for the
2407 previous cycle of this loop. */
2408
2409 if (lengthptr != NULL)
2410 {
2411 #ifdef DEBUG
2412 if (code > cd->hwm) cd->hwm = code; /* High water info */
2413 #endif
2414 if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
2415 {
2416 *errorcodeptr = ERR52;
2417 goto FAILED;
2418 }
2419
2420 /* There is at least one situation where code goes backwards: this is the
2421 case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2422 the class is simply eliminated. However, it is created first, so we have to
2423 allow memory for it. Therefore, don't ever reduce the length at this point.
2424 */
2425
2426 if (code < last_code) code = last_code;
2427
2428 /* Paranoid check for integer overflow */
2429
2430 if (OFLOW_MAX - *lengthptr < code - last_code)
2431 {
2432 *errorcodeptr = ERR20;
2433 goto FAILED;
2434 }
2435
2436 *lengthptr += code - last_code;
2437 DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2438
2439 /* If "previous" is set and it is not at the start of the work space, move
2440 it back to there, in order to avoid filling up the work space. Otherwise,
2441 if "previous" is NULL, reset the current code pointer to the start. */
2442
2443 if (previous != NULL)
2444 {
2445 if (previous > orig_code)
2446 {
2447 memmove(orig_code, previous, code - previous);
2448 code -= previous - orig_code;
2449 previous = orig_code;
2450 }
2451 }
2452 else code = orig_code;
2453
2454 /* Remember where this code item starts so we can pick up the length
2455 next time round. */
2456
2457 last_code = code;
2458 }
2459
2460 /* In the real compile phase, just check the workspace used by the forward
2461 reference list. */
2462
2463 else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
2464 {
2465 *errorcodeptr = ERR52;
2466 goto FAILED;
2467 }
2468
2469 /* If in \Q...\E, check for the end; if not, we have a literal */
2470
2471 if (inescq && c != 0)
2472 {
2473 if (c == '\\' && ptr[1] == 'E')
2474 {
2475 inescq = FALSE;
2476 ptr++;
2477 continue;
2478 }
2479 else
2480 {
2481 if (previous_callout != NULL)
2482 {
2483 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2484 complete_callout(previous_callout, ptr, cd);
2485 previous_callout = NULL;
2486 }
2487 if ((options & PCRE_AUTO_CALLOUT) != 0)
2488 {
2489 previous_callout = code;
2490 code = auto_callout(code, ptr, cd);
2491 }
2492 goto NORMAL_CHAR;
2493 }
2494 }
2495
2496 /* Fill in length of a previous callout, except when the next thing is
2497 a quantifier. */
2498
2499 is_quantifier = c == '*' || c == '+' || c == '?' ||
2500 (c == '{' && is_counted_repeat(ptr+1));
2501
2502 if (!is_quantifier && previous_callout != NULL &&
2503 after_manual_callout-- <= 0)
2504 {
2505 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2506 complete_callout(previous_callout, ptr, cd);
2507 previous_callout = NULL;
2508 }
2509
2510 /* In extended mode, skip white space and comments */
2511
2512 if ((options & PCRE_EXTENDED) != 0)
2513 {
2514 if ((cd->ctypes[c] & ctype_space) != 0) continue;
2515 if (c == '#')
2516 {
2517 while (*(++ptr) != 0)
2518 {
2519 if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
2520 }
2521 if (*ptr != 0) continue;
2522
2523 /* Else fall through to handle end of string */
2524 c = 0;
2525 }
2526 }
2527
2528 /* No auto callout for quantifiers. */
2529
2530 if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
2531 {
2532 previous_callout = code;
2533 code = auto_callout(code, ptr, cd);
2534 }
2535
2536 switch(c)
2537 {
2538 /* ===================================================================*/
2539 case 0: /* The branch terminates at string end */
2540 case '|': /* or | or ) */
2541 case ')':
2542 *firstbyteptr = firstbyte;
2543 *reqbyteptr = reqbyte;
2544 *codeptr = code;
2545 *ptrptr = ptr;
2546 if (lengthptr != NULL)
2547 {
2548 if (OFLOW_MAX - *lengthptr < code - last_code)
2549 {
2550 *errorcodeptr = ERR20;
2551 goto FAILED;
2552 }
2553 *lengthptr += code - last_code; /* To include callout length */
2554 DPRINTF((">> end branch\n"));
2555 }
2556 return TRUE;
2557
2558
2559 /* ===================================================================*/
2560 /* Handle single-character metacharacters. In multiline mode, ^ disables
2561 the setting of any following char as a first character. */
2562
2563 case '^':
2564 if ((options & PCRE_MULTILINE) != 0)
2565 {
2566 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2567 }
2568 previous = NULL;
2569 *code++ = OP_CIRC;
2570 break;
2571
2572 case '$':
2573 previous = NULL;
2574 *code++ = OP_DOLL;
2575 break;
2576
2577 /* There can never be a first char if '.' is first, whatever happens about
2578 repeats. The value of reqbyte doesn't change either. */
2579
2580 case '.':
2581 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2582 zerofirstbyte = firstbyte;
2583 zeroreqbyte = reqbyte;
2584 previous = code;
2585 *code++ = OP_ANY;
2586 break;
2587
2588
2589 /* ===================================================================*/
2590 /* Character classes. If the included characters are all < 256, we build a
2591 32-byte bitmap of the permitted characters, except in the special case
2592 where there is only one such character. For negated classes, we build the
2593 map as usual, then invert it at the end. However, we use a different opcode
2594 so that data characters > 255 can be handled correctly.
2595
2596 If the class contains characters outside the 0-255 range, a different
2597 opcode is compiled. It may optionally have a bit map for characters < 256,
2598 but those above are are explicitly listed afterwards. A flag byte tells
2599 whether the bitmap is present, and whether this is a negated class or not.
2600 */
2601
2602 case '[':
2603 previous = code;
2604
2605 /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
2606 they are encountered at the top level, so we'll do that too. */
2607
2608 if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2609 check_posix_syntax(ptr, &tempptr, cd))
2610 {
2611 *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;
2612 goto FAILED;
2613 }
2614
2615 /* If the first character is '^', set the negation flag and skip it. Also,
2616 if the first few characters (either before or after ^) are \Q\E or \E we
2617 skip them too. This makes for compatibility with Perl. */
2618
2619 negate_class = FALSE;
2620 for (;;)
2621 {
2622 c = *(++ptr);
2623 if (c == '\\')
2624 {
2625 if (ptr[1] == 'E') ptr++;
2626 else if (strncmp((const char *)ptr+1, "Q\\E", 3) == 0) ptr += 3;
2627 else break;
2628 }
2629 else if (!negate_class && c == '^')
2630 negate_class = TRUE;
2631 else break;
2632 }
2633
2634 /* Keep a count of chars with values < 256 so that we can optimize the case
2635 of just a single character (as long as it's < 256). However, For higher
2636 valued UTF-8 characters, we don't yet do any optimization. */
2637
2638 class_charcount = 0;
2639 class_lastchar = -1;
2640
2641 /* Initialize the 32-char bit map to all zeros. We build the map in a
2642 temporary bit of memory, in case the class contains only 1 character (less
2643 than 256), because in that case the compiled code doesn't use the bit map.
2644 */
2645
2646 memset(classbits, 0, 32 * sizeof(uschar));
2647
2648 #ifdef SUPPORT_UTF8
2649 class_utf8 = FALSE; /* No chars >= 256 */
2650 class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */
2651 #endif
2652
2653 /* Process characters until ] is reached. By writing this as a "do" it
2654 means that an initial ] is taken as a data character. At the start of the
2655 loop, c contains the first byte of the character. */
2656
2657 if (c != 0) do
2658 {
2659 const uschar *oldptr;
2660
2661 #ifdef SUPPORT_UTF8
2662 if (utf8 && c > 127)
2663 { /* Braces are required because the */
2664 GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
2665 }
2666 #endif
2667
2668 /* Inside \Q...\E everything is literal except \E */
2669
2670 if (inescq)
2671 {
2672 if (c == '\\' && ptr[1] == 'E') /* If we are at \E */
2673 {
2674 inescq = FALSE; /* Reset literal state */
2675 ptr++; /* Skip the 'E' */
2676 continue; /* Carry on with next */
2677 }
2678 goto CHECK_RANGE; /* Could be range if \E follows */
2679 }
2680
2681 /* Handle POSIX class names. Perl allows a negation extension of the
2682 form [:^name:]. A square bracket that doesn't match the syntax is
2683 treated as a literal. We also recognize the POSIX constructions
2684 [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
2685 5.6 and 5.8 do. */
2686
2687 if (c == '[' &&
2688 (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2689 check_posix_syntax(ptr, &tempptr, cd))
2690 {
2691 BOOL local_negate = FALSE;
2692 int posix_class, taboffset, tabopt;
2693 register const uschar *cbits = cd->cbits;
2694 uschar pbits[32];
2695
2696 if (ptr[1] != ':')
2697 {
2698 *errorcodeptr = ERR31;
2699 goto FAILED;
2700 }
2701
2702 ptr += 2;
2703 if (*ptr == '^')
2704 {
2705 local_negate = TRUE;
2706 ptr++;
2707 }
2708
2709 posix_class = check_posix_name(ptr, tempptr - ptr);
2710 if (posix_class < 0)
2711 {
2712 *errorcodeptr = ERR30;
2713 goto FAILED;
2714 }
2715
2716 /* If matching is caseless, upper and lower are converted to
2717 alpha. This relies on the fact that the class table starts with
2718 alpha, lower, upper as the first 3 entries. */
2719
2720 if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
2721 posix_class = 0;
2722
2723 /* We build the bit map for the POSIX class in a chunk of local store
2724 because we may be adding and subtracting from it, and we don't want to
2725 subtract bits that may be in the main map already. At the end we or the
2726 result into the bit map that is being built. */
2727
2728 posix_class *= 3;
2729
2730 /* Copy in the first table (always present) */
2731
2732 memcpy(pbits, cbits + posix_class_maps[posix_class],
2733 32 * sizeof(uschar));
2734
2735 /* If there is a second table, add or remove it as required. */
2736
2737 taboffset = posix_class_maps[posix_class + 1];
2738 tabopt = posix_class_maps[posix_class + 2];
2739
2740 if (taboffset >= 0)
2741 {
2742 if (tabopt >= 0)
2743 for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
2744 else
2745 for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
2746 }
2747
2748 /* Not see if we need to remove any special characters. An option
2749 value of 1 removes vertical space and 2 removes underscore. */
2750
2751 if (tabopt < 0) tabopt = -tabopt;
2752 if (tabopt == 1) pbits[1] &= ~0x3c;
2753 else if (tabopt == 2) pbits[11] &= 0x7f;
2754
2755 /* Add the POSIX table or its complement into the main table that is
2756 being built and we are done. */
2757
2758 if (local_negate)
2759 for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
2760 else
2761 for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
2762
2763 ptr = tempptr + 1;
2764 class_charcount = 10; /* Set > 1; assumes more than 1 per class */
2765 continue; /* End of POSIX syntax handling */
2766 }
2767
2768 /* Backslash may introduce a single character, or it may introduce one
2769 of the specials, which just set a flag. The sequence \b is a special
2770 case. Inside a class (and only there) it is treated as backspace.
2771 Elsewhere it marks a word boundary. Other escapes have preset maps ready
2772 to 'or' into the one we are building. We assume they have more than one
2773 character in them, so set class_charcount bigger than one. */
2774
2775 if (c == '\\')
2776 {
2777 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2778 if (*errorcodeptr != 0) goto FAILED;
2779
2780 if (-c == ESC_b) c = '\b'; /* \b is backslash in a class */
2781 else if (-c == ESC_X) c = 'X'; /* \X is literal X in a class */
2782 else if (-c == ESC_R) c = 'R'; /* \R is literal R in a class */
2783 else if (-c == ESC_Q) /* Handle start of quoted string */
2784 {
2785 if (ptr[1] == '\\' && ptr[2] == 'E')
2786 {
2787 ptr += 2; /* avoid empty string */
2788 }
2789 else inescq = TRUE;
2790 continue;
2791 }
2792 else if (-c == ESC_E) continue; /* Ignore orphan \E */
2793
2794 if (c < 0)
2795 {
2796 register const uschar *cbits = cd->cbits;
2797 class_charcount += 2; /* Greater than 1 is what matters */
2798
2799 /* Save time by not doing this in the pre-compile phase. */
2800
2801 if (lengthptr == NULL) switch (-c)
2802 {
2803 case ESC_d:
2804 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
2805 continue;
2806
2807 case ESC_D:
2808 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
2809 continue;
2810
2811 case ESC_w:
2812 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
2813 continue;
2814
2815 case ESC_W:
2816 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
2817 continue;
2818
2819 case ESC_s:
2820 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
2821 classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
2822 continue;
2823
2824 case ESC_S:
2825 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
2826 classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
2827 continue;
2828
2829 case ESC_E: /* Perl ignores an orphan \E */
2830 continue;
2831
2832 default: /* Not recognized; fall through */
2833 break; /* Need "default" setting to stop compiler warning. */
2834 }
2835
2836 /* In the pre-compile phase, just do the recognition. */
2837
2838 else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
2839 c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
2840
2841 /* We need to deal with \H, \h, \V, and \v in both phases because
2842 they use extra memory. */
2843
2844 if (-c == ESC_h)
2845 {
2846 SETBIT(classbits, 0x09); /* VT */
2847 SETBIT(classbits, 0x20); /* SPACE */
2848 SETBIT(classbits, 0xa0); /* NSBP */
2849 #ifdef SUPPORT_UTF8
2850 if (utf8)
2851 {
2852 class_utf8 = TRUE;
2853 *class_utf8data++ = XCL_SINGLE;
2854 class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
2855 *class_utf8data++ = XCL_SINGLE;
2856 class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
2857 *class_utf8data++ = XCL_RANGE;
2858 class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
2859 class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
2860 *class_utf8data++ = XCL_SINGLE;
2861 class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
2862 *class_utf8data++ = XCL_SINGLE;
2863 class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
2864 *class_utf8data++ = XCL_SINGLE;
2865 class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
2866 }
2867 #endif
2868 continue;
2869 }
2870
2871 if (-c == ESC_H)
2872 {
2873 for (c = 0; c < 32; c++)
2874 {
2875 int x = 0xff;
2876 switch (c)
2877 {
2878 case 0x09/8: x ^= 1 << (0x09%8); break;
2879 case 0x20/8: x ^= 1 << (0x20%8); break;
2880 case 0xa0/8: x ^= 1 << (0xa0%8); break;
2881 default: break;
2882 }
2883 classbits[c] |= x;
2884 }
2885
2886 #ifdef SUPPORT_UTF8
2887 if (utf8)
2888 {
2889 class_utf8 = TRUE;
2890 *class_utf8data++ = XCL_RANGE;
2891 class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2892 class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
2893 *class_utf8data++ = XCL_RANGE;
2894 class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
2895 class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
2896 *class_utf8data++ = XCL_RANGE;
2897 class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
2898 class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
2899 *class_utf8data++ = XCL_RANGE;
2900 class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
2901 class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
2902 *class_utf8data++ = XCL_RANGE;
2903 class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
2904 class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
2905 *class_utf8data++ = XCL_RANGE;
2906 class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
2907 class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
2908 *class_utf8data++ = XCL_RANGE;
2909 class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
2910 class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2911 }
2912 #endif
2913 continue;
2914 }
2915
2916 if (-c == ESC_v)
2917 {
2918 SETBIT(classbits, 0x0a); /* LF */
2919 SETBIT(classbits, 0x0b); /* VT */
2920 SETBIT(classbits, 0x0c); /* FF */
2921 SETBIT(classbits, 0x0d); /* CR */
2922 SETBIT(classbits, 0x85); /* NEL */
2923 #ifdef SUPPORT_UTF8
2924 if (utf8)
2925 {
2926 class_utf8 = TRUE;
2927 *class_utf8data++ = XCL_RANGE;
2928 class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
2929 class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
2930 }
2931 #endif
2932 continue;
2933 }
2934
2935 if (-c == ESC_V)
2936 {
2937 for (c = 0; c < 32; c++)
2938 {
2939 int x = 0xff;
2940 switch (c)
2941 {
2942 case 0x0a/8: x ^= 1 << (0x0a%8);
2943 x ^= 1 << (0x0b%8);
2944 x ^= 1 << (0x0c%8);
2945 x ^= 1 << (0x0d%8);
2946 break;
2947 case 0x85/8: x ^= 1 << (0x85%8); break;
2948 default: break;
2949 }
2950 classbits[c] |= x;
2951 }
2952
2953 #ifdef SUPPORT_UTF8
2954 if (utf8)
2955 {
2956 class_utf8 = TRUE;
2957 *class_utf8data++ = XCL_RANGE;
2958 class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2959 class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
2960 *class_utf8data++ = XCL_RANGE;
2961 class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
2962 class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2963 }
2964 #endif
2965 continue;
2966 }
2967
2968 /* We need to deal with \P and \p in both phases. */
2969
2970 #ifdef SUPPORT_UCP
2971 if (-c == ESC_p || -c == ESC_P)
2972 {
2973 BOOL negated;
2974 int pdata;
2975 int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
2976 if (ptype < 0) goto FAILED;
2977 class_utf8 = TRUE;
2978 *class_utf8data++ = ((-c == ESC_p) != negated)?
2979 XCL_PROP : XCL_NOTPROP;
2980 *class_utf8data++ = ptype;
2981 *class_utf8data++ = pdata;
2982 class_charcount -= 2; /* Not a < 256 character */
2983 continue;
2984 }
2985 #endif
2986 /* Unrecognized escapes are faulted if PCRE is running in its
2987 strict mode. By default, for compatibility with Perl, they are
2988 treated as literals. */
2989
2990 if ((options & PCRE_EXTRA) != 0)
2991 {
2992 *errorcodeptr = ERR7;
2993 goto FAILED;
2994 }
2995
2996 class_charcount -= 2; /* Undo the default count from above */
2997 c = *ptr; /* Get the final character and fall through */
2998 }
2999
3000 /* Fall through if we have a single character (c >= 0). This may be
3001 greater than 256 in UTF-8 mode. */
3002
3003 } /* End of backslash handling */
3004
3005 /* A single character may be followed by '-' to form a range. However,
3006 Perl does not permit ']' to be the end of the range. A '-' character
3007 at the end is treated as a literal. Perl ignores orphaned \E sequences
3008 entirely. The code for handling \Q and \E is messy. */
3009
3010 CHECK_RANGE:
3011 while (ptr[1] == '\\' && ptr[2] == 'E')
3012 {
3013 inescq = FALSE;
3014 ptr += 2;
3015 }
3016
3017 oldptr = ptr;
3018
3019 /* Remember \r or \n */
3020
3021 if (c == '\r' || c == '\n') cd->external_flags |= PCRE_HASCRORLF;
3022
3023 /* Check for range */
3024
3025 if (!inescq && ptr[1] == '-')
3026 {
3027 int d;
3028 ptr += 2;
3029 while (*ptr == '\\' && ptr[1] == 'E') ptr += 2;
3030
3031 /* If we hit \Q (not followed by \E) at this point, go into escaped
3032 mode. */
3033
3034 while (*ptr == '\\' && ptr[1] == 'Q')
3035 {
3036 ptr += 2;
3037 if (*ptr == '\\' && ptr[1] == 'E') { ptr += 2; continue; }
3038 inescq = TRUE;
3039 break;
3040 }
3041
3042 if (*ptr == 0 || (!inescq && *ptr == ']'))
3043 {
3044 ptr = oldptr;
3045 goto LONE_SINGLE_CHARACTER;
3046 }
3047
3048 #ifdef SUPPORT_UTF8
3049 if (utf8)
3050 { /* Braces are required because the */
3051 GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
3052 }
3053 else
3054 #endif
3055 d = *ptr; /* Not UTF-8 mode */
3056
3057 /* The second part of a range can be a single-character escape, but
3058 not any of the other escapes. Perl 5.6 treats a hyphen as a literal
3059 in such circumstances. */
3060
3061 if (!inescq && d == '\\')
3062 {
3063 d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3064 if (*errorcodeptr != 0) goto FAILED;
3065
3066 /* \b is backslash; \X is literal X; \R is literal R; any other
3067 special means the '-' was literal */
3068
3069 if (d < 0)
3070 {
3071 if (d == -ESC_b) d = '\b';
3072 else if (d == -ESC_X) d = 'X';
3073 else if (d == -ESC_R) d = 'R'; else
3074 {
3075 ptr = oldptr;
3076 goto LONE_SINGLE_CHARACTER; /* A few lines below */
3077 }
3078 }
3079 }
3080
3081 /* Check that the two values are in the correct order. Optimize
3082 one-character ranges */
3083
3084 if (d < c)
3085 {
3086 *errorcodeptr = ERR8;
3087 goto FAILED;
3088 }
3089
3090 if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
3091
3092 /* Remember \r or \n */
3093
3094 if (d == '\r' || d == '\n') cd->external_flags |= PCRE_HASCRORLF;
3095
3096 /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
3097 matching, we have to use an XCLASS with extra data items. Caseless
3098 matching for characters > 127 is available only if UCP support is
3099 available. */
3100
3101 #ifdef SUPPORT_UTF8
3102 if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
3103 {
3104 class_utf8 = TRUE;
3105
3106 /* With UCP support, we can find the other case equivalents of
3107 the relevant characters. There may be several ranges. Optimize how
3108 they fit with the basic range. */
3109
3110 #ifdef SUPPORT_UCP
3111 if ((options & PCRE_CASELESS) != 0)
3112 {
3113 unsigned int occ, ocd;
3114 unsigned int cc = c;
3115 unsigned int origd = d;
3116 while (get_othercase_range(&cc, origd, &occ, &ocd))
3117 {
3118 if (occ >= (unsigned int)c &&
3119 ocd <= (unsigned int)d)
3120 continue; /* Skip embedded ranges */
3121
3122 if (occ < (unsigned int)c &&
3123 ocd >= (unsigned int)c - 1) /* Extend the basic range */
3124 { /* if there is overlap, */
3125 c = occ; /* noting that if occ < c */
3126 continue; /* we can't have ocd > d */
3127 } /* because a subrange is */
3128 if (ocd > (unsigned int)d &&
3129 occ <= (unsigned int)d + 1) /* always shorter than */
3130 { /* the basic range. */
3131 d = ocd;
3132 continue;
3133 }
3134
3135 if (occ == ocd)
3136 {
3137 *class_utf8data++ = XCL_SINGLE;
3138 }
3139 else
3140 {
3141 *class_utf8data++ = XCL_RANGE;
3142 class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
3143 }
3144 class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
3145 }
3146 }
3147 #endif /* SUPPORT_UCP */
3148
3149 /* Now record the original range, possibly modified for UCP caseless
3150 overlapping ranges. */
3151
3152 *class_utf8data++ = XCL_RANGE;
3153 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3154 class_utf8data += _pcre_ord2utf8(d, class_utf8data);
3155
3156 /* With UCP support, we are done. Without UCP support, there is no
3157 caseless matching for UTF-8 characters > 127; we can use the bit map
3158 for the smaller ones. */
3159
3160 #ifdef SUPPORT_UCP
3161 continue; /* With next character in the class */
3162 #else
3163 if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
3164
3165 /* Adjust upper limit and fall through to set up the map */
3166
3167 d = 127;
3168
3169 #endif /* SUPPORT_UCP */
3170 }
3171 #endif /* SUPPORT_UTF8 */
3172
3173 /* We use the bit map for all cases when not in UTF-8 mode; else
3174 ranges that lie entirely within 0-127 when there is UCP support; else
3175 for partial ranges without UCP support. */
3176
3177 class_charcount += d - c + 1;
3178 class_lastchar = d;
3179
3180 /* We can save a bit of time by skipping this in the pre-compile. */
3181
3182 if (lengthptr == NULL) for (; c <= d; c++)
3183 {
3184 classbits[c/8] |= (1 << (c&7));
3185 if ((options & PCRE_CASELESS) != 0)
3186 {
3187 int uc = cd->fcc[c]; /* flip case */
3188 classbits[uc/8] |= (1 << (uc&7));
3189 }
3190 }
3191
3192 continue; /* Go get the next char in the class */
3193 }
3194
3195 /* Handle a lone single character - we can get here for a normal
3196 non-escape char, or after \ that introduces a single character or for an
3197 apparent range that isn't. */
3198
3199 LONE_SINGLE_CHARACTER:
3200
3201 /* Handle a character that cannot go in the bit map */
3202
3203 #ifdef SUPPORT_UTF8
3204 if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
3205 {
3206 class_utf8 = TRUE;
3207 *class_utf8data++ = XCL_SINGLE;
3208 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3209
3210 #ifdef SUPPORT_UCP
3211 if ((options & PCRE_CASELESS) != 0)
3212 {
3213 unsigned int othercase;
3214 if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR)
3215 {
3216 *class_utf8data++ = XCL_SINGLE;
3217 class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
3218 }
3219 }
3220 #endif /* SUPPORT_UCP */
3221
3222 }
3223 else
3224 #endif /* SUPPORT_UTF8 */
3225
3226 /* Handle a single-byte character */
3227 {
3228 classbits[c/8] |= (1 << (c&7));
3229 if ((options & PCRE_CASELESS) != 0)
3230 {
3231 c = cd->fcc[c]; /* flip case */
3232 classbits[c/8] |= (1 << (c&7));
3233 }
3234 class_charcount++;
3235 class_lastchar = c;
3236 }
3237 }
3238
3239 /* Loop until ']' reached. This "while" is the end of the "do" above. */
3240
3241 while ((c = *(++ptr)) != 0 && (c != ']' || inescq));
3242
3243 if (c == 0) /* Missing terminating ']' */
3244 {
3245 *errorcodeptr = ERR6;
3246 goto FAILED;
3247 }
3248
3249
3250 /* This code has been disabled because it would mean that \s counts as
3251 an explicit \r or \n reference, and that's not really what is wanted. Now
3252 we set the flag only if there is a literal "\r" or "\n" in the class. */
3253
3254 #if 0
3255 /* Remember whether \r or \n are in this class */
3256
3257 if (negate_class)
3258 {
3259 if ((classbits[1] & 0x24) != 0x24) cd->external_flags |= PCRE_HASCRORLF;
3260 }
3261 else
3262 {
3263 if ((classbits[1] & 0x24) != 0) cd->external_flags |= PCRE_HASCRORLF;
3264 }
3265 #endif
3266
3267
3268 /* If class_charcount is 1, we saw precisely one character whose value is
3269 less than 256. As long as there were no characters >= 128 and there was no
3270 use of \p or \P, in other words, no use of any XCLASS features, we can
3271 optimize.
3272
3273 In UTF-8 mode, we can optimize the negative case only if there were no
3274 characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR
3275 operate on single-bytes only. This is an historical hangover. Maybe one day
3276 we can tidy these opcodes to handle multi-byte characters.
3277
3278 The optimization throws away the bit map. We turn the item into a
3279 1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
3280 that OP_NOT does not support multibyte characters. In the positive case, it
3281 can cause firstbyte to be set. Otherwise, there can be no first char if
3282 this item is first, whatever repeat count may follow. In the case of
3283 reqbyte, save the previous value for reinstating. */
3284
3285 #ifdef SUPPORT_UTF8
3286 if (class_charcount == 1 && !class_utf8 &&
3287 (!utf8 || !negate_class || class_lastchar < 128))
3288 #else
3289 if (class_charcount == 1)
3290 #endif
3291 {
3292 zeroreqbyte = reqbyte;
3293
3294 /* The OP_NOT opcode works on one-byte characters only. */
3295
3296 if (negate_class)
3297 {
3298 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3299 zerofirstbyte = firstbyte;
3300 *code++ = OP_NOT;
3301 *code++ = class_lastchar;
3302 break;
3303 }
3304
3305 /* For a single, positive character, get the value into mcbuffer, and
3306 then we can handle this with the normal one-character code. */
3307
3308 #ifdef SUPPORT_UTF8
3309 if (utf8 && class_lastchar > 127)
3310 mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
3311 else
3312 #endif
3313 {
3314 mcbuffer[0] = class_lastchar;
3315 mclength = 1;
3316 }
3317 goto ONE_CHAR;
3318 } /* End of 1-char optimization */
3319
3320 /* The general case - not the one-char optimization. If this is the first
3321 thing in the branch, there can be no first char setting, whatever the
3322 repeat count. Any reqbyte setting must remain unchanged after any kind of
3323 repeat. */
3324
3325 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3326 zerofirstbyte = firstbyte;
3327 zeroreqbyte = reqbyte;
3328
3329 /* If there are characters with values > 255, we have to compile an
3330 extended class, with its own opcode. If there are no characters < 256,
3331 we can omit the bitmap in the actual compiled code. */
3332
3333 #ifdef SUPPORT_UTF8
3334 if (class_utf8)
3335 {
3336 *class_utf8data++ = XCL_END; /* Marks the end of extra data */
3337 *code++ = OP_XCLASS;
3338 code += LINK_SIZE;
3339 *code = negate_class? XCL_NOT : 0;
3340
3341 /* If the map is required, move up the extra data to make room for it;
3342 otherwise just move the code pointer to the end of the extra data. */
3343
3344 if (class_charcount > 0)
3345 {
3346 *code++ |= XCL_MAP;
3347 memmove(code + 32, code, class_utf8data - code);
3348 memcpy(code, classbits, 32);
3349 code = class_utf8data + 32;
3350 }
3351 else code = class_utf8data;
3352
3353 /* Now fill in the complete length of the item */
3354
3355 PUT(previous, 1, code - previous);
3356 break; /* End of class handling */
3357 }
3358 #endif
3359
3360 /* If there are no characters > 255, negate the 32-byte map if necessary,
3361 and copy it into the code vector. If this is the first thing in the branch,
3362 there can be no first char setting, whatever the repeat count. Any reqbyte
3363 setting must remain unchanged after any kind of repeat. */
3364
3365 if (negate_class)
3366 {
3367 *code++ = OP_NCLASS;
3368 if (lengthptr == NULL) /* Save time in the pre-compile phase */
3369 for (c = 0; c < 32; c++) code[c] = ~classbits[c];
3370 }
3371 else
3372 {
3373 *code++ = OP_CLASS;
3374 memcpy(code, classbits, 32);
3375 }
3376 code += 32;
3377 break;
3378
3379
3380 /* ===================================================================*/
3381 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
3382 has been tested above. */
3383
3384 case '{':
3385 if (!is_quantifier) goto NORMAL_CHAR;
3386 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
3387 if (*errorcodeptr != 0) goto FAILED;
3388 goto REPEAT;
3389
3390 case '*':
3391 repeat_min = 0;
3392 repeat_max = -1;
3393 goto REPEAT;
3394
3395 case '+':
3396 repeat_min = 1;
3397 repeat_max = -1;
3398 goto REPEAT;
3399
3400 case '?':
3401 repeat_min = 0;
3402 repeat_max = 1;
3403
3404 REPEAT:
3405 if (previous == NULL)
3406 {
3407 *errorcodeptr = ERR9;
3408 goto FAILED;
3409 }
3410
3411 if (repeat_min == 0)
3412 {
3413 firstbyte = zerofirstbyte; /* Adjust for zero repeat */
3414 reqbyte = zeroreqbyte; /* Ditto */
3415 }
3416
3417 /* Remember whether this is a variable length repeat */
3418
3419 reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
3420
3421 op_type = 0; /* Default single-char op codes */
3422 possessive_quantifier = FALSE; /* Default not possessive quantifier */
3423
3424 /* Save start of previous item, in case we have to move it up to make space
3425 for an inserted OP_ONCE for the additional '+' extension. */
3426
3427 tempcode = previous;
3428
3429 /* If the next character is '+', we have a possessive quantifier. This
3430 implies greediness, whatever the setting of the PCRE_UNGREEDY option.
3431 If the next character is '?' this is a minimizing repeat, by default,
3432 but if PCRE_UNGREEDY is set, it works the other way round. We change the
3433 repeat type to the non-default. */
3434
3435 if (ptr[1] == '+')
3436 {
3437 repeat_type = 0; /* Force greedy */
3438 possessive_quantifier = TRUE;
3439 ptr++;
3440 }
3441 else if (ptr[1] == '?')
3442 {
3443 repeat_type = greedy_non_default;
3444 ptr++;
3445 }
3446 else repeat_type = greedy_default;
3447
3448 /* If previous was a character match, abolish the item and generate a
3449 repeat item instead. If a char item has a minumum of more than one, ensure
3450 that it is set in reqbyte - it might not be if a sequence such as x{3} is
3451 the first thing in a branch because the x will have gone into firstbyte
3452 instead. */
3453
3454 if (*previous == OP_CHAR || *previous == OP_CHARNC)
3455 {
3456 /* Deal with UTF-8 characters that take up more than one byte. It's
3457 easier to write this out separately than try to macrify it. Use c to
3458 hold the length of the character in bytes, plus 0x80 to flag that it's a
3459 length rather than a small character. */
3460
3461 #ifdef SUPPORT_UTF8
3462 if (utf8 && (code[-1] & 0x80) != 0)
3463 {
3464 uschar *lastchar = code - 1;
3465 while((*lastchar & 0xc0) == 0x80) lastchar--;
3466 c = code - lastchar; /* Length of UTF-8 character */
3467 memcpy(utf8_char, lastchar, c); /* Save the char */
3468 c |= 0x80; /* Flag c as a length */
3469 }
3470 else
3471 #endif
3472
3473 /* Handle the case of a single byte - either with no UTF8 support, or
3474 with UTF-8 disabled, or for a UTF-8 character < 128. */
3475
3476 {
3477 c = code[-1];
3478 if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
3479 }
3480
3481 /* If the repetition is unlimited, it pays to see if the next thing on
3482 the line is something that cannot possibly match this character. If so,
3483 automatically possessifying this item gains some performance in the case
3484 where the match fails. */
3485
3486 if (!possessive_quantifier &&
3487 repeat_max < 0 &&
3488 check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3489 options, cd))
3490 {
3491 repeat_type = 0; /* Force greedy */
3492 possessive_quantifier = TRUE;
3493 }
3494
3495 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
3496 }
3497
3498 /* If previous was a single negated character ([^a] or similar), we use
3499 one of the special opcodes, replacing it. The code is shared with single-
3500 character repeats by setting opt_type to add a suitable offset into
3501 repeat_type. We can also test for auto-possessification. OP_NOT is
3502 currently used only for single-byte chars. */
3503
3504 else if (*previous == OP_NOT)
3505 {
3506 op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
3507 c = previous[1];
3508 if (!possessive_quantifier &&
3509 repeat_max < 0 &&
3510 check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3511 {
3512 repeat_type = 0; /* Force greedy */
3513 possessive_quantifier = TRUE;
3514 }
3515 goto OUTPUT_SINGLE_REPEAT;
3516 }
3517
3518 /* If previous was a character type match (\d or similar), abolish it and
3519 create a suitable repeat item. The code is shared with single-character
3520 repeats by setting op_type to add a suitable offset into repeat_type. Note
3521 the the Unicode property types will be present only when SUPPORT_UCP is
3522 defined, but we don't wrap the little bits of code here because it just
3523 makes it horribly messy. */
3524
3525 else if (*previous < OP_EODN)
3526 {
3527 uschar *oldcode;
3528 int prop_type, prop_value;
3529 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
3530 c = *previous;
3531
3532 if (!possessive_quantifier &&
3533 repeat_max < 0 &&
3534 check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3535 {
3536 repeat_type = 0; /* Force greedy */
3537 possessive_quantifier = TRUE;
3538 }
3539
3540 OUTPUT_SINGLE_REPEAT:
3541 if (*previous == OP_PROP || *previous == OP_NOTPROP)
3542 {
3543 prop_type = previous[1];
3544 prop_value = previous[2];
3545 }
3546 else prop_type = prop_value = -1;
3547
3548 oldcode = code;
3549 code = previous; /* Usually overwrite previous item */
3550
3551 /* If the maximum is zero then the minimum must also be zero; Perl allows
3552 this case, so we do too - by simply omitting the item altogether. */
3553
3554 if (repeat_max == 0) goto END_REPEAT;
3555
3556 /* All real repeats make it impossible to handle partial matching (maybe
3557 one day we will be able to remove this restriction). */
3558
3559 if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL;
3560
3561 /* Combine the op_type with the repeat_type */
3562
3563 repeat_type += op_type;
3564
3565 /* A minimum of zero is handled either as the special case * or ?, or as
3566 an UPTO, with the maximum given. */
3567
3568 if (repeat_min == 0)
3569 {
3570 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
3571 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
3572 else
3573 {
3574 *code++ = OP_UPTO + repeat_type;
3575 PUT2INC(code, 0, repeat_max);
3576 }
3577 }
3578
3579 /* A repeat minimum of 1 is optimized into some special cases. If the
3580 maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
3581 left in place and, if the maximum is greater than 1, we use OP_UPTO with
3582 one less than the maximum. */
3583
3584 else if (repeat_min == 1)
3585 {
3586 if (repeat_max == -1)
3587 *code++ = OP_PLUS + repeat_type;
3588 else
3589 {
3590 code = oldcode; /* leave previous item in place */
3591 if (repeat_max == 1) goto END_REPEAT;
3592 *code++ = OP_UPTO + repeat_type;
3593 PUT2INC(code, 0, repeat_max - 1);
3594 }
3595 }
3596
3597 /* The case {n,n} is just an EXACT, while the general case {n,m} is
3598 handled as an EXACT followed by an UPTO. */
3599
3600 else
3601 {
3602 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
3603 PUT2INC(code, 0, repeat_min);
3604
3605 /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
3606 we have to insert the character for the previous code. For a repeated
3607 Unicode property match, there are two extra bytes that define the
3608 required property. In UTF-8 mode, long characters have their length in
3609 c, with the 0x80 bit as a flag. */
3610
3611 if (repeat_max < 0)
3612 {
3613 #ifdef SUPPORT_UTF8
3614 if (utf8 && c >= 128)
3615 {
3616 memcpy(code, utf8_char, c & 7);
3617 code += c & 7;
3618 }
3619 else
3620 #endif
3621 {
3622 *code++ = c;
3623 if (prop_type >= 0)
3624 {
3625 *code++ = prop_type;
3626 *code++ = prop_value;
3627 }
3628 }
3629 *code++ = OP_STAR + repeat_type;
3630 }
3631
3632 /* Else insert an UPTO if the max is greater than the min, again
3633 preceded by the character, for the previously inserted code. If the
3634 UPTO is just for 1 instance, we can use QUERY instead. */
3635
3636 else if (repeat_max != repeat_min)
3637 {
3638 #ifdef SUPPORT_UTF8
3639 if (utf8 && c >= 128)
3640 {
3641 memcpy(code, utf8_char, c & 7);
3642 code += c & 7;
3643 }
3644 else
3645 #endif
3646 *code++ = c;
3647 if (prop_type >= 0)
3648 {
3649 *code++ = prop_type;
3650 *code++ = prop_value;
3651 }
3652 repeat_max -= repeat_min;
3653
3654 if (repeat_max == 1)
3655 {
3656 *code++ = OP_QUERY + repeat_type;
3657 }
3658 else
3659 {
3660 *code++ = OP_UPTO + repeat_type;
3661 PUT2INC(code, 0, repeat_max);
3662 }
3663 }
3664 }
3665
3666 /* The character or character type itself comes last in all cases. */
3667
3668 #ifdef SUPPORT_UTF8
3669 if (utf8 && c >= 128)
3670 {
3671 memcpy(code, utf8_char, c & 7);
3672 code += c & 7;
3673 }
3674 else
3675 #endif
3676 *code++ = c;
3677
3678 /* For a repeated Unicode property match, there are two extra bytes that
3679 define the required property. */
3680
3681 #ifdef SUPPORT_UCP
3682 if (prop_type >= 0)
3683 {
3684 *code++ = prop_type;
3685 *code++ = prop_value;
3686 }
3687 #endif
3688 }
3689
3690 /* If previous was a character class or a back reference, we put the repeat
3691 stuff after it, but just skip the item if the repeat was {0,0}. */
3692
3693 else if (*previous == OP_CLASS ||
3694 *previous == OP_NCLASS ||
3695 #ifdef SUPPORT_UTF8
3696 *previous == OP_XCLASS ||
3697 #endif
3698 *previous == OP_REF)
3699 {
3700 if (repeat_max == 0)
3701 {
3702 code = previous;
3703 goto END_REPEAT;
3704 }
3705
3706 /* All real repeats make it impossible to handle partial matching (maybe
3707 one day we will be able to remove this restriction). */
3708
3709 if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL;
3710
3711 if (repeat_min == 0 && repeat_max == -1)
3712 *code++ = OP_CRSTAR + repeat_type;
3713 else if (repeat_min == 1 && repeat_max == -1)
3714 *code++ = OP_CRPLUS + repeat_type;
3715 else if (repeat_min == 0 && repeat_max == 1)
3716 *code++ = OP_CRQUERY + repeat_type;
3717 else
3718 {
3719 *code++ = OP_CRRANGE + repeat_type;
3720 PUT2INC(code, 0, repeat_min);
3721 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
3722 PUT2INC(code, 0, repeat_max);
3723 }
3724 }
3725
3726 /* If previous was a bracket group, we may have to replicate it in certain
3727 cases. */
3728
3729 else if (*previous == OP_BRA || *previous == OP_CBRA ||
3730 *previous == OP_ONCE || *previous == OP_COND)
3731 {
3732 register int i;
3733 int ketoffset = 0;
3734 int len = code - previous;
3735 uschar *bralink = NULL;
3736
3737 /* Repeating a DEFINE group is pointless */
3738
3739 if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
3740 {
3741 *errorcodeptr = ERR55;
3742 goto FAILED;
3743 }
3744
3745 /* If the maximum repeat count is unlimited, find the end of the bracket
3746 by scanning through from the start, and compute the offset back to it
3747 from the current code pointer. There may be an OP_OPT setting following
3748 the final KET, so we can't find the end just by going back from the code
3749 pointer. */
3750
3751 if (repeat_max == -1)
3752 {
3753 register uschar *ket = previous;
3754 do ket += GET(ket, 1); while (*ket != OP_KET);
3755 ketoffset = code - ket;
3756 }
3757
3758 /* The case of a zero minimum is special because of the need to stick
3759 OP_BRAZERO in front of it, and because the group appears once in the
3760 data, whereas in other cases it appears the minimum number of times. For
3761 this reason, it is simplest to treat this case separately, as otherwise
3762 the code gets far too messy. There are several special subcases when the
3763 minimum is zero. */
3764
3765 if (repeat_min == 0)
3766 {
3767 /* If the maximum is also zero, we just omit the group from the output
3768 altogether. */
3769
3770 if (repeat_max == 0)
3771 {
3772 code = previous;
3773 goto END_REPEAT;
3774 }
3775
3776 /* If the maximum is 1 or unlimited, we just have to stick in the
3777 BRAZERO and do no more at this point. However, we do need to adjust
3778 any OP_RECURSE calls inside the group that refer to the group itself or
3779 any internal or forward referenced group, because the offset is from
3780 the start of the whole regex. Temporarily terminate the pattern while
3781 doing this. */
3782
3783 if (repeat_max <= 1)
3784 {
3785 *code = OP_END;
3786 adjust_recurse(previous, 1, utf8, cd, save_hwm);
3787 memmove(previous+1, previous, len);
3788 code++;
3789 *previous++ = OP_BRAZERO + repeat_type;
3790 }
3791
3792 /* If the maximum is greater than 1 and limited, we have to replicate
3793 in a nested fashion, sticking OP_BRAZERO before each set of brackets.
3794 The first one has to be handled carefully because it's the original
3795 copy, which has to be moved up. The remainder can be handled by code
3796 that is common with the non-zero minimum case below. We have to
3797 adjust the value or repeat_max, since one less copy is required. Once
3798 again, we may have to adjust any OP_RECURSE calls inside the group. */
3799
3800 else
3801 {
3802 int offset;
3803 *code = OP_END;
3804 adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
3805 memmove(previous + 2 + LINK_SIZE, previous, len);
3806 code += 2 + LINK_SIZE;
3807 *previous++ = OP_BRAZERO + repeat_type;
3808 *previous++ = OP_BRA;
3809
3810 /* We chain together the bracket offset fields that have to be
3811 filled in later when the ends of the brackets are reached. */
3812
3813 offset = (bralink == NULL)? 0 : previous - bralink;
3814 bralink = previous;
3815 PUTINC(previous, 0, offset);
3816 }
3817
3818 repeat_max--;
3819 }
3820
3821 /* If the minimum is greater than zero, replicate the group as many
3822 times as necessary, and adjust the maximum to the number of subsequent
3823 copies that we need. If we set a first char from the group, and didn't
3824 set a required char, copy the latter from the former. If there are any
3825 forward reference subroutine calls in the group, there will be entries on
3826 the workspace list; replicate these with an appropriate increment. */
3827
3828 else
3829 {
3830 if (repeat_min > 1)
3831 {
3832 /* In the pre-compile phase, we don't actually do the replication. We
3833 just adjust the length as if we had. Do some paranoid checks for
3834 potential integer overflow. */
3835
3836 if (lengthptr != NULL)
3837 {
3838 int delta = (repeat_min - 1)*length_prevgroup;
3839 if ((double)(repeat_min - 1)*(double)length_prevgroup >
3840 (double)INT_MAX ||
3841 OFLOW_MAX - *lengthptr < delta)
3842 {
3843 *errorcodeptr = ERR20;
3844 goto FAILED;
3845 }
3846 *lengthptr += delta;
3847 }
3848
3849 /* This is compiling for real */
3850
3851 else
3852 {
3853 if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
3854 for (i = 1; i < repeat_min; i++)
3855 {
3856 uschar *hc;
3857 uschar *this_hwm = cd->hwm;
3858 memcpy(code, previous, len);
3859 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3860 {
3861 PUT(cd->hwm, 0, GET(hc, 0) + len);
3862 cd->hwm += LINK_SIZE;
3863 }
3864 save_hwm = this_hwm;
3865 code += len;
3866 }
3867 }
3868 }
3869
3870 if (repeat_max > 0) repeat_max -= repeat_min;
3871 }
3872
3873 /* This code is common to both the zero and non-zero minimum cases. If
3874 the maximum is limited, it replicates the group in a nested fashion,
3875 remembering the bracket starts on a stack. In the case of a zero minimum,
3876 the first one was set up above. In all cases the repeat_max now specifies
3877 the number of additional copies needed. Again, we must remember to
3878 replicate entries on the forward reference list. */
3879
3880 if (repeat_max >= 0)
3881 {
3882 /* In the pre-compile phase, we don't actually do the replication. We
3883 just adjust the length as if we had. For each repetition we must add 1
3884 to the length for BRAZERO and for all but the last repetition we must
3885 add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
3886 paranoid checks to avoid integer overflow. */
3887
3888 if (lengthptr != NULL && repeat_max > 0)
3889 {
3890 int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
3891 2 - 2*LINK_SIZE; /* Last one doesn't nest */
3892 if ((double)repeat_max *
3893 (double)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
3894 > (double)INT_MAX ||
3895 OFLOW_MAX - *lengthptr < delta)
3896 {
3897 *errorcodeptr = ERR20;
3898 goto FAILED;
3899 }
3900 *lengthptr += delta;
3901 }
3902
3903 /* This is compiling for real */
3904
3905 else for (i = repeat_max - 1; i >= 0; i--)
3906 {
3907 uschar *hc;
3908 uschar *this_hwm = cd->hwm;
3909
3910 *code++ = OP_BRAZERO + repeat_type;
3911
3912 /* All but the final copy start a new nesting, maintaining the
3913 chain of brackets outstanding. */
3914
3915 if (i != 0)
3916 {
3917 int offset;
3918 *code++ = OP_BRA;
3919 offset = (bralink == NULL)? 0 : code - bralink;
3920 bralink = code;
3921 PUTINC(code, 0, offset);
3922 }
3923
3924 memcpy(code, previous, len);
3925 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3926 {
3927 PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
3928 cd->hwm += LINK_SIZE;
3929 }
3930 save_hwm = this_hwm;
3931 code += len;
3932 }
3933
3934 /* Now chain through the pending brackets, and fill in their length
3935 fields (which are holding the chain links pro tem). */
3936
3937 while (bralink != NULL)
3938 {
3939 int oldlinkoffset;
3940 int offset = code - bralink + 1;
3941 uschar *bra = code - offset;
3942 oldlinkoffset = GET(bra, 1);
3943 bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
3944 *code++ = OP_KET;
3945 PUTINC(code, 0, offset);
3946 PUT(bra, 1, offset);
3947 }
3948 }
3949
3950 /* If the maximum is unlimited, set a repeater in the final copy. We
3951 can't just offset backwards from the current code point, because we
3952 don't know if there's been an options resetting after the ket. The
3953 correct offset was computed above.
3954
3955 Then, when we are doing the actual compile phase, check to see whether
3956 this group is a non-atomic one that could match an empty string. If so,
3957 convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
3958 that runtime checking can be done. [This check is also applied to
3959 atomic groups at runtime, but in a different way.] */
3960
3961 else
3962 {
3963 uschar *ketcode = code - ketoffset;
3964 uschar *bracode = ketcode - GET(ketcode, 1);
3965 *ketcode = OP_KETRMAX + repeat_type;
3966 if (lengthptr == NULL && *bracode != OP_ONCE)
3967 {
3968 uschar *scode = bracode;
3969 do
3970 {
3971 if (could_be_empty_branch(scode, ketcode, utf8))
3972 {
3973 *bracode += OP_SBRA - OP_BRA;
3974 break;
3975 }
3976 scode += GET(scode, 1);
3977 }
3978 while (*scode == OP_ALT);
3979 }
3980 }
3981 }
3982
3983 /* Else there's some kind of shambles */
3984
3985 else
3986 {
3987 *errorcodeptr = ERR11;
3988 goto FAILED;
3989 }
3990
3991 /* If the character following a repeat is '+', or if certain optimization
3992 tests above succeeded, possessive_quantifier is TRUE. For some of the
3993 simpler opcodes, there is an special alternative opcode for this. For
3994 anything else, we wrap the entire repeated item inside OP_ONCE brackets.
3995 The '+' notation is just syntactic sugar, taken from Sun's Java package,
3996 but the special opcodes can optimize it a bit. The repeated item starts at
3997 tempcode, not at previous, which might be the first part of a string whose
3998 (former) last char we repeated.
3999
4000 Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
4001 an 'upto' may follow. We skip over an 'exact' item, and then test the
4002 length of what remains before proceeding. */
4003
4004 if (possessive_quantifier)
4005 {
4006 int len;
4007 if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||
4008 *tempcode == OP_NOTEXACT)
4009 tempcode += _pcre_OP_lengths[*tempcode];
4010 len = code - tempcode;
4011 if (len > 0) switch (*tempcode)
4012 {
4013 case OP_STAR: *tempcode = OP_POSSTAR; break;
4014 case OP_PLUS: *tempcode = OP_POSPLUS; break;
4015 case OP_QUERY: *tempcode = OP_POSQUERY; break;
4016 case OP_UPTO: *tempcode = OP_POSUPTO; break;
4017
4018 case OP_TYPESTAR: *tempcode = OP_TYPEPOSSTAR; break;
4019 case OP_TYPEPLUS: *tempcode = OP_TYPEPOSPLUS; break;
4020 case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
4021 case OP_TYPEUPTO: *tempcode = OP_TYPEPOSUPTO; break;
4022
4023 case OP_NOTSTAR: *tempcode = OP_NOTPOSSTAR; break;
4024 case OP_NOTPLUS: *tempcode = OP_NOTPOSPLUS; break;
4025 case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
4026 case OP_NOTUPTO: *tempcode = OP_NOTPOSUPTO; break;
4027
4028 default:
4029 memmove(tempcode + 1+LINK_SIZE, tempcode, len);
4030 code += 1 + LINK_SIZE;
4031 len += 1 + LINK_SIZE;
4032 tempcode[0] = OP_ONCE;
4033 *code++ = OP_KET;
4034 PUTINC(code, 0, len);
4035 PUT(tempcode, 1, len);
4036 break;
4037 }
4038 }
4039
4040 /* In all case we no longer have a previous item. We also set the
4041 "follows varying string" flag for subsequently encountered reqbytes if
4042 it isn't already set and we have just passed a varying length item. */
4043
4044 END_REPEAT:
4045 previous = NULL;
4046 cd->req_varyopt |= reqvary;
4047 break;
4048
4049
4050 /* ===================================================================*/
4051 /* Start of nested parenthesized sub-expression, or comment or lookahead or
4052 lookbehind or option setting or condition or all the other extended
4053 parenthesis forms. */
4054
4055 case '(':
4056 newoptions = options;
4057 skipbytes = 0;
4058 bravalue = OP_CBRA;
4059 save_hwm = cd->hwm;
4060 reset_bracount = FALSE;
4061
4062 /* First deal with various "verbs" that can be introduced by '*'. */
4063
4064 if (*(++ptr) == '*' && (cd->ctypes[ptr[1]] & ctype_letter) != 0)
4065 {
4066 int i, namelen;
4067 const char *vn = verbnames;
4068 const uschar *name = ++ptr;
4069 previous = NULL;
4070 while ((cd->ctypes[*++ptr] & ctype_letter) != 0);
4071 if (*ptr == ':')
4072 {
4073 *errorcodeptr = ERR59; /* Not supported */
4074 goto FAILED;
4075 }
4076 if (*ptr != ')')
4077 {
4078 *errorcodeptr = ERR60;
4079 goto FAILED;
4080 }
4081 namelen = ptr - name;
4082 for (i = 0; i < verbcount; i++)
4083 {
4084 if (namelen == verbs[i].len &&
4085 strncmp((char *)name, vn, namelen) == 0)
4086 {
4087 *code = verbs[i].op;
4088 if (*code++ == OP_ACCEPT) cd->had_accept = TRUE;
4089 break;
4090 }
4091 vn += verbs[i].len + 1;
4092 }
4093 if (i < verbcount) continue;
4094 *errorcodeptr = ERR60;
4095 goto FAILED;
4096 }
4097
4098 /* Deal with the extended parentheses; all are introduced by '?', and the
4099 appearance of any of them means that this is not a capturing group. */
4100
4101 else if (*ptr == '?')
4102 {
4103 int i, set, unset, namelen;
4104 int *optset;
4105 const uschar *name;
4106 uschar *slot;
4107
4108 switch (*(++ptr))
4109 {
4110 case '#': /* Comment; skip to ket */
4111 ptr++;
4112 while (*ptr != 0 && *ptr != ')') ptr++;
4113 if (*ptr == 0)
4114 {
4115 *errorcodeptr = ERR18;
4116 goto FAILED;
4117 }
4118 continue;
4119
4120
4121 /* ------------------------------------------------------------ */
4122 case '|': /* Reset capture count for each branch */
4123 reset_bracount = TRUE;
4124 /* Fall through */
4125
4126 /* ------------------------------------------------------------ */
4127 case ':': /* Non-capturing bracket */
4128 bravalue = OP_BRA;
4129 ptr++;
4130 break;
4131
4132
4133 /* ------------------------------------------------------------ */
4134 case '(':
4135 bravalue = OP_COND; /* Conditional group */
4136
4137 /* A condition can be an assertion, a number (referring to a numbered
4138 group), a name (referring to a named group), or 'R', referring to
4139 recursion. R<digits> and R&name are also permitted for recursion tests.
4140
4141 There are several syntaxes for testing a named group: (?(name)) is used
4142 by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
4143
4144 There are two unfortunate ambiguities, caused by history. (a) 'R' can
4145 be the recursive thing or the name 'R' (and similarly for 'R' followed
4146 by digits), and (b) a number could be a name that consists of digits.
4147 In both cases, we look for a name first; if not found, we try the other
4148 cases. */
4149
4150 /* For conditions that are assertions, check the syntax, and then exit
4151 the switch. This will take control down to where bracketed groups,
4152 including assertions, are processed. */
4153
4154 if (ptr[1] == '?' && (ptr[2] == '=' || ptr[2] == '!' || ptr[2] == '<'))
4155 break;
4156
4157 /* Most other conditions use OP_CREF (a couple change to OP_RREF
4158 below), and all need to skip 3 bytes at the start of the group. */
4159
4160 code[1+LINK_SIZE] = OP_CREF;
4161 skipbytes = 3;
4162 refsign = -1;
4163
4164 /* Check for a test for recursion in a named group. */
4165
4166 if (ptr[1] == 'R' && ptr[2] == '&')
4167 {
4168 terminator = -1;
4169 ptr += 2;
4170 code[1+LINK_SIZE] = OP_RREF; /* Change the type of test */
4171 }
4172
4173 /* Check for a test for a named group's having been set, using the Perl
4174 syntax (?(<name>) or (?('name') */
4175
4176 else if (ptr[1] == '<')
4177 {
4178 terminator = '>';
4179 ptr++;
4180 }
4181 else if (ptr[1] == '\'')
4182 {
4183 terminator = '\'';
4184 ptr++;
4185 }
4186 else
4187 {
4188 terminator = 0;
4189 if (ptr[1] == '-' || ptr[1] == '+') refsign = *(++ptr);
4190 }
4191
4192 /* We now expect to read a name; any thing else is an error */
4193
4194 if ((cd->ctypes[ptr[1]] & ctype_word) == 0)
4195 {
4196 ptr += 1; /* To get the right offset */
4197 *errorcodeptr = ERR28;
4198 goto FAILED;
4199 }
4200
4201 /* Read the name, but also get it as a number if it's all digits */
4202
4203 recno = 0;
4204 name = ++ptr;
4205 while ((cd->ctypes[*ptr] & ctype_word) != 0)
4206 {
4207 if (recno >= 0)
4208 recno = ((digitab[*ptr] & ctype_digit) != 0)?
4209 recno * 10 + *ptr - '0' : -1;
4210 ptr++;
4211 }
4212 namelen = ptr - name;
4213
4214 if ((terminator > 0 && *ptr++ != terminator) || *ptr++ != ')')
4215 {
4216 ptr--; /* Error offset */
4217 *errorcodeptr = ERR26;
4218 goto FAILED;
4219 }
4220
4221 /* Do no further checking in the pre-compile phase. */
4222
4223 if (lengthptr != NULL) break;
4224
4225 /* In the real compile we do the work of looking for the actual
4226 reference. If the string started with "+" or "-" we require the rest to
4227 be digits, in which case recno will be set. */
4228
4229 if (refsign > 0)
4230 {
4231 if (recno <= 0)
4232 {
4233 *errorcodeptr = ERR58;
4234 goto FAILED;
4235 }
4236 if (refsign == '-')
4237 {
4238 recno = cd->bracount - recno + 1;
4239 if (recno <= 0)
4240 {
4241 *errorcodeptr = ERR15;
4242 goto FAILED;
4243 }
4244 }
4245 else recno += cd->bracount;
4246 PUT2(code, 2+LINK_SIZE, recno);
4247 break;
4248 }
4249
4250 /* Otherwise (did not start with "+" or "-"), start by looking for the
4251 name. */
4252
4253 slot = cd->name_table;
4254 for (i = 0; i < cd->names_found; i++)
4255 {
4256 if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
4257 slot += cd->name_entry_size;
4258 }
4259
4260 /* Found a previous named subpattern */
4261
4262 if (i < cd->names_found)
4263 {
4264 recno = GET2(slot, 0);
4265 PUT2(code, 2+LINK_SIZE, recno);
4266 }
4267
4268 /* Search the pattern for a forward reference */
4269
4270 else if ((i = find_parens(ptr, cd->bracount, name, namelen,
4271 (options & PCRE_EXTENDED) != 0)) > 0)
4272 {
4273 PUT2(code, 2+LINK_SIZE, i);
4274 }
4275
4276 /* If terminator == 0 it means that the name followed directly after
4277 the opening parenthesis [e.g. (?(abc)...] and in this case there are
4278 some further alternatives to try. For the cases where terminator != 0
4279 [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
4280 now checked all the possibilities, so give an error. */
4281
4282 else if (terminator != 0)
4283 {
4284 *errorcodeptr = ERR15;
4285 goto FAILED;
4286 }
4287
4288 /* Check for (?(R) for recursion. Allow digits after R to specify a
4289 specific group number. */
4290
4291 else if (*name == 'R')
4292 {
4293 recno = 0;
4294 for (i = 1; i < namelen; i++)
4295 {
4296 if ((digitab[name[i]] & ctype_digit) == 0)
4297 {
4298 *errorcodeptr = ERR15;
4299 goto FAILED;
4300 }
4301 recno = recno * 10 + name[i] - '0';
4302 }
4303 if (recno == 0) recno = RREF_ANY;
4304 code[1+LINK_SIZE] = OP_RREF; /* Change test type */
4305 PUT2(code, 2+LINK_SIZE, recno);
4306 }
4307
4308 /* Similarly, check for the (?(DEFINE) "condition", which is always
4309 false. */
4310
4311 else if (namelen == 6 && strncmp((char *)name, "DEFINE", 6) == 0)
4312 {
4313 code[1+LINK_SIZE] = OP_DEF;
4314 skipbytes = 1;
4315 }
4316
4317 /* Check for the "name" actually being a subpattern number. */
4318
4319 else if (recno > 0)
4320 {
4321 PUT2(code, 2+LINK_SIZE, recno);
4322 }
4323
4324 /* Either an unidentified subpattern, or a reference to (?(0) */
4325
4326 else
4327 {
4328 *errorcodeptr = (recno == 0)? ERR35: ERR15;
4329 goto FAILED;
4330 }
4331 break;
4332
4333
4334 /* ------------------------------------------------------------ */
4335 case '=': /* Positive lookahead */
4336 bravalue = OP_ASSERT;
4337 ptr++;
4338 break;
4339
4340
4341 /* ------------------------------------------------------------ */
4342 case '!': /* Negative lookahead */
4343 ptr++;
4344 if (*ptr == ')') /* Optimize (?!) */
4345 {
4346 *code++ = OP_FAIL;
4347 previous = NULL;
4348 continue;
4349 }
4350 bravalue = OP_ASSERT_NOT;
4351 break;
4352
4353
4354 /* ------------------------------------------------------------ */
4355 case '<': /* Lookbehind or named define */
4356 switch (ptr[1])
4357 {
4358 case '=': /* Positive lookbehind */
4359 bravalue = OP_ASSERTBACK;
4360 ptr += 2;
4361 break;
4362
4363 case '!': /* Negative lookbehind */
4364 bravalue = OP_ASSERTBACK_NOT;
4365 ptr += 2;
4366 break;
4367
4368 default: /* Could be name define, else bad */
4369 if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME;
4370 ptr++; /* Correct offset for error */
4371 *errorcodeptr = ERR24;
4372 goto FAILED;
4373 }
4374 break;
4375
4376
4377 /* ------------------------------------------------------------ */
4378 case '>': /* One-time brackets */
4379 bravalue = OP_ONCE;
4380 ptr++;
4381 break;
4382
4383
4384 /* ------------------------------------------------------------ */
4385 case 'C': /* Callout - may be followed by digits; */
4386 previous_callout = code; /* Save for later completion */
4387 after_manual_callout = 1; /* Skip one item before completing */
4388 *code++ = OP_CALLOUT;
4389 {
4390 int n = 0;
4391 while ((digitab[*(++ptr)] & ctype_digit) != 0)
4392 n = n * 10 + *ptr - '0';
4393 if (*ptr != ')')
4394 {
4395 *errorcodeptr = ERR39;
4396 goto FAILED;
4397 }
4398 if (n > 255)
4399 {
4400 *errorcodeptr = ERR38;
4401 goto FAILED;
4402 }
4403 *code++ = n;
4404 PUT(code, 0, ptr - cd->start_pattern + 1); /* Pattern offset */
4405 PUT(code, LINK_SIZE, 0); /* Default length */
4406 code += 2 * LINK_SIZE;
4407 }
4408 previous = NULL;
4409 continue;
4410
4411
4412 /* ------------------------------------------------------------ */
4413 case 'P': /* Python-style named subpattern handling */
4414 if (*(++ptr) == '=' || *ptr == '>') /* Reference or recursion */
4415 {
4416 is_recurse = *ptr == '>';
4417 terminator = ')';
4418 goto NAMED_REF_OR_RECURSE;
4419 }
4420 else if (*ptr != '<') /* Test for Python-style definition */
4421 {
4422 *errorcodeptr = ERR41;
4423 goto FAILED;
4424 }
4425 /* Fall through to handle (?P< as (?< is handled */
4426
4427
4428 /* ------------------------------------------------------------ */
4429 DEFINE_NAME: /* Come here from (?< handling */
4430 case '\'':
4431 {
4432 terminator = (*ptr == '<')? '>' : '\'';
4433 name = ++ptr;
4434
4435 while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4436 namelen = ptr - name;
4437
4438 /* In the pre-compile phase, just do a syntax check. */
4439
4440 if (lengthptr != NULL)
4441 {
4442 if (*ptr != terminator)
4443 {
4444 *errorcodeptr = ERR42;
4445 goto FAILED;
4446 }
4447 if (cd->names_found >= MAX_NAME_COUNT)
4448 {
4449 *errorcodeptr = ERR49;
4450 goto FAILED;
4451 }
4452 if (namelen + 3 > cd->name_entry_size)
4453 {
4454 cd->name_entry_size = namelen + 3;
4455 if (namelen > MAX_NAME_SIZE)
4456 {
4457 *errorcodeptr = ERR48;
4458 goto FAILED;
4459 }
4460 }
4461 }
4462
4463 /* In the real compile, create the entry in the table */
4464
4465 else
4466 {
4467 slot = cd->name_table;
4468 for (i = 0; i < cd->names_found; i++)
4469 {
4470 int crc = memcmp(name, slot+2, namelen);
4471 if (crc == 0)
4472 {
4473 if (slot[2+namelen] == 0)
4474 {
4475 if ((options & PCRE_DUPNAMES) == 0)
4476 {
4477 *errorcodeptr = ERR43;
4478 goto FAILED;
4479 }
4480 }
4481 else crc = -1; /* Current name is substring */
4482 }
4483 if (crc < 0)
4484 {
4485 memmove(slot + cd->name_entry_size, slot,
4486 (cd->names_found - i) * cd->name_entry_size);
4487 break;
4488 }
4489 slot += cd->name_entry_size;
4490 }
4491
4492 PUT2(slot, 0, cd->bracount + 1);
4493 memcpy(slot + 2, name, namelen);
4494 slot[2+namelen] = 0;
4495 }
4496 }
4497
4498 /* In both cases, count the number of names we've encountered. */
4499
4500 ptr++; /* Move past > or ' */
4501 cd->names_found++;
4502 goto NUMBERED_GROUP;
4503
4504
4505 /* ------------------------------------------------------------ */
4506 case '&': /* Perl recursion/subroutine syntax */
4507 terminator = ')';
4508 is_recurse = TRUE;
4509 /* Fall through */
4510
4511 /* We come here from the Python syntax above that handles both
4512 references (?P=name) and recursion (?P>name), as well as falling
4513 through from the Perl recursion syntax (?&name). */
4514
4515 NAMED_REF_OR_RECURSE:
4516 name = ++ptr;
4517 while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4518 namelen = ptr - name;
4519
4520 /* In the pre-compile phase, do a syntax check and set a dummy
4521 reference number. */
4522
4523 if (lengthptr != NULL)
4524 {
4525 if (*ptr != terminator)
4526 {
4527 *errorcodeptr = ERR42;
4528 goto FAILED;
4529 }
4530 if (namelen > MAX_NAME_SIZE)
4531 {
4532 *errorcodeptr = ERR48;
4533 goto FAILED;
4534 }
4535 recno = 0;
4536 }
4537
4538 /* In the real compile, seek the name in the table */
4539
4540 else
4541 {
4542 slot = cd->name_table;
4543 for (i = 0; i < cd->names_found; i++)
4544 {
4545 if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
4546 slot += cd->name_entry_size;
4547 }
4548
4549 if (i < cd->names_found) /* Back reference */
4550 {
4551 recno = GET2(slot, 0);
4552 }
4553 else if ((recno = /* Forward back reference */
4554 find_parens(ptr, cd->bracount, name, namelen,
4555 (options & PCRE_EXTENDED) != 0)) <= 0)
4556 {
4557 *errorcodeptr = ERR15;
4558 goto FAILED;
4559 }
4560 }
4561
4562 /* In both phases, we can now go to the code than handles numerical
4563 recursion or backreferences. */
4564
4565 if (is_recurse) goto HANDLE_RECURSION;
4566 else goto HANDLE_REFERENCE;
4567
4568
4569 /* ------------------------------------------------------------ */
4570 case 'R': /* Recursion */
4571 ptr++; /* Same as (?0) */
4572 /* Fall through */
4573
4574
4575 /* ------------------------------------------------------------ */
4576 case '-': case '+':
4577 case '0': case '1': case '2': case '3': case '4': /* Recursion or */
4578 case '5': case '6': case '7': case '8': case '9': /* subroutine */
4579 {
4580 const uschar *called;
4581
4582 if ((refsign = *ptr) == '+') ptr++;
4583 else if (refsign == '-')
4584 {
4585 if ((digitab[ptr[1]] & ctype_digit) == 0)
4586 goto OTHER_CHAR_AFTER_QUERY;
4587 ptr++;
4588 }
4589
4590 recno = 0;
4591 while((digitab[*ptr] & ctype_digit) != 0)
4592 recno = recno * 10 + *ptr++ - '0';
4593
4594 if (*ptr != ')')
4595 {
4596 *errorcodeptr = ERR29;
4597 goto FAILED;
4598 }
4599
4600 if (refsign == '-')
4601 {
4602 if (recno == 0)
4603 {
4604 *errorcodeptr = ERR58;
4605 goto FAILED;
4606 }
4607 recno = cd->bracount - recno + 1;
4608 if (recno <= 0)
4609 {
4610 *errorcodeptr = ERR15;
4611 goto FAILED;
4612 }
4613 }
4614 else if (refsign == '+')
4615 {
4616 if (recno == 0)
4617 {
4618 *errorcodeptr = ERR58;
4619 goto FAILED;
4620 }
4621 recno += cd->bracount;
4622 }
4623
4624 /* Come here from code above that handles a named recursion */
4625
4626 HANDLE_RECURSION:
4627
4628 previous = code;
4629 called = cd->start_code;
4630
4631 /* When we are actually compiling, find the bracket that is being
4632 referenced. Temporarily end the regex in case it doesn't exist before
4633 this point. If we end up with a forward reference, first check that
4634 the bracket does occur later so we can give the error (and position)
4635 now. Then remember this forward reference in the workspace so it can
4636 be filled in at the end. */
4637
4638 if (lengthptr == NULL)
4639 {
4640 *code = OP_END;
4641 if (recno != 0) called = find_bracket(cd->start_code, utf8, recno);
4642
4643 /* Forward reference */
4644
4645 if (called == NULL)
4646 {
4647 if (find_parens(ptr, cd->bracount, NULL, recno,
4648 (options & PCRE_EXTENDED) != 0) < 0)
4649 {
4650 *errorcodeptr = ERR15;
4651 goto FAILED;
4652 }
4653 called = cd->start_code + recno;
4654 PUTINC(cd->hwm, 0, code + 2 + LINK_SIZE - cd->start_code);
4655 }
4656
4657 /* If not a forward reference, and the subpattern is still open,
4658 this is a recursive call. We check to see if this is a left
4659 recursion that could loop for ever, and diagnose that case. */
4660
4661 else if (GET(called, 1) == 0 &&
4662 could_be_empty(called, code, bcptr, utf8))
4663 {
4664 *errorcodeptr = ERR40;
4665 goto FAILED;
4666 }
4667 }
4668
4669 /* Insert the recursion/subroutine item, automatically wrapped inside
4670 "once" brackets. Set up a "previous group" length so that a
4671 subsequent quantifier will work. */
4672
4673 *code = OP_ONCE;
4674 PUT(code, 1, 2 + 2*LINK_SIZE);
4675 code += 1 + LINK_SIZE;
4676
4677 *code = OP_RECURSE;
4678 PUT(code, 1, called - cd->start_code);
4679 code += 1 + LINK_SIZE;
4680
4681 *code = OP_KET;
4682 PUT(code, 1, 2 + 2*LINK_SIZE);
4683 code += 1 + LINK_SIZE;
4684
4685 length_prevgroup = 3 + 3*LINK_SIZE;
4686 }
4687
4688 /* Can't determine a first byte now */
4689
4690 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
4691 continue;
4692
4693
4694 /* ------------------------------------------------------------ */
4695 default: /* Other characters: check option setting */
4696 OTHER_CHAR_AFTER_QUERY:
4697 set = unset = 0;
4698 optset = &set;
4699
4700 while (*ptr != ')' && *ptr != ':')
4701 {
4702 switch (*ptr++)
4703 {
4704 case '-': optset = &unset; break;
4705
4706 case 'J': /* Record that it changed in the external options */
4707 *optset |= PCRE_DUPNAMES;
4708 cd->external_flags |= PCRE_JCHANGED;
4709 break;
4710
4711 case 'i': *optset |= PCRE_CASELESS; break;
4712 case 'm': *optset |= PCRE_MULTILINE; break;
4713 case 's': *optset |= PCRE_DOTALL; break;
4714 case 'x': *optset |= PCRE_EXTENDED; break;
4715 case 'U': *optset |= PCRE_UNGREEDY; break;
4716 case 'X': *optset |= PCRE_EXTRA; break;
4717
4718 default: *errorcodeptr = ERR12;
4719 ptr--; /* Correct the offset */
4720 goto FAILED;
4721 }
4722 }
4723
4724 /* Set up the changed option bits, but don't change anything yet. */
4725
4726 newoptions = (options | set) & (~unset);
4727
4728 /* If the options ended with ')' this is not the start of a nested
4729 group with option changes, so the options change at this level. If this
4730 item is right at the start of the pattern, the options can be
4731 abstracted and made external in the pre-compile phase, and ignored in
4732 the compile phase. This can be helpful when matching -- for instance in
4733 caseless checking of required bytes.
4734
4735 If the code pointer is not (cd->start_code + 1 + LINK_SIZE), we are
4736 definitely *not* at the start of the pattern because something has been
4737 compiled. In the pre-compile phase, however, the code pointer can have
4738 that value after the start, because it gets reset as code is discarded
4739 during the pre-compile. However, this can happen only at top level - if
4740 we are within parentheses, the starting BRA will still be present. At
4741 any parenthesis level, the length value can be used to test if anything
4742 has been compiled at that level. Thus, a test for both these conditions
4743 is necessary to ensure we correctly detect the start of the pattern in
4744 both phases.
4745
4746 If we are not at the pattern start, compile code to change the ims
4747 options if this setting actually changes any of them. We also pass the
4748 new setting back so that it can be put at the start of any following
4749 branches, and when this group ends (if we are in a group), a resetting
4750 item can be compiled. */
4751
4752 if (*ptr == ')')
4753 {
4754 if (code == cd->start_code + 1 + LINK_SIZE &&
4755 (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE))
4756 {
4757 cd->external_options = newoptions;
4758 options = newoptions;
4759 }
4760 else
4761 {
4762 if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
4763 {
4764 *code++ = OP_OPT;
4765 *code++ = newoptions & PCRE_IMS;
4766 }
4767
4768 /* Change options at this level, and pass them back for use
4769 in subsequent branches. Reset the greedy defaults and the case
4770 value for firstbyte and reqbyte. */
4771
4772 *optionsptr = options = newoptions;
4773 greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
4774 greedy_non_default = greedy_default ^ 1;
4775 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
4776 }
4777
4778 previous = NULL; /* This item can't be repeated */
4779 continue; /* It is complete */
4780 }
4781
4782 /* If the options ended with ':' we are heading into a nested group
4783 with possible change of options. Such groups are non-capturing and are
4784 not assertions of any kind. All we need to do is skip over the ':';
4785 the newoptions value is handled below. */
4786
4787 bravalue = OP_BRA;
4788 ptr++;
4789 } /* End of switch for character following (? */
4790 } /* End of (? handling */
4791
4792 /* Opening parenthesis not followed by '?'. If PCRE_NO_AUTO_CAPTURE is set,
4793 all unadorned brackets become non-capturing and behave like (?:...)
4794 brackets. */
4795
4796 else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
4797 {
4798 bravalue = OP_BRA;
4799 }
4800
4801 /* Else we have a capturing group. */
4802
4803 else
4804 {
4805 NUMBERED_GROUP:
4806 cd->bracount += 1;
4807 PUT2(code, 1+LINK_SIZE, cd->bracount);
4808 skipbytes = 2;
4809 }
4810
4811 /* Process nested bracketed regex. Assertions may not be repeated, but
4812 other kinds can be. All their opcodes are >= OP_ONCE. We copy code into a
4813 non-register variable in order to be able to pass its address because some
4814 compilers complain otherwise. Pass in a new setting for the ims options if
4815 they have changed. */
4816
4817 previous = (bravalue >= OP_ONCE)? code : NULL;
4818 *code = bravalue;
4819 tempcode = code;
4820 tempreqvary = cd->req_varyopt; /* Save value before bracket */
4821 length_prevgroup = 0; /* Initialize for pre-compile phase */
4822
4823 if (!compile_regex(
4824 newoptions, /* The complete new option state */
4825 options & PCRE_IMS, /* The previous ims option state */
4826 &tempcode, /* Where to put code (updated) */
4827 &ptr, /* Input pointer (updated) */
4828 errorcodeptr, /* Where to put an error message */
4829 (bravalue == OP_ASSERTBACK ||
4830 bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
4831 reset_bracount, /* True if (?| group */
4832 skipbytes, /* Skip over bracket number */
4833 &subfirstbyte, /* For possible first char */
4834 &subreqbyte, /* For possible last char */
4835 bcptr, /* Current branch chain */
4836 cd, /* Tables block */
4837 (lengthptr == NULL)? NULL : /* Actual compile phase */
4838 &length_prevgroup /* Pre-compile phase */
4839 ))
4840 goto FAILED;
4841
4842 /* At the end of compiling, code is still pointing to the start of the
4843 group, while tempcode has been updated to point past the end of the group
4844 and any option resetting that may follow it. The pattern pointer (ptr)
4845 is on the bracket. */
4846
4847 /* If this is a conditional bracket, check that there are no more than
4848 two branches in the group, or just one if it's a DEFINE group. We do this
4849 in the real compile phase, not in the pre-pass, where the whole group may
4850 not be available. */
4851
4852 if (bravalue == OP_COND && lengthptr == NULL)
4853 {
4854 uschar *tc = code;
4855 int condcount = 0;
4856
4857 do {
4858 condcount++;
4859 tc += GET(tc,1);
4860 }
4861 while (*tc != OP_KET);
4862
4863 /* A DEFINE group is never obeyed inline (the "condition" is always
4864 false). It must have only one branch. */
4865
4866 if (code[LINK_SIZE+1] == OP_DEF)
4867 {
4868 if (condcount > 1)
4869 {
4870 *errorcodeptr = ERR54;
4871 goto FAILED;
4872 }
4873 bravalue = OP_DEF; /* Just a flag to suppress char handling below */
4874 }
4875
4876 /* A "normal" conditional group. If there is just one branch, we must not
4877 make use of its firstbyte or reqbyte, because this is equivalent to an
4878 empty second branch. */
4879
4880 else
4881 {
4882 if (condcount > 2)
4883 {
4884 *errorcodeptr = ERR27;
4885 goto FAILED;
4886 }
4887 if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
4888 }
4889 }
4890
4891 /* Error if hit end of pattern */
4892
4893 if (*ptr != ')')
4894 {
4895 *errorcodeptr = ERR14;
4896 goto FAILED;
4897 }
4898
4899 /* In the pre-compile phase, update the length by the length of the group,
4900 less the brackets at either end. Then reduce the compiled code to just a
4901 set of non-capturing brackets so that it doesn't use much memory if it is
4902 duplicated by a quantifier.*/
4903
4904 if (lengthptr != NULL)
4905 {
4906 if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE)
4907 {
4908 *errorcodeptr = ERR20;
4909 goto FAILED;
4910 }
4911 *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
4912 *code++ = OP_BRA;
4913 PUTINC(code, 0, 1 + LINK_SIZE);
4914 *code++ = OP_KET;
4915 PUTINC(code, 0, 1 + LINK_SIZE);
4916 break; /* No need to waste time with special character handling */
4917 }
4918
4919 /* Otherwise update the main code pointer to the end of the group. */
4920
4921 code = tempcode;
4922
4923 /* For a DEFINE group, required and first character settings are not
4924 relevant. */
4925
4926 if (bravalue == OP_DEF) break;
4927
4928 /* Handle updating of the required and first characters for other types of
4929 group. Update for normal brackets of all kinds, and conditions with two
4930 branches (see code above). If the bracket is followed by a quantifier with
4931 zero repeat, we have to back off. Hence the definition of zeroreqbyte and
4932 zerofirstbyte outside the main loop so that they can be accessed for the
4933 back off. */
4934
4935 zeroreqbyte = reqbyte;
4936 zerofirstbyte = firstbyte;
4937 groupsetfirstbyte = FALSE;
4938
4939 if (bravalue >= OP_ONCE)
4940 {
4941 /* If we have not yet set a firstbyte in this branch, take it from the
4942 subpattern, remembering that it was set here so that a repeat of more
4943 than one can replicate it as reqbyte if necessary. If the subpattern has
4944 no firstbyte, set "none" for the whole branch. In both cases, a zero
4945 repeat forces firstbyte to "none". */
4946
4947 if (firstbyte == REQ_UNSET)
4948 {
4949 if (subfirstbyte >= 0)
4950 {
4951 firstbyte = subfirstbyte;
4952 groupsetfirstbyte = TRUE;
4953 }
4954 else firstbyte = REQ_NONE;
4955 zerofirstbyte = REQ_NONE;
4956 }
4957
4958 /* If firstbyte was previously set, convert the subpattern's firstbyte
4959 into reqbyte if there wasn't one, using the vary flag that was in
4960 existence beforehand. */
4961
4962 else if (subfirstbyte >= 0 && subreqbyte < 0)
4963 subreqbyte = subfirstbyte | tempreqvary;
4964
4965 /* If the subpattern set a required byte (or set a first byte that isn't
4966 really the first byte - see above), set it. */
4967
4968 if (subreqbyte >= 0) reqbyte = subreqbyte;
4969 }
4970
4971 /* For a forward assertion, we take the reqbyte, if set. This can be
4972 helpful if the pattern that follows the assertion doesn't set a different
4973 char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
4974 for an assertion, however because it leads to incorrect effect for patterns
4975 such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
4976 of a firstbyte. This is overcome by a scan at the end if there's no
4977 firstbyte, looking for an asserted first char. */
4978
4979 else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
4980 break; /* End of processing '(' */
4981
4982
4983 /* ===================================================================*/
4984 /* Handle metasequences introduced by \. For ones like \d, the ESC_ values
4985 are arranged to be the negation of the corresponding OP_values. For the
4986 back references, the values are ESC_REF plus the reference number. Only
4987 back references and those types that consume a character may be repeated.
4988 We can test for values between ESC_b and ESC_Z for the latter; this may
4989 have to change if any new ones are ever created. */
4990
4991 case '\\':
4992 tempptr = ptr;
4993 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, FALSE);
4994 if (*errorcodeptr != 0) goto FAILED;
4995
4996 if (c < 0)
4997 {
4998 if (-c == ESC_Q) /* Handle start of quoted string */
4999 {
5000 if (ptr[1] == '\\' && ptr[2] == 'E') ptr += 2; /* avoid empty string */
5001 else inescq = TRUE;
5002 continue;
5003 }
5004
5005 if (-c == ESC_E) continue; /* Perl ignores an orphan \E */
5006
5007 /* For metasequences that actually match a character, we disable the
5008 setting of a first character if it hasn't already been set. */
5009
5010 if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
5011 firstbyte = REQ_NONE;
5012
5013 /* Set values to reset to if this is followed by a zero repeat. */
5014
5015 zerofirstbyte = firstbyte;
5016 zeroreqbyte = reqbyte;
5017
5018 /* \k<name> or \k'name' is a back reference by name (Perl syntax).
5019 We also support \k{name} (.NET syntax) */
5020
5021 if (-c == ESC_k && (ptr[1] == '<' || ptr[1] == '\'' || ptr[1] == '{'))
5022 {
5023 is_recurse = FALSE;
5024 terminator = (*(++ptr) == '<')? '>' : (*ptr == '\'')? '\'' : '}';
5025 goto NAMED_REF_OR_RECURSE;
5026 }
5027
5028 /* Back references are handled specially; must disable firstbyte if
5029 not set to cope with cases like (?=(\w+))\1: which would otherwise set
5030 ':' later. */
5031
5032 if (-c >= ESC_REF)
5033 {
5034 recno = -c - ESC_REF;
5035
5036 HANDLE_REFERENCE: /* Come here from named backref handling */
5037 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
5038 previous = code;
5039 *code++ = OP_REF;
5040 PUT2INC(code, 0, recno);
5041 cd->backref_map |= (recno < 32)? (1 << recno) : 1;
5042 if (recno > cd->top_backref) cd->top_backref = recno;
5043 }
5044
5045 /* So are Unicode property matches, if supported. */
5046
5047 #ifdef SUPPORT_UCP
5048 else if (-c == ESC_P || -c == ESC_p)
5049 {
5050 BOOL negated;
5051 int pdata;
5052 int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
5053 if (ptype < 0) goto FAILED;
5054 previous = code;
5055 *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
5056 *code++ = ptype;
5057 *code++ = pdata;
5058 }
5059 #else
5060
5061 /* If Unicode properties are not supported, \X, \P, and \p are not
5062 allowed. */
5063
5064 else if (-c == ESC_X || -c == ESC_P || -c == ESC_p)
5065 {
5066 *errorcodeptr = ERR45;
5067 goto FAILED;
5068 }
5069 #endif
5070
5071 /* For the rest (including \X when Unicode properties are supported), we
5072 can obtain the OP value by negating the escape value. */
5073
5074 else
5075 {
5076 previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
5077 *code++ = -c;
5078 }
5079 continue;
5080 }
5081
5082 /* We have a data character whose value is in c. In UTF-8 mode it may have
5083 a value > 127. We set its representation in the length/buffer, and then
5084 handle it as a data character. */
5085
5086 #ifdef SUPPORT_UTF8
5087 if (utf8 && c > 127)
5088 mclength = _pcre_ord2utf8(c, mcbuffer);
5089 else
5090 #endif
5091
5092 {
5093 mcbuffer[0] = c;
5094 mclength = 1;
5095 }
5096 goto ONE_CHAR;
5097
5098
5099 /* ===================================================================*/
5100 /* Handle a literal character. It is guaranteed not to be whitespace or #
5101 when the extended flag is set. If we are in UTF-8 mode, it may be a
5102 multi-byte literal character. */
5103
5104 default:
5105 NORMAL_CHAR:
5106 mclength = 1;
5107 mcbuffer[0] = c;
5108
5109 #ifdef SUPPORT_UTF8
5110 if (utf8 && c >= 0xc0)
5111 {
5112 while ((ptr[1] & 0xc0) == 0x80)
5113 mcbuffer[mclength++] = *(++ptr);
5114 }
5115 #endif
5116
5117 /* At this point we have the character's bytes in mcbuffer, and the length
5118 in mclength. When not in UTF-8 mode, the length is always 1. */
5119
5120 ONE_CHAR:
5121 previous = code;
5122 *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARNC : OP_CHAR;
5123 for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
5124
5125 /* Remember if \r or \n were seen */
5126
5127 if (mcbuffer[0] == '\r' || mcbuffer[0] == '\n')
5128 cd->external_flags |= PCRE_HASCRORLF;
5129
5130 /* Set the first and required bytes appropriately. If no previous first
5131 byte, set it from this character, but revert to none on a zero repeat.
5132 Otherwise, leave the firstbyte value alone, and don't change it on a zero
5133 repeat. */
5134
5135 if (firstbyte == REQ_UNSET)
5136 {
5137 zerofirstbyte = REQ_NONE;
5138 zeroreqbyte = reqbyte;
5139
5140 /* If the character is more than one byte long, we can set firstbyte
5141 only if it is not to be matched caselessly. */
5142
5143 if (mclength == 1 || req_caseopt == 0)
5144 {
5145 firstbyte = mcbuffer[0] | req_caseopt;
5146 if (mclength != 1) reqbyte = code[-1] | cd->req_varyopt;
5147 }
5148 else firstbyte = reqbyte = REQ_NONE;
5149 }
5150
5151 /* firstbyte was previously set; we can set reqbyte only the length is
5152 1 or the matching is caseful. */
5153
5154 else
5155 {
5156 zerofirstbyte = firstbyte;
5157 zeroreqbyte = reqbyte;
5158 if (mclength == 1 || req_caseopt == 0)
5159 reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
5160 }
5161
5162 break; /* End of literal character handling */
5163 }
5164 } /* end of big loop */
5165
5166
5167 /* Control never reaches here by falling through, only by a goto for all the
5168 error states. Pass back the position in the pattern so that it can be displayed
5169 to the user for diagnosing the error. */
5170
5171 FAILED:
5172 *ptrptr = ptr;
5173 return FALSE;
5174 }
5175
5176
5177
5178
5179 /*************************************************
5180 * Compile sequence of alternatives *
5181 *************************************************/
5182
5183 /* On entry, ptr is pointing past the bracket character, but on return it
5184 points to the closing bracket, or vertical bar, or end of string. The code
5185 variable is pointing at the byte into which the BRA operator has been stored.
5186 If the ims options are changed at the start (for a (?ims: group) or during any
5187 branch, we need to insert an OP_OPT item at the start of every following branch
5188 to ensure they get set correctly at run time, and also pass the new options
5189 into every subsequent branch compile.
5190
5191 This function is used during the pre-compile phase when we are trying to find
5192 out the amount of memory needed, as well as during the real compile phase. The
5193 value of lengthptr distinguishes the two phases.
5194
5195 Arguments:
5196 options option bits, including any changes for this subpattern
5197 oldims previous settings of ims option bits
5198 codeptr -> the address of the current code pointer
5199 ptrptr -> the address of the current pattern pointer
5200 errorcodeptr -> pointer to error code variable
5201 lookbehind TRUE if this is a lookbehind assertion
5202 reset_bracount TRUE to reset the count for each branch
5203 skipbytes skip this many bytes at start (for brackets and OP_COND)
5204 firstbyteptr place to put the first required character, or a negative number
5205 reqbyteptr place to put the last required character, or a negative number
5206 bcptr pointer to the chain of currently open branches
5207 cd points to the data block with tables pointers etc.
5208 lengthptr NULL during the real compile phase
5209 points to length accumulator during pre-compile phase
5210
5211 Returns: TRUE on success
5212 */
5213
5214 static BOOL
5215 compile_regex(int options, int oldims, uschar **codeptr, const uschar **ptrptr,
5216 int *errorcodeptr, BOOL lookbehind, BOOL reset_bracount, int skipbytes,
5217 int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd,
5218 int *lengthptr)
5219 {
5220 const uschar *ptr = *ptrptr;
5221 uschar *code = *codeptr;
5222 uschar *last_branch = code;
5223 uschar *start_bracket = code;
5224 uschar *reverse_count = NULL;
5225 int firstbyte, reqbyte;
5226 int branchfirstbyte, branchreqbyte;
5227 int length;
5228 int orig_bracount;
5229 int max_bracount;
5230 branch_chain bc;
5231
5232 bc.outer = bcptr;
5233 bc.current = code;
5234
5235 firstbyte = reqbyte = REQ_UNSET;
5236
5237 /* Accumulate the length for use in the pre-compile phase. Start with the
5238 length of the BRA and KET and any extra bytes that are required at the
5239 beginning. We accumulate in a local variable to save frequent testing of
5240 lenthptr for NULL. We cannot do this by looking at the value of code at the
5241 start and end of each alternative, because compiled items are discarded during
5242 the pre-compile phase so that the work space is not exceeded. */
5243
5244 length = 2 + 2*LINK_SIZE + skipbytes;
5245
5246 /* WARNING: If the above line is changed for any reason, you must also change
5247 the code that abstracts option settings at the start of the pattern and makes
5248 them global. It tests the value of length for (2 + 2*LINK_SIZE) in the
5249 pre-compile phase to find out whether anything has yet been compiled or not. */
5250
5251 /* Offset is set zero to mark that this bracket is still open */
5252
5253 PUT(code, 1, 0);
5254 code += 1 + LINK_SIZE + skipbytes;
5255
5256 /* Loop for each alternative branch */
5257
5258 orig_bracount = max_bracount = cd->bracount;
5259 for (;;)
5260 {
5261 /* For a (?| group, reset the capturing bracket count so that each branch
5262 uses the same numbers. */
5263
5264 if (reset_bracount) cd->bracount = orig_bracount;
5265
5266 /* Handle a change of ims options at the start of the branch */
5267
5268 if ((options & PCRE_IMS) != oldims)
5269 {
5270 *code++ = OP_OPT;
5271 *code++ = options & PCRE_IMS;
5272 length += 2;
5273 }
5274
5275 /* Set up dummy OP_REVERSE if lookbehind assertion */
5276
5277 if (lookbehind)
5278 {
5279 *code++ = OP_REVERSE;
5280 reverse_count = code;
5281 PUTINC(code, 0, 0);
5282 length += 1 + LINK_SIZE;
5283 }
5284
5285 /* Now compile the branch; in the pre-compile phase its length gets added
5286 into the length. */
5287
5288 if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstbyte,
5289 &branchreqbyte, &bc, cd, (lengthptr == NULL)? NULL : &length))
5290 {
5291 *ptrptr = ptr;
5292 return FALSE;
5293 }
5294
5295 /* Keep the highest bracket count in case (?| was used and some branch
5296 has fewer than the rest. */
5297
5298 if (cd->bracount > max_bracount) max_bracount = cd->bracount;
5299
5300 /* In the real compile phase, there is some post-processing to be done. */
5301
5302 if (lengthptr == NULL)
5303 {
5304 /* If this is the first branch, the firstbyte and reqbyte values for the
5305 branch become the values for the regex. */
5306
5307 if (*last_branch != OP_ALT)
5308 {
5309 firstbyte = branchfirstbyte;
5310 reqbyte = branchreqbyte;
5311 }
5312
5313 /* If this is not the first branch, the first char and reqbyte have to
5314 match the values from all the previous branches, except that if the
5315 previous value for reqbyte didn't have REQ_VARY set, it can still match,
5316 and we set REQ_VARY for the regex. */
5317
5318 else
5319 {
5320 /* If we previously had a firstbyte, but it doesn't match the new branch,
5321 we have to abandon the firstbyte for the regex, but if there was
5322 previously no reqbyte, it takes on the value of the old firstbyte. */
5323
5324 if (firstbyte >= 0 && firstbyte != branchfirstbyte)
5325 {
5326 if (reqbyte < 0) reqbyte = firstbyte;
5327 firstbyte = REQ_NONE;
5328 }
5329
5330 /* If we (now or from before) have no firstbyte, a firstbyte from the
5331 branch becomes a reqbyte if there isn't a branch reqbyte. */
5332
5333 if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
5334 branchreqbyte = branchfirstbyte;
5335
5336 /* Now ensure that the reqbytes match */
5337
5338 if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
5339 reqbyte = REQ_NONE;
5340 else reqbyte |= branchreqbyte; /* To "or" REQ_VARY */
5341 }
5342
5343 /* If lookbehind, check that this branch matches a fixed-length string, and
5344 put the length into the OP_REVERSE item. Temporarily mark the end of the
5345 branch with OP_END. */
5346
5347 if (lookbehind)
5348 {
5349 int fixed_length;
5350 *code = OP_END;
5351 fixed_length = find_fixedlength(last_branch, options);
5352 DPRINTF(("fixed length = %d\n", fixed_length));
5353 if (fixed_length < 0)
5354 {
5355 *errorcodeptr = (fixed_length == -2)? ERR36 : ERR25;
5356 *ptrptr = ptr;
5357 return FALSE;
5358 }
5359 PUT(reverse_count, 0, fixed_length);
5360 }
5361 }
5362
5363 /* Reached end of expression, either ')' or end of pattern. In the real
5364 compile phase, go back through the alternative branches and reverse the chain
5365 of offsets, with the field in the BRA item now becoming an offset to the
5366 first alternative. If there are no alternatives, it points to the end of the
5367 group. The length in the terminating ket is always the length of the whole
5368 bracketed item. If any of the ims options were changed inside the group,
5369 compile a resetting op-code following, except at the very end of the pattern.
5370 Return leaving the pointer at the terminating char. */
5371
5372 if (*ptr != '|')
5373 {
5374 if (lengthptr == NULL)
5375 {
5376 int branch_length = code - last_branch;
5377 do
5378 {
5379 int prev_length = GET(last_branch, 1);
5380 PUT(last_branch, 1, branch_length);
5381 branch_length = prev_length;
5382 last_branch -= branch_length;
5383 }
5384 while (branch_length > 0);
5385 }
5386
5387 /* Fill in the ket */
5388
5389 *code = OP_KET;
5390 PUT(code, 1, code - start_bracket);
5391 code += 1 + LINK_SIZE;
5392
5393 /* Resetting option if needed */
5394
5395 if ((options & PCRE_IMS) != oldims && *ptr == ')')
5396 {
5397 *code++ = OP_OPT;
5398 *code++ = oldims;
5399 length += 2;
5400 }
5401
5402 /* Retain the highest bracket number, in case resetting was used. */
5403
5404 cd->bracount = max_bracount;
5405
5406 /* Set values to pass back */
5407
5408 *codeptr = code;
5409 *ptrptr = ptr;
5410 *firstbyteptr = firstbyte;
5411 *reqbyteptr = reqbyte;
5412 if (lengthptr != NULL)
5413 {
5414 if (OFLOW_MAX - *lengthptr < length)
5415 {
5416 *errorcodeptr = ERR20;
5417 return FALSE;
5418 }
5419 *lengthptr += length;
5420 }
5421 return TRUE;
5422 }
5423
5424 /* Another branch follows. In the pre-compile phase, we can move the code
5425 pointer back to where it was for the start of the first branch. (That is,
5426 pretend that each branch is the only one.)
5427
5428 In the real compile phase, insert an ALT node. Its length field points back
5429 to the previous branch while the bracket remains open. At the end the chain
5430 is reversed. It's done like this so that the start of the bracket has a
5431 zero offset until it is closed, making it possible to detect recursion. */
5432
5433 if (lengthptr != NULL)
5434 {
5435 code = *codeptr + 1 + LINK_SIZE + skipbytes;
5436 length += 1 + LINK_SIZE;
5437 }
5438 else
5439 {
5440 *code = OP_ALT;
5441 PUT(code, 1, code - last_branch);
5442 bc.current = last_branch = code;
5443 code += 1 + LINK_SIZE;
5444 }
5445
5446 ptr++;
5447 }
5448 /* Control never reaches here */
5449 }
5450
5451
5452
5453
5454 /*************************************************
5455 * Check for anchored expression *
5456 *************************************************/
5457
5458 /* Try to find out if this is an anchored regular expression. Consider each
5459 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
5460 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
5461 it's anchored. However, if this is a multiline pattern, then only OP_SOD
5462 counts, since OP_CIRC can match in the middle.
5463
5464 We can also consider a regex to be anchored if OP_SOM starts all its branches.
5465 This is the code for \G, which means "match at start of match position, taking
5466 into account the match offset".
5467
5468 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
5469 because that will try the rest of the pattern at all possible matching points,
5470 so there is no point trying again.... er ....
5471
5472 .... except when the .* appears inside capturing parentheses, and there is a
5473 subsequent back reference to those parentheses. We haven't enough information
5474 to catch that case precisely.
5475
5476 At first, the best we could do was to detect when .* was in capturing brackets
5477 and the highest back reference was greater than or equal to that level.
5478 However, by keeping a bitmap of the first 31 back references, we can catch some
5479 of the more common cases more precisely.
5480
5481 Arguments:
5482 code points to start of expression (the bracket)
5483 options points to the options setting
5484 bracket_map a bitmap of which brackets we are inside while testing; this
5485 handles up to substring 31; after that we just have to take
5486 the less precise approach
5487 backref_map the back reference bitmap
5488
5489 Returns: TRUE or FALSE
5490 */
5491
5492 static BOOL
5493 is_anchored(register const uschar *code, int *options, unsigned int bracket_map,
5494 unsigned int backref_map)
5495 {
5496 do {
5497 const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
5498 options, PCRE_MULTILINE, FALSE);
5499 register int op = *scode;
5500
5501 /* Non-capturing brackets */
5502
5503 if (op == OP_BRA)
5504 {
5505 if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
5506 }
5507
5508 /* Capturing brackets */
5509
5510 else if (op == OP_CBRA)
5511 {
5512 int n = GET2(scode, 1+LINK_SIZE);
5513 int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
5514 if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;
5515 }
5516
5517 /* Other brackets */
5518
5519 else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
5520 {
5521 if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
5522 }
5523
5524 /* .* is not anchored unless DOTALL is set and it isn't in brackets that
5525 are or may be referenced. */
5526
5527 else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
5528 op == OP_TYPEPOSSTAR) &&
5529 (*options & PCRE_DOTALL) != 0)
5530 {
5531 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
5532 }
5533
5534 /* Check for explicit anchoring */
5535
5536 else if (op != OP_SOD && op != OP_SOM &&
5537 ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
5538 return FALSE;
5539 code += GET(code, 1);
5540 }
5541 while (*code == OP_ALT); /* Loop for each alternative */
5542 return TRUE;
5543 }
5544
5545
5546
5547 /*************************************************
5548 * Check for starting with ^ or .* *
5549 *************************************************/
5550
5551 /* This is called to find out if every branch starts with ^ or .* so that
5552 "first char" processing can be done to speed things up in multiline
5553 matching and for non-DOTALL patterns that start with .* (which must start at
5554 the beginning or after \n). As in the case of is_anchored() (see above), we
5555 have to take account of back references to capturing brackets that contain .*
5556 because in that case we can't make the assumption.
5557
5558 Arguments:
5559 code points to start of expression (the bracket)
5560 bracket_map a bitmap of which brackets we are inside while testing; this
5561 handles up to substring 31; after that we just have to take
5562 the less precise approach
5563 backref_map the back reference bitmap
5564
5565 Returns: TRUE or FALSE
5566 */
5567
5568 static BOOL
5569 is_startline(const uschar *code, unsigned int bracket_map,
5570 unsigned int backref_map)
5571 {
5572 do {
5573 const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
5574 NULL, 0, FALSE);
5575 register int op = *scode;
5576
5577 /* Non-capturing brackets */
5578
5579 if (op == OP_BRA)
5580 {
5581 if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
5582 }
5583
5584 /* Capturing brackets */
5585
5586 else if (op == OP_CBRA)
5587 {
5588 int n = GET2(scode, 1+LINK_SIZE);
5589 int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
5590 if (!is_startline(scode, new_map, backref_map)) return FALSE;
5591 }
5592
5593 /* Other brackets */
5594
5595 else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
5596 { if (!is_startline(scode, bracket_map, backref_map)) return FALSE; }
5597
5598 /* .* means "start at start or after \n" if it isn't in brackets that
5599 may be referenced. */
5600
5601 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
5602 {
5603 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
5604 }
5605
5606 /* Check for explicit circumflex */
5607
5608 else if (op != OP_CIRC) return FALSE;
5609
5610 /* Move on to the next alternative */
5611
5612 code += GET(code, 1);
5613 }
5614 while (*code == OP_ALT); /* Loop for each alternative */
5615 return TRUE;
5616 }
5617
5618
5619
5620 /*************************************************
5621 * Check for asserted fixed first char *
5622 *************************************************/
5623
5624 /* During compilation, the "first char" settings from forward assertions are
5625 discarded, because they can cause conflicts with actual literals that follow.
5626 However, if we end up without a first char setting for an unanchored pattern,
5627 it is worth scanning the regex to see if there is an initial asserted first
5628 char. If all branches start with the same asserted char, or with a bracket all
5629 of whose alternatives start with the same asserted char (recurse ad lib), then
5630 we return that char, otherwise -1.
5631
5632 Arguments:
5633 code points to start of expression (the bracket)
5634 options pointer to the options (used to check casing changes)
5635 inassert TRUE if in an assertion
5636
5637 Returns: -1 or the fixed first char
5638 */
5639
5640 static int
5641 find_firstassertedchar(const uschar *code, int *options, BOOL inassert)
5642 {
5643 register int c = -1;
5644 do {
5645 int d;
5646 const uschar *scode =
5647 first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE);
5648 register int op = *scode;
5649
5650 switch(op)
5651 {
5652 default:
5653 return -1;
5654
5655 case OP_BRA:
5656 case OP_CBRA:
5657 case OP_ASSERT:
5658 case OP_ONCE:
5659 case OP_COND:
5660 if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0)
5661 return -1;
5662 if (c < 0) c = d; else if (c != d) return -1;
5663 break;
5664
5665 case OP_EXACT: /* Fall through */
5666 scode += 2;
5667
5668 case OP_CHAR:
5669 case OP_CHARNC:
5670 case OP_PLUS:
5671 case OP_MINPLUS:
5672 case OP_POSPLUS:
5673 if (!inassert) return -1;
5674 if (c < 0)
5675 {
5676 c = scode[1];
5677 if ((*options & PCRE_CASELESS) != 0) c |= REQ_CASELESS;
5678 }
5679 else if (c != scode[1]) return -1;
5680 break;
5681 }
5682
5683 code += GET(code, 1);
5684 }
5685 while (*code == OP_ALT);
5686 return c;
5687 }
5688
5689
5690
5691 /*************************************************
5692 * Compile a Regular Expression *
5693 *************************************************/
5694
5695 /* This function takes a string and returns a pointer to a block of store
5696 holding a compiled version of the expression. The original API for this
5697 function had no error code return variable; it is retained for backwards
5698 compatibility. The new function is given a new name.
5699
5700 Arguments:
5701 pattern the regular expression
5702 options various option bits
5703 errorcodeptr pointer to error code variable (pcre_compile2() only)
5704 can be NULL if you don't want a code value
5705 errorptr pointer to pointer to error text
5706 erroroffset ptr offset in pattern where error was detected
5707 tables pointer to character tables or NULL
5708
5709 Returns: pointer to compiled data block, or NULL on error,
5710 with errorptr and erroroffset set
5711 */
5712
5713 PCRE_EXP_DEFN pcre *
5714 pcre_compile(const char *pattern, int options, const char **errorptr,
5715 int *erroroffset, const unsigned char *tables)
5716 {
5717 return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
5718 }
5719
5720
5721 PCRE_EXP_DEFN pcre *
5722 pcre_compile2(const char *pattern, int options, int *errorcodeptr,
5723 const char **errorptr, int *erroroffset, const unsigned char *tables)
5724 {
5725 real_pcre *re;
5726 int length = 1; /* For final END opcode */
5727 int firstbyte, reqbyte, newline;
5728 int errorcode = 0;
5729 int skipatstart = 0;
5730 #ifdef SUPPORT_UTF8
5731 BOOL utf8;
5732 #endif
5733 size_t size;
5734 uschar *code;
5735 const uschar *codestart;
5736 const uschar *ptr;
5737 compile_data compile_block;
5738 compile_data *cd = &compile_block;
5739
5740 /* This space is used for "compiling" into during the first phase, when we are
5741 computing the amount of memory that is needed. Compiled items are thrown away
5742 as soon as possible, so that a fairly large buffer should be sufficient for
5743 this purpose. The same space is used in the second phase for remembering where
5744 to fill in forward references to subpatterns. */
5745
5746 uschar cworkspace[COMPILE_WORK_SIZE];
5747
5748
5749 /* Set this early so that early errors get offset 0. */
5750
5751 ptr = (const uschar *)pattern;
5752
5753 /* We can't pass back an error message if errorptr is NULL; I guess the best we
5754 can do is just return NULL, but we can set a code value if there is a code
5755 pointer. */
5756
5757 if (errorptr == NULL)
5758 {
5759 if (errorcodeptr != NULL) *errorcodeptr = 99;
5760 return NULL;
5761 }
5762
5763 *errorptr = NULL;
5764 if (errorcodeptr != NULL) *errorcodeptr = ERR0;
5765
5766 /* However, we can give a message for this error */
5767
5768 if (erroroffset == NULL)
5769 {
5770 errorcode = ERR16;
5771 goto PCRE_EARLY_ERROR_RETURN2;
5772 }
5773
5774 *erroroffset = 0;
5775
5776 /* Can't support UTF8 unless PCRE has been compiled to include the code. */
5777
5778 #ifdef SUPPORT_UTF8
5779 utf8 = (options & PCRE_UTF8) != 0;
5780 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
5781 (*erroroffset = _pcre_valid_utf8((uschar *)pattern, -1)) >= 0)
5782 {
5783 errorcode = ERR44;
5784 goto PCRE_EARLY_ERROR_RETURN2;
5785 }
5786 #else
5787 if ((options & PCRE_UTF8) != 0)
5788 {
5789 errorcode = ERR32;
5790 goto PCRE_EARLY_ERROR_RETURN;
5791 }
5792 #endif
5793
5794 if ((options & ~PUBLIC_OPTIONS) != 0)
5795 {
5796 errorcode = ERR17;
5797 goto PCRE_EARLY_ERROR_RETURN;
5798 }
5799
5800 /* Set up pointers to the individual character tables */
5801
5802 if (tables == NULL) tables = _pcre_default_tables;
5803 cd->lcc = tables + lcc_offset;
5804 cd->fcc = tables + fcc_offset;
5805 cd->cbits = tables + cbits_offset;
5806 cd->ctypes = tables + ctypes_offset;
5807
5808 /* Check for global one-time settings at the start of the pattern, and remember
5809 the offset for later. */
5810
5811 while (ptr[skipatstart] == '(' && ptr[skipatstart+1] == '*')
5812 {
5813 int newnl = 0;
5814 int newbsr = 0;
5815
5816 if (strncmp((char *)(ptr+skipatstart+2), "CR)", 3) == 0)
5817 { skipatstart += 5; newnl = PCRE_NEWLINE_CR; }
5818 else if (strncmp((char *)(ptr+skipatstart+2), "LF)", 3) == 0)
5819 { skipatstart += 5; newnl = PCRE_NEWLINE_LF; }
5820 else if (strncmp((char *)(ptr+skipatstart+2), "CRLF)", 5) == 0)
5821 { skipatstart += 7; newnl = PCRE_NEWLINE_CR + PCRE_NEWLINE_LF; }
5822 else if (strncmp((char *)(ptr+skipatstart+2), "ANY)", 4) == 0)
5823 { skipatstart += 6; newnl = PCRE_NEWLINE_ANY; }
5824 else if (strncmp((char *)(ptr+skipatstart+2), "ANYCRLF)", 8) == 0)
5825 { skipatstart += 10; newnl = PCRE_NEWLINE_ANYCRLF; }
5826
5827 else if (strncmp((char *)(ptr+skipatstart+2), "BSR_ANYCRLF)", 12) == 0)
5828 { skipatstart += 14; newbsr = PCRE_BSR_ANYCRLF; }
5829 else if (strncmp((char *)(ptr+skipatstart+2), "BSR_UNICODE)", 12) == 0)
5830 { skipatstart += 14; newbsr = PCRE_BSR_UNICODE; }
5831
5832 if (newnl != 0)
5833 options = (options & ~PCRE_NEWLINE_BITS) | newnl;
5834 else if (newbsr != 0)
5835 options = (options & ~(PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) | newbsr;
5836 else break;
5837 }
5838
5839 /* Check validity of \R options. */
5840
5841 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
5842 {
5843 case 0:
5844 case PCRE_BSR_ANYCRLF:
5845 case PCRE_BSR_UNICODE:
5846 break;
5847 default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
5848 }
5849
5850 /* Handle different types of newline. The three bits give seven cases. The
5851 current code allows for fixed one- or two-byte sequences, plus "any" and
5852 "anycrlf". */
5853
5854 switch (options & PCRE_NEWLINE_BITS)
5855 {
5856 case 0: newline = NEWLINE; break; /* Build-time default */
5857 case PCRE_NEWLINE_CR: newline = '\r'; break;
5858 case PCRE_NEWLINE_LF: newline = '\n'; break;
5859 case PCRE_NEWLINE_CR+
5860 PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
5861 case PCRE_NEWLINE_ANY: newline = -1; break;
5862 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
5863 default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
5864 }
5865
5866 if (newline == -2)
5867 {
5868 cd->nltype = NLTYPE_ANYCRLF;
5869 }
5870 else if (newline < 0)
5871 {
5872 cd->nltype = NLTYPE_ANY;
5873 }
5874 else
5875 {
5876 cd->nltype = NLTYPE_FIXED;
5877 if (newline > 255)
5878 {
5879 cd->nllen = 2;
5880 cd->nl[0] = (newline >> 8) & 255;
5881 cd->nl[1] = newline & 255;
5882 }
5883 else
5884 {
5885 cd->nllen = 1;
5886 cd->nl[0] = newline;
5887 }
5888 }
5889
5890 /* Maximum back reference and backref bitmap. The bitmap records up to 31 back
5891 references to help in deciding whether (.*) can be treated as anchored or not.
5892 */
5893
5894 cd->top_backref = 0;
5895 cd->backref_map = 0;
5896
5897 /* Reflect pattern for debugging output */
5898
5899 DPRINTF(("------------------------------------------------------------------\n"));
5900 DPRINTF(("%s\n", pattern));
5901
5902 /* Pretend to compile the pattern while actually just accumulating the length
5903 of memory required. This behaviour is triggered by passing a non-NULL final
5904 argument to compile_regex(). We pass a block of workspace (cworkspace) for it
5905 to compile parts of the pattern into; the compiled code is discarded when it is
5906 no longer needed, so hopefully this workspace will never overflow, though there
5907 is a test for its doing so. */
5908
5909 cd->bracount = 0;
5910 cd->names_found = 0;
5911 cd->name_entry_size = 0;
5912 cd->name_table = NULL;
5913 cd->start_workspace = cworkspace;
5914 cd->start_code = cworkspace;
5915 cd->hwm = cworkspace;
5916 cd->start_pattern = (const uschar *)pattern;
5917 cd->end_pattern = (const uschar *)(pattern + strlen(pattern));
5918 cd->req_varyopt = 0;
5919 cd->external_options = options;
5920 cd->external_flags = 0;
5921
5922 /* Now do the pre-compile. On error, errorcode will be set non-zero, so we
5923 don't need to look at the result of the function here. The initial options have
5924 been put into the cd block so that they can be changed if an option setting is
5925 found within the regex right at the beginning. Bringing initial option settings
5926 outside can help speed up starting point checks. */
5927
5928 ptr += skipatstart;
5929 code = cworkspace;
5930 *code = OP_BRA;
5931 (void)compile_regex(cd->external_options, cd->external_options & PCRE_IMS,
5932 &code, &ptr, &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd,
5933 &length);
5934 if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN;
5935
5936 DPRINTF(("end pre-compile: length=%d workspace=%d\n", length,
5937 cd->hwm - cworkspace));
5938
5939 if (length > MAX_PATTERN_SIZE)
5940 {
5941 errorcode = ERR20;
5942 goto PCRE_EARLY_ERROR_RETURN;
5943 }
5944
5945 /* Compute the size of data block needed and get it, either from malloc or
5946 externally provided function. Integer overflow should no longer be possible
5947 because nowadays we limit the maximum value of cd->names_found and
5948 cd->name_entry_size. */
5949
5950 size = length + sizeof(real_pcre) + cd->names_found * (cd->name_entry_size + 3);
5951 re = (real_pcre *)(pcre_malloc)(size);
5952
5953 if (re == NULL)
5954 {
5955 errorcode = ERR21;
5956 goto PCRE_EARLY_ERROR_RETURN;
5957 }
5958
5959 /* Put in the magic number, and save the sizes, initial options, internal
5960 flags, and character table pointer. NULL is used for the default character
5961 tables. The nullpad field is at the end; it's there to help in the case when a
5962 regex compiled on a system with 4-byte pointers is run on another with 8-byte
5963 pointers. */
5964
5965 re->magic_number = MAGIC_NUMBER;
5966 re->size = size;
5967 re->options = cd->external_options;
5968 re->flags = cd->external_flags;
5969 re->dummy1 = 0;
5970 re->first_byte = 0;
5971 re->req_byte = 0;
5972 re->name_table_offset = sizeof(real_pcre);
5973 re->name_entry_size = cd->name_entry_size;
5974 re->name_count = cd->names_found;
5975 re->ref_count = 0;
5976 re->tables = (tables == _pcre_default_tables)? NULL : tables;
5977 re->nullpad = NULL;
5978
5979 /* The starting points of the name/number translation table and of the code are
5980 passed around in the compile data block. The start/end pattern and initial
5981 options are already set from the pre-compile phase, as is the name_entry_size
5982 field. Reset the bracket count and the names_found field. Also reset the hwm
5983 field; this time it's used for remembering forward references to subpatterns.
5984 */
5985
5986 cd->bracount = 0;
5987 cd->names_found = 0;
5988 cd->name_table = (uschar *)re + re->name_table_offset;
5989 codestart = cd->name_table + re->name_entry_size * re->name_count;
5990 cd->start_code = codestart;
5991 cd->hwm = cworkspace;
5992 cd->req_varyopt = 0;
5993 cd->had_accept = FALSE;
5994
5995 /* Set up a starting, non-extracting bracket, then compile the expression. On
5996 error, errorcode will be set non-zero, so we don't need to look at the result
5997 of the function here. */
5998
5999 ptr = (const uschar *)pattern + skipatstart;
6000 code = (uschar *)codestart;
6001 *code = OP_BRA;
6002 (void)compile_regex(re->options, re->options & PCRE_IMS, &code, &ptr,
6003 &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd, NULL);
6004 re->top_bracket = cd->bracount;
6005 re->top_backref = cd->top_backref;
6006 re->flags = cd->external_flags;
6007
6008 if (cd->had_accept) reqbyte = -1; /* Must disable after (*ACCEPT) */
6009
6010 /* If not reached end of pattern on success, there's an excess bracket. */
6011
6012 if (errorcode == 0 && *ptr != 0) errorcode = ERR22;
6013
6014 /* Fill in the terminating state and check for disastrous overflow, but
6015 if debugging, leave the test till after things are printed out. */
6016
6017 *code++ = OP_END;
6018
6019 #ifndef DEBUG
6020 if (code - codestart > length) errorcode = ERR23;
6021 #endif
6022
6023 /* Fill in any forward references that are required. */
6024
6025 while (errorcode == 0 && cd->hwm > cworkspace)
6026 {
6027 int offset, recno;
6028 const uschar *groupptr;
6029 cd->hwm -= LINK_SIZE;
6030 offset = GET(cd->hwm, 0);
6031 recno = GET(codestart, offset);
6032 groupptr = find_bracket(codestart, (re->options & PCRE_UTF8) != 0, recno);
6033 if (groupptr == NULL) errorcode = ERR53;
6034 else PUT(((uschar *)codestart), offset, groupptr - codestart);
6035 }
6036
6037 /* Give an error if there's back reference to a non-existent capturing
6038 subpattern. */
6039
6040 if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15;
6041
6042 /* Failed to compile, or error while post-processing */
6043
6044 if (errorcode != 0)
6045 {
6046 (pcre_free)(re);
6047 PCRE_EARLY_ERROR_RETURN:
6048 *erroroffset = ptr - (const uschar *)pattern;
6049 PCRE_EARLY_ERROR_RETURN2:
6050 *errorptr = find_error_text(errorcode);
6051 if (errorcodeptr != NULL) *errorcodeptr = errorcode;
6052 return NULL;
6053 }
6054
6055 /* If the anchored option was not passed, set the flag if we can determine that
6056 the pattern is anchored by virtue of ^ characters or \A or anything else (such
6057 as starting with .* when DOTALL is set).
6058
6059 Otherwise, if we know what the first byte has to be, save it, because that
6060 speeds up unanchored matches no end. If not, see if we can set the
6061 PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
6062 start with ^. and also when all branches start with .* for non-DOTALL matches.
6063 */
6064
6065 if ((re->options & PCRE_ANCHORED) == 0)
6066 {
6067 int temp_options = re->options; /* May get changed during these scans */
6068 if (is_anchored(codestart, &temp_options, 0, cd->backref_map))
6069 re->options |= PCRE_ANCHORED;
6070 else
6071 {
6072 if (firstbyte < 0)
6073 firstbyte = find_firstassertedchar(codestart, &temp_options, FALSE);
6074 if (firstbyte >= 0) /* Remove caseless flag for non-caseable chars */
6075 {
6076 int ch = firstbyte & 255;
6077 re->first_byte = ((firstbyte & REQ_CASELESS) != 0 &&
6078 cd->fcc[ch] == ch)? ch : firstbyte;
6079 re->flags |= PCRE_FIRSTSET;
6080 }
6081 else if (is_startline(codestart, 0, cd->backref_map))
6082 re->flags |= PCRE_STARTLINE;
6083 }
6084 }
6085
6086 /* For an anchored pattern, we use the "required byte" only if it follows a
6087 variable length item in the regex. Remove the caseless flag for non-caseable
6088 bytes. */
6089
6090 if (reqbyte >= 0 &&
6091 ((re->options & PCRE_ANCHORED) == 0 || (reqbyte & REQ_VARY) != 0))
6092 {
6093 int ch = reqbyte & 255;
6094 re->req_byte = ((reqbyte & REQ_CASELESS) != 0 &&
6095 cd->fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte;
6096 re->flags |= PCRE_REQCHSET;
6097 }
6098
6099 /* Print out the compiled data if debugging is enabled. This is never the
6100 case when building a production library. */
6101
6102 #ifdef DEBUG
6103
6104 printf("Length = %d top_bracket = %d top_backref = %d\n",
6105 length, re->top_bracket, re->top_backref);
6106
6107 printf("Options=%08x\n", re->options);
6108
6109 if ((re->flags & PCRE_FIRSTSET) != 0)
6110 {
6111 int ch = re->first_byte & 255;
6112 const char *caseless = ((re->first_byte & REQ_CASELESS) == 0)?
6113 "" : " (caseless)";
6114 if (isprint(ch)) printf("First char = %c%s\n", ch, caseless);
6115 else printf("First char = \\x%02x%s\n", ch, caseless);
6116 }
6117
6118 if ((re->flags & PCRE_REQCHSET) != 0)
6119 {
6120 int ch = re->req_byte & 255;
6121 const char *caseless = ((re->req_byte & REQ_CASELESS) == 0)?
6122 "" : " (caseless)";
6123 if (isprint(ch)) printf("Req char = %c%s\n", ch, caseless);
6124 else printf("Req char = \\x%02x%s\n", ch, caseless);
6125 }
6126
6127 pcre_printint(re, stdout, TRUE);
6128
6129 /* This check is done here in the debugging case so that the code that
6130 was compiled can be seen. */
6131
6132 if (code - codestart > length)
6133 {
6134 (pcre_free)(re);
6135 *errorptr = find_error_text(ERR23);
6136 *erroroffset = ptr - (uschar *)pattern;
6137 if (errorcodeptr != NULL) *errorcodeptr = ERR23;
6138 return NULL;
6139 }
6140 #endif /* DEBUG */
6141
6142 return (pcre *)re;
6143 }
6144
6145 /* End of pcre_compile.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12