/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Contents of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 264 - (show annotations) (download)
Tue Nov 13 11:07:16 2007 UTC (7 years ago) by ph10
File MIME type: text/plain
File size: 194262 byte(s)
Fix [\S] etc. bug in UTF-8 mode with characters > 255.

1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2007 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains the external function pcre_compile(), along with
42 supporting internal functions that are not used by other modules. */
43
44
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48
49 #define NLBLOCK cd /* Block containing newline information */
50 #define PSSTART start_pattern /* Field containing processed string start */
51 #define PSEND end_pattern /* Field containing processed string end */
52
53 #include "pcre_internal.h"
54
55
56 /* When DEBUG is defined, we need the pcre_printint() function, which is also
57 used by pcretest. DEBUG is not defined when building a production library. */
58
59 #ifdef DEBUG
60 #include "pcre_printint.src"
61 #endif
62
63
64 /* Macro for setting individual bits in class bitmaps. */
65
66 #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
67
68 /* Maximum length value to check against when making sure that the integer that
69 holds the compiled pattern length does not overflow. We make it a bit less than
70 INT_MAX to allow for adding in group terminating bytes, so that we don't have
71 to check them every time. */
72
73 #define OFLOW_MAX (INT_MAX - 20)
74
75
76 /*************************************************
77 * Code parameters and static tables *
78 *************************************************/
79
80 /* This value specifies the size of stack workspace that is used during the
81 first pre-compile phase that determines how much memory is required. The regex
82 is partly compiled into this space, but the compiled parts are discarded as
83 soon as they can be, so that hopefully there will never be an overrun. The code
84 does, however, check for an overrun. The largest amount I've seen used is 218,
85 so this number is very generous.
86
87 The same workspace is used during the second, actual compile phase for
88 remembering forward references to groups so that they can be filled in at the
89 end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
90 is 4 there is plenty of room. */
91
92 #define COMPILE_WORK_SIZE (4096)
93
94
95 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
96 are simple data values; negative values are for special things like \d and so
97 on. Zero means further processing is needed (for things like \x), or the escape
98 is invalid. */
99
100 #ifndef EBCDIC /* This is the "normal" table for ASCII systems */
101 static const short int escapes[] = {
102 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
103 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
104 '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */
105 -ESC_H, 0, 0, -ESC_K, 0, 0, 0, 0, /* H - O */
106 -ESC_P, -ESC_Q, -ESC_R, -ESC_S, 0, 0, -ESC_V, -ESC_W, /* P - W */
107 -ESC_X, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
108 '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */
109 -ESC_h, 0, 0, -ESC_k, 0, 0, ESC_n, 0, /* h - o */
110 -ESC_p, 0, ESC_r, -ESC_s, ESC_tee, 0, -ESC_v, -ESC_w, /* p - w */
111 0, 0, -ESC_z /* x - z */
112 };
113
114 #else /* This is the "abnormal" table for EBCDIC systems */
115 static const short int escapes[] = {
116 /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
117 /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
118 /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
119 /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
120 /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
121 /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
122 /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
123 /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
124 /* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0,
125 /* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
126 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
127 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0,
128 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
129 /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
130 /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
131 /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
132 /* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0,
133 /* D0 */ '}', 0, -ESC_K, 0, 0, 0, 0, -ESC_P,
134 /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
135 /* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X,
136 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
137 /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
138 /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
139 };
140 #endif
141
142
143 /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
144 searched linearly. Put all the names into a single string, in order to reduce
145 the number of relocations when a shared library is dynamically linked. */
146
147 typedef struct verbitem {
148 int len;
149 int op;
150 } verbitem;
151
152 static const char verbnames[] =
153 "ACCEPT\0"
154 "COMMIT\0"
155 "F\0"
156 "FAIL\0"
157 "PRUNE\0"
158 "SKIP\0"
159 "THEN";
160
161 static verbitem verbs[] = {
162 { 6, OP_ACCEPT },
163 { 6, OP_COMMIT },
164 { 1, OP_FAIL },
165 { 4, OP_FAIL },
166 { 5, OP_PRUNE },
167 { 4, OP_SKIP },
168 { 4, OP_THEN }
169 };
170
171 static int verbcount = sizeof(verbs)/sizeof(verbitem);
172
173
174 /* Tables of names of POSIX character classes and their lengths. The names are
175 now all in a single string, to reduce the number of relocations when a shared
176 library is dynamically loaded. The list of lengths is terminated by a zero
177 length entry. The first three must be alpha, lower, upper, as this is assumed
178 for handling case independence. */
179
180 static const char posix_names[] =
181 "alpha\0" "lower\0" "upper\0" "alnum\0" "ascii\0" "blank\0"
182 "cntrl\0" "digit\0" "graph\0" "print\0" "punct\0" "space\0"
183 "word\0" "xdigit";
184
185 static const uschar posix_name_lengths[] = {
186 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
187
188 /* Table of class bit maps for each POSIX class. Each class is formed from a
189 base map, with an optional addition or removal of another map. Then, for some
190 classes, there is some additional tweaking: for [:blank:] the vertical space
191 characters are removed, and for [:alpha:] and [:alnum:] the underscore
192 character is removed. The triples in the table consist of the base map offset,
193 second map offset or -1 if no second map, and a non-negative value for map
194 addition or a negative value for map subtraction (if there are two maps). The
195 absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
196 remove vertical space characters, 2 => remove underscore. */
197
198 static const int posix_class_maps[] = {
199 cbit_word, cbit_digit, -2, /* alpha */
200 cbit_lower, -1, 0, /* lower */
201 cbit_upper, -1, 0, /* upper */
202 cbit_word, -1, 2, /* alnum - word without underscore */
203 cbit_print, cbit_cntrl, 0, /* ascii */
204 cbit_space, -1, 1, /* blank - a GNU extension */
205 cbit_cntrl, -1, 0, /* cntrl */
206 cbit_digit, -1, 0, /* digit */
207 cbit_graph, -1, 0, /* graph */
208 cbit_print, -1, 0, /* print */
209 cbit_punct, -1, 0, /* punct */
210 cbit_space, -1, 0, /* space */
211 cbit_word, -1, 0, /* word - a Perl extension */
212 cbit_xdigit,-1, 0 /* xdigit */
213 };
214
215
216 #define STRING(a) # a
217 #define XSTRING(s) STRING(s)
218
219 /* The texts of compile-time error messages. These are "char *" because they
220 are passed to the outside world. Do not ever re-use any error number, because
221 they are documented. Always add a new error instead. Messages marked DEAD below
222 are no longer used. This used to be a table of strings, but in order to reduce
223 the number of relocations needed when a shared library is loaded dynamically,
224 it is now one long string. We cannot use a table of offsets, because the
225 lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
226 simply count through to the one we want - this isn't a performance issue
227 because these strings are used only when there is a compilation error. */
228
229 static const char error_texts[] =
230 "no error\0"
231 "\\ at end of pattern\0"
232 "\\c at end of pattern\0"
233 "unrecognized character follows \\\0"
234 "numbers out of order in {} quantifier\0"
235 /* 5 */
236 "number too big in {} quantifier\0"
237 "missing terminating ] for character class\0"
238 "invalid escape sequence in character class\0"
239 "range out of order in character class\0"
240 "nothing to repeat\0"
241 /* 10 */
242 "operand of unlimited repeat could match the empty string\0" /** DEAD **/
243 "internal error: unexpected repeat\0"
244 "unrecognized character after (?\0"
245 "POSIX named classes are supported only within a class\0"
246 "missing )\0"
247 /* 15 */
248 "reference to non-existent subpattern\0"
249 "erroffset passed as NULL\0"
250 "unknown option bit(s) set\0"
251 "missing ) after comment\0"
252 "parentheses nested too deeply\0" /** DEAD **/
253 /* 20 */
254 "regular expression is too large\0"
255 "failed to get memory\0"
256 "unmatched parentheses\0"
257 "internal error: code overflow\0"
258 "unrecognized character after (?<\0"
259 /* 25 */
260 "lookbehind assertion is not fixed length\0"
261 "malformed number or name after (?(\0"
262 "conditional group contains more than two branches\0"
263 "assertion expected after (?(\0"
264 "(?R or (?[+-]digits must be followed by )\0"
265 /* 30 */
266 "unknown POSIX class name\0"
267 "POSIX collating elements are not supported\0"
268 "this version of PCRE is not compiled with PCRE_UTF8 support\0"
269 "spare error\0" /** DEAD **/
270 "character value in \\x{...} sequence is too large\0"
271 /* 35 */
272 "invalid condition (?(0)\0"
273 "\\C not allowed in lookbehind assertion\0"
274 "PCRE does not support \\L, \\l, \\N, \\U, or \\u\0"
275 "number after (?C is > 255\0"
276 "closing ) for (?C expected\0"
277 /* 40 */
278 "recursive call could loop indefinitely\0"
279 "unrecognized character after (?P\0"
280 "syntax error in subpattern name (missing terminator)\0"
281 "two named subpatterns have the same name\0"
282 "invalid UTF-8 string\0"
283 /* 45 */
284 "support for \\P, \\p, and \\X has not been compiled\0"
285 "malformed \\P or \\p sequence\0"
286 "unknown property name after \\P or \\p\0"
287 "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
288 "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
289 /* 50 */
290 "repeated subpattern is too long\0" /** DEAD **/
291 "octal value is greater than \\377 (not in UTF-8 mode)\0"
292 "internal error: overran compiling workspace\0"
293 "internal error: previously-checked referenced subpattern not found\0"
294 "DEFINE group contains more than one branch\0"
295 /* 55 */
296 "repeating a DEFINE group is not allowed\0"
297 "inconsistent NEWLINE options\0"
298 "\\g is not followed by a braced name or an optionally braced non-zero number\0"
299 "(?+ or (?- or (?(+ or (?(- must be followed by a non-zero number\0"
300 "(*VERB) with an argument is not supported\0"
301 /* 60 */
302 "(*VERB) not recognized\0"
303 "number is too big";
304
305
306 /* Table to identify digits and hex digits. This is used when compiling
307 patterns. Note that the tables in chartables are dependent on the locale, and
308 may mark arbitrary characters as digits - but the PCRE compiling code expects
309 to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
310 a private table here. It costs 256 bytes, but it is a lot faster than doing
311 character value tests (at least in some simple cases I timed), and in some
312 applications one wants PCRE to compile efficiently as well as match
313 efficiently.
314
315 For convenience, we use the same bit definitions as in chartables:
316
317 0x04 decimal digit
318 0x08 hexadecimal digit
319
320 Then we can use ctype_digit and ctype_xdigit in the code. */
321
322 #ifndef EBCDIC /* This is the "normal" case, for ASCII systems */
323 static const unsigned char digitab[] =
324 {
325 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
326 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
327 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
328 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
329 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
330 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
331 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
332 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
333 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
334 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
335 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
336 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
337 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
338 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
339 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
340 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
341 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
342 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
343 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
344 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
345 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
346 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
347 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
348 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
349 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
350 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
351 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
352 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
353 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
354 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
355 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
356 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
357
358 #else /* This is the "abnormal" case, for EBCDIC systems */
359 static const unsigned char digitab[] =
360 {
361 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
362 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
363 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
364 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
365 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
366 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
367 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
368 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
369 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
370 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
371 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
372 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */
373 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
374 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
375 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
376 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
377 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
378 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
379 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
380 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
381 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
382 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
383 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
384 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
385 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
386 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
387 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
388 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
389 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
390 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
391 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
392 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
393
394 static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
395 0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
396 0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
397 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
398 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
399 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
400 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
401 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
402 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
403 0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
404 0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
405 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
406 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */
407 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
408 0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
409 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
410 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
411 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
412 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
413 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
414 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
415 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
416 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
417 0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
418 0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
419 0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
420 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
421 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
422 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
423 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
424 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
425 0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
426 0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
427 #endif
428
429
430 /* Definition to allow mutual recursion */
431
432 static BOOL
433 compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
434 int *, int *, branch_chain *, compile_data *, int *);
435
436
437
438 /*************************************************
439 * Find an error text *
440 *************************************************/
441
442 /* The error texts are now all in one long string, to save on relocations. As
443 some of the text is of unknown length, we can't use a table of offsets.
444 Instead, just count through the strings. This is not a performance issue
445 because it happens only when there has been a compilation error.
446
447 Argument: the error number
448 Returns: pointer to the error string
449 */
450
451 static const char *
452 find_error_text(int n)
453 {
454 const char *s = error_texts;
455 for (; n > 0; n--) while (*s++ != 0);
456 return s;
457 }
458
459
460 /*************************************************
461 * Handle escapes *
462 *************************************************/
463
464 /* This function is called when a \ has been encountered. It either returns a
465 positive value for a simple escape such as \n, or a negative value which
466 encodes one of the more complicated things such as \d. A backreference to group
467 n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
468 UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
469 ptr is pointing at the \. On exit, it is on the final character of the escape
470 sequence.
471
472 Arguments:
473 ptrptr points to the pattern position pointer
474 errorcodeptr points to the errorcode variable
475 bracount number of previous extracting brackets
476 options the options bits
477 isclass TRUE if inside a character class
478
479 Returns: zero or positive => a data character
480 negative => a special escape sequence
481 on error, errorcodeptr is set
482 */
483
484 static int
485 check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
486 int options, BOOL isclass)
487 {
488 BOOL utf8 = (options & PCRE_UTF8) != 0;
489 const uschar *ptr = *ptrptr + 1;
490 int c, i;
491
492 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
493 ptr--; /* Set pointer back to the last byte */
494
495 /* If backslash is at the end of the pattern, it's an error. */
496
497 if (c == 0) *errorcodeptr = ERR1;
498
499 /* Non-alphamerics are literals. For digits or letters, do an initial lookup in
500 a table. A non-zero result is something that can be returned immediately.
501 Otherwise further processing may be required. */
502
503 #ifndef EBCDIC /* ASCII coding */
504 else if (c < '0' || c > 'z') {} /* Not alphameric */
505 else if ((i = escapes[c - '0']) != 0) c = i;
506
507 #else /* EBCDIC coding */
508 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphameric */
509 else if ((i = escapes[c - 0x48]) != 0) c = i;
510 #endif
511
512 /* Escapes that need further processing, or are illegal. */
513
514 else
515 {
516 const uschar *oldptr;
517 BOOL braced, negated;
518
519 switch (c)
520 {
521 /* A number of Perl escapes are not handled by PCRE. We give an explicit
522 error. */
523
524 case 'l':
525 case 'L':
526 case 'N':
527 case 'u':
528 case 'U':
529 *errorcodeptr = ERR37;
530 break;
531
532 /* \g must be followed by a number, either plain or braced. If positive, it
533 is an absolute backreference. If negative, it is a relative backreference.
534 This is a Perl 5.10 feature. Perl 5.10 also supports \g{name} as a
535 reference to a named group. This is part of Perl's movement towards a
536 unified syntax for back references. As this is synonymous with \k{name}, we
537 fudge it up by pretending it really was \k. */
538
539 case 'g':
540 if (ptr[1] == '{')
541 {
542 const uschar *p;
543 for (p = ptr+2; *p != 0 && *p != '}'; p++)
544 if (*p != '-' && (digitab[*p] & ctype_digit) == 0) break;
545 if (*p != 0 && *p != '}')
546 {
547 c = -ESC_k;
548 break;
549 }
550 braced = TRUE;
551 ptr++;
552 }
553 else braced = FALSE;
554
555 if (ptr[1] == '-')
556 {
557 negated = TRUE;
558 ptr++;
559 }
560 else negated = FALSE;
561
562 c = 0;
563 while ((digitab[ptr[1]] & ctype_digit) != 0)
564 c = c * 10 + *(++ptr) - '0';
565
566 if (c < 0)
567 {
568 *errorcodeptr = ERR61;
569 break;
570 }
571
572 if (c == 0 || (braced && *(++ptr) != '}'))
573 {
574 *errorcodeptr = ERR57;
575 break;
576 }
577
578 if (negated)
579 {
580 if (c > bracount)
581 {
582 *errorcodeptr = ERR15;
583 break;
584 }
585 c = bracount - (c - 1);
586 }
587
588 c = -(ESC_REF + c);
589 break;
590
591 /* The handling of escape sequences consisting of a string of digits
592 starting with one that is not zero is not straightforward. By experiment,
593 the way Perl works seems to be as follows:
594
595 Outside a character class, the digits are read as a decimal number. If the
596 number is less than 10, or if there are that many previous extracting
597 left brackets, then it is a back reference. Otherwise, up to three octal
598 digits are read to form an escaped byte. Thus \123 is likely to be octal
599 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
600 value is greater than 377, the least significant 8 bits are taken. Inside a
601 character class, \ followed by a digit is always an octal number. */
602
603 case '1': case '2': case '3': case '4': case '5':
604 case '6': case '7': case '8': case '9':
605
606 if (!isclass)
607 {
608 oldptr = ptr;
609 c -= '0';
610 while ((digitab[ptr[1]] & ctype_digit) != 0)
611 c = c * 10 + *(++ptr) - '0';
612 if (c < 0)
613 {
614 *errorcodeptr = ERR61;
615 break;
616 }
617 if (c < 10 || c <= bracount)
618 {
619 c = -(ESC_REF + c);
620 break;
621 }
622 ptr = oldptr; /* Put the pointer back and fall through */
623 }
624
625 /* Handle an octal number following \. If the first digit is 8 or 9, Perl
626 generates a binary zero byte and treats the digit as a following literal.
627 Thus we have to pull back the pointer by one. */
628
629 if ((c = *ptr) >= '8')
630 {
631 ptr--;
632 c = 0;
633 break;
634 }
635
636 /* \0 always starts an octal number, but we may drop through to here with a
637 larger first octal digit. The original code used just to take the least
638 significant 8 bits of octal numbers (I think this is what early Perls used
639 to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
640 than 3 octal digits. */
641
642 case '0':
643 c -= '0';
644 while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
645 c = c * 8 + *(++ptr) - '0';
646 if (!utf8 && c > 255) *errorcodeptr = ERR51;
647 break;
648
649 /* \x is complicated. \x{ddd} is a character number which can be greater
650 than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
651 treated as a data character. */
652
653 case 'x':
654 if (ptr[1] == '{')
655 {
656 const uschar *pt = ptr + 2;
657 int count = 0;
658
659 c = 0;
660 while ((digitab[*pt] & ctype_xdigit) != 0)
661 {
662 register int cc = *pt++;
663 if (c == 0 && cc == '0') continue; /* Leading zeroes */
664 count++;
665
666 #ifndef EBCDIC /* ASCII coding */
667 if (cc >= 'a') cc -= 32; /* Convert to upper case */
668 c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));
669 #else /* EBCDIC coding */
670 if (cc >= 'a' && cc <= 'z') cc += 64; /* Convert to upper case */
671 c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));
672 #endif
673 }
674
675 if (*pt == '}')
676 {
677 if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
678 ptr = pt;
679 break;
680 }
681
682 /* If the sequence of hex digits does not end with '}', then we don't
683 recognize this construct; fall through to the normal \x handling. */
684 }
685
686 /* Read just a single-byte hex-defined char */
687
688 c = 0;
689 while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
690 {
691 int cc; /* Some compilers don't like ++ */
692 cc = *(++ptr); /* in initializers */
693 #ifndef EBCDIC /* ASCII coding */
694 if (cc >= 'a') cc -= 32; /* Convert to upper case */
695 c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
696 #else /* EBCDIC coding */
697 if (cc <= 'z') cc += 64; /* Convert to upper case */
698 c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
699 #endif
700 }
701 break;
702
703 /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
704 This coding is ASCII-specific, but then the whole concept of \cx is
705 ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
706
707 case 'c':
708 c = *(++ptr);
709 if (c == 0)
710 {
711 *errorcodeptr = ERR2;
712 break;
713 }
714
715 #ifndef EBCDIC /* ASCII coding */
716 if (c >= 'a' && c <= 'z') c -= 32;
717 c ^= 0x40;
718 #else /* EBCDIC coding */
719 if (c >= 'a' && c <= 'z') c += 64;
720 c ^= 0xC0;
721 #endif
722 break;
723
724 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
725 other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
726 for Perl compatibility, it is a literal. This code looks a bit odd, but
727 there used to be some cases other than the default, and there may be again
728 in future, so I haven't "optimized" it. */
729
730 default:
731 if ((options & PCRE_EXTRA) != 0) switch(c)
732 {
733 default:
734 *errorcodeptr = ERR3;
735 break;
736 }
737 break;
738 }
739 }
740
741 *ptrptr = ptr;
742 return c;
743 }
744
745
746
747 #ifdef SUPPORT_UCP
748 /*************************************************
749 * Handle \P and \p *
750 *************************************************/
751
752 /* This function is called after \P or \p has been encountered, provided that
753 PCRE is compiled with support for Unicode properties. On entry, ptrptr is
754 pointing at the P or p. On exit, it is pointing at the final character of the
755 escape sequence.
756
757 Argument:
758 ptrptr points to the pattern position pointer
759 negptr points to a boolean that is set TRUE for negation else FALSE
760 dptr points to an int that is set to the detailed property value
761 errorcodeptr points to the error code variable
762
763 Returns: type value from ucp_type_table, or -1 for an invalid type
764 */
765
766 static int
767 get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
768 {
769 int c, i, bot, top;
770 const uschar *ptr = *ptrptr;
771 char name[32];
772
773 c = *(++ptr);
774 if (c == 0) goto ERROR_RETURN;
775
776 *negptr = FALSE;
777
778 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
779 negation. */
780
781 if (c == '{')
782 {
783 if (ptr[1] == '^')
784 {
785 *negptr = TRUE;
786 ptr++;
787 }
788 for (i = 0; i < (int)sizeof(name) - 1; i++)
789 {
790 c = *(++ptr);
791 if (c == 0) goto ERROR_RETURN;
792 if (c == '}') break;
793 name[i] = c;
794 }
795 if (c !='}') goto ERROR_RETURN;
796 name[i] = 0;
797 }
798
799 /* Otherwise there is just one following character */
800
801 else
802 {
803 name[0] = c;
804 name[1] = 0;
805 }
806
807 *ptrptr = ptr;
808
809 /* Search for a recognized property name using binary chop */
810
811 bot = 0;
812 top = _pcre_utt_size;
813
814 while (bot < top)
815 {
816 i = (bot + top) >> 1;
817 c = strcmp(name, _pcre_utt_names + _pcre_utt[i].name_offset);
818 if (c == 0)
819 {
820 *dptr = _pcre_utt[i].value;
821 return _pcre_utt[i].type;
822 }
823 if (c > 0) bot = i + 1; else top = i;
824 }
825
826 *errorcodeptr = ERR47;
827 *ptrptr = ptr;
828 return -1;
829
830 ERROR_RETURN:
831 *errorcodeptr = ERR46;
832 *ptrptr = ptr;
833 return -1;
834 }
835 #endif
836
837
838
839
840 /*************************************************
841 * Check for counted repeat *
842 *************************************************/
843
844 /* This function is called when a '{' is encountered in a place where it might
845 start a quantifier. It looks ahead to see if it really is a quantifier or not.
846 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
847 where the ddds are digits.
848
849 Arguments:
850 p pointer to the first char after '{'
851
852 Returns: TRUE or FALSE
853 */
854
855 static BOOL
856 is_counted_repeat(const uschar *p)
857 {
858 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
859 while ((digitab[*p] & ctype_digit) != 0) p++;
860 if (*p == '}') return TRUE;
861
862 if (*p++ != ',') return FALSE;
863 if (*p == '}') return TRUE;
864
865 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
866 while ((digitab[*p] & ctype_digit) != 0) p++;
867
868 return (*p == '}');
869 }
870
871
872
873 /*************************************************
874 * Read repeat counts *
875 *************************************************/
876
877 /* Read an item of the form {n,m} and return the values. This is called only
878 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
879 so the syntax is guaranteed to be correct, but we need to check the values.
880
881 Arguments:
882 p pointer to first char after '{'
883 minp pointer to int for min
884 maxp pointer to int for max
885 returned as -1 if no max
886 errorcodeptr points to error code variable
887
888 Returns: pointer to '}' on success;
889 current ptr on error, with errorcodeptr set non-zero
890 */
891
892 static const uschar *
893 read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
894 {
895 int min = 0;
896 int max = -1;
897
898 /* Read the minimum value and do a paranoid check: a negative value indicates
899 an integer overflow. */
900
901 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
902 if (min < 0 || min > 65535)
903 {
904 *errorcodeptr = ERR5;
905 return p;
906 }
907
908 /* Read the maximum value if there is one, and again do a paranoid on its size.
909 Also, max must not be less than min. */
910
911 if (*p == '}') max = min; else
912 {
913 if (*(++p) != '}')
914 {
915 max = 0;
916 while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
917 if (max < 0 || max > 65535)
918 {
919 *errorcodeptr = ERR5;
920 return p;
921 }
922 if (max < min)
923 {
924 *errorcodeptr = ERR4;
925 return p;
926 }
927 }
928 }
929
930 /* Fill in the required variables, and pass back the pointer to the terminating
931 '}'. */
932
933 *minp = min;
934 *maxp = max;
935 return p;
936 }
937
938
939
940 /*************************************************
941 * Find forward referenced subpattern *
942 *************************************************/
943
944 /* This function scans along a pattern's text looking for capturing
945 subpatterns, and counting them. If it finds a named pattern that matches the
946 name it is given, it returns its number. Alternatively, if the name is NULL, it
947 returns when it reaches a given numbered subpattern. This is used for forward
948 references to subpatterns. We know that if (?P< is encountered, the name will
949 be terminated by '>' because that is checked in the first pass.
950
951 Arguments:
952 ptr current position in the pattern
953 count current count of capturing parens so far encountered
954 name name to seek, or NULL if seeking a numbered subpattern
955 lorn name length, or subpattern number if name is NULL
956 xmode TRUE if we are in /x mode
957
958 Returns: the number of the named subpattern, or -1 if not found
959 */
960
961 static int
962 find_parens(const uschar *ptr, int count, const uschar *name, int lorn,
963 BOOL xmode)
964 {
965 const uschar *thisname;
966
967 for (; *ptr != 0; ptr++)
968 {
969 int term;
970
971 /* Skip over backslashed characters and also entire \Q...\E */
972
973 if (*ptr == '\\')
974 {
975 if (*(++ptr) == 0) return -1;
976 if (*ptr == 'Q') for (;;)
977 {
978 while (*(++ptr) != 0 && *ptr != '\\');
979 if (*ptr == 0) return -1;
980 if (*(++ptr) == 'E') break;
981 }
982 continue;
983 }
984
985 /* Skip over character classes */
986
987 if (*ptr == '[')
988 {
989 while (*(++ptr) != ']')
990 {
991 if (*ptr == 0) return -1;
992 if (*ptr == '\\')
993 {
994 if (*(++ptr) == 0) return -1;
995 if (*ptr == 'Q') for (;;)
996 {
997 while (*(++ptr) != 0 && *ptr != '\\');
998 if (*ptr == 0) return -1;
999 if (*(++ptr) == 'E') break;
1000 }
1001 continue;
1002 }
1003 }
1004 continue;
1005 }
1006
1007 /* Skip comments in /x mode */
1008
1009 if (xmode && *ptr == '#')
1010 {
1011 while (*(++ptr) != 0 && *ptr != '\n');
1012 if (*ptr == 0) return -1;
1013 continue;
1014 }
1015
1016 /* An opening parens must now be a real metacharacter */
1017
1018 if (*ptr != '(') continue;
1019 if (ptr[1] != '?' && ptr[1] != '*')
1020 {
1021 count++;
1022 if (name == NULL && count == lorn) return count;
1023 continue;
1024 }
1025
1026 ptr += 2;
1027 if (*ptr == 'P') ptr++; /* Allow optional P */
1028
1029 /* We have to disambiguate (?<! and (?<= from (?<name> */
1030
1031 if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') &&
1032 *ptr != '\'')
1033 continue;
1034
1035 count++;
1036
1037 if (name == NULL && count == lorn) return count;
1038 term = *ptr++;
1039 if (term == '<') term = '>';
1040 thisname = ptr;
1041 while (*ptr != term) ptr++;
1042 if (name != NULL && lorn == ptr - thisname &&
1043 strncmp((const char *)name, (const char *)thisname, lorn) == 0)
1044 return count;
1045 }
1046
1047 return -1;
1048 }
1049
1050
1051
1052 /*************************************************
1053 * Find first significant op code *
1054 *************************************************/
1055
1056 /* This is called by several functions that scan a compiled expression looking
1057 for a fixed first character, or an anchoring op code etc. It skips over things
1058 that do not influence this. For some calls, a change of option is important.
1059 For some calls, it makes sense to skip negative forward and all backward
1060 assertions, and also the \b assertion; for others it does not.
1061
1062 Arguments:
1063 code pointer to the start of the group
1064 options pointer to external options
1065 optbit the option bit whose changing is significant, or
1066 zero if none are
1067 skipassert TRUE if certain assertions are to be skipped
1068
1069 Returns: pointer to the first significant opcode
1070 */
1071
1072 static const uschar*
1073 first_significant_code(const uschar *code, int *options, int optbit,
1074 BOOL skipassert)
1075 {
1076 for (;;)
1077 {
1078 switch ((int)*code)
1079 {
1080 case OP_OPT:
1081 if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
1082 *options = (int)code[1];
1083 code += 2;
1084 break;
1085
1086 case OP_ASSERT_NOT:
1087 case OP_ASSERTBACK:
1088 case OP_ASSERTBACK_NOT:
1089 if (!skipassert) return code;
1090 do code += GET(code, 1); while (*code == OP_ALT);
1091 code += _pcre_OP_lengths[*code];
1092 break;
1093
1094 case OP_WORD_BOUNDARY:
1095 case OP_NOT_WORD_BOUNDARY:
1096 if (!skipassert) return code;
1097 /* Fall through */
1098
1099 case OP_CALLOUT:
1100 case OP_CREF:
1101 case OP_RREF:
1102 case OP_DEF:
1103 code += _pcre_OP_lengths[*code];
1104 break;
1105
1106 default:
1107 return code;
1108 }
1109 }
1110 /* Control never reaches here */
1111 }
1112
1113
1114
1115
1116 /*************************************************
1117 * Find the fixed length of a pattern *
1118 *************************************************/
1119
1120 /* Scan a pattern and compute the fixed length of subject that will match it,
1121 if the length is fixed. This is needed for dealing with backward assertions.
1122 In UTF8 mode, the result is in characters rather than bytes.
1123
1124 Arguments:
1125 code points to the start of the pattern (the bracket)
1126 options the compiling options
1127
1128 Returns: the fixed length, or -1 if there is no fixed length,
1129 or -2 if \C was encountered
1130 */
1131
1132 static int
1133 find_fixedlength(uschar *code, int options)
1134 {
1135 int length = -1;
1136
1137 register int branchlength = 0;
1138 register uschar *cc = code + 1 + LINK_SIZE;
1139
1140 /* Scan along the opcodes for this branch. If we get to the end of the
1141 branch, check the length against that of the other branches. */
1142
1143 for (;;)
1144 {
1145 int d;
1146 register int op = *cc;
1147 switch (op)
1148 {
1149 case OP_CBRA:
1150 case OP_BRA:
1151 case OP_ONCE:
1152 case OP_COND:
1153 d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);
1154 if (d < 0) return d;
1155 branchlength += d;
1156 do cc += GET(cc, 1); while (*cc == OP_ALT);
1157 cc += 1 + LINK_SIZE;
1158 break;
1159
1160 /* Reached end of a branch; if it's a ket it is the end of a nested
1161 call. If it's ALT it is an alternation in a nested call. If it is
1162 END it's the end of the outer call. All can be handled by the same code. */
1163
1164 case OP_ALT:
1165 case OP_KET:
1166 case OP_KETRMAX:
1167 case OP_KETRMIN:
1168 case OP_END:
1169 if (length < 0) length = branchlength;
1170 else if (length != branchlength) return -1;
1171 if (*cc != OP_ALT) return length;
1172 cc += 1 + LINK_SIZE;
1173 branchlength = 0;
1174 break;
1175
1176 /* Skip over assertive subpatterns */
1177
1178 case OP_ASSERT:
1179 case OP_ASSERT_NOT:
1180 case OP_ASSERTBACK:
1181 case OP_ASSERTBACK_NOT:
1182 do cc += GET(cc, 1); while (*cc == OP_ALT);
1183 /* Fall through */
1184
1185 /* Skip over things that don't match chars */
1186
1187 case OP_REVERSE:
1188 case OP_CREF:
1189 case OP_RREF:
1190 case OP_DEF:
1191 case OP_OPT:
1192 case OP_CALLOUT:
1193 case OP_SOD:
1194 case OP_SOM:
1195 case OP_EOD:
1196 case OP_EODN:
1197 case OP_CIRC:
1198 case OP_DOLL:
1199 case OP_NOT_WORD_BOUNDARY:
1200 case OP_WORD_BOUNDARY:
1201 cc += _pcre_OP_lengths[*cc];
1202 break;
1203
1204 /* Handle literal characters */
1205
1206 case OP_CHAR:
1207 case OP_CHARNC:
1208 case OP_NOT:
1209 branchlength++;
1210 cc += 2;
1211 #ifdef SUPPORT_UTF8
1212 if ((options & PCRE_UTF8) != 0)
1213 {
1214 while ((*cc & 0xc0) == 0x80) cc++;
1215 }
1216 #endif
1217 break;
1218
1219 /* Handle exact repetitions. The count is already in characters, but we
1220 need to skip over a multibyte character in UTF8 mode. */
1221
1222 case OP_EXACT:
1223 branchlength += GET2(cc,1);
1224 cc += 4;
1225 #ifdef SUPPORT_UTF8
1226 if ((options & PCRE_UTF8) != 0)
1227 {
1228 while((*cc & 0x80) == 0x80) cc++;
1229 }
1230 #endif
1231 break;
1232
1233 case OP_TYPEEXACT:
1234 branchlength += GET2(cc,1);
1235 if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
1236 cc += 4;
1237 break;
1238
1239 /* Handle single-char matchers */
1240
1241 case OP_PROP:
1242 case OP_NOTPROP:
1243 cc += 2;
1244 /* Fall through */
1245
1246 case OP_NOT_DIGIT:
1247 case OP_DIGIT:
1248 case OP_NOT_WHITESPACE:
1249 case OP_WHITESPACE:
1250 case OP_NOT_WORDCHAR:
1251 case OP_WORDCHAR:
1252 case OP_ANY:
1253 branchlength++;
1254 cc++;
1255 break;
1256
1257 /* The single-byte matcher isn't allowed */
1258
1259 case OP_ANYBYTE:
1260 return -2;
1261
1262 /* Check a class for variable quantification */
1263
1264 #ifdef SUPPORT_UTF8
1265 case OP_XCLASS:
1266 cc += GET(cc, 1) - 33;
1267 /* Fall through */
1268 #endif
1269
1270 case OP_CLASS:
1271 case OP_NCLASS:
1272 cc += 33;
1273
1274 switch (*cc)
1275 {
1276 case OP_CRSTAR:
1277 case OP_CRMINSTAR:
1278 case OP_CRQUERY:
1279 case OP_CRMINQUERY:
1280 return -1;
1281
1282 case OP_CRRANGE:
1283 case OP_CRMINRANGE:
1284 if (GET2(cc,1) != GET2(cc,3)) return -1;
1285 branchlength += GET2(cc,1);
1286 cc += 5;
1287 break;
1288
1289 default:
1290 branchlength++;
1291 }
1292 break;
1293
1294 /* Anything else is variable length */
1295
1296 default:
1297 return -1;
1298 }
1299 }
1300 /* Control never gets here */
1301 }
1302
1303
1304
1305
1306 /*************************************************
1307 * Scan compiled regex for numbered bracket *
1308 *************************************************/
1309
1310 /* This little function scans through a compiled pattern until it finds a
1311 capturing bracket with the given number.
1312
1313 Arguments:
1314 code points to start of expression
1315 utf8 TRUE in UTF-8 mode
1316 number the required bracket number
1317
1318 Returns: pointer to the opcode for the bracket, or NULL if not found
1319 */
1320
1321 static const uschar *
1322 find_bracket(const uschar *code, BOOL utf8, int number)
1323 {
1324 for (;;)
1325 {
1326 register int c = *code;
1327 if (c == OP_END) return NULL;
1328
1329 /* XCLASS is used for classes that cannot be represented just by a bit
1330 map. This includes negated single high-valued characters. The length in
1331 the table is zero; the actual length is stored in the compiled code. */
1332
1333 if (c == OP_XCLASS) code += GET(code, 1);
1334
1335 /* Handle capturing bracket */
1336
1337 else if (c == OP_CBRA)
1338 {
1339 int n = GET2(code, 1+LINK_SIZE);
1340 if (n == number) return (uschar *)code;
1341 code += _pcre_OP_lengths[c];
1342 }
1343
1344 /* Otherwise, we can get the item's length from the table, except that for
1345 repeated character types, we have to test for \p and \P, which have an extra
1346 two bytes of parameters. */
1347
1348 else
1349 {
1350 switch(c)
1351 {
1352 case OP_TYPESTAR:
1353 case OP_TYPEMINSTAR:
1354 case OP_TYPEPLUS:
1355 case OP_TYPEMINPLUS:
1356 case OP_TYPEQUERY:
1357 case OP_TYPEMINQUERY:
1358 case OP_TYPEPOSSTAR:
1359 case OP_TYPEPOSPLUS:
1360 case OP_TYPEPOSQUERY:
1361 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1362 break;
1363
1364 case OP_TYPEUPTO:
1365 case OP_TYPEMINUPTO:
1366 case OP_TYPEEXACT:
1367 case OP_TYPEPOSUPTO:
1368 if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1369 break;
1370 }
1371
1372 /* Add in the fixed length from the table */
1373
1374 code += _pcre_OP_lengths[c];
1375
1376 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1377 a multi-byte character. The length in the table is a minimum, so we have to
1378 arrange to skip the extra bytes. */
1379
1380 #ifdef SUPPORT_UTF8
1381 if (utf8) switch(c)
1382 {
1383 case OP_CHAR:
1384 case OP_CHARNC:
1385 case OP_EXACT:
1386 case OP_UPTO:
1387 case OP_MINUPTO:
1388 case OP_POSUPTO:
1389 case OP_STAR:
1390 case OP_MINSTAR:
1391 case OP_POSSTAR:
1392 case OP_PLUS:
1393 case OP_MINPLUS:
1394 case OP_POSPLUS:
1395 case OP_QUERY:
1396 case OP_MINQUERY:
1397 case OP_POSQUERY:
1398 if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1399 break;
1400 }
1401 #endif
1402 }
1403 }
1404 }
1405
1406
1407
1408 /*************************************************
1409 * Scan compiled regex for recursion reference *
1410 *************************************************/
1411
1412 /* This little function scans through a compiled pattern until it finds an
1413 instance of OP_RECURSE.
1414
1415 Arguments:
1416 code points to start of expression
1417 utf8 TRUE in UTF-8 mode
1418
1419 Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1420 */
1421
1422 static const uschar *
1423 find_recurse(const uschar *code, BOOL utf8)
1424 {
1425 for (;;)
1426 {
1427 register int c = *code;
1428 if (c == OP_END) return NULL;
1429 if (c == OP_RECURSE) return code;
1430
1431 /* XCLASS is used for classes that cannot be represented just by a bit
1432 map. This includes negated single high-valued characters. The length in
1433 the table is zero; the actual length is stored in the compiled code. */
1434
1435 if (c == OP_XCLASS) code += GET(code, 1);
1436
1437 /* Otherwise, we can get the item's length from the table, except that for
1438 repeated character types, we have to test for \p and \P, which have an extra
1439 two bytes of parameters. */
1440
1441 else
1442 {
1443 switch(c)
1444 {
1445 case OP_TYPESTAR:
1446 case OP_TYPEMINSTAR:
1447 case OP_TYPEPLUS:
1448 case OP_TYPEMINPLUS:
1449 case OP_TYPEQUERY:
1450 case OP_TYPEMINQUERY:
1451 case OP_TYPEPOSSTAR:
1452 case OP_TYPEPOSPLUS:
1453 case OP_TYPEPOSQUERY:
1454 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1455 break;
1456
1457 case OP_TYPEPOSUPTO:
1458 case OP_TYPEUPTO:
1459 case OP_TYPEMINUPTO:
1460 case OP_TYPEEXACT:
1461 if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1462 break;
1463 }
1464
1465 /* Add in the fixed length from the table */
1466
1467 code += _pcre_OP_lengths[c];
1468
1469 /* In UTF-8 mode, opcodes that are followed by a character may be followed
1470 by a multi-byte character. The length in the table is a minimum, so we have
1471 to arrange to skip the extra bytes. */
1472
1473 #ifdef SUPPORT_UTF8
1474 if (utf8) switch(c)
1475 {
1476 case OP_CHAR:
1477 case OP_CHARNC:
1478 case OP_EXACT:
1479 case OP_UPTO:
1480 case OP_MINUPTO:
1481 case OP_POSUPTO:
1482 case OP_STAR:
1483 case OP_MINSTAR:
1484 case OP_POSSTAR:
1485 case OP_PLUS:
1486 case OP_MINPLUS:
1487 case OP_POSPLUS:
1488 case OP_QUERY:
1489 case OP_MINQUERY:
1490 case OP_POSQUERY:
1491 if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1492 break;
1493 }
1494 #endif
1495 }
1496 }
1497 }
1498
1499
1500
1501 /*************************************************
1502 * Scan compiled branch for non-emptiness *
1503 *************************************************/
1504
1505 /* This function scans through a branch of a compiled pattern to see whether it
1506 can match the empty string or not. It is called from could_be_empty()
1507 below and from compile_branch() when checking for an unlimited repeat of a
1508 group that can match nothing. Note that first_significant_code() skips over
1509 assertions. If we hit an unclosed bracket, we return "empty" - this means we've
1510 struck an inner bracket whose current branch will already have been scanned.
1511
1512 Arguments:
1513 code points to start of search
1514 endcode points to where to stop
1515 utf8 TRUE if in UTF8 mode
1516
1517 Returns: TRUE if what is matched could be empty
1518 */
1519
1520 static BOOL
1521 could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1522 {
1523 register int c;
1524 for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1525 code < endcode;
1526 code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1527 {
1528 const uschar *ccode;
1529
1530 c = *code;
1531
1532 /* Groups with zero repeats can of course be empty; skip them. */
1533
1534 if (c == OP_BRAZERO || c == OP_BRAMINZERO)
1535 {
1536 code += _pcre_OP_lengths[c];
1537 do code += GET(code, 1); while (*code == OP_ALT);
1538 c = *code;
1539 continue;
1540 }
1541
1542 /* For other groups, scan the branches. */
1543
1544 if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND)
1545 {
1546 BOOL empty_branch;
1547 if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1548
1549 /* Scan a closed bracket */
1550
1551 empty_branch = FALSE;
1552 do
1553 {
1554 if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1555 empty_branch = TRUE;
1556 code += GET(code, 1);
1557 }
1558 while (*code == OP_ALT);
1559 if (!empty_branch) return FALSE; /* All branches are non-empty */
1560 c = *code;
1561 continue;
1562 }
1563
1564 /* Handle the other opcodes */
1565
1566 switch (c)
1567 {
1568 /* Check for quantifiers after a class. XCLASS is used for classes that
1569 cannot be represented just by a bit map. This includes negated single
1570 high-valued characters. The length in _pcre_OP_lengths[] is zero; the
1571 actual length is stored in the compiled code, so we must update "code"
1572 here. */
1573
1574 #ifdef SUPPORT_UTF8
1575 case OP_XCLASS:
1576 ccode = code += GET(code, 1);
1577 goto CHECK_CLASS_REPEAT;
1578 #endif
1579
1580 case OP_CLASS:
1581 case OP_NCLASS:
1582 ccode = code + 33;
1583
1584 #ifdef SUPPORT_UTF8
1585 CHECK_CLASS_REPEAT:
1586 #endif
1587
1588 switch (*ccode)
1589 {
1590 case OP_CRSTAR: /* These could be empty; continue */
1591 case OP_CRMINSTAR:
1592 case OP_CRQUERY:
1593 case OP_CRMINQUERY:
1594 break;
1595
1596 default: /* Non-repeat => class must match */
1597 case OP_CRPLUS: /* These repeats aren't empty */
1598 case OP_CRMINPLUS:
1599 return FALSE;
1600
1601 case OP_CRRANGE:
1602 case OP_CRMINRANGE:
1603 if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
1604 break;
1605 }
1606 break;
1607
1608 /* Opcodes that must match a character */
1609
1610 case OP_PROP:
1611 case OP_NOTPROP:
1612 case OP_EXTUNI:
1613 case OP_NOT_DIGIT:
1614 case OP_DIGIT:
1615 case OP_NOT_WHITESPACE:
1616 case OP_WHITESPACE:
1617 case OP_NOT_WORDCHAR:
1618 case OP_WORDCHAR:
1619 case OP_ANY:
1620 case OP_ANYBYTE:
1621 case OP_CHAR:
1622 case OP_CHARNC:
1623 case OP_NOT:
1624 case OP_PLUS:
1625 case OP_MINPLUS:
1626 case OP_POSPLUS:
1627 case OP_EXACT:
1628 case OP_NOTPLUS:
1629 case OP_NOTMINPLUS:
1630 case OP_NOTPOSPLUS:
1631 case OP_NOTEXACT:
1632 case OP_TYPEPLUS:
1633 case OP_TYPEMINPLUS:
1634 case OP_TYPEPOSPLUS:
1635 case OP_TYPEEXACT:
1636 return FALSE;
1637
1638 /* These are going to continue, as they may be empty, but we have to
1639 fudge the length for the \p and \P cases. */
1640
1641 case OP_TYPESTAR:
1642 case OP_TYPEMINSTAR:
1643 case OP_TYPEPOSSTAR:
1644 case OP_TYPEQUERY:
1645 case OP_TYPEMINQUERY:
1646 case OP_TYPEPOSQUERY:
1647 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1648 break;
1649
1650 /* Same for these */
1651
1652 case OP_TYPEUPTO:
1653 case OP_TYPEMINUPTO:
1654 case OP_TYPEPOSUPTO:
1655 if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1656 break;
1657
1658 /* End of branch */
1659
1660 case OP_KET:
1661 case OP_KETRMAX:
1662 case OP_KETRMIN:
1663 case OP_ALT:
1664 return TRUE;
1665
1666 /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1667 MINUPTO, and POSUPTO may be followed by a multibyte character */
1668
1669 #ifdef SUPPORT_UTF8
1670 case OP_STAR:
1671 case OP_MINSTAR:
1672 case OP_POSSTAR:
1673 case OP_QUERY:
1674 case OP_MINQUERY:
1675 case OP_POSQUERY:
1676 case OP_UPTO:
1677 case OP_MINUPTO:
1678 case OP_POSUPTO:
1679 if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1680 break;
1681 #endif
1682 }
1683 }
1684
1685 return TRUE;
1686 }
1687
1688
1689
1690 /*************************************************
1691 * Scan compiled regex for non-emptiness *
1692 *************************************************/
1693
1694 /* This function is called to check for left recursive calls. We want to check
1695 the current branch of the current pattern to see if it could match the empty
1696 string. If it could, we must look outwards for branches at other levels,
1697 stopping when we pass beyond the bracket which is the subject of the recursion.
1698
1699 Arguments:
1700 code points to start of the recursion
1701 endcode points to where to stop (current RECURSE item)
1702 bcptr points to the chain of current (unclosed) branch starts
1703 utf8 TRUE if in UTF-8 mode
1704
1705 Returns: TRUE if what is matched could be empty
1706 */
1707
1708 static BOOL
1709 could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1710 BOOL utf8)
1711 {
1712 while (bcptr != NULL && bcptr->current >= code)
1713 {
1714 if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1715 bcptr = bcptr->outer;
1716 }
1717 return TRUE;
1718 }
1719
1720
1721
1722 /*************************************************
1723 * Check for POSIX class syntax *
1724 *************************************************/
1725
1726 /* This function is called when the sequence "[:" or "[." or "[=" is
1727 encountered in a character class. It checks whether this is followed by an
1728 optional ^ and then a sequence of letters, terminated by a matching ":]" or
1729 ".]" or "=]".
1730
1731 Argument:
1732 ptr pointer to the initial [
1733 endptr where to return the end pointer
1734 cd pointer to compile data
1735
1736 Returns: TRUE or FALSE
1737 */
1738
1739 static BOOL
1740 check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
1741 {
1742 int terminator; /* Don't combine these lines; the Solaris cc */
1743 terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
1744 if (*(++ptr) == '^') ptr++;
1745 while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
1746 if (*ptr == terminator && ptr[1] == ']')
1747 {
1748 *endptr = ptr;
1749 return TRUE;
1750 }
1751 return FALSE;
1752 }
1753
1754
1755
1756
1757 /*************************************************
1758 * Check POSIX class name *
1759 *************************************************/
1760
1761 /* This function is called to check the name given in a POSIX-style class entry
1762 such as [:alnum:].
1763
1764 Arguments:
1765 ptr points to the first letter
1766 len the length of the name
1767
1768 Returns: a value representing the name, or -1 if unknown
1769 */
1770
1771 static int
1772 check_posix_name(const uschar *ptr, int len)
1773 {
1774 const char *pn = posix_names;
1775 register int yield = 0;
1776 while (posix_name_lengths[yield] != 0)
1777 {
1778 if (len == posix_name_lengths[yield] &&
1779 strncmp((const char *)ptr, pn, len) == 0) return yield;
1780 pn += posix_name_lengths[yield] + 1;
1781 yield++;
1782 }
1783 return -1;
1784 }
1785
1786
1787 /*************************************************
1788 * Adjust OP_RECURSE items in repeated group *
1789 *************************************************/
1790
1791 /* OP_RECURSE items contain an offset from the start of the regex to the group
1792 that is referenced. This means that groups can be replicated for fixed
1793 repetition simply by copying (because the recursion is allowed to refer to
1794 earlier groups that are outside the current group). However, when a group is
1795 optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
1796 it, after it has been compiled. This means that any OP_RECURSE items within it
1797 that refer to the group itself or any contained groups have to have their
1798 offsets adjusted. That one of the jobs of this function. Before it is called,
1799 the partially compiled regex must be temporarily terminated with OP_END.
1800
1801 This function has been extended with the possibility of forward references for
1802 recursions and subroutine calls. It must also check the list of such references
1803 for the group we are dealing with. If it finds that one of the recursions in
1804 the current group is on this list, it adjusts the offset in the list, not the
1805 value in the reference (which is a group number).
1806
1807 Arguments:
1808 group points to the start of the group
1809 adjust the amount by which the group is to be moved
1810 utf8 TRUE in UTF-8 mode
1811 cd contains pointers to tables etc.
1812 save_hwm the hwm forward reference pointer at the start of the group
1813
1814 Returns: nothing
1815 */
1816
1817 static void
1818 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
1819 uschar *save_hwm)
1820 {
1821 uschar *ptr = group;
1822
1823 while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1824 {
1825 int offset;
1826 uschar *hc;
1827
1828 /* See if this recursion is on the forward reference list. If so, adjust the
1829 reference. */
1830
1831 for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
1832 {
1833 offset = GET(hc, 0);
1834 if (cd->start_code + offset == ptr + 1)
1835 {
1836 PUT(hc, 0, offset + adjust);
1837 break;
1838 }
1839 }
1840
1841 /* Otherwise, adjust the recursion offset if it's after the start of this
1842 group. */
1843
1844 if (hc >= cd->hwm)
1845 {
1846 offset = GET(ptr, 1);
1847 if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1848 }
1849
1850 ptr += 1 + LINK_SIZE;
1851 }
1852 }
1853
1854
1855
1856 /*************************************************
1857 * Insert an automatic callout point *
1858 *************************************************/
1859
1860 /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
1861 callout points before each pattern item.
1862
1863 Arguments:
1864 code current code pointer
1865 ptr current pattern pointer
1866 cd pointers to tables etc
1867
1868 Returns: new code pointer
1869 */
1870
1871 static uschar *
1872 auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
1873 {
1874 *code++ = OP_CALLOUT;
1875 *code++ = 255;
1876 PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */
1877 PUT(code, LINK_SIZE, 0); /* Default length */
1878 return code + 2*LINK_SIZE;
1879 }
1880
1881
1882
1883 /*************************************************
1884 * Complete a callout item *
1885 *************************************************/
1886
1887 /* A callout item contains the length of the next item in the pattern, which
1888 we can't fill in till after we have reached the relevant point. This is used
1889 for both automatic and manual callouts.
1890
1891 Arguments:
1892 previous_callout points to previous callout item
1893 ptr current pattern pointer
1894 cd pointers to tables etc
1895
1896 Returns: nothing
1897 */
1898
1899 static void
1900 complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
1901 {
1902 int length = ptr - cd->start_pattern - GET(previous_callout, 2);
1903 PUT(previous_callout, 2 + LINK_SIZE, length);
1904 }
1905
1906
1907
1908 #ifdef SUPPORT_UCP
1909 /*************************************************
1910 * Get othercase range *
1911 *************************************************/
1912
1913 /* This function is passed the start and end of a class range, in UTF-8 mode
1914 with UCP support. It searches up the characters, looking for internal ranges of
1915 characters in the "other" case. Each call returns the next one, updating the
1916 start address.
1917
1918 Arguments:
1919 cptr points to starting character value; updated
1920 d end value
1921 ocptr where to put start of othercase range
1922 odptr where to put end of othercase range
1923
1924 Yield: TRUE when range returned; FALSE when no more
1925 */
1926
1927 static BOOL
1928 get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
1929 unsigned int *odptr)
1930 {
1931 unsigned int c, othercase, next;
1932
1933 for (c = *cptr; c <= d; c++)
1934 { if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR) break; }
1935
1936 if (c > d) return FALSE;
1937
1938 *ocptr = othercase;
1939 next = othercase + 1;
1940
1941 for (++c; c <= d; c++)
1942 {
1943 if (_pcre_ucp_othercase(c) != next) break;
1944 next++;
1945 }
1946
1947 *odptr = next - 1;
1948 *cptr = c;
1949
1950 return TRUE;
1951 }
1952 #endif /* SUPPORT_UCP */
1953
1954
1955
1956 /*************************************************
1957 * Check if auto-possessifying is possible *
1958 *************************************************/
1959
1960 /* This function is called for unlimited repeats of certain items, to see
1961 whether the next thing could possibly match the repeated item. If not, it makes
1962 sense to automatically possessify the repeated item.
1963
1964 Arguments:
1965 op_code the repeated op code
1966 this data for this item, depends on the opcode
1967 utf8 TRUE in UTF-8 mode
1968 utf8_char used for utf8 character bytes, NULL if not relevant
1969 ptr next character in pattern
1970 options options bits
1971 cd contains pointers to tables etc.
1972
1973 Returns: TRUE if possessifying is wanted
1974 */
1975
1976 static BOOL
1977 check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
1978 const uschar *ptr, int options, compile_data *cd)
1979 {
1980 int next;
1981
1982 /* Skip whitespace and comments in extended mode */
1983
1984 if ((options & PCRE_EXTENDED) != 0)
1985 {
1986 for (;;)
1987 {
1988 while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1989 if (*ptr == '#')
1990 {
1991 while (*(++ptr) != 0)
1992 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1993 }
1994 else break;
1995 }
1996 }
1997
1998 /* If the next item is one that we can handle, get its value. A non-negative
1999 value is a character, a negative value is an escape value. */
2000
2001 if (*ptr == '\\')
2002 {
2003 int temperrorcode = 0;
2004 next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
2005 if (temperrorcode != 0) return FALSE;
2006 ptr++; /* Point after the escape sequence */
2007 }
2008
2009 else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
2010 {
2011 #ifdef SUPPORT_UTF8
2012 if (utf8) { GETCHARINC(next, ptr); } else
2013 #endif
2014 next = *ptr++;
2015 }
2016
2017 else return FALSE;
2018
2019 /* Skip whitespace and comments in extended mode */
2020
2021 if ((options & PCRE_EXTENDED) != 0)
2022 {
2023 for (;;)
2024 {
2025 while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2026 if (*ptr == '#')
2027 {
2028 while (*(++ptr) != 0)
2029 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2030 }
2031 else break;
2032 }
2033 }
2034
2035 /* If the next thing is itself optional, we have to give up. */
2036
2037 if (*ptr == '*' || *ptr == '?' || strncmp((char *)ptr, "{0,", 3) == 0)
2038 return FALSE;
2039
2040 /* Now compare the next item with the previous opcode. If the previous is a
2041 positive single character match, "item" either contains the character or, if
2042 "item" is greater than 127 in utf8 mode, the character's bytes are in
2043 utf8_char. */
2044
2045
2046 /* Handle cases when the next item is a character. */
2047
2048 if (next >= 0) switch(op_code)
2049 {
2050 case OP_CHAR:
2051 #ifdef SUPPORT_UTF8
2052 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2053 #endif
2054 return item != next;
2055
2056 /* For CHARNC (caseless character) we must check the other case. If we have
2057 Unicode property support, we can use it to test the other case of
2058 high-valued characters. */
2059
2060 case OP_CHARNC:
2061 #ifdef SUPPORT_UTF8
2062 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2063 #endif
2064 if (item == next) return FALSE;
2065 #ifdef SUPPORT_UTF8
2066 if (utf8)
2067 {
2068 unsigned int othercase;
2069 if (next < 128) othercase = cd->fcc[next]; else
2070 #ifdef SUPPORT_UCP
2071 othercase = _pcre_ucp_othercase((unsigned int)next);
2072 #else
2073 othercase = NOTACHAR;
2074 #endif
2075 return (unsigned int)item != othercase;
2076 }
2077 else
2078 #endif /* SUPPORT_UTF8 */
2079 return (item != cd->fcc[next]); /* Non-UTF-8 mode */
2080
2081 /* For OP_NOT, "item" must be a single-byte character. */
2082
2083 case OP_NOT:
2084 if (next < 0) return FALSE; /* Not a character */
2085 if (item == next) return TRUE;
2086 if ((options & PCRE_CASELESS) == 0) return FALSE;
2087 #ifdef SUPPORT_UTF8
2088 if (utf8)
2089 {
2090 unsigned int othercase;
2091 if (next < 128) othercase = cd->fcc[next]; else
2092 #ifdef SUPPORT_UCP
2093 othercase = _pcre_ucp_othercase(next);
2094 #else
2095 othercase = NOTACHAR;
2096 #endif
2097 return (unsigned int)item == othercase;
2098 }
2099 else
2100 #endif /* SUPPORT_UTF8 */
2101 return (item == cd->fcc[next]); /* Non-UTF-8 mode */
2102
2103 case OP_DIGIT:
2104 return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
2105
2106 case OP_NOT_DIGIT:
2107 return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
2108
2109 case OP_WHITESPACE:
2110 return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
2111
2112 case OP_NOT_WHITESPACE:
2113 return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
2114
2115 case OP_WORDCHAR:
2116 return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
2117
2118 case OP_NOT_WORDCHAR:
2119 return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
2120
2121 case OP_HSPACE:
2122 case OP_NOT_HSPACE:
2123 switch(next)
2124 {
2125 case 0x09:
2126 case 0x20:
2127 case 0xa0:
2128 case 0x1680:
2129 case 0x180e:
2130 case 0x2000:
2131 case 0x2001:
2132 case 0x2002:
2133 case 0x2003:
2134 case 0x2004:
2135 case 0x2005:
2136 case 0x2006:
2137 case 0x2007:
2138 case 0x2008:
2139 case 0x2009:
2140 case 0x200A:
2141 case 0x202f:
2142 case 0x205f:
2143 case 0x3000:
2144 return op_code != OP_HSPACE;
2145 default:
2146 return op_code == OP_HSPACE;
2147 }
2148
2149 case OP_VSPACE:
2150 case OP_NOT_VSPACE:
2151 switch(next)
2152 {
2153 case 0x0a:
2154 case 0x0b:
2155 case 0x0c:
2156 case 0x0d:
2157 case 0x85:
2158 case 0x2028:
2159 case 0x2029:
2160 return op_code != OP_VSPACE;
2161 default:
2162 return op_code == OP_VSPACE;
2163 }
2164
2165 default:
2166 return FALSE;
2167 }
2168
2169
2170 /* Handle the case when the next item is \d, \s, etc. */
2171
2172 switch(op_code)
2173 {
2174 case OP_CHAR:
2175 case OP_CHARNC:
2176 #ifdef SUPPORT_UTF8
2177 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2178 #endif
2179 switch(-next)
2180 {
2181 case ESC_d:
2182 return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
2183
2184 case ESC_D:
2185 return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
2186
2187 case ESC_s:
2188 return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
2189
2190 case ESC_S:
2191 return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
2192
2193 case ESC_w:
2194 return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
2195
2196 case ESC_W:
2197 return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
2198
2199 case ESC_h:
2200 case ESC_H:
2201 switch(item)
2202 {
2203 case 0x09:
2204 case 0x20:
2205 case 0xa0:
2206 case 0x1680:
2207 case 0x180e:
2208 case 0x2000:
2209 case 0x2001:
2210 case 0x2002:
2211 case 0x2003:
2212 case 0x2004:
2213 case 0x2005:
2214 case 0x2006:
2215 case 0x2007:
2216 case 0x2008:
2217 case 0x2009:
2218 case 0x200A:
2219 case 0x202f:
2220 case 0x205f:
2221 case 0x3000:
2222 return -next != ESC_h;
2223 default:
2224 return -next == ESC_h;
2225 }
2226
2227 case ESC_v:
2228 case ESC_V:
2229 switch(item)
2230 {
2231 case 0x0a:
2232 case 0x0b:
2233 case 0x0c:
2234 case 0x0d:
2235 case 0x85:
2236 case 0x2028:
2237 case 0x2029:
2238 return -next != ESC_v;
2239 default:
2240 return -next == ESC_v;
2241 }
2242
2243 default:
2244 return FALSE;
2245 }
2246
2247 case OP_DIGIT:
2248 return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
2249 next == -ESC_h || next == -ESC_v;
2250
2251 case OP_NOT_DIGIT:
2252 return next == -ESC_d;
2253
2254 case OP_WHITESPACE:
2255 return next == -ESC_S || next == -ESC_d || next == -ESC_w;
2256
2257 case OP_NOT_WHITESPACE:
2258 return next == -ESC_s || next == -ESC_h || next == -ESC_v;
2259
2260 case OP_HSPACE:
2261 return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w;
2262
2263 case OP_NOT_HSPACE:
2264 return next == -ESC_h;
2265
2266 /* Can't have \S in here because VT matches \S (Perl anomaly) */
2267 case OP_VSPACE:
2268 return next == -ESC_V || next == -ESC_d || next == -ESC_w;
2269
2270 case OP_NOT_VSPACE:
2271 return next == -ESC_v;
2272
2273 case OP_WORDCHAR:
2274 return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v;
2275
2276 case OP_NOT_WORDCHAR:
2277 return next == -ESC_w || next == -ESC_d;
2278
2279 default:
2280 return FALSE;
2281 }
2282
2283 /* Control does not reach here */
2284 }
2285
2286
2287
2288 /*************************************************
2289 * Compile one branch *
2290 *************************************************/
2291
2292 /* Scan the pattern, compiling it into the a vector. If the options are
2293 changed during the branch, the pointer is used to change the external options
2294 bits. This function is used during the pre-compile phase when we are trying
2295 to find out the amount of memory needed, as well as during the real compile
2296 phase. The value of lengthptr distinguishes the two phases.
2297
2298 Arguments:
2299 optionsptr pointer to the option bits
2300 codeptr points to the pointer to the current code point
2301 ptrptr points to the current pattern pointer
2302 errorcodeptr points to error code variable
2303 firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2304 reqbyteptr set to the last literal character required, else < 0
2305 bcptr points to current branch chain
2306 cd contains pointers to tables etc.
2307 lengthptr NULL during the real compile phase
2308 points to length accumulator during pre-compile phase
2309
2310 Returns: TRUE on success
2311 FALSE, with *errorcodeptr set non-zero on error
2312 */
2313
2314 static BOOL
2315 compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2316 int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2317 compile_data *cd, int *lengthptr)
2318 {
2319 int repeat_type, op_type;
2320 int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
2321 int bravalue = 0;
2322 int greedy_default, greedy_non_default;
2323 int firstbyte, reqbyte;
2324 int zeroreqbyte, zerofirstbyte;
2325 int req_caseopt, reqvary, tempreqvary;
2326 int options = *optionsptr;
2327 int after_manual_callout = 0;
2328 int length_prevgroup = 0;
2329 register int c;
2330 register uschar *code = *codeptr;
2331 uschar *last_code = code;
2332 uschar *orig_code = code;
2333 uschar *tempcode;
2334 BOOL inescq = FALSE;
2335 BOOL groupsetfirstbyte = FALSE;
2336 const uschar *ptr = *ptrptr;
2337 const uschar *tempptr;
2338 uschar *previous = NULL;
2339 uschar *previous_callout = NULL;
2340 uschar *save_hwm = NULL;
2341 uschar classbits[32];
2342
2343 #ifdef SUPPORT_UTF8
2344 BOOL class_utf8;
2345 BOOL utf8 = (options & PCRE_UTF8) != 0;
2346 uschar *class_utf8data;
2347 uschar utf8_char[6];
2348 #else
2349 BOOL utf8 = FALSE;
2350 uschar *utf8_char = NULL;
2351 #endif
2352
2353 #ifdef DEBUG
2354 if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2355 #endif
2356
2357 /* Set up the default and non-default settings for greediness */
2358
2359 greedy_default = ((options & PCRE_UNGREEDY) != 0);
2360 greedy_non_default = greedy_default ^ 1;
2361
2362 /* Initialize no first byte, no required byte. REQ_UNSET means "no char
2363 matching encountered yet". It gets changed to REQ_NONE if we hit something that
2364 matches a non-fixed char first char; reqbyte just remains unset if we never
2365 find one.
2366
2367 When we hit a repeat whose minimum is zero, we may have to adjust these values
2368 to take the zero repeat into account. This is implemented by setting them to
2369 zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2370 item types that can be repeated set these backoff variables appropriately. */
2371
2372 firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2373
2374 /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2375 according to the current setting of the caseless flag. REQ_CASELESS is a bit
2376 value > 255. It is added into the firstbyte or reqbyte variables to record the
2377 case status of the value. This is used only for ASCII characters. */
2378
2379 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2380
2381 /* Switch on next character until the end of the branch */
2382
2383 for (;; ptr++)
2384 {
2385 BOOL negate_class;
2386 BOOL should_flip_negation;
2387 BOOL possessive_quantifier;
2388 BOOL is_quantifier;
2389 BOOL is_recurse;
2390 BOOL reset_bracount;
2391 int class_charcount;
2392 int class_lastchar;
2393 int newoptions;
2394 int recno;
2395 int refsign;
2396 int skipbytes;
2397 int subreqbyte;
2398 int subfirstbyte;
2399 int terminator;
2400 int mclength;
2401 uschar mcbuffer[8];
2402
2403 /* Get next byte in the pattern */
2404
2405 c = *ptr;
2406
2407 /* If we are in the pre-compile phase, accumulate the length used for the
2408 previous cycle of this loop. */
2409
2410 if (lengthptr != NULL)
2411 {
2412 #ifdef DEBUG
2413 if (code > cd->hwm) cd->hwm = code; /* High water info */
2414 #endif
2415 if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
2416 {
2417 *errorcodeptr = ERR52;
2418 goto FAILED;
2419 }
2420
2421 /* There is at least one situation where code goes backwards: this is the
2422 case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2423 the class is simply eliminated. However, it is created first, so we have to
2424 allow memory for it. Therefore, don't ever reduce the length at this point.
2425 */
2426
2427 if (code < last_code) code = last_code;
2428
2429 /* Paranoid check for integer overflow */
2430
2431 if (OFLOW_MAX - *lengthptr < code - last_code)
2432 {
2433 *errorcodeptr = ERR20;
2434 goto FAILED;
2435 }
2436
2437 *lengthptr += code - last_code;
2438 DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2439
2440 /* If "previous" is set and it is not at the start of the work space, move
2441 it back to there, in order to avoid filling up the work space. Otherwise,
2442 if "previous" is NULL, reset the current code pointer to the start. */
2443
2444 if (previous != NULL)
2445 {
2446 if (previous > orig_code)
2447 {
2448 memmove(orig_code, previous, code - previous);
2449 code -= previous - orig_code;
2450 previous = orig_code;
2451 }
2452 }
2453 else code = orig_code;
2454
2455 /* Remember where this code item starts so we can pick up the length
2456 next time round. */
2457
2458 last_code = code;
2459 }
2460
2461 /* In the real compile phase, just check the workspace used by the forward
2462 reference list. */
2463
2464 else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
2465 {
2466 *errorcodeptr = ERR52;
2467 goto FAILED;
2468 }
2469
2470 /* If in \Q...\E, check for the end; if not, we have a literal */
2471
2472 if (inescq && c != 0)
2473 {
2474 if (c == '\\' && ptr[1] == 'E')
2475 {
2476 inescq = FALSE;
2477 ptr++;
2478 continue;
2479 }
2480 else
2481 {
2482 if (previous_callout != NULL)
2483 {
2484 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2485 complete_callout(previous_callout, ptr, cd);
2486 previous_callout = NULL;
2487 }
2488 if ((options & PCRE_AUTO_CALLOUT) != 0)
2489 {
2490 previous_callout = code;
2491 code = auto_callout(code, ptr, cd);
2492 }
2493 goto NORMAL_CHAR;
2494 }
2495 }
2496
2497 /* Fill in length of a previous callout, except when the next thing is
2498 a quantifier. */
2499
2500 is_quantifier = c == '*' || c == '+' || c == '?' ||
2501 (c == '{' && is_counted_repeat(ptr+1));
2502
2503 if (!is_quantifier && previous_callout != NULL &&
2504 after_manual_callout-- <= 0)
2505 {
2506 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2507 complete_callout(previous_callout, ptr, cd);
2508 previous_callout = NULL;
2509 }
2510
2511 /* In extended mode, skip white space and comments */
2512
2513 if ((options & PCRE_EXTENDED) != 0)
2514 {
2515 if ((cd->ctypes[c] & ctype_space) != 0) continue;
2516 if (c == '#')
2517 {
2518 while (*(++ptr) != 0)
2519 {
2520 if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
2521 }
2522 if (*ptr != 0) continue;
2523
2524 /* Else fall through to handle end of string */
2525 c = 0;
2526 }
2527 }
2528
2529 /* No auto callout for quantifiers. */
2530
2531 if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
2532 {
2533 previous_callout = code;
2534 code = auto_callout(code, ptr, cd);
2535 }
2536
2537 switch(c)
2538 {
2539 /* ===================================================================*/
2540 case 0: /* The branch terminates at string end */
2541 case '|': /* or | or ) */
2542 case ')':
2543 *firstbyteptr = firstbyte;
2544 *reqbyteptr = reqbyte;
2545 *codeptr = code;
2546 *ptrptr = ptr;
2547 if (lengthptr != NULL)
2548 {
2549 if (OFLOW_MAX - *lengthptr < code - last_code)
2550 {
2551 *errorcodeptr = ERR20;
2552 goto FAILED;
2553 }
2554 *lengthptr += code - last_code; /* To include callout length */
2555 DPRINTF((">> end branch\n"));
2556 }
2557 return TRUE;
2558
2559
2560 /* ===================================================================*/
2561 /* Handle single-character metacharacters. In multiline mode, ^ disables
2562 the setting of any following char as a first character. */
2563
2564 case '^':
2565 if ((options & PCRE_MULTILINE) != 0)
2566 {
2567 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2568 }
2569 previous = NULL;
2570 *code++ = OP_CIRC;
2571 break;
2572
2573 case '$':
2574 previous = NULL;
2575 *code++ = OP_DOLL;
2576 break;
2577
2578 /* There can never be a first char if '.' is first, whatever happens about
2579 repeats. The value of reqbyte doesn't change either. */
2580
2581 case '.':
2582 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2583 zerofirstbyte = firstbyte;
2584 zeroreqbyte = reqbyte;
2585 previous = code;
2586 *code++ = OP_ANY;
2587 break;
2588
2589
2590 /* ===================================================================*/
2591 /* Character classes. If the included characters are all < 256, we build a
2592 32-byte bitmap of the permitted characters, except in the special case
2593 where there is only one such character. For negated classes, we build the
2594 map as usual, then invert it at the end. However, we use a different opcode
2595 so that data characters > 255 can be handled correctly.
2596
2597 If the class contains characters outside the 0-255 range, a different
2598 opcode is compiled. It may optionally have a bit map for characters < 256,
2599 but those above are are explicitly listed afterwards. A flag byte tells
2600 whether the bitmap is present, and whether this is a negated class or not.
2601 */
2602
2603 case '[':
2604 previous = code;
2605
2606 /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
2607 they are encountered at the top level, so we'll do that too. */
2608
2609 if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2610 check_posix_syntax(ptr, &tempptr, cd))
2611 {
2612 *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;
2613 goto FAILED;
2614 }
2615
2616 /* If the first character is '^', set the negation flag and skip it. Also,
2617 if the first few characters (either before or after ^) are \Q\E or \E we
2618 skip them too. This makes for compatibility with Perl. */
2619
2620 negate_class = FALSE;
2621 for (;;)
2622 {
2623 c = *(++ptr);
2624 if (c == '\\')
2625 {
2626 if (ptr[1] == 'E') ptr++;
2627 else if (strncmp((const char *)ptr+1, "Q\\E", 3) == 0) ptr += 3;
2628 else break;
2629 }
2630 else if (!negate_class && c == '^')
2631 negate_class = TRUE;
2632 else break;
2633 }
2634
2635 /* If a class contains a negative special such as \S, we need to flip the
2636 negation flag at the end, so that support for characters > 255 works
2637 correctly (they are all included in the class). */
2638
2639 should_flip_negation = FALSE;
2640
2641 /* Keep a count of chars with values < 256 so that we can optimize the case
2642 of just a single character (as long as it's < 256). However, For higher
2643 valued UTF-8 characters, we don't yet do any optimization. */
2644
2645 class_charcount = 0;
2646 class_lastchar = -1;
2647
2648 /* Initialize the 32-char bit map to all zeros. We build the map in a
2649 temporary bit of memory, in case the class contains only 1 character (less
2650 than 256), because in that case the compiled code doesn't use the bit map.
2651 */
2652
2653 memset(classbits, 0, 32 * sizeof(uschar));
2654
2655 #ifdef SUPPORT_UTF8
2656 class_utf8 = FALSE; /* No chars >= 256 */
2657 class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */
2658 #endif
2659
2660 /* Process characters until ] is reached. By writing this as a "do" it
2661 means that an initial ] is taken as a data character. At the start of the
2662 loop, c contains the first byte of the character. */
2663
2664 if (c != 0) do
2665 {
2666 const uschar *oldptr;
2667
2668 #ifdef SUPPORT_UTF8
2669 if (utf8 && c > 127)
2670 { /* Braces are required because the */
2671 GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
2672 }
2673 #endif
2674
2675 /* Inside \Q...\E everything is literal except \E */
2676
2677 if (inescq)
2678 {
2679 if (c == '\\' && ptr[1] == 'E') /* If we are at \E */
2680 {
2681 inescq = FALSE; /* Reset literal state */
2682 ptr++; /* Skip the 'E' */
2683 continue; /* Carry on with next */
2684 }
2685 goto CHECK_RANGE; /* Could be range if \E follows */
2686 }
2687
2688 /* Handle POSIX class names. Perl allows a negation extension of the
2689 form [:^name:]. A square bracket that doesn't match the syntax is
2690 treated as a literal. We also recognize the POSIX constructions
2691 [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
2692 5.6 and 5.8 do. */
2693
2694 if (c == '[' &&
2695 (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2696 check_posix_syntax(ptr, &tempptr, cd))
2697 {
2698 BOOL local_negate = FALSE;
2699 int posix_class, taboffset, tabopt;
2700 register const uschar *cbits = cd->cbits;
2701 uschar pbits[32];
2702
2703 if (ptr[1] != ':')
2704 {
2705 *errorcodeptr = ERR31;
2706 goto FAILED;
2707 }
2708
2709 ptr += 2;
2710 if (*ptr == '^')
2711 {
2712 local_negate = TRUE;
2713 ptr++;
2714 }
2715
2716 posix_class = check_posix_name(ptr, tempptr - ptr);
2717 if (posix_class < 0)
2718 {
2719 *errorcodeptr = ERR30;
2720 goto FAILED;
2721 }
2722
2723 /* If matching is caseless, upper and lower are converted to
2724 alpha. This relies on the fact that the class table starts with
2725 alpha, lower, upper as the first 3 entries. */
2726
2727 if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
2728 posix_class = 0;
2729
2730 /* We build the bit map for the POSIX class in a chunk of local store
2731 because we may be adding and subtracting from it, and we don't want to
2732 subtract bits that may be in the main map already. At the end we or the
2733 result into the bit map that is being built. */
2734
2735 posix_class *= 3;
2736
2737 /* Copy in the first table (always present) */
2738
2739 memcpy(pbits, cbits + posix_class_maps[posix_class],
2740 32 * sizeof(uschar));
2741
2742 /* If there is a second table, add or remove it as required. */
2743
2744 taboffset = posix_class_maps[posix_class + 1];
2745 tabopt = posix_class_maps[posix_class + 2];
2746
2747 if (taboffset >= 0)
2748 {
2749 if (tabopt >= 0)
2750 for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
2751 else
2752 for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
2753 }
2754
2755 /* Not see if we need to remove any special characters. An option
2756 value of 1 removes vertical space and 2 removes underscore. */
2757
2758 if (tabopt < 0) tabopt = -tabopt;
2759 if (tabopt == 1) pbits[1] &= ~0x3c;
2760 else if (tabopt == 2) pbits[11] &= 0x7f;
2761
2762 /* Add the POSIX table or its complement into the main table that is
2763 being built and we are done. */
2764
2765 if (local_negate)
2766 for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
2767 else
2768 for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
2769
2770 ptr = tempptr + 1;
2771 class_charcount = 10; /* Set > 1; assumes more than 1 per class */
2772 continue; /* End of POSIX syntax handling */
2773 }
2774
2775 /* Backslash may introduce a single character, or it may introduce one
2776 of the specials, which just set a flag. The sequence \b is a special
2777 case. Inside a class (and only there) it is treated as backspace.
2778 Elsewhere it marks a word boundary. Other escapes have preset maps ready
2779 to 'or' into the one we are building. We assume they have more than one
2780 character in them, so set class_charcount bigger than one. */
2781
2782 if (c == '\\')
2783 {
2784 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2785 if (*errorcodeptr != 0) goto FAILED;
2786
2787 if (-c == ESC_b) c = '\b'; /* \b is backslash in a class */
2788 else if (-c == ESC_X) c = 'X'; /* \X is literal X in a class */
2789 else if (-c == ESC_R) c = 'R'; /* \R is literal R in a class */
2790 else if (-c == ESC_Q) /* Handle start of quoted string */
2791 {
2792 if (ptr[1] == '\\' && ptr[2] == 'E')
2793 {
2794 ptr += 2; /* avoid empty string */
2795 }
2796 else inescq = TRUE;
2797 continue;
2798 }
2799 else if (-c == ESC_E) continue; /* Ignore orphan \E */
2800
2801 if (c < 0)
2802 {
2803 register const uschar *cbits = cd->cbits;
2804 class_charcount += 2; /* Greater than 1 is what matters */
2805
2806 /* Save time by not doing this in the pre-compile phase. */
2807
2808 if (lengthptr == NULL) switch (-c)
2809 {
2810 case ESC_d:
2811 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
2812 continue;
2813
2814 case ESC_D:
2815 should_flip_negation = TRUE;
2816 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
2817 continue;
2818
2819 case ESC_w:
2820 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
2821 continue;
2822
2823 case ESC_W:
2824 should_flip_negation = TRUE;
2825 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
2826 continue;
2827
2828 case ESC_s:
2829 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
2830 classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
2831 continue;
2832
2833 case ESC_S:
2834 should_flip_negation = TRUE;
2835 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
2836 classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
2837 continue;
2838
2839 case ESC_E: /* Perl ignores an orphan \E */
2840 continue;
2841
2842 default: /* Not recognized; fall through */
2843 break; /* Need "default" setting to stop compiler warning. */
2844 }
2845
2846 /* In the pre-compile phase, just do the recognition. */
2847
2848 else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
2849 c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
2850
2851 /* We need to deal with \H, \h, \V, and \v in both phases because
2852 they use extra memory. */
2853
2854 if (-c == ESC_h)
2855 {
2856 SETBIT(classbits, 0x09); /* VT */
2857 SETBIT(classbits, 0x20); /* SPACE */
2858 SETBIT(classbits, 0xa0); /* NSBP */
2859 #ifdef SUPPORT_UTF8
2860 if (utf8)
2861 {
2862 class_utf8 = TRUE;
2863 *class_utf8data++ = XCL_SINGLE;
2864 class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
2865 *class_utf8data++ = XCL_SINGLE;
2866 class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
2867 *class_utf8data++ = XCL_RANGE;
2868 class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
2869 class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
2870 *class_utf8data++ = XCL_SINGLE;
2871 class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
2872 *class_utf8data++ = XCL_SINGLE;
2873 class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
2874 *class_utf8data++ = XCL_SINGLE;
2875 class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
2876 }
2877 #endif
2878 continue;
2879 }
2880
2881 if (-c == ESC_H)
2882 {
2883 for (c = 0; c < 32; c++)
2884 {
2885 int x = 0xff;
2886 switch (c)
2887 {
2888 case 0x09/8: x ^= 1 << (0x09%8); break;
2889 case 0x20/8: x ^= 1 << (0x20%8); break;
2890 case 0xa0/8: x ^= 1 << (0xa0%8); break;
2891 default: break;
2892 }
2893 classbits[c] |= x;
2894 }
2895
2896 #ifdef SUPPORT_UTF8
2897 if (utf8)
2898 {
2899 class_utf8 = TRUE;
2900 *class_utf8data++ = XCL_RANGE;
2901 class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2902 class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
2903 *class_utf8data++ = XCL_RANGE;
2904 class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
2905 class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
2906 *class_utf8data++ = XCL_RANGE;
2907 class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
2908 class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
2909 *class_utf8data++ = XCL_RANGE;
2910 class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
2911 class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
2912 *class_utf8data++ = XCL_RANGE;
2913 class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
2914 class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
2915 *class_utf8data++ = XCL_RANGE;
2916 class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
2917 class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
2918 *class_utf8data++ = XCL_RANGE;
2919 class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
2920 class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2921 }
2922 #endif
2923 continue;
2924 }
2925
2926 if (-c == ESC_v)
2927 {
2928 SETBIT(classbits, 0x0a); /* LF */
2929 SETBIT(classbits, 0x0b); /* VT */
2930 SETBIT(classbits, 0x0c); /* FF */
2931 SETBIT(classbits, 0x0d); /* CR */
2932 SETBIT(classbits, 0x85); /* NEL */
2933 #ifdef SUPPORT_UTF8
2934 if (utf8)
2935 {
2936 class_utf8 = TRUE;
2937 *class_utf8data++ = XCL_RANGE;
2938 class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
2939 class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
2940 }
2941 #endif
2942 continue;
2943 }
2944
2945 if (-c == ESC_V)
2946 {
2947 for (c = 0; c < 32; c++)
2948 {
2949 int x = 0xff;
2950 switch (c)
2951 {
2952 case 0x0a/8: x ^= 1 << (0x0a%8);
2953 x ^= 1 << (0x0b%8);
2954 x ^= 1 << (0x0c%8);
2955 x ^= 1 << (0x0d%8);
2956 break;
2957 case 0x85/8: x ^= 1 << (0x85%8); break;
2958 default: break;
2959 }
2960 classbits[c] |= x;
2961 }
2962
2963 #ifdef SUPPORT_UTF8
2964 if (utf8)
2965 {
2966 class_utf8 = TRUE;
2967 *class_utf8data++ = XCL_RANGE;
2968 class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2969 class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
2970 *class_utf8data++ = XCL_RANGE;
2971 class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
2972 class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2973 }
2974 #endif
2975 continue;
2976 }
2977
2978 /* We need to deal with \P and \p in both phases. */
2979
2980 #ifdef SUPPORT_UCP
2981 if (-c == ESC_p || -c == ESC_P)
2982 {
2983 BOOL negated;
2984 int pdata;
2985 int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
2986 if (ptype < 0) goto FAILED;
2987 class_utf8 = TRUE;
2988 *class_utf8data++ = ((-c == ESC_p) != negated)?
2989 XCL_PROP : XCL_NOTPROP;
2990 *class_utf8data++ = ptype;
2991 *class_utf8data++ = pdata;
2992 class_charcount -= 2; /* Not a < 256 character */
2993 continue;
2994 }
2995 #endif
2996 /* Unrecognized escapes are faulted if PCRE is running in its
2997 strict mode. By default, for compatibility with Perl, they are
2998 treated as literals. */
2999
3000 if ((options & PCRE_EXTRA) != 0)
3001 {
3002 *errorcodeptr = ERR7;
3003 goto FAILED;
3004 }
3005
3006 class_charcount -= 2; /* Undo the default count from above */
3007 c = *ptr; /* Get the final character and fall through */
3008 }
3009
3010 /* Fall through if we have a single character (c >= 0). This may be
3011 greater than 256 in UTF-8 mode. */
3012
3013 } /* End of backslash handling */
3014
3015 /* A single character may be followed by '-' to form a range. However,
3016 Perl does not permit ']' to be the end of the range. A '-' character
3017 at the end is treated as a literal. Perl ignores orphaned \E sequences
3018 entirely. The code for handling \Q and \E is messy. */
3019
3020 CHECK_RANGE:
3021 while (ptr[1] == '\\' && ptr[2] == 'E')
3022 {
3023 inescq = FALSE;
3024 ptr += 2;
3025 }
3026
3027 oldptr = ptr;
3028
3029 /* Remember \r or \n */
3030
3031 if (c == '\r' || c == '\n') cd->external_flags |= PCRE_HASCRORLF;
3032
3033 /* Check for range */
3034
3035 if (!inescq && ptr[1] == '-')
3036 {
3037 int d;
3038 ptr += 2;
3039 while (*ptr == '\\' && ptr[1] == 'E') ptr += 2;
3040
3041 /* If we hit \Q (not followed by \E) at this point, go into escaped
3042 mode. */
3043
3044 while (*ptr == '\\' && ptr[1] == 'Q')
3045 {
3046 ptr += 2;
3047 if (*ptr == '\\' && ptr[1] == 'E') { ptr += 2; continue; }
3048 inescq = TRUE;
3049 break;
3050 }
3051
3052 if (*ptr == 0 || (!inescq && *ptr == ']'))
3053 {
3054 ptr = oldptr;
3055 goto LONE_SINGLE_CHARACTER;
3056 }
3057
3058 #ifdef SUPPORT_UTF8
3059 if (utf8)
3060 { /* Braces are required because the */
3061 GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
3062 }
3063 else
3064 #endif
3065 d = *ptr; /* Not UTF-8 mode */
3066
3067 /* The second part of a range can be a single-character escape, but
3068 not any of the other escapes. Perl 5.6 treats a hyphen as a literal
3069 in such circumstances. */
3070
3071 if (!inescq && d == '\\')
3072 {
3073 d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3074 if (*errorcodeptr != 0) goto FAILED;
3075
3076 /* \b is backslash; \X is literal X; \R is literal R; any other
3077 special means the '-' was literal */
3078
3079 if (d < 0)
3080 {
3081 if (d == -ESC_b) d = '\b';
3082 else if (d == -ESC_X) d = 'X';
3083 else if (d == -ESC_R) d = 'R'; else
3084 {
3085 ptr = oldptr;
3086 goto LONE_SINGLE_CHARACTER; /* A few lines below */
3087 }
3088 }
3089 }
3090
3091 /* Check that the two values are in the correct order. Optimize
3092 one-character ranges */
3093
3094 if (d < c)
3095 {
3096 *errorcodeptr = ERR8;
3097 goto FAILED;
3098 }
3099
3100 if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
3101
3102 /* Remember \r or \n */
3103
3104 if (d == '\r' || d == '\n') cd->external_flags |= PCRE_HASCRORLF;
3105
3106 /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
3107 matching, we have to use an XCLASS with extra data items. Caseless
3108 matching for characters > 127 is available only if UCP support is
3109 available. */
3110
3111 #ifdef SUPPORT_UTF8
3112 if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
3113 {
3114 class_utf8 = TRUE;
3115
3116 /* With UCP support, we can find the other case equivalents of
3117 the relevant characters. There may be several ranges. Optimize how
3118 they fit with the basic range. */
3119
3120 #ifdef SUPPORT_UCP
3121 if ((options & PCRE_CASELESS) != 0)
3122 {
3123 unsigned int occ, ocd;
3124 unsigned int cc = c;
3125 unsigned int origd = d;
3126 while (get_othercase_range(&cc, origd, &occ, &ocd))
3127 {
3128 if (occ >= (unsigned int)c &&
3129 ocd <= (unsigned int)d)
3130 continue; /* Skip embedded ranges */
3131
3132 if (occ < (unsigned int)c &&
3133 ocd >= (unsigned int)c - 1) /* Extend the basic range */
3134 { /* if there is overlap, */
3135 c = occ; /* noting that if occ < c */
3136 continue; /* we can't have ocd > d */
3137 } /* because a subrange is */
3138 if (ocd > (unsigned int)d &&
3139 occ <= (unsigned int)d + 1) /* always shorter than */
3140 { /* the basic range. */
3141 d = ocd;
3142 continue;
3143 }
3144
3145 if (occ == ocd)
3146 {
3147 *class_utf8data++ = XCL_SINGLE;
3148 }
3149 else
3150 {
3151 *class_utf8data++ = XCL_RANGE;
3152 class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
3153 }
3154 class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
3155 }
3156 }
3157 #endif /* SUPPORT_UCP */
3158
3159 /* Now record the original range, possibly modified for UCP caseless
3160 overlapping ranges. */
3161
3162 *class_utf8data++ = XCL_RANGE;
3163 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3164 class_utf8data += _pcre_ord2utf8(d, class_utf8data);
3165
3166 /* With UCP support, we are done. Without UCP support, there is no
3167 caseless matching for UTF-8 characters > 127; we can use the bit map
3168 for the smaller ones. */
3169
3170 #ifdef SUPPORT_UCP
3171 continue; /* With next character in the class */
3172 #else
3173 if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
3174
3175 /* Adjust upper limit and fall through to set up the map */
3176
3177 d = 127;
3178
3179 #endif /* SUPPORT_UCP */
3180 }
3181 #endif /* SUPPORT_UTF8 */
3182
3183 /* We use the bit map for all cases when not in UTF-8 mode; else
3184 ranges that lie entirely within 0-127 when there is UCP support; else
3185 for partial ranges without UCP support. */
3186
3187 class_charcount += d - c + 1;
3188 class_lastchar = d;
3189
3190 /* We can save a bit of time by skipping this in the pre-compile. */
3191
3192 if (lengthptr == NULL) for (; c <= d; c++)
3193 {
3194 classbits[c/8] |= (1 << (c&7));
3195 if ((options & PCRE_CASELESS) != 0)
3196 {
3197 int uc = cd->fcc[c]; /* flip case */
3198 classbits[uc/8] |= (1 << (uc&7));
3199 }
3200 }
3201
3202 continue; /* Go get the next char in the class */
3203 }
3204
3205 /* Handle a lone single character - we can get here for a normal
3206 non-escape char, or after \ that introduces a single character or for an
3207 apparent range that isn't. */
3208
3209 LONE_SINGLE_CHARACTER:
3210
3211 /* Handle a character that cannot go in the bit map */
3212
3213 #ifdef SUPPORT_UTF8
3214 if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
3215 {
3216 class_utf8 = TRUE;
3217 *class_utf8data++ = XCL_SINGLE;
3218 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3219
3220 #ifdef SUPPORT_UCP
3221 if ((options & PCRE_CASELESS) != 0)
3222 {
3223 unsigned int othercase;
3224 if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR)
3225 {
3226 *class_utf8data++ = XCL_SINGLE;
3227 class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
3228 }
3229 }
3230 #endif /* SUPPORT_UCP */
3231
3232 }
3233 else
3234 #endif /* SUPPORT_UTF8 */
3235
3236 /* Handle a single-byte character */
3237 {
3238 classbits[c/8] |= (1 << (c&7));
3239 if ((options & PCRE_CASELESS) != 0)
3240 {
3241 c = cd->fcc[c]; /* flip case */
3242 classbits[c/8] |= (1 << (c&7));
3243 }
3244 class_charcount++;
3245 class_lastchar = c;
3246 }
3247 }
3248
3249 /* Loop until ']' reached. This "while" is the end of the "do" above. */
3250
3251 while ((c = *(++ptr)) != 0 && (c != ']' || inescq));
3252
3253 if (c == 0) /* Missing terminating ']' */
3254 {
3255 *errorcodeptr = ERR6;
3256 goto FAILED;
3257 }
3258
3259
3260 /* This code has been disabled because it would mean that \s counts as
3261 an explicit \r or \n reference, and that's not really what is wanted. Now
3262 we set the flag only if there is a literal "\r" or "\n" in the class. */
3263
3264 #if 0
3265 /* Remember whether \r or \n are in this class */
3266
3267 if (negate_class)
3268 {
3269 if ((classbits[1] & 0x24) != 0x24) cd->external_flags |= PCRE_HASCRORLF;
3270 }
3271 else
3272 {
3273 if ((classbits[1] & 0x24) != 0) cd->external_flags |= PCRE_HASCRORLF;
3274 }
3275 #endif
3276
3277
3278 /* If class_charcount is 1, we saw precisely one character whose value is
3279 less than 256. As long as there were no characters >= 128 and there was no
3280 use of \p or \P, in other words, no use of any XCLASS features, we can
3281 optimize.
3282
3283 In UTF-8 mode, we can optimize the negative case only if there were no
3284 characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR
3285 operate on single-bytes only. This is an historical hangover. Maybe one day
3286 we can tidy these opcodes to handle multi-byte characters.
3287
3288 The optimization throws away the bit map. We turn the item into a
3289 1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
3290 that OP_NOT does not support multibyte characters. In the positive case, it
3291 can cause firstbyte to be set. Otherwise, there can be no first char if
3292 this item is first, whatever repeat count may follow. In the case of
3293 reqbyte, save the previous value for reinstating. */
3294
3295 #ifdef SUPPORT_UTF8
3296 if (class_charcount == 1 && !class_utf8 &&
3297 (!utf8 || !negate_class || class_lastchar < 128))
3298 #else
3299 if (class_charcount == 1)
3300 #endif
3301 {
3302 zeroreqbyte = reqbyte;
3303
3304 /* The OP_NOT opcode works on one-byte characters only. */
3305
3306 if (negate_class)
3307 {
3308 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3309 zerofirstbyte = firstbyte;
3310 *code++ = OP_NOT;
3311 *code++ = class_lastchar;
3312 break;
3313 }
3314
3315 /* For a single, positive character, get the value into mcbuffer, and
3316 then we can handle this with the normal one-character code. */
3317
3318 #ifdef SUPPORT_UTF8
3319 if (utf8 && class_lastchar > 127)
3320 mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
3321 else
3322 #endif
3323 {
3324 mcbuffer[0] = class_lastchar;
3325 mclength = 1;
3326 }
3327 goto ONE_CHAR;
3328 } /* End of 1-char optimization */
3329
3330 /* The general case - not the one-char optimization. If this is the first
3331 thing in the branch, there can be no first char setting, whatever the
3332 repeat count. Any reqbyte setting must remain unchanged after any kind of
3333 repeat. */
3334
3335 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3336 zerofirstbyte = firstbyte;
3337 zeroreqbyte = reqbyte;
3338
3339 /* If there are characters with values > 255, we have to compile an
3340 extended class, with its own opcode, unless there was a negated special
3341 such as \S in the class, because in that case all characters > 255 are in
3342 the class, so any that were explicitly given as well can be ignored. If
3343 (when there are explicit characters > 255 that must be listed) there are no
3344 characters < 256, we can omit the bitmap in the actual compiled code. */
3345
3346 #ifdef SUPPORT_UTF8
3347 if (class_utf8 && !should_flip_negation)
3348 {
3349 *class_utf8data++ = XCL_END; /* Marks the end of extra data */
3350 *code++ = OP_XCLASS;
3351 code += LINK_SIZE;
3352 *code = negate_class? XCL_NOT : 0;
3353
3354 /* If the map is required, move up the extra data to make room for it;
3355 otherwise just move the code pointer to the end of the extra data. */
3356
3357 if (class_charcount > 0)
3358 {
3359 *code++ |= XCL_MAP;
3360 memmove(code + 32, code, class_utf8data - code);
3361 memcpy(code, classbits, 32);
3362 code = class_utf8data + 32;
3363 }
3364 else code = class_utf8data;
3365
3366 /* Now fill in the complete length of the item */
3367
3368 PUT(previous, 1, code - previous);
3369 break; /* End of class handling */
3370 }
3371 #endif
3372
3373 /* If there are no characters > 255, set the opcode to OP_CLASS or
3374 OP_NCLASS, depending on whether the whole class was negated and whether
3375 there were negative specials such as \S in the class. Then copy the 32-byte
3376 map into the code vector, negating it if necessary. */
3377
3378 *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
3379 if (negate_class)
3380 {
3381 if (lengthptr == NULL) /* Save time in the pre-compile phase */
3382 for (c = 0; c < 32; c++) code[c] = ~classbits[c];
3383 }
3384 else
3385 {
3386 memcpy(code, classbits, 32);
3387 }
3388 code += 32;
3389 break;
3390
3391
3392 /* ===================================================================*/
3393 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
3394 has been tested above. */
3395
3396 case '{':
3397 if (!is_quantifier) goto NORMAL_CHAR;
3398 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
3399 if (*errorcodeptr != 0) goto FAILED;
3400 goto REPEAT;
3401
3402 case '*':
3403 repeat_min = 0;
3404 repeat_max = -1;
3405 goto REPEAT;
3406
3407 case '+':
3408 repeat_min = 1;
3409 repeat_max = -1;
3410 goto REPEAT;
3411
3412 case '?':
3413 repeat_min = 0;
3414 repeat_max = 1;
3415
3416 REPEAT:
3417 if (previous == NULL)
3418 {
3419 *errorcodeptr = ERR9;
3420 goto FAILED;
3421 }
3422
3423 if (repeat_min == 0)
3424 {
3425 firstbyte = zerofirstbyte; /* Adjust for zero repeat */
3426 reqbyte = zeroreqbyte; /* Ditto */
3427 }
3428
3429 /* Remember whether this is a variable length repeat */
3430
3431 reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
3432
3433 op_type = 0; /* Default single-char op codes */
3434 possessive_quantifier = FALSE; /* Default not possessive quantifier */
3435
3436 /* Save start of previous item, in case we have to move it up to make space
3437 for an inserted OP_ONCE for the additional '+' extension. */
3438
3439 tempcode = previous;
3440
3441 /* If the next character is '+', we have a possessive quantifier. This
3442 implies greediness, whatever the setting of the PCRE_UNGREEDY option.
3443 If the next character is '?' this is a minimizing repeat, by default,
3444 but if PCRE_UNGREEDY is set, it works the other way round. We change the
3445 repeat type to the non-default. */
3446
3447 if (ptr[1] == '+')
3448 {
3449 repeat_type = 0; /* Force greedy */
3450 possessive_quantifier = TRUE;
3451 ptr++;
3452 }
3453 else if (ptr[1] == '?')
3454 {
3455 repeat_type = greedy_non_default;
3456 ptr++;
3457 }
3458 else repeat_type = greedy_default;
3459
3460 /* If previous was a character match, abolish the item and generate a
3461 repeat item instead. If a char item has a minumum of more than one, ensure
3462 that it is set in reqbyte - it might not be if a sequence such as x{3} is
3463 the first thing in a branch because the x will have gone into firstbyte
3464 instead. */
3465
3466 if (*previous == OP_CHAR || *previous == OP_CHARNC)
3467 {
3468 /* Deal with UTF-8 characters that take up more than one byte. It's
3469 easier to write this out separately than try to macrify it. Use c to
3470 hold the length of the character in bytes, plus 0x80 to flag that it's a
3471 length rather than a small character. */
3472
3473 #ifdef SUPPORT_UTF8
3474 if (utf8 && (code[-1] & 0x80) != 0)
3475 {
3476 uschar *lastchar = code - 1;
3477 while((*lastchar & 0xc0) == 0x80) lastchar--;
3478 c = code - lastchar; /* Length of UTF-8 character */
3479 memcpy(utf8_char, lastchar, c); /* Save the char */
3480 c |= 0x80; /* Flag c as a length */
3481 }
3482 else
3483 #endif
3484
3485 /* Handle the case of a single byte - either with no UTF8 support, or
3486 with UTF-8 disabled, or for a UTF-8 character < 128. */
3487
3488 {
3489 c = code[-1];
3490 if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
3491 }
3492
3493 /* If the repetition is unlimited, it pays to see if the next thing on
3494 the line is something that cannot possibly match this character. If so,
3495 automatically possessifying this item gains some performance in the case
3496 where the match fails. */
3497
3498 if (!possessive_quantifier &&
3499 repeat_max < 0 &&
3500 check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3501 options, cd))
3502 {
3503 repeat_type = 0; /* Force greedy */
3504 possessive_quantifier = TRUE;
3505 }
3506
3507 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
3508 }
3509
3510 /* If previous was a single negated character ([^a] or similar), we use
3511 one of the special opcodes, replacing it. The code is shared with single-
3512 character repeats by setting opt_type to add a suitable offset into
3513 repeat_type. We can also test for auto-possessification. OP_NOT is
3514 currently used only for single-byte chars. */
3515
3516 else if (*previous == OP_NOT)
3517 {
3518 op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
3519 c = previous[1];
3520 if (!possessive_quantifier &&
3521 repeat_max < 0 &&
3522 check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3523 {
3524 repeat_type = 0; /* Force greedy */
3525 possessive_quantifier = TRUE;
3526 }
3527 goto OUTPUT_SINGLE_REPEAT;
3528 }
3529
3530 /* If previous was a character type match (\d or similar), abolish it and
3531 create a suitable repeat item. The code is shared with single-character
3532 repeats by setting op_type to add a suitable offset into repeat_type. Note
3533 the the Unicode property types will be present only when SUPPORT_UCP is
3534 defined, but we don't wrap the little bits of code here because it just
3535 makes it horribly messy. */
3536
3537 else if (*previous < OP_EODN)
3538 {
3539 uschar *oldcode;
3540 int prop_type, prop_value;
3541 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
3542 c = *previous;
3543
3544 if (!possessive_quantifier &&
3545 repeat_max < 0 &&
3546 check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3547 {
3548 repeat_type = 0; /* Force greedy */
3549 possessive_quantifier = TRUE;
3550 }
3551
3552 OUTPUT_SINGLE_REPEAT:
3553 if (*previous == OP_PROP || *previous == OP_NOTPROP)
3554 {
3555 prop_type = previous[1];
3556 prop_value = previous[2];
3557 }
3558 else prop_type = prop_value = -1;
3559
3560 oldcode = code;
3561 code = previous; /* Usually overwrite previous item */
3562
3563 /* If the maximum is zero then the minimum must also be zero; Perl allows
3564 this case, so we do too - by simply omitting the item altogether. */
3565
3566 if (repeat_max == 0) goto END_REPEAT;
3567
3568 /* All real repeats make it impossible to handle partial matching (maybe
3569 one day we will be able to remove this restriction). */
3570
3571 if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL;
3572
3573 /* Combine the op_type with the repeat_type */
3574
3575 repeat_type += op_type;
3576
3577 /* A minimum of zero is handled either as the special case * or ?, or as
3578 an UPTO, with the maximum given. */
3579
3580 if (repeat_min == 0)
3581 {
3582 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
3583 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
3584 else
3585 {
3586 *code++ = OP_UPTO + repeat_type;
3587 PUT2INC(code, 0, repeat_max);
3588 }
3589 }
3590
3591 /* A repeat minimum of 1 is optimized into some special cases. If the
3592 maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
3593 left in place and, if the maximum is greater than 1, we use OP_UPTO with
3594 one less than the maximum. */
3595
3596 else if (repeat_min == 1)
3597 {
3598 if (repeat_max == -1)
3599 *code++ = OP_PLUS + repeat_type;
3600 else
3601 {
3602 code = oldcode; /* leave previous item in place */
3603 if (repeat_max == 1) goto END_REPEAT;
3604 *code++ = OP_UPTO + repeat_type;
3605 PUT2INC(code, 0, repeat_max - 1);
3606 }
3607 }
3608
3609 /* The case {n,n} is just an EXACT, while the general case {n,m} is
3610 handled as an EXACT followed by an UPTO. */
3611
3612 else
3613 {
3614 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
3615 PUT2INC(code, 0, repeat_min);
3616
3617 /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
3618 we have to insert the character for the previous code. For a repeated
3619 Unicode property match, there are two extra bytes that define the
3620 required property. In UTF-8 mode, long characters have their length in
3621 c, with the 0x80 bit as a flag. */
3622
3623 if (repeat_max < 0)
3624 {
3625 #ifdef SUPPORT_UTF8
3626 if (utf8 && c >= 128)
3627 {
3628 memcpy(code, utf8_char, c & 7);
3629 code += c & 7;
3630 }
3631 else
3632 #endif
3633 {
3634 *code++ = c;
3635 if (prop_type >= 0)
3636 {
3637 *code++ = prop_type;
3638 *code++ = prop_value;
3639 }
3640 }
3641 *code++ = OP_STAR + repeat_type;
3642 }
3643
3644 /* Else insert an UPTO if the max is greater than the min, again
3645 preceded by the character, for the previously inserted code. If the
3646 UPTO is just for 1 instance, we can use QUERY instead. */
3647
3648 else if (repeat_max != repeat_min)
3649 {
3650 #ifdef SUPPORT_UTF8
3651 if (utf8 && c >= 128)
3652 {
3653 memcpy(code, utf8_char, c & 7);
3654 code += c & 7;
3655 }
3656 else
3657 #endif
3658 *code++ = c;
3659 if (prop_type >= 0)
3660 {
3661 *code++ = prop_type;
3662 *code++ = prop_value;
3663 }
3664 repeat_max -= repeat_min;
3665
3666 if (repeat_max == 1)
3667 {
3668 *code++ = OP_QUERY + repeat_type;
3669 }
3670 else
3671 {
3672 *code++ = OP_UPTO + repeat_type;
3673 PUT2INC(code, 0, repeat_max);
3674 }
3675 }
3676 }
3677
3678 /* The character or character type itself comes last in all cases. */
3679
3680 #ifdef SUPPORT_UTF8
3681 if (utf8 && c >= 128)
3682 {
3683 memcpy(code, utf8_char, c & 7);
3684 code += c & 7;
3685 }
3686 else
3687 #endif
3688 *code++ = c;
3689
3690 /* For a repeated Unicode property match, there are two extra bytes that
3691 define the required property. */
3692
3693 #ifdef SUPPORT_UCP
3694 if (prop_type >= 0)
3695 {
3696 *code++ = prop_type;
3697 *code++ = prop_value;
3698 }
3699 #endif
3700 }
3701
3702 /* If previous was a character class or a back reference, we put the repeat
3703 stuff after it, but just skip the item if the repeat was {0,0}. */
3704
3705 else if (*previous == OP_CLASS ||
3706 *previous == OP_NCLASS ||
3707 #ifdef SUPPORT_UTF8
3708 *previous == OP_XCLASS ||
3709 #endif
3710 *previous == OP_REF)
3711 {
3712 if (repeat_max == 0)
3713 {
3714 code = previous;
3715 goto END_REPEAT;
3716 }
3717
3718 /* All real repeats make it impossible to handle partial matching (maybe
3719 one day we will be able to remove this restriction). */
3720
3721 if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL;
3722
3723 if (repeat_min == 0 && repeat_max == -1)
3724 *code++ = OP_CRSTAR + repeat_type;
3725 else if (repeat_min == 1 && repeat_max == -1)
3726 *code++ = OP_CRPLUS + repeat_type;
3727 else if (repeat_min == 0 && repeat_max == 1)
3728 *code++ = OP_CRQUERY + repeat_type;
3729 else
3730 {
3731 *code++ = OP_CRRANGE + repeat_type;
3732 PUT2INC(code, 0, repeat_min);
3733 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
3734 PUT2INC(code, 0, repeat_max);
3735 }
3736 }
3737
3738 /* If previous was a bracket group, we may have to replicate it in certain
3739 cases. */
3740
3741 else if (*previous == OP_BRA || *previous == OP_CBRA ||
3742 *previous == OP_ONCE || *previous == OP_COND)
3743 {
3744 register int i;
3745 int ketoffset = 0;
3746 int len = code - previous;
3747 uschar *bralink = NULL;
3748
3749 /* Repeating a DEFINE group is pointless */
3750
3751 if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
3752 {
3753 *errorcodeptr = ERR55;
3754 goto FAILED;
3755 }
3756
3757 /* If the maximum repeat count is unlimited, find the end of the bracket
3758 by scanning through from the start, and compute the offset back to it
3759 from the current code pointer. There may be an OP_OPT setting following
3760 the final KET, so we can't find the end just by going back from the code
3761 pointer. */
3762
3763 if (repeat_max == -1)
3764 {
3765 register uschar *ket = previous;
3766 do ket += GET(ket, 1); while (*ket != OP_KET);
3767 ketoffset = code - ket;
3768 }
3769
3770 /* The case of a zero minimum is special because of the need to stick
3771 OP_BRAZERO in front of it, and because the group appears once in the
3772 data, whereas in other cases it appears the minimum number of times. For
3773 this reason, it is simplest to treat this case separately, as otherwise
3774 the code gets far too messy. There are several special subcases when the
3775 minimum is zero. */
3776
3777 if (repeat_min == 0)
3778 {
3779 /* If the maximum is also zero, we just omit the group from the output
3780 altogether. */
3781
3782 if (repeat_max == 0)
3783 {
3784 code = previous;
3785 goto END_REPEAT;
3786 }
3787
3788 /* If the maximum is 1 or unlimited, we just have to stick in the
3789 BRAZERO and do no more at this point. However, we do need to adjust
3790 any OP_RECURSE calls inside the group that refer to the group itself or
3791 any internal or forward referenced group, because the offset is from
3792 the start of the whole regex. Temporarily terminate the pattern while
3793 doing this. */
3794
3795 if (repeat_max <= 1)
3796 {
3797 *code = OP_END;
3798 adjust_recurse(previous, 1, utf8, cd, save_hwm);
3799 memmove(previous+1, previous, len);
3800 code++;
3801 *previous++ = OP_BRAZERO + repeat_type;
3802 }
3803
3804 /* If the maximum is greater than 1 and limited, we have to replicate
3805 in a nested fashion, sticking OP_BRAZERO before each set of brackets.
3806 The first one has to be handled carefully because it's the original
3807 copy, which has to be moved up. The remainder can be handled by code
3808 that is common with the non-zero minimum case below. We have to
3809 adjust the value or repeat_max, since one less copy is required. Once
3810 again, we may have to adjust any OP_RECURSE calls inside the group. */
3811
3812 else
3813 {
3814 int offset;
3815 *code = OP_END;
3816 adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
3817 memmove(previous + 2 + LINK_SIZE, previous, len);
3818 code += 2 + LINK_SIZE;
3819 *previous++ = OP_BRAZERO + repeat_type;
3820 *previous++ = OP_BRA;
3821
3822 /* We chain together the bracket offset fields that have to be
3823 filled in later when the ends of the brackets are reached. */
3824
3825 offset = (bralink == NULL)? 0 : previous - bralink;
3826 bralink = previous;
3827 PUTINC(previous, 0, offset);
3828 }
3829
3830 repeat_max--;
3831 }
3832
3833 /* If the minimum is greater than zero, replicate the group as many
3834 times as necessary, and adjust the maximum to the number of subsequent
3835 copies that we need. If we set a first char from the group, and didn't
3836 set a required char, copy the latter from the former. If there are any
3837 forward reference subroutine calls in the group, there will be entries on
3838 the workspace list; replicate these with an appropriate increment. */
3839
3840 else
3841 {
3842 if (repeat_min > 1)
3843 {
3844 /* In the pre-compile phase, we don't actually do the replication. We
3845 just adjust the length as if we had. Do some paranoid checks for
3846 potential integer overflow. */
3847
3848 if (lengthptr != NULL)
3849 {
3850 int delta = (repeat_min - 1)*length_prevgroup;
3851 if ((double)(repeat_min - 1)*(double)length_prevgroup >
3852 (double)INT_MAX ||
3853 OFLOW_MAX - *lengthptr < delta)
3854 {
3855 *errorcodeptr = ERR20;
3856 goto FAILED;
3857 }
3858 *lengthptr += delta;
3859 }
3860
3861 /* This is compiling for real */
3862
3863 else
3864 {
3865 if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
3866 for (i = 1; i < repeat_min; i++)
3867 {
3868 uschar *hc;
3869 uschar *this_hwm = cd->hwm;
3870 memcpy(code, previous, len);
3871 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3872 {
3873 PUT(cd->hwm, 0, GET(hc, 0) + len);
3874 cd->hwm += LINK_SIZE;
3875 }
3876 save_hwm = this_hwm;
3877 code += len;
3878 }
3879 }
3880 }
3881
3882 if (repeat_max > 0) repeat_max -= repeat_min;
3883 }
3884
3885 /* This code is common to both the zero and non-zero minimum cases. If
3886 the maximum is limited, it replicates the group in a nested fashion,
3887 remembering the bracket starts on a stack. In the case of a zero minimum,
3888 the first one was set up above. In all cases the repeat_max now specifies
3889 the number of additional copies needed. Again, we must remember to
3890 replicate entries on the forward reference list. */
3891
3892 if (repeat_max >= 0)
3893 {
3894 /* In the pre-compile phase, we don't actually do the replication. We
3895 just adjust the length as if we had. For each repetition we must add 1
3896 to the length for BRAZERO and for all but the last repetition we must
3897 add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
3898 paranoid checks to avoid integer overflow. */
3899
3900 if (lengthptr != NULL && repeat_max > 0)
3901 {
3902 int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
3903 2 - 2*LINK_SIZE; /* Last one doesn't nest */
3904 if ((double)repeat_max *
3905 (double)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
3906 > (double)INT_MAX ||
3907 OFLOW_MAX - *lengthptr < delta)
3908 {
3909 *errorcodeptr = ERR20;
3910 goto FAILED;
3911 }
3912 *lengthptr += delta;
3913 }
3914
3915 /* This is compiling for real */
3916
3917 else for (i = repeat_max - 1; i >= 0; i--)
3918 {
3919 uschar *hc;
3920 uschar *this_hwm = cd->hwm;
3921
3922 *code++ = OP_BRAZERO + repeat_type;
3923
3924 /* All but the final copy start a new nesting, maintaining the
3925 chain of brackets outstanding. */
3926
3927 if (i != 0)
3928 {
3929 int offset;
3930 *code++ = OP_BRA;
3931 offset = (bralink == NULL)? 0 : code - bralink;
3932 bralink = code;
3933 PUTINC(code, 0, offset);
3934 }
3935
3936 memcpy(code, previous, len);
3937 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3938 {
3939 PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
3940 cd->hwm += LINK_SIZE;
3941 }
3942 save_hwm = this_hwm;
3943 code += len;
3944 }
3945
3946 /* Now chain through the pending brackets, and fill in their length
3947 fields (which are holding the chain links pro tem). */
3948
3949 while (bralink != NULL)
3950 {
3951 int oldlinkoffset;
3952 int offset = code - bralink + 1;
3953 uschar *bra = code - offset;
3954 oldlinkoffset = GET(bra, 1);
3955 bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
3956 *code++ = OP_KET;
3957 PUTINC(code, 0, offset);
3958 PUT(bra, 1, offset);
3959 }
3960 }
3961
3962 /* If the maximum is unlimited, set a repeater in the final copy. We
3963 can't just offset backwards from the current code point, because we
3964 don't know if there's been an options resetting after the ket. The
3965 correct offset was computed above.
3966
3967 Then, when we are doing the actual compile phase, check to see whether
3968 this group is a non-atomic one that could match an empty string. If so,
3969 convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
3970 that runtime checking can be done. [This check is also applied to
3971 atomic groups at runtime, but in a different way.] */
3972
3973 else
3974 {
3975 uschar *ketcode = code - ketoffset;
3976 uschar *bracode = ketcode - GET(ketcode, 1);
3977 *ketcode = OP_KETRMAX + repeat_type;
3978 if (lengthptr == NULL && *bracode != OP_ONCE)
3979 {
3980 uschar *scode = bracode;
3981 do
3982 {
3983 if (could_be_empty_branch(scode, ketcode, utf8))
3984 {
3985 *bracode += OP_SBRA - OP_BRA;
3986 break;
3987 }
3988 scode += GET(scode, 1);
3989 }
3990 while (*scode == OP_ALT);
3991 }
3992 }
3993 }
3994
3995 /* Else there's some kind of shambles */
3996
3997 else
3998 {
3999 *errorcodeptr = ERR11;
4000 goto FAILED;
4001 }
4002
4003 /* If the character following a repeat is '+', or if certain optimization
4004 tests above succeeded, possessive_quantifier is TRUE. For some of the
4005 simpler opcodes, there is an special alternative opcode for this. For
4006 anything else, we wrap the entire repeated item inside OP_ONCE brackets.
4007 The '+' notation is just syntactic sugar, taken from Sun's Java package,
4008 but the special opcodes can optimize it a bit. The repeated item starts at
4009 tempcode, not at previous, which might be the first part of a string whose
4010 (former) last char we repeated.
4011
4012 Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
4013 an 'upto' may follow. We skip over an 'exact' item, and then test the
4014 length of what remains before proceeding. */
4015
4016 if (possessive_quantifier)
4017 {
4018 int len;
4019 if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||
4020 *tempcode == OP_NOTEXACT)
4021 tempcode += _pcre_OP_lengths[*tempcode];
4022 len = code - tempcode;
4023 if (len > 0) switch (*tempcode)
4024 {
4025 case OP_STAR: *tempcode = OP_POSSTAR; break;
4026 case OP_PLUS: *tempcode = OP_POSPLUS; break;
4027 case OP_QUERY: *tempcode = OP_POSQUERY; break;
4028 case OP_UPTO: *tempcode = OP_POSUPTO; break;
4029
4030 case OP_TYPESTAR: *tempcode = OP_TYPEPOSSTAR; break;
4031 case OP_TYPEPLUS: *tempcode = OP_TYPEPOSPLUS; break;
4032 case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
4033 case OP_TYPEUPTO: *tempcode = OP_TYPEPOSUPTO; break;
4034
4035 case OP_NOTSTAR: *tempcode = OP_NOTPOSSTAR; break;
4036 case OP_NOTPLUS: *tempcode = OP_NOTPOSPLUS; break;
4037 case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
4038 case OP_NOTUPTO: *tempcode = OP_NOTPOSUPTO; break;
4039
4040 default:
4041 memmove(tempcode + 1+LINK_SIZE, tempcode, len);
4042 code += 1 + LINK_SIZE;
4043 len += 1 + LINK_SIZE;
4044 tempcode[0] = OP_ONCE;
4045 *code++ = OP_KET;
4046 PUTINC(code, 0, len);
4047 PUT(tempcode, 1, len);
4048 break;
4049 }
4050 }
4051
4052 /* In all case we no longer have a previous item. We also set the
4053 "follows varying string" flag for subsequently encountered reqbytes if
4054 it isn't already set and we have just passed a varying length item. */
4055
4056 END_REPEAT:
4057 previous = NULL;
4058 cd->req_varyopt |= reqvary;
4059 break;
4060
4061
4062 /* ===================================================================*/
4063 /* Start of nested parenthesized sub-expression, or comment or lookahead or
4064 lookbehind or option setting or condition or all the other extended
4065 parenthesis forms. */
4066
4067 case '(':
4068 newoptions = options;
4069 skipbytes = 0;
4070 bravalue = OP_CBRA;
4071 save_hwm = cd->hwm;
4072 reset_bracount = FALSE;
4073
4074 /* First deal with various "verbs" that can be introduced by '*'. */
4075
4076 if (*(++ptr) == '*' && (cd->ctypes[ptr[1]] & ctype_letter) != 0)
4077 {
4078 int i, namelen;
4079 const char *vn = verbnames;
4080 const uschar *name = ++ptr;
4081 previous = NULL;
4082 while ((cd->ctypes[*++ptr] & ctype_letter) != 0);
4083 if (*ptr == ':')
4084 {
4085 *errorcodeptr = ERR59; /* Not supported */
4086 goto FAILED;
4087 }
4088 if (*ptr != ')')
4089 {
4090 *errorcodeptr = ERR60;
4091 goto FAILED;
4092 }
4093 namelen = ptr - name;
4094 for (i = 0; i < verbcount; i++)
4095 {
4096 if (namelen == verbs[i].len &&
4097 strncmp((char *)name, vn, namelen) == 0)
4098 {
4099 *code = verbs[i].op;
4100 if (*code++ == OP_ACCEPT) cd->had_accept = TRUE;
4101 break;
4102 }
4103 vn += verbs[i].len + 1;
4104 }
4105 if (i < verbcount) continue;
4106 *errorcodeptr = ERR60;
4107 goto FAILED;
4108 }
4109
4110 /* Deal with the extended parentheses; all are introduced by '?', and the
4111 appearance of any of them means that this is not a capturing group. */
4112
4113 else if (*ptr == '?')
4114 {
4115 int i, set, unset, namelen;
4116 int *optset;
4117 const uschar *name;
4118 uschar *slot;
4119
4120 switch (*(++ptr))
4121 {
4122 case '#': /* Comment; skip to ket */
4123 ptr++;
4124 while (*ptr != 0 && *ptr != ')') ptr++;
4125 if (*ptr == 0)
4126 {
4127 *errorcodeptr = ERR18;
4128 goto FAILED;
4129 }
4130 continue;
4131
4132
4133 /* ------------------------------------------------------------ */
4134 case '|': /* Reset capture count for each branch */
4135 reset_bracount = TRUE;
4136 /* Fall through */
4137
4138 /* ------------------------------------------------------------ */
4139 case ':': /* Non-capturing bracket */
4140 bravalue = OP_BRA;
4141 ptr++;
4142 break;
4143
4144
4145 /* ------------------------------------------------------------ */
4146 case '(':
4147 bravalue = OP_COND; /* Conditional group */
4148
4149 /* A condition can be an assertion, a number (referring to a numbered
4150 group), a name (referring to a named group), or 'R', referring to
4151 recursion. R<digits> and R&name are also permitted for recursion tests.
4152
4153 There are several syntaxes for testing a named group: (?(name)) is used
4154 by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
4155
4156 There are two unfortunate ambiguities, caused by history. (a) 'R' can
4157 be the recursive thing or the name 'R' (and similarly for 'R' followed
4158 by digits), and (b) a number could be a name that consists of digits.
4159 In both cases, we look for a name first; if not found, we try the other
4160 cases. */
4161
4162 /* For conditions that are assertions, check the syntax, and then exit
4163 the switch. This will take control down to where bracketed groups,
4164 including assertions, are processed. */
4165
4166 if (ptr[1] == '?' && (ptr[2] == '=' || ptr[2] == '!' || ptr[2] == '<'))
4167 break;
4168
4169 /* Most other conditions use OP_CREF (a couple change to OP_RREF
4170 below), and all need to skip 3 bytes at the start of the group. */
4171
4172 code[1+LINK_SIZE] = OP_CREF;
4173 skipbytes = 3;
4174 refsign = -1;
4175
4176 /* Check for a test for recursion in a named group. */
4177
4178 if (ptr[1] == 'R' && ptr[2] == '&')
4179 {
4180 terminator = -1;
4181 ptr += 2;
4182 code[1+LINK_SIZE] = OP_RREF; /* Change the type of test */
4183 }
4184
4185 /* Check for a test for a named group's having been set, using the Perl
4186 syntax (?(<name>) or (?('name') */
4187
4188 else if (ptr[1] == '<')
4189 {
4190 terminator = '>';
4191 ptr++;
4192 }
4193 else if (ptr[1] == '\'')
4194 {
4195 terminator = '\'';
4196 ptr++;
4197 }
4198 else
4199 {
4200 terminator = 0;
4201 if (ptr[1] == '-' || ptr[1] == '+') refsign = *(++ptr);
4202 }
4203
4204 /* We now expect to read a name; any thing else is an error */
4205
4206 if ((cd->ctypes[ptr[1]] & ctype_word) == 0)
4207 {
4208 ptr += 1; /* To get the right offset */
4209 *errorcodeptr = ERR28;
4210 goto FAILED;
4211 }
4212
4213 /* Read the name, but also get it as a number if it's all digits */
4214
4215 recno = 0;
4216 name = ++ptr;
4217 while ((cd->ctypes[*ptr] & ctype_word) != 0)
4218 {
4219 if (recno >= 0)
4220 recno = ((digitab[*ptr] & ctype_digit) != 0)?
4221 recno * 10 + *ptr - '0' : -1;
4222 ptr++;
4223 }
4224 namelen = ptr - name;
4225
4226 if ((terminator > 0 && *ptr++ != terminator) || *ptr++ != ')')
4227 {
4228 ptr--; /* Error offset */
4229 *errorcodeptr = ERR26;
4230 goto FAILED;
4231 }
4232
4233 /* Do no further checking in the pre-compile phase. */
4234
4235 if (lengthptr != NULL) break;
4236
4237 /* In the real compile we do the work of looking for the actual
4238 reference. If the string started with "+" or "-" we require the rest to
4239 be digits, in which case recno will be set. */
4240
4241 if (refsign > 0)
4242 {
4243 if (recno <= 0)
4244 {
4245 *errorcodeptr = ERR58;
4246 goto FAILED;
4247 }
4248 if (refsign == '-')
4249 {
4250 recno = cd->bracount - recno + 1;
4251 if (recno <= 0)
4252 {
4253 *errorcodeptr = ERR15;
4254 goto FAILED;
4255 }
4256 }
4257 else recno += cd->bracount;
4258 PUT2(code, 2+LINK_SIZE, recno);
4259 break;
4260 }
4261
4262 /* Otherwise (did not start with "+" or "-"), start by looking for the
4263 name. */
4264
4265 slot = cd->name_table;
4266 for (i = 0; i < cd->names_found; i++)
4267 {
4268 if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
4269 slot += cd->name_entry_size;
4270 }
4271
4272 /* Found a previous named subpattern */
4273
4274 if (i < cd->names_found)
4275 {
4276 recno = GET2(slot, 0);
4277 PUT2(code, 2+LINK_SIZE, recno);
4278 }
4279
4280 /* Search the pattern for a forward reference */
4281
4282 else if ((i = find_parens(ptr, cd->bracount, name, namelen,
4283 (options & PCRE_EXTENDED) != 0)) > 0)
4284 {
4285 PUT2(code, 2+LINK_SIZE, i);
4286 }
4287
4288 /* If terminator == 0 it means that the name followed directly after
4289 the opening parenthesis [e.g. (?(abc)...] and in this case there are
4290 some further alternatives to try. For the cases where terminator != 0
4291 [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
4292 now checked all the possibilities, so give an error. */
4293
4294 else if (terminator != 0)
4295 {
4296 *errorcodeptr = ERR15;
4297 goto FAILED;
4298 }
4299
4300 /* Check for (?(R) for recursion. Allow digits after R to specify a
4301 specific group number. */
4302
4303 else if (*name == 'R')
4304 {
4305 recno = 0;
4306 for (i = 1; i < namelen; i++)
4307 {
4308 if ((digitab[name[i]] & ctype_digit) == 0)
4309 {
4310 *errorcodeptr = ERR15;
4311 goto FAILED;
4312 }
4313 recno = recno * 10 + name[i] - '0';
4314 }
4315 if (recno == 0) recno = RREF_ANY;
4316 code[1+LINK_SIZE] = OP_RREF; /* Change test type */
4317 PUT2(code, 2+LINK_SIZE, recno);
4318 }
4319
4320 /* Similarly, check for the (?(DEFINE) "condition", which is always
4321 false. */
4322
4323 else if (namelen == 6 && strncmp((char *)name, "DEFINE", 6) == 0)
4324 {
4325 code[1+LINK_SIZE] = OP_DEF;
4326 skipbytes = 1;
4327 }
4328
4329 /* Check for the "name" actually being a subpattern number. */
4330
4331 else if (recno > 0)
4332 {
4333 PUT2(code, 2+LINK_SIZE, recno);
4334 }
4335
4336 /* Either an unidentified subpattern, or a reference to (?(0) */
4337
4338 else
4339 {
4340 *errorcodeptr = (recno == 0)? ERR35: ERR15;
4341 goto FAILED;
4342 }
4343 break;
4344
4345
4346 /* ------------------------------------------------------------ */
4347 case '=': /* Positive lookahead */
4348 bravalue = OP_ASSERT;
4349 ptr++;
4350 break;
4351
4352
4353 /* ------------------------------------------------------------ */
4354 case '!': /* Negative lookahead */
4355 ptr++;
4356 if (*ptr == ')') /* Optimize (?!) */
4357 {
4358 *code++ = OP_FAIL;
4359 previous = NULL;
4360 continue;
4361 }
4362 bravalue = OP_ASSERT_NOT;
4363 break;
4364
4365
4366 /* ------------------------------------------------------------ */
4367 case '<': /* Lookbehind or named define */
4368 switch (ptr[1])
4369 {
4370 case '=': /* Positive lookbehind */
4371 bravalue = OP_ASSERTBACK;
4372 ptr += 2;
4373 break;
4374
4375 case '!': /* Negative lookbehind */
4376 bravalue = OP_ASSERTBACK_NOT;
4377 ptr += 2;
4378 break;
4379
4380 default: /* Could be name define, else bad */
4381 if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME;
4382 ptr++; /* Correct offset for error */
4383 *errorcodeptr = ERR24;
4384 goto FAILED;
4385 }
4386 break;
4387
4388
4389 /* ------------------------------------------------------------ */
4390 case '>': /* One-time brackets */
4391 bravalue = OP_ONCE;
4392 ptr++;
4393 break;
4394
4395
4396 /* ------------------------------------------------------------ */
4397 case 'C': /* Callout - may be followed by digits; */
4398 previous_callout = code; /* Save for later completion */
4399 after_manual_callout = 1; /* Skip one item before completing */
4400 *code++ = OP_CALLOUT;
4401 {
4402 int n = 0;
4403 while ((digitab[*(++ptr)] & ctype_digit) != 0)
4404 n = n * 10 + *ptr - '0';
4405 if (*ptr != ')')
4406 {
4407 *errorcodeptr = ERR39;
4408 goto FAILED;
4409 }
4410 if (n > 255)
4411 {
4412 *errorcodeptr = ERR38;
4413 goto FAILED;
4414 }
4415 *code++ = n;
4416 PUT(code, 0, ptr - cd->start_pattern + 1); /* Pattern offset */
4417 PUT(code, LINK_SIZE, 0); /* Default length */
4418 code += 2 * LINK_SIZE;
4419 }
4420 previous = NULL;
4421 continue;
4422
4423
4424 /* ------------------------------------------------------------ */
4425 case 'P': /* Python-style named subpattern handling */
4426 if (*(++ptr) == '=' || *ptr == '>') /* Reference or recursion */
4427 {
4428 is_recurse = *ptr == '>';
4429 terminator = ')';
4430 goto NAMED_REF_OR_RECURSE;
4431 }
4432 else if (*ptr != '<') /* Test for Python-style definition */
4433 {
4434 *errorcodeptr = ERR41;
4435 goto FAILED;
4436 }
4437 /* Fall through to handle (?P< as (?< is handled */
4438
4439
4440 /* ------------------------------------------------------------ */
4441 DEFINE_NAME: /* Come here from (?< handling */
4442 case '\'':
4443 {
4444 terminator = (*ptr == '<')? '>' : '\'';
4445 name = ++ptr;
4446
4447 while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4448 namelen = ptr - name;
4449
4450 /* In the pre-compile phase, just do a syntax check. */
4451
4452 if (lengthptr != NULL)
4453 {
4454 if (*ptr != terminator)
4455 {
4456 *errorcodeptr = ERR42;
4457 goto FAILED;
4458 }
4459 if (cd->names_found >= MAX_NAME_COUNT)
4460 {
4461 *errorcodeptr = ERR49;
4462 goto FAILED;
4463 }
4464 if (namelen + 3 > cd->name_entry_size)
4465 {
4466 cd->name_entry_size = namelen + 3;
4467 if (namelen > MAX_NAME_SIZE)
4468 {
4469 *errorcodeptr = ERR48;
4470 goto FAILED;
4471 }
4472 }
4473 }
4474
4475 /* In the real compile, create the entry in the table */
4476
4477 else
4478 {
4479 slot = cd->name_table;
4480 for (i = 0; i < cd->names_found; i++)
4481 {
4482 int crc = memcmp(name, slot+2, namelen);
4483 if (crc == 0)
4484 {
4485 if (slot[2+namelen] == 0)
4486 {
4487 if ((options & PCRE_DUPNAMES) == 0)
4488 {
4489 *errorcodeptr = ERR43;
4490 goto FAILED;
4491 }
4492 }
4493 else crc = -1; /* Current name is substring */
4494 }
4495 if (crc < 0)
4496 {
4497 memmove(slot + cd->name_entry_size, slot,
4498 (cd->names_found - i) * cd->name_entry_size);
4499 break;
4500 }
4501 slot += cd->name_entry_size;
4502 }
4503
4504 PUT2(slot, 0, cd->bracount + 1);
4505 memcpy(slot + 2, name, namelen);
4506 slot[2+namelen] = 0;
4507 }
4508 }
4509
4510 /* In both cases, count the number of names we've encountered. */
4511
4512 ptr++; /* Move past > or ' */
4513 cd->names_found++;
4514 goto NUMBERED_GROUP;
4515
4516
4517 /* ------------------------------------------------------------ */
4518 case '&': /* Perl recursion/subroutine syntax */
4519 terminator = ')';
4520 is_recurse = TRUE;
4521 /* Fall through */
4522
4523 /* We come here from the Python syntax above that handles both
4524 references (?P=name) and recursion (?P>name), as well as falling
4525 through from the Perl recursion syntax (?&name). */
4526
4527 NAMED_REF_OR_RECURSE:
4528 name = ++ptr;
4529 while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4530 namelen = ptr - name;
4531
4532 /* In the pre-compile phase, do a syntax check and set a dummy
4533 reference number. */
4534
4535 if (lengthptr != NULL)
4536 {
4537 if (*ptr != terminator)
4538 {
4539 *errorcodeptr = ERR42;
4540 goto FAILED;
4541 }
4542 if (namelen > MAX_NAME_SIZE)
4543 {
4544 *errorcodeptr = ERR48;
4545 goto FAILED;
4546 }
4547 recno = 0;
4548 }
4549
4550 /* In the real compile, seek the name in the table */
4551
4552 else
4553 {
4554 slot = cd->name_table;
4555 for (i = 0; i < cd->names_found; i++)
4556 {
4557 if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
4558 slot += cd->name_entry_size;
4559 }
4560
4561 if (i < cd->names_found) /* Back reference */
4562 {
4563 recno = GET2(slot, 0);
4564 }
4565 else if ((recno = /* Forward back reference */
4566 find_parens(ptr, cd->bracount, name, namelen,
4567 (options & PCRE_EXTENDED) != 0)) <= 0)
4568 {
4569 *errorcodeptr = ERR15;
4570 goto FAILED;
4571 }
4572 }
4573
4574 /* In both phases, we can now go to the code than handles numerical
4575 recursion or backreferences. */
4576
4577 if (is_recurse) goto HANDLE_RECURSION;
4578 else goto HANDLE_REFERENCE;
4579
4580
4581 /* ------------------------------------------------------------ */
4582 case 'R': /* Recursion */
4583 ptr++; /* Same as (?0) */
4584 /* Fall through */
4585
4586
4587 /* ------------------------------------------------------------ */
4588 case '-': case '+':
4589 case '0': case '1': case '2': case '3': case '4': /* Recursion or */
4590 case '5': case '6': case '7': case '8': case '9': /* subroutine */
4591 {
4592 const uschar *called;
4593
4594 if ((refsign = *ptr) == '+') ptr++;
4595 else if (refsign == '-')
4596 {
4597 if ((digitab[ptr[1]] & ctype_digit) == 0)
4598 goto OTHER_CHAR_AFTER_QUERY;
4599 ptr++;
4600 }
4601
4602 recno = 0;
4603 while((digitab[*ptr] & ctype_digit) != 0)
4604 recno = recno * 10 + *ptr++ - '0';
4605
4606 if (*ptr != ')')
4607 {
4608 *errorcodeptr = ERR29;
4609 goto FAILED;
4610 }
4611
4612 if (refsign == '-')
4613 {
4614 if (recno == 0)
4615 {
4616 *errorcodeptr = ERR58;
4617 goto FAILED;
4618 }
4619 recno = cd->bracount - recno + 1;
4620 if (recno <= 0)
4621 {
4622 *errorcodeptr = ERR15;
4623 goto FAILED;
4624 }
4625 }
4626 else if (refsign == '+')
4627 {
4628 if (recno == 0)
4629 {
4630 *errorcodeptr = ERR58;
4631 goto FAILED;
4632 }
4633 recno += cd->bracount;
4634 }
4635
4636 /* Come here from code above that handles a named recursion */
4637
4638 HANDLE_RECURSION:
4639
4640 previous = code;
4641 called = cd->start_code;
4642
4643 /* When we are actually compiling, find the bracket that is being
4644 referenced. Temporarily end the regex in case it doesn't exist before
4645 this point. If we end up with a forward reference, first check that
4646 the bracket does occur later so we can give the error (and position)
4647 now. Then remember this forward reference in the workspace so it can
4648 be filled in at the end. */
4649
4650 if (lengthptr == NULL)
4651 {
4652 *code = OP_END;
4653 if (recno != 0) called = find_bracket(cd->start_code, utf8, recno);
4654
4655 /* Forward reference */
4656
4657 if (called == NULL)
4658 {
4659 if (find_parens(ptr, cd->bracount, NULL, recno,
4660 (options & PCRE_EXTENDED) != 0) < 0)
4661 {
4662 *errorcodeptr = ERR15;
4663 goto FAILED;
4664 }
4665 called = cd->start_code + recno;
4666 PUTINC(cd->hwm, 0, code + 2 + LINK_SIZE - cd->start_code);
4667 }
4668
4669 /* If not a forward reference, and the subpattern is still open,
4670 this is a recursive call. We check to see if this is a left
4671 recursion that could loop for ever, and diagnose that case. */
4672
4673 else if (GET(called, 1) == 0 &&
4674 could_be_empty(called, code, bcptr, utf8))
4675 {
4676 *errorcodeptr = ERR40;
4677 goto FAILED;
4678 }
4679 }
4680
4681 /* Insert the recursion/subroutine item, automatically wrapped inside
4682 "once" brackets. Set up a "previous group" length so that a
4683 subsequent quantifier will work. */
4684
4685 *code = OP_ONCE;
4686 PUT(code, 1, 2 + 2*LINK_SIZE);
4687 code += 1 + LINK_SIZE;
4688
4689 *code = OP_RECURSE;
4690 PUT(code, 1, called - cd->start_code);
4691 code += 1 + LINK_SIZE;
4692
4693 *code = OP_KET;
4694 PUT(code, 1, 2 + 2*LINK_SIZE);
4695 code += 1 + LINK_SIZE;
4696
4697 length_prevgroup = 3 + 3*LINK_SIZE;
4698 }
4699
4700 /* Can't determine a first byte now */
4701
4702 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
4703 continue;
4704
4705
4706 /* ------------------------------------------------------------ */
4707 default: /* Other characters: check option setting */
4708 OTHER_CHAR_AFTER_QUERY:
4709 set = unset = 0;
4710 optset = &set;
4711
4712 while (*ptr != ')' && *ptr != ':')
4713 {
4714 switch (*ptr++)
4715 {
4716 case '-': optset = &unset; break;
4717
4718 case 'J': /* Record that it changed in the external options */
4719 *optset |= PCRE_DUPNAMES;
4720 cd->external_flags |= PCRE_JCHANGED;
4721 break;
4722
4723 case 'i': *optset |= PCRE_CASELESS; break;
4724 case 'm': *optset |= PCRE_MULTILINE; break;
4725 case 's': *optset |= PCRE_DOTALL; break;
4726 case 'x': *optset |= PCRE_EXTENDED; break;
4727 case 'U': *optset |= PCRE_UNGREEDY; break;
4728 case 'X': *optset |= PCRE_EXTRA; break;
4729
4730 default: *errorcodeptr = ERR12;
4731 ptr--; /* Correct the offset */
4732 goto FAILED;
4733 }
4734 }
4735
4736 /* Set up the changed option bits, but don't change anything yet. */
4737
4738 newoptions = (options | set) & (~unset);
4739
4740 /* If the options ended with ')' this is not the start of a nested
4741 group with option changes, so the options change at this level. If this
4742 item is right at the start of the pattern, the options can be
4743 abstracted and made external in the pre-compile phase, and ignored in
4744 the compile phase. This can be helpful when matching -- for instance in
4745 caseless checking of required bytes.
4746
4747 If the code pointer is not (cd->start_code + 1 + LINK_SIZE), we are
4748 definitely *not* at the start of the pattern because something has been
4749 compiled. In the pre-compile phase, however, the code pointer can have
4750 that value after the start, because it gets reset as code is discarded
4751 during the pre-compile. However, this can happen only at top level - if
4752 we are within parentheses, the starting BRA will still be present. At
4753 any parenthesis level, the length value can be used to test if anything
4754 has been compiled at that level. Thus, a test for both these conditions
4755 is necessary to ensure we correctly detect the start of the pattern in
4756 both phases.
4757
4758 If we are not at the pattern start, compile code to change the ims
4759 options if this setting actually changes any of them. We also pass the
4760 new setting back so that it can be put at the start of any following
4761 branches, and when this group ends (if we are in a group), a resetting
4762 item can be compiled. */
4763
4764 if (*ptr == ')')
4765 {
4766 if (code == cd->start_code + 1 + LINK_SIZE &&
4767 (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE))
4768 {
4769 cd->external_options = newoptions;
4770 options = newoptions;
4771 }
4772 else
4773 {
4774 if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
4775 {
4776 *code++ = OP_OPT;
4777 *code++ = newoptions & PCRE_IMS;
4778 }
4779
4780 /* Change options at this level, and pass them back for use
4781 in subsequent branches. Reset the greedy defaults and the case
4782 value for firstbyte and reqbyte. */
4783
4784 *optionsptr = options = newoptions;
4785 greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
4786 greedy_non_default = greedy_default ^ 1;
4787 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
4788 }
4789
4790 previous = NULL; /* This item can't be repeated */
4791 continue; /* It is complete */
4792 }
4793
4794 /* If the options ended with ':' we are heading into a nested group
4795 with possible change of options. Such groups are non-capturing and are
4796 not assertions of any kind. All we need to do is skip over the ':';
4797 the newoptions value is handled below. */
4798
4799 bravalue = OP_BRA;
4800 ptr++;
4801 } /* End of switch for character following (? */
4802 } /* End of (? handling */
4803
4804 /* Opening parenthesis not followed by '?'. If PCRE_NO_AUTO_CAPTURE is set,
4805 all unadorned brackets become non-capturing and behave like (?:...)
4806 brackets. */
4807
4808 else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
4809 {
4810 bravalue = OP_BRA;
4811 }
4812
4813 /* Else we have a capturing group. */
4814
4815 else
4816 {
4817 NUMBERED_GROUP:
4818 cd->bracount += 1;
4819 PUT2(code, 1+LINK_SIZE, cd->bracount);
4820 skipbytes = 2;
4821 }
4822
4823 /* Process nested bracketed regex. Assertions may not be repeated, but
4824 other kinds can be. All their opcodes are >= OP_ONCE. We copy code into a
4825 non-register variable in order to be able to pass its address because some
4826 compilers complain otherwise. Pass in a new setting for the ims options if
4827 they have changed. */
4828
4829 previous = (bravalue >= OP_ONCE)? code : NULL;
4830 *code = bravalue;
4831 tempcode = code;
4832 tempreqvary = cd->req_varyopt; /* Save value before bracket */
4833 length_prevgroup = 0; /* Initialize for pre-compile phase */
4834
4835 if (!compile_regex(
4836 newoptions, /* The complete new option state */
4837 options & PCRE_IMS, /* The previous ims option state */
4838 &tempcode, /* Where to put code (updated) */
4839 &ptr, /* Input pointer (updated) */
4840 errorcodeptr, /* Where to put an error message */
4841 (bravalue == OP_ASSERTBACK ||
4842 bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
4843 reset_bracount, /* True if (?| group */
4844 skipbytes, /* Skip over bracket number */
4845 &subfirstbyte, /* For possible first char */
4846 &subreqbyte, /* For possible last char */
4847 bcptr, /* Current branch chain */
4848 cd, /* Tables block */
4849 (lengthptr == NULL)? NULL : /* Actual compile phase */
4850 &length_prevgroup /* Pre-compile phase */
4851 ))
4852 goto FAILED;
4853
4854 /* At the end of compiling, code is still pointing to the start of the
4855 group, while tempcode has been updated to point past the end of the group
4856 and any option resetting that may follow it. The pattern pointer (ptr)
4857 is on the bracket. */
4858
4859 /* If this is a conditional bracket, check that there are no more than
4860 two branches in the group, or just one if it's a DEFINE group. We do this
4861 in the real compile phase, not in the pre-pass, where the whole group may
4862 not be available. */
4863
4864 if (bravalue == OP_COND && lengthptr == NULL)
4865 {
4866 uschar *tc = code;
4867 int condcount = 0;
4868
4869 do {
4870 condcount++;
4871 tc += GET(tc,1);
4872 }
4873 while (*tc != OP_KET);
4874
4875 /* A DEFINE group is never obeyed inline (the "condition" is always
4876 false). It must have only one branch. */
4877
4878 if (code[LINK_SIZE+1] == OP_DEF)
4879 {
4880 if (condcount > 1)
4881 {
4882 *errorcodeptr = ERR54;
4883 goto FAILED;
4884 }
4885 bravalue = OP_DEF; /* Just a flag to suppress char handling below */
4886 }
4887
4888 /* A "normal" conditional group. If there is just one branch, we must not
4889 make use of its firstbyte or reqbyte, because this is equivalent to an
4890 empty second branch. */
4891
4892 else
4893 {
4894 if (condcount > 2)
4895 {
4896 *errorcodeptr = ERR27;
4897 goto FAILED;
4898 }
4899 if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
4900 }
4901 }
4902
4903 /* Error if hit end of pattern */
4904
4905 if (*ptr != ')')
4906 {
4907 *errorcodeptr = ERR14;
4908 goto FAILED;
4909 }
4910
4911 /* In the pre-compile phase, update the length by the length of the group,
4912 less the brackets at either end. Then reduce the compiled code to just a
4913 set of non-capturing brackets so that it doesn't use much memory if it is
4914 duplicated by a quantifier.*/
4915
4916 if (lengthptr != NULL)
4917 {
4918 if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE)
4919 {
4920 *errorcodeptr = ERR20;
4921 goto FAILED;
4922 }
4923 *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
4924 *code++ = OP_BRA;
4925 PUTINC(code, 0, 1 + LINK_SIZE);
4926 *code++ = OP_KET;
4927 PUTINC(code, 0, 1 + LINK_SIZE);
4928 break; /* No need to waste time with special character handling */
4929 }
4930
4931 /* Otherwise update the main code pointer to the end of the group. */
4932
4933 code = tempcode;
4934
4935 /* For a DEFINE group, required and first character settings are not
4936 relevant. */
4937
4938 if (bravalue == OP_DEF) break;
4939
4940 /* Handle updating of the required and first characters for other types of
4941 group. Update for normal brackets of all kinds, and conditions with two
4942 branches (see code above). If the bracket is followed by a quantifier with
4943 zero repeat, we have to back off. Hence the definition of zeroreqbyte and
4944 zerofirstbyte outside the main loop so that they can be accessed for the
4945 back off. */
4946
4947 zeroreqbyte = reqbyte;
4948 zerofirstbyte = firstbyte;
4949 groupsetfirstbyte = FALSE;
4950
4951 if (bravalue >= OP_ONCE)
4952 {
4953 /* If we have not yet set a firstbyte in this branch, take it from the
4954 subpattern, remembering that it was set here so that a repeat of more
4955 than one can replicate it as reqbyte if necessary. If the subpattern has
4956 no firstbyte, set "none" for the whole branch. In both cases, a zero
4957 repeat forces firstbyte to "none". */
4958
4959 if (firstbyte == REQ_UNSET)
4960 {
4961 if (subfirstbyte >= 0)
4962 {
4963 firstbyte = subfirstbyte;
4964 groupsetfirstbyte = TRUE;
4965 }
4966 else firstbyte = REQ_NONE;
4967 zerofirstbyte = REQ_NONE;
4968 }
4969
4970 /* If firstbyte was previously set, convert the subpattern's firstbyte
4971 into reqbyte if there wasn't one, using the vary flag that was in
4972 existence beforehand. */
4973
4974 else if (subfirstbyte >= 0 && subreqbyte < 0)
4975 subreqbyte = subfirstbyte | tempreqvary;
4976
4977 /* If the subpattern set a required byte (or set a first byte that isn't
4978 really the first byte - see above), set it. */
4979
4980 if (subreqbyte >= 0) reqbyte = subreqbyte;
4981 }
4982
4983 /* For a forward assertion, we take the reqbyte, if set. This can be
4984 helpful if the pattern that follows the assertion doesn't set a different
4985 char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
4986 for an assertion, however because it leads to incorrect effect for patterns
4987 such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
4988 of a firstbyte. This is overcome by a scan at the end if there's no
4989 firstbyte, looking for an asserted first char. */
4990
4991 else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
4992 break; /* End of processing '(' */
4993
4994
4995 /* ===================================================================*/
4996 /* Handle metasequences introduced by \. For ones like \d, the ESC_ values
4997 are arranged to be the negation of the corresponding OP_values. For the
4998 back references, the values are ESC_REF plus the reference number. Only
4999 back references and those types that consume a character may be repeated.
5000 We can test for values between ESC_b and ESC_Z for the latter; this may
5001 have to change if any new ones are ever created. */
5002
5003 case '\\':
5004 tempptr = ptr;
5005 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, FALSE);
5006 if (*errorcodeptr != 0) goto FAILED;
5007
5008 if (c < 0)
5009 {
5010 if (-c == ESC_Q) /* Handle start of quoted string */
5011 {
5012 if (ptr[1] == '\\' && ptr[2] == 'E') ptr += 2; /* avoid empty string */
5013 else inescq = TRUE;
5014 continue;
5015 }
5016
5017 if (-c == ESC_E) continue; /* Perl ignores an orphan \E */
5018
5019 /* For metasequences that actually match a character, we disable the
5020 setting of a first character if it hasn't already been set. */
5021
5022 if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
5023 firstbyte = REQ_NONE;
5024
5025 /* Set values to reset to if this is followed by a zero repeat. */
5026
5027 zerofirstbyte = firstbyte;
5028 zeroreqbyte = reqbyte;
5029
5030 /* \k<name> or \k'name' is a back reference by name (Perl syntax).
5031 We also support \k{name} (.NET syntax) */
5032
5033 if (-c == ESC_k && (ptr[1] == '<' || ptr[1] == '\'' || ptr[1] == '{'))
5034 {
5035 is_recurse = FALSE;
5036 terminator = (*(++ptr) == '<')? '>' : (*ptr == '\'')? '\'' : '}';
5037 goto NAMED_REF_OR_RECURSE;
5038 }
5039
5040 /* Back references are handled specially; must disable firstbyte if
5041 not set to cope with cases like (?=(\w+))\1: which would otherwise set
5042 ':' later. */
5043
5044 if (-c >= ESC_REF)
5045 {
5046 recno = -c - ESC_REF;
5047
5048 HANDLE_REFERENCE: /* Come here from named backref handling */
5049 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
5050 previous = code;
5051 *code++ = OP_REF;
5052 PUT2INC(code, 0, recno);
5053 cd->backref_map |= (recno < 32)? (1 << recno) : 1;
5054 if (recno > cd->top_backref) cd->top_backref = recno;
5055 }
5056
5057 /* So are Unicode property matches, if supported. */
5058
5059 #ifdef SUPPORT_UCP
5060 else if (-c == ESC_P || -c == ESC_p)
5061 {
5062 BOOL negated;
5063 int pdata;
5064 int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
5065 if (ptype < 0) goto FAILED;
5066 previous = code;
5067 *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
5068 *code++ = ptype;
5069 *code++ = pdata;
5070 }
5071 #else
5072
5073 /* If Unicode properties are not supported, \X, \P, and \p are not
5074 allowed. */
5075
5076 else if (-c == ESC_X || -c == ESC_P || -c == ESC_p)
5077 {
5078 *errorcodeptr = ERR45;
5079 goto FAILED;
5080 }
5081 #endif
5082
5083 /* For the rest (including \X when Unicode properties are supported), we
5084 can obtain the OP value by negating the escape value. */
5085
5086 else
5087 {
5088 previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
5089 *code++ = -c;
5090 }
5091 continue;
5092 }
5093
5094 /* We have a data character whose value is in c. In UTF-8 mode it may have
5095 a value > 127. We set its representation in the length/buffer, and then
5096 handle it as a data character. */
5097
5098 #ifdef SUPPORT_UTF8
5099 if (utf8 && c > 127)
5100 mclength = _pcre_ord2utf8(c, mcbuffer);
5101 else
5102 #endif
5103
5104 {
5105 mcbuffer[0] = c;
5106 mclength = 1;
5107 }
5108 goto ONE_CHAR;
5109
5110
5111 /* ===================================================================*/
5112 /* Handle a literal character. It is guaranteed not to be whitespace or #
5113 when the extended flag is set. If we are in UTF-8 mode, it may be a
5114 multi-byte literal character. */
5115
5116 default:
5117 NORMAL_CHAR:
5118 mclength = 1;
5119 mcbuffer[0] = c;
5120
5121 #ifdef SUPPORT_UTF8
5122 if (utf8 && c >= 0xc0)
5123 {
5124 while ((ptr[1] & 0xc0) == 0x80)
5125 mcbuffer[mclength++] = *(++ptr);
5126 }
5127 #endif
5128
5129 /* At this point we have the character's bytes in mcbuffer, and the length
5130 in mclength. When not in UTF-8 mode, the length is always 1. */
5131
5132 ONE_CHAR:
5133 previous = code;
5134 *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARNC : OP_CHAR;
5135 for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
5136
5137 /* Remember if \r or \n were seen */
5138
5139 if (mcbuffer[0] == '\r' || mcbuffer[0] == '\n')
5140 cd->external_flags |= PCRE_HASCRORLF;
5141
5142 /* Set the first and required bytes appropriately. If no previous first
5143 byte, set it from this character, but revert to none on a zero repeat.
5144 Otherwise, leave the firstbyte value alone, and don't change it on a zero
5145 repeat. */
5146
5147 if (firstbyte == REQ_UNSET)
5148 {
5149 zerofirstbyte = REQ_NONE;
5150 zeroreqbyte = reqbyte;
5151
5152 /* If the character is more than one byte long, we can set firstbyte
5153 only if it is not to be matched caselessly. */
5154
5155 if (mclength == 1 || req_caseopt == 0)
5156 {
5157 firstbyte = mcbuffer[0] | req_caseopt;
5158 if (mclength != 1) reqbyte = code[-1] | cd->req_varyopt;
5159 }
5160 else firstbyte = reqbyte = REQ_NONE;
5161 }
5162
5163 /* firstbyte was previously set; we can set reqbyte only the length is
5164 1 or the matching is caseful. */
5165
5166 else
5167 {
5168 zerofirstbyte = firstbyte;
5169 zeroreqbyte = reqbyte;
5170 if (mclength == 1 || req_caseopt == 0)
5171 reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
5172 }
5173
5174 break; /* End of literal character handling */
5175 }
5176 } /* end of big loop */
5177
5178
5179 /* Control never reaches here by falling through, only by a goto for all the
5180 error states. Pass back the position in the pattern so that it can be displayed
5181 to the user for diagnosing the error. */
5182
5183 FAILED:
5184 *ptrptr = ptr;
5185 return FALSE;
5186 }
5187
5188
5189
5190
5191 /*************************************************
5192 * Compile sequence of alternatives *
5193 *************************************************/
5194
5195 /* On entry, ptr is pointing past the bracket character, but on return it
5196 points to the closing bracket, or vertical bar, or end of string. The code
5197 variable is pointing at the byte into which the BRA operator has been stored.
5198 If the ims options are changed at the start (for a (?ims: group) or during any
5199 branch, we need to insert an OP_OPT item at the start of every following branch
5200 to ensure they get set correctly at run time, and also pass the new options
5201 into every subsequent branch compile.
5202
5203 This function is used during the pre-compile phase when we are trying to find
5204 out the amount of memory needed, as well as during the real compile phase. The
5205 value of lengthptr distinguishes the two phases.
5206
5207 Arguments:
5208 options option bits, including any changes for this subpattern
5209 oldims previous settings of ims option bits
5210 codeptr -> the address of the current code pointer
5211 ptrptr -> the address of the current pattern pointer
5212 errorcodeptr -> pointer to error code variable
5213 lookbehind TRUE if this is a lookbehind assertion
5214 reset_bracount TRUE to reset the count for each branch
5215 skipbytes skip this many bytes at start (for brackets and OP_COND)
5216 firstbyteptr place to put the first required character, or a negative number
5217 reqbyteptr place to put the last required character, or a negative number
5218 bcptr pointer to the chain of currently open branches
5219 cd points to the data block with tables pointers etc.
5220 lengthptr NULL during the real compile phase
5221 points to length accumulator during pre-compile phase
5222
5223 Returns: TRUE on success
5224 */
5225
5226 static BOOL
5227 compile_regex(int options, int oldims, uschar **codeptr, const uschar **ptrptr,
5228 int *errorcodeptr, BOOL lookbehind, BOOL reset_bracount, int skipbytes,
5229 int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd,
5230 int *lengthptr)
5231 {
5232 const uschar *ptr = *ptrptr;
5233 uschar *code = *codeptr;
5234 uschar *last_branch = code;
5235 uschar *start_bracket = code;
5236 uschar *reverse_count = NULL;
5237 int firstbyte, reqbyte;
5238 int branchfirstbyte, branchreqbyte;
5239 int length;
5240 int orig_bracount;
5241 int max_bracount;
5242 branch_chain bc;
5243
5244 bc.outer = bcptr;
5245 bc.current = code;
5246
5247 firstbyte = reqbyte = REQ_UNSET;
5248
5249 /* Accumulate the length for use in the pre-compile phase. Start with the
5250 length of the BRA and KET and any extra bytes that are required at the
5251 beginning. We accumulate in a local variable to save frequent testing of
5252 lenthptr for NULL. We cannot do this by looking at the value of code at the
5253 start and end of each alternative, because compiled items are discarded during
5254 the pre-compile phase so that the work space is not exceeded. */
5255
5256 length = 2 + 2*LINK_SIZE + skipbytes;
5257
5258 /* WARNING: If the above line is changed for any reason, you must also change
5259 the code that abstracts option settings at the start of the pattern and makes
5260 them global. It tests the value of length for (2 + 2*LINK_SIZE) in the
5261 pre-compile phase to find out whether anything has yet been compiled or not. */
5262
5263 /* Offset is set zero to mark that this bracket is still open */
5264
5265 PUT(code, 1, 0);
5266 code += 1 + LINK_SIZE + skipbytes;
5267
5268 /* Loop for each alternative branch */
5269
5270 orig_bracount = max_bracount = cd->bracount;
5271 for (;;)
5272 {
5273 /* For a (?| group, reset the capturing bracket count so that each branch
5274 uses the same numbers. */
5275
5276 if (reset_bracount) cd->bracount = orig_bracount;
5277
5278 /* Handle a change of ims options at the start of the branch */
5279
5280 if ((options & PCRE_IMS) != oldims)
5281 {
5282 *code++ = OP_OPT;
5283 *code++ = options & PCRE_IMS;
5284 length += 2;
5285 }
5286
5287 /* Set up dummy OP_REVERSE if lookbehind assertion */
5288
5289 if (lookbehind)
5290 {
5291 *code++ = OP_REVERSE;
5292 reverse_count = code;
5293 PUTINC(code, 0, 0);
5294 length += 1 + LINK_SIZE;
5295 }
5296
5297 /* Now compile the branch; in the pre-compile phase its length gets added
5298 into the length. */
5299
5300 if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstbyte,
5301 &branchreqbyte, &bc, cd, (lengthptr == NULL)? NULL : &length))
5302 {
5303 *ptrptr = ptr;
5304 return FALSE;
5305 }
5306
5307 /* Keep the highest bracket count in case (?| was used and some branch
5308 has fewer than the rest. */
5309
5310 if (cd->bracount > max_bracount) max_bracount = cd->bracount;
5311
5312 /* In the real compile phase, there is some post-processing to be done. */
5313
5314 if (lengthptr == NULL)
5315 {
5316 /* If this is the first branch, the firstbyte and reqbyte values for the
5317 branch become the values for the regex. */
5318
5319 if (*last_branch != OP_ALT)
5320 {
5321 firstbyte = branchfirstbyte;
5322 reqbyte = branchreqbyte;
5323 }
5324
5325 /* If this is not the first branch, the first char and reqbyte have to
5326 match the values from all the previous branches, except that if the
5327 previous value for reqbyte didn't have REQ_VARY set, it can still match,
5328 and we set REQ_VARY for the regex. */
5329
5330 else
5331 {
5332 /* If we previously had a firstbyte, but it doesn't match the new branch,
5333 we have to abandon the firstbyte for the regex, but if there was
5334 previously no reqbyte, it takes on the value of the old firstbyte. */
5335
5336 if (firstbyte >= 0 && firstbyte != branchfirstbyte)
5337 {
5338 if (reqbyte < 0) reqbyte = firstbyte;
5339 firstbyte = REQ_NONE;
5340 }
5341
5342 /* If we (now or from before) have no firstbyte, a firstbyte from the
5343 branch becomes a reqbyte if there isn't a branch reqbyte. */
5344
5345 if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
5346 branchreqbyte = branchfirstbyte;
5347
5348 /* Now ensure that the reqbytes match */
5349
5350 if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
5351 reqbyte = REQ_NONE;
5352 else reqbyte |= branchreqbyte; /* To "or" REQ_VARY */
5353 }
5354
5355 /* If lookbehind, check that this branch matches a fixed-length string, and
5356 put the length into the OP_REVERSE item. Temporarily mark the end of the
5357 branch with OP_END. */
5358
5359 if (lookbehind)
5360 {
5361 int fixed_length;
5362 *code = OP_END;
5363 fixed_length = find_fixedlength(last_branch, options);
5364 DPRINTF(("fixed length = %d\n", fixed_length));
5365 if (fixed_length < 0)
5366 {
5367 *errorcodeptr = (fixed_length == -2)? ERR36 : ERR25;
5368 *ptrptr = ptr;
5369 return FALSE;
5370 }
5371 PUT(reverse_count, 0, fixed_length);
5372 }
5373 }
5374
5375 /* Reached end of expression, either ')' or end of pattern. In the real
5376 compile phase, go back through the alternative branches and reverse the chain
5377 of offsets, with the field in the BRA item now becoming an offset to the
5378 first alternative. If there are no alternatives, it points to the end of the
5379 group. The length in the terminating ket is always the length of the whole
5380 bracketed item. If any of the ims options were changed inside the group,
5381 compile a resetting op-code following, except at the very end of the pattern.
5382 Return leaving the pointer at the terminating char. */
5383
5384 if (*ptr != '|')
5385 {
5386 if (lengthptr == NULL)
5387 {
5388 int branch_length = code - last_branch;
5389 do
5390 {
5391 int prev_length = GET(last_branch, 1);
5392 PUT(last_branch, 1, branch_length);
5393 branch_length = prev_length;
5394 last_branch -= branch_length;
5395 }
5396 while (branch_length > 0);
5397 }
5398
5399 /* Fill in the ket */
5400
5401 *code = OP_KET;
5402 PUT(code, 1, code - start_bracket);
5403 code += 1 + LINK_SIZE;
5404
5405 /* Resetting option if needed */
5406
5407 if ((options & PCRE_IMS) != oldims && *ptr == ')')
5408 {
5409 *code++ = OP_OPT;
5410 *code++ = oldims;
5411 length += 2;
5412 }
5413
5414 /* Retain the highest bracket number, in case resetting was used. */
5415
5416 cd->bracount = max_bracount;
5417
5418 /* Set values to pass back */
5419
5420 *codeptr = code;
5421 *ptrptr = ptr;
5422 *firstbyteptr = firstbyte;
5423 *reqbyteptr = reqbyte;
5424 if (lengthptr != NULL)
5425 {
5426 if (OFLOW_MAX - *lengthptr < length)
5427 {
5428 *errorcodeptr = ERR20;
5429 return FALSE;
5430 }
5431 *lengthptr += length;
5432 }
5433 return TRUE;
5434 }
5435
5436 /* Another branch follows. In the pre-compile phase, we can move the code
5437 pointer back to where it was for the start of the first branch. (That is,
5438 pretend that each branch is the only one.)
5439
5440 In the real compile phase, insert an ALT node. Its length field points back
5441 to the previous branch while the bracket remains open. At the end the chain
5442 is reversed. It's done like this so that the start of the bracket has a
5443 zero offset until it is closed, making it possible to detect recursion. */
5444
5445 if (lengthptr != NULL)
5446 {
5447 code = *codeptr + 1 + LINK_SIZE + skipbytes;
5448 length += 1 + LINK_SIZE;
5449 }
5450 else
5451 {
5452 *code = OP_ALT;
5453 PUT(code, 1, code - last_branch);
5454 bc.current = last_branch = code;
5455 code += 1 + LINK_SIZE;
5456 }
5457
5458 ptr++;
5459 }
5460 /* Control never reaches here */
5461 }
5462
5463
5464
5465
5466 /*************************************************
5467 * Check for anchored expression *
5468 *************************************************/
5469
5470 /* Try to find out if this is an anchored regular expression. Consider each
5471 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
5472 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
5473 it's anchored. However, if this is a multiline pattern, then only OP_SOD
5474 counts, since OP_CIRC can match in the middle.
5475
5476 We can also consider a regex to be anchored if OP_SOM starts all its branches.
5477 This is the code for \G, which means "match at start of match position, taking
5478 into account the match offset".
5479
5480 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
5481 because that will try the rest of the pattern at all possible matching points,
5482 so there is no point trying again.... er ....
5483
5484 .... except when the .* appears inside capturing parentheses, and there is a
5485 subsequent back reference to those parentheses. We haven't enough information
5486 to catch that case precisely.
5487
5488 At first, the best we could do was to detect when .* was in capturing brackets
5489 and the highest back reference was greater than or equal to that level.
5490 However, by keeping a bitmap of the first 31 back references, we can catch some
5491 of the more common cases more precisely.
5492
5493 Arguments:
5494 code points to start of expression (the bracket)
5495 options points to the options setting
5496 bracket_map a bitmap of which brackets we are inside while testing; this
5497 handles up to substring 31; after that we just have to take
5498 the less precise approach
5499 backref_map the back reference bitmap
5500
5501 Returns: TRUE or FALSE
5502 */
5503
5504 static BOOL
5505 is_anchored(register const uschar *code, int *options, unsigned int bracket_map,
5506 unsigned int backref_map)
5507 {
5508 do {
5509 const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
5510 options, PCRE_MULTILINE, FALSE);
5511 register int op = *scode;
5512
5513 /* Non-capturing brackets */
5514
5515 if (op == OP_BRA)
5516 {
5517 if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
5518 }
5519
5520 /* Capturing brackets */
5521
5522 else if (op == OP_CBRA)
5523 {
5524 int n = GET2(scode, 1+LINK_SIZE);
5525 int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
5526 if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;
5527 }
5528
5529 /* Other brackets */
5530
5531 else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
5532 {
5533 if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
5534 }
5535
5536 /* .* is not anchored unless DOTALL is set and it isn't in brackets that
5537 are or may be referenced. */
5538
5539 else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
5540 op == OP_TYPEPOSSTAR) &&
5541 (*options & PCRE_DOTALL) != 0)
5542 {
5543 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
5544 }
5545
5546 /* Check for explicit anchoring */
5547
5548 else if (op != OP_SOD && op != OP_SOM &&
5549 ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
5550 return FALSE;
5551 code += GET(code, 1);
5552 }
5553 while (*code == OP_ALT); /* Loop for each alternative */
5554 return TRUE;
5555 }
5556
5557
5558
5559 /*************************************************
5560 * Check for starting with ^ or .* *
5561 *************************************************/
5562
5563 /* This is called to find out if every branch starts with ^ or .* so that
5564 "first char" processing can be done to speed things up in multiline
5565 matching and for non-DOTALL patterns that start with .* (which must start at
5566 the beginning or after \n). As in the case of is_anchored() (see above), we
5567 have to take account of back references to capturing brackets that contain .*
5568 because in that case we can't make the assumption.
5569
5570 Arguments:
5571 code points to start of expression (the bracket)
5572 bracket_map a bitmap of which brackets we are inside while testing; this
5573 handles up to substring 31; after that we just have to take
5574 the less precise approach
5575 backref_map the back reference bitmap
5576
5577 Returns: TRUE or FALSE
5578 */
5579
5580 static BOOL
5581 is_startline(const uschar *code, unsigned int bracket_map,
5582 unsigned int backref_map)
5583 {
5584 do {
5585 const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
5586 NULL, 0, FALSE);
5587 register int op = *scode;
5588
5589 /* Non-capturing brackets */
5590
5591 if (op == OP_BRA)
5592 {
5593 if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
5594 }
5595
5596 /* Capturing brackets */
5597
5598 else if (op == OP_CBRA)
5599 {
5600 int n = GET2(scode, 1+LINK_SIZE);
5601 int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
5602 if (!is_startline(scode, new_map, backref_map)) return FALSE;
5603 }
5604
5605 /* Other brackets */
5606
5607 else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
5608 { if (!is_startline(scode, bracket_map, backref_map)) return FALSE; }
5609
5610 /* .* means "start at start or after \n" if it isn't in brackets that
5611 may be referenced. */
5612
5613 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
5614 {
5615 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
5616 }
5617
5618 /* Check for explicit circumflex */
5619
5620 else if (op != OP_CIRC) return FALSE;
5621
5622 /* Move on to the next alternative */
5623
5624 code += GET(code, 1);
5625 }
5626 while (*code == OP_ALT); /* Loop for each alternative */
5627 return TRUE;
5628 }
5629
5630
5631
5632 /*************************************************
5633 * Check for asserted fixed first char *
5634 *************************************************/
5635
5636 /* During compilation, the "first char" settings from forward assertions are
5637 discarded, because they can cause conflicts with actual literals that follow.
5638 However, if we end up without a first char setting for an unanchored pattern,
5639 it is worth scanning the regex to see if there is an initial asserted first
5640 char. If all branches start with the same asserted char, or with a bracket all
5641 of whose alternatives start with the same asserted char (recurse ad lib), then
5642 we return that char, otherwise -1.
5643
5644 Arguments:
5645 code points to start of expression (the bracket)
5646 options pointer to the options (used to check casing changes)
5647 inassert TRUE if in an assertion
5648
5649 Returns: -1 or the fixed first char
5650 */
5651
5652 static int
5653 find_firstassertedchar(const uschar *code, int *options, BOOL inassert)
5654 {
5655 register int c = -1;
5656 do {
5657 int d;
5658 const uschar *scode =
5659 first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE);
5660 register int op = *scode;
5661
5662 switch(op)
5663 {
5664 default:
5665 return -1;
5666
5667 case OP_BRA:
5668 case OP_CBRA:
5669 case OP_ASSERT:
5670 case OP_ONCE:
5671 case OP_COND:
5672 if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0)
5673 return -1;
5674 if (c < 0) c = d; else if (c != d) return -1;
5675 break;
5676
5677 case OP_EXACT: /* Fall through */
5678 scode += 2;
5679
5680 case OP_CHAR:
5681 case OP_CHARNC:
5682 case OP_PLUS:
5683 case OP_MINPLUS:
5684 case OP_POSPLUS:
5685 if (!inassert) return -1;
5686 if (c < 0)
5687 {
5688 c = scode[1];
5689 if ((*options & PCRE_CASELESS) != 0) c |= REQ_CASELESS;
5690 }
5691 else if (c != scode[1]) return -1;
5692 break;
5693 }
5694
5695 code += GET(code, 1);
5696 }
5697 while (*code == OP_ALT);
5698 return c;
5699 }
5700
5701
5702
5703 /*************************************************
5704 * Compile a Regular Expression *
5705 *************************************************/
5706
5707 /* This function takes a string and returns a pointer to a block of store
5708 holding a compiled version of the expression. The original API for this
5709 function had no error code return variable; it is retained for backwards
5710 compatibility. The new function is given a new name.
5711
5712 Arguments:
5713 pattern the regular expression
5714 options various option bits
5715 errorcodeptr pointer to error code variable (pcre_compile2() only)
5716 can be NULL if you don't want a code value
5717 errorptr pointer to pointer to error text
5718 erroroffset ptr offset in pattern where error was detected
5719 tables pointer to character tables or NULL
5720
5721 Returns: pointer to compiled data block, or NULL on error,
5722 with errorptr and erroroffset set
5723 */
5724
5725 PCRE_EXP_DEFN pcre *
5726 pcre_compile(const char *pattern, int options, const char **errorptr,
5727 int *erroroffset, const unsigned char *tables)
5728 {
5729 return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
5730 }
5731
5732
5733 PCRE_EXP_DEFN pcre *
5734 pcre_compile2(const char *pattern, int options, int *errorcodeptr,
5735 const char **errorptr, int *erroroffset, const unsigned char *tables)
5736 {
5737 real_pcre *re;
5738 int length = 1; /* For final END opcode */
5739 int firstbyte, reqbyte, newline;
5740 int errorcode = 0;
5741 int skipatstart = 0;
5742 #ifdef SUPPORT_UTF8
5743 BOOL utf8;
5744 #endif
5745 size_t size;
5746 uschar *code;
5747 const uschar *codestart;
5748 const uschar *ptr;
5749 compile_data compile_block;
5750 compile_data *cd = &compile_block;
5751
5752 /* This space is used for "compiling" into during the first phase, when we are
5753 computing the amount of memory that is needed. Compiled items are thrown away
5754 as soon as possible, so that a fairly large buffer should be sufficient for
5755 this purpose. The same space is used in the second phase for remembering where
5756 to fill in forward references to subpatterns. */
5757
5758 uschar cworkspace[COMPILE_WORK_SIZE];
5759
5760
5761 /* Set this early so that early errors get offset 0. */
5762
5763 ptr = (const uschar *)pattern;
5764
5765 /* We can't pass back an error message if errorptr is NULL; I guess the best we
5766 can do is just return NULL, but we can set a code value if there is a code
5767 pointer. */
5768
5769 if (errorptr == NULL)
5770 {
5771 if (errorcodeptr != NULL) *errorcodeptr = 99;
5772 return NULL;
5773 }
5774
5775 *errorptr = NULL;
5776 if (errorcodeptr != NULL) *errorcodeptr = ERR0;
5777
5778 /* However, we can give a message for this error */
5779
5780 if (erroroffset == NULL)
5781 {
5782 errorcode = ERR16;
5783 goto PCRE_EARLY_ERROR_RETURN2;
5784 }
5785
5786 *erroroffset = 0;
5787
5788 /* Can't support UTF8 unless PCRE has been compiled to include the code. */
5789
5790 #ifdef SUPPORT_UTF8
5791 utf8 = (options & PCRE_UTF8) != 0;
5792 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
5793 (*erroroffset = _pcre_valid_utf8((uschar *)pattern, -1)) >= 0)
5794 {
5795 errorcode = ERR44;
5796 goto PCRE_EARLY_ERROR_RETURN2;
5797 }
5798 #else
5799 if ((options & PCRE_UTF8) != 0)
5800 {
5801 errorcode = ERR32;
5802 goto PCRE_EARLY_ERROR_RETURN;
5803 }
5804 #endif
5805
5806 if ((options & ~PUBLIC_OPTIONS) != 0)
5807 {
5808 errorcode = ERR17;
5809 goto PCRE_EARLY_ERROR_RETURN;
5810 }
5811
5812 /* Set up pointers to the individual character tables */
5813
5814 if (tables == NULL) tables = _pcre_default_tables;
5815 cd->lcc = tables + lcc_offset;
5816 cd->fcc = tables + fcc_offset;
5817 cd->cbits = tables + cbits_offset;
5818 cd->ctypes = tables + ctypes_offset;
5819
5820 /* Check for global one-time settings at the start of the pattern, and remember
5821 the offset for later. */
5822
5823 while (ptr[skipatstart] == '(' && ptr[skipatstart+1] == '*')
5824 {
5825 int newnl = 0;
5826 int newbsr = 0;
5827
5828 if (strncmp((char *)(ptr+skipatstart+2), "CR)", 3) == 0)
5829 { skipatstart += 5; newnl = PCRE_NEWLINE_CR; }
5830 else if (strncmp((char *)(ptr+skipatstart+2), "LF)", 3) == 0)
5831 { skipatstart += 5; newnl = PCRE_NEWLINE_LF; }
5832 else if (strncmp((char *)(ptr+skipatstart+2), "CRLF)", 5) == 0)
5833 { skipatstart += 7; newnl = PCRE_NEWLINE_CR + PCRE_NEWLINE_LF; }
5834 else if (strncmp((char *)(ptr+skipatstart+2), "ANY)", 4) == 0)
5835 { skipatstart += 6; newnl = PCRE_NEWLINE_ANY; }
5836 else if (strncmp((char *)(ptr+skipatstart+2), "ANYCRLF)", 8) == 0)
5837 { skipatstart += 10; newnl = PCRE_NEWLINE_ANYCRLF; }
5838
5839 else if (strncmp((char *)(ptr+skipatstart+2), "BSR_ANYCRLF)", 12) == 0)
5840 { skipatstart += 14; newbsr = PCRE_BSR_ANYCRLF; }
5841 else if (strncmp((char *)(ptr+skipatstart+2), "BSR_UNICODE)", 12) == 0)
5842 { skipatstart += 14; newbsr = PCRE_BSR_UNICODE; }
5843
5844 if (newnl != 0)
5845 options = (options & ~PCRE_NEWLINE_BITS) | newnl;
5846 else if (newbsr != 0)
5847 options = (options & ~(PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) | newbsr;
5848 else break;
5849 }
5850
5851 /* Check validity of \R options. */
5852
5853 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
5854 {
5855 case 0:
5856 case PCRE_BSR_ANYCRLF:
5857 case PCRE_BSR_UNICODE:
5858 break;
5859 default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
5860 }
5861
5862 /* Handle different types of newline. The three bits give seven cases. The
5863 current code allows for fixed one- or two-byte sequences, plus "any" and
5864 "anycrlf". */
5865
5866 switch (options & PCRE_NEWLINE_BITS)
5867 {
5868 case 0: newline = NEWLINE; break; /* Build-time default */
5869 case PCRE_NEWLINE_CR: newline = '\r'; break;
5870 case PCRE_NEWLINE_LF: newline = '\n'; break;
5871 case PCRE_NEWLINE_CR+
5872 PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
5873 case PCRE_NEWLINE_ANY: newline = -1; break;
5874 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
5875 default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
5876 }
5877
5878 if (newline == -2)
5879 {
5880 cd->nltype = NLTYPE_ANYCRLF;
5881 }
5882 else if (newline < 0)
5883 {
5884 cd->nltype = NLTYPE_ANY;
5885 }
5886 else
5887 {
5888 cd->nltype = NLTYPE_FIXED;
5889 if (newline > 255)
5890 {
5891 cd->nllen = 2;
5892 cd->nl[0] = (newline >> 8) & 255;
5893 cd->nl[1] = newline & 255;
5894 }
5895 else
5896 {
5897 cd->nllen = 1;
5898 cd->nl[0] = newline;
5899 }
5900 }
5901
5902 /* Maximum back reference and backref bitmap. The bitmap records up to 31 back
5903 references to help in deciding whether (.*) can be treated as anchored or not.
5904 */
5905
5906 cd->top_backref = 0;
5907 cd->backref_map = 0;
5908
5909 /* Reflect pattern for debugging output */
5910
5911 DPRINTF(("------------------------------------------------------------------\n"));
5912 DPRINTF(("%s\n", pattern));
5913
5914 /* Pretend to compile the pattern while actually just accumulating the length
5915 of memory required. This behaviour is triggered by passing a non-NULL final
5916 argument to compile_regex(). We pass a block of workspace (cworkspace) for it
5917 to compile parts of the pattern into; the compiled code is discarded when it is
5918 no longer needed, so hopefully this workspace will never overflow, though there
5919 is a test for its doing so. */
5920
5921 cd->bracount = 0;
5922 cd->names_found = 0;
5923 cd->name_entry_size = 0;
5924 cd->name_table = NULL;
5925 cd->start_workspace = cworkspace;
5926 cd->start_code = cworkspace;
5927 cd->hwm = cworkspace;
5928 cd->start_pattern = (const uschar *)pattern;
5929 cd->end_pattern = (const uschar *)(pattern + strlen(pattern));
5930 cd->req_varyopt = 0;
5931 cd->external_options = options;
5932 cd->external_flags = 0;
5933
5934 /* Now do the pre-compile. On error, errorcode will be set non-zero, so we
5935 don't need to look at the result of the function here. The initial options have
5936 been put into the cd block so that they can be changed if an option setting is
5937 found within the regex right at the beginning. Bringing initial option settings
5938 outside can help speed up starting point checks. */
5939
5940 ptr += skipatstart;
5941 code = cworkspace;
5942 *code = OP_BRA;
5943 (void)compile_regex(cd->external_options, cd->external_options & PCRE_IMS,
5944 &code, &ptr, &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd,
5945 &length);
5946 if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN;
5947
5948 DPRINTF(("end pre-compile: length=%d workspace=%d\n", length,
5949 cd->hwm - cworkspace));
5950
5951 if (length > MAX_PATTERN_SIZE)
5952 {
5953 errorcode = ERR20;
5954 goto PCRE_EARLY_ERROR_RETURN;
5955 }
5956
5957 /* Compute the size of data block needed and get it, either from malloc or
5958 externally provided function. Integer overflow should no longer be possible
5959 because nowadays we limit the maximum value of cd->names_found and
5960 cd->name_entry_size. */
5961
5962 size = length + sizeof(real_pcre) + cd->names_found * (cd->name_entry_size + 3);
5963 re = (real_pcre *)(pcre_malloc)(size);
5964
5965 if (re == NULL)
5966 {
5967 errorcode = ERR21;
5968 goto PCRE_EARLY_ERROR_RETURN;
5969 }
5970
5971 /* Put in the magic number, and save the sizes, initial options, internal
5972 flags, and character table pointer. NULL is used for the default character
5973 tables. The nullpad field is at the end; it's there to help in the case when a
5974 regex compiled on a system with 4-byte pointers is run on another with 8-byte
5975 pointers. */
5976
5977 re->magic_number = MAGIC_NUMBER;
5978 re->size = size;
5979 re->options = cd->external_options;
5980 re->flags = cd->external_flags;
5981 re->dummy1 = 0;
5982 re->first_byte = 0;
5983 re->req_byte = 0;
5984 re->name_table_offset = sizeof(real_pcre);
5985 re->name_entry_size = cd->name_entry_size;
5986 re->name_count = cd->names_found;
5987 re->ref_count = 0;
5988 re->tables = (tables == _pcre_default_tables)? NULL : tables;
5989 re->nullpad = NULL;
5990
5991 /* The starting points of the name/number translation table and of the code are
5992 passed around in the compile data block. The start/end pattern and initial
5993 options are already set from the pre-compile phase, as is the name_entry_size
5994 field. Reset the bracket count and the names_found field. Also reset the hwm
5995 field; this time it's used for remembering forward references to subpatterns.
5996 */
5997
5998 cd->bracount = 0;
5999 cd->names_found = 0;
6000 cd->name_table = (uschar *)re + re->name_table_offset;
6001 codestart = cd->name_table + re->name_entry_size * re->name_count;
6002 cd->start_code = codestart;
6003 cd->hwm = cworkspace;
6004 cd->req_varyopt = 0;
6005 cd->had_accept = FALSE;
6006
6007 /* Set up a starting, non-extracting bracket, then compile the expression. On
6008 error, errorcode will be set non-zero, so we don't need to look at the result
6009 of the function here. */
6010
6011 ptr = (const uschar *)pattern + skipatstart;
6012 code = (uschar *)codestart;
6013 *code = OP_BRA;
6014 (void)compile_regex(re->options, re->options & PCRE_IMS, &code, &ptr,
6015 &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd, NULL);
6016 re->top_bracket = cd->bracount;
6017 re->top_backref = cd->top_backref;
6018 re->flags = cd->external_flags;
6019
6020 if (cd->had_accept) reqbyte = -1; /* Must disable after (*ACCEPT) */
6021
6022 /* If not reached end of pattern on success, there's an excess bracket. */
6023
6024 if (errorcode == 0 && *ptr != 0) errorcode = ERR22;
6025
6026 /* Fill in the terminating state and check for disastrous overflow, but
6027 if debugging, leave the test till after things are printed out. */
6028
6029 *code++ = OP_END;
6030
6031 #ifndef DEBUG
6032 if (code - codestart > length) errorcode = ERR23;
6033 #endif
6034
6035 /* Fill in any forward references that are required. */
6036
6037 while (errorcode == 0 && cd->hwm > cworkspace)
6038 {
6039 int offset, recno;
6040 const uschar *groupptr;
6041 cd->hwm -= LINK_SIZE;
6042 offset = GET(cd->hwm, 0);
6043 recno = GET(codestart, offset);
6044 groupptr = find_bracket(codestart, (re->options & PCRE_UTF8) != 0, recno);
6045 if (groupptr == NULL) errorcode = ERR53;
6046 else PUT(((uschar *)codestart), offset, groupptr - codestart);
6047 }
6048
6049 /* Give an error if there's back reference to a non-existent capturing
6050 subpattern. */
6051
6052 if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15;
6053
6054 /* Failed to compile, or error while post-processing */
6055
6056 if (errorcode != 0)
6057 {
6058 (pcre_free)(re);
6059 PCRE_EARLY_ERROR_RETURN:
6060 *erroroffset = ptr - (const uschar *)pattern;
6061 PCRE_EARLY_ERROR_RETURN2:
6062 *errorptr = find_error_text(errorcode);
6063 if (errorcodeptr != NULL) *errorcodeptr = errorcode;
6064 return NULL;
6065 }
6066
6067 /* If the anchored option was not passed, set the flag if we can determine that
6068 the pattern is anchored by virtue of ^ characters or \A or anything else (such
6069 as starting with .* when DOTALL is set).
6070
6071 Otherwise, if we know what the first byte has to be, save it, because that
6072 speeds up unanchored matches no end. If not, see if we can set the
6073 PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
6074 start with ^. and also when all branches start with .* for non-DOTALL matches.
6075 */
6076
6077 if ((re->options & PCRE_ANCHORED) == 0)
6078 {
6079 int temp_options = re->options; /* May get changed during these scans */
6080 if (is_anchored(codestart, &temp_options, 0, cd->backref_map))
6081 re->options |= PCRE_ANCHORED;
6082 else
6083 {
6084 if (firstbyte < 0)
6085 firstbyte = find_firstassertedchar(codestart, &temp_options, FALSE);
6086 if (firstbyte >= 0) /* Remove caseless flag for non-caseable chars */
6087 {
6088 int ch = firstbyte & 255;
6089 re->first_byte = ((firstbyte & REQ_CASELESS) != 0 &&
6090 cd->fcc[ch] == ch)? ch : firstbyte;
6091 re->flags |= PCRE_FIRSTSET;
6092 }
6093 else if (is_startline(codestart, 0, cd->backref_map))
6094 re->flags |= PCRE_STARTLINE;
6095 }
6096 }
6097
6098 /* For an anchored pattern, we use the "required byte" only if it follows a
6099 variable length item in the regex. Remove the caseless flag for non-caseable
6100 bytes. */
6101
6102 if (reqbyte >= 0 &&
6103 ((re->options & PCRE_ANCHORED) == 0 || (reqbyte & REQ_VARY) != 0))
6104 {
6105 int ch = reqbyte & 255;
6106 re->req_byte = ((reqbyte & REQ_CASELESS) != 0 &&
6107 cd->fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte;
6108 re->flags |= PCRE_REQCHSET;
6109 }
6110
6111 /* Print out the compiled data if debugging is enabled. This is never the
6112 case when building a production library. */
6113
6114 #ifdef DEBUG
6115
6116 printf("Length = %d top_bracket = %d top_backref = %d\n",
6117 length, re->top_bracket, re->top_backref);
6118
6119 printf("Options=%08x\n", re->options);
6120
6121 if ((re->flags & PCRE_FIRSTSET) != 0)
6122 {
6123 int ch = re->first_byte & 255;
6124 const char *caseless = ((re->first_byte & REQ_CASELESS) == 0)?
6125 "" : " (caseless)";
6126 if (isprint(ch)) printf("First char = %c%s\n", ch, caseless);
6127 else printf("First char = \\x%02x%s\n", ch, caseless);
6128 }
6129
6130 if ((re->flags & PCRE_REQCHSET) != 0)
6131 {
6132 int ch = re->req_byte & 255;
6133 const char *caseless = ((re->req_byte & REQ_CASELESS) == 0)?
6134 "" : " (caseless)";
6135 if (isprint(ch)) printf("Req char = %c%s\n", ch, caseless);
6136 else printf("Req char = \\x%02x%s\n", ch, caseless);
6137 }
6138
6139 pcre_printint(re, stdout, TRUE);
6140
6141 /* This check is done here in the debugging case so that the code that
6142 was compiled can be seen. */
6143
6144 if (code - codestart > length)
6145 {
6146 (pcre_free)(re);
6147 *errorptr = find_error_text(ERR23);
6148 *erroroffset = ptr - (uschar *)pattern;
6149 if (errorcodeptr != NULL) *errorcodeptr = ERR23;
6150 return NULL;
6151 }
6152 #endif /* DEBUG */
6153
6154 return (pcre *)re;
6155 }
6156
6157 /* End of pcre_compile.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12