/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Contents of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 323 - (show annotations) (download)
Wed Mar 5 17:23:42 2008 UTC (6 years, 8 months ago) by ph10
File MIME type: text/plain
File size: 197102 byte(s)
Remove a line of dead code, identified by coverity and reported by Nuno Lopes.

1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2008 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains the external function pcre_compile(), along with
42 supporting internal functions that are not used by other modules. */
43
44
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48
49 #define NLBLOCK cd /* Block containing newline information */
50 #define PSSTART start_pattern /* Field containing processed string start */
51 #define PSEND end_pattern /* Field containing processed string end */
52
53 #include "pcre_internal.h"
54
55
56 /* When DEBUG is defined, we need the pcre_printint() function, which is also
57 used by pcretest. DEBUG is not defined when building a production library. */
58
59 #ifdef DEBUG
60 #include "pcre_printint.src"
61 #endif
62
63
64 /* Macro for setting individual bits in class bitmaps. */
65
66 #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
67
68 /* Maximum length value to check against when making sure that the integer that
69 holds the compiled pattern length does not overflow. We make it a bit less than
70 INT_MAX to allow for adding in group terminating bytes, so that we don't have
71 to check them every time. */
72
73 #define OFLOW_MAX (INT_MAX - 20)
74
75
76 /*************************************************
77 * Code parameters and static tables *
78 *************************************************/
79
80 /* This value specifies the size of stack workspace that is used during the
81 first pre-compile phase that determines how much memory is required. The regex
82 is partly compiled into this space, but the compiled parts are discarded as
83 soon as they can be, so that hopefully there will never be an overrun. The code
84 does, however, check for an overrun. The largest amount I've seen used is 218,
85 so this number is very generous.
86
87 The same workspace is used during the second, actual compile phase for
88 remembering forward references to groups so that they can be filled in at the
89 end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
90 is 4 there is plenty of room. */
91
92 #define COMPILE_WORK_SIZE (4096)
93
94
95 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
96 are simple data values; negative values are for special things like \d and so
97 on. Zero means further processing is needed (for things like \x), or the escape
98 is invalid. */
99
100 #ifndef EBCDIC /* This is the "normal" table for ASCII systems */
101 static const short int escapes[] = {
102 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
103 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
104 '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */
105 -ESC_H, 0, 0, -ESC_K, 0, 0, 0, 0, /* H - O */
106 -ESC_P, -ESC_Q, -ESC_R, -ESC_S, 0, 0, -ESC_V, -ESC_W, /* P - W */
107 -ESC_X, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
108 '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */
109 -ESC_h, 0, 0, -ESC_k, 0, 0, ESC_n, 0, /* h - o */
110 -ESC_p, 0, ESC_r, -ESC_s, ESC_tee, 0, -ESC_v, -ESC_w, /* p - w */
111 0, 0, -ESC_z /* x - z */
112 };
113
114 #else /* This is the "abnormal" table for EBCDIC systems */
115 static const short int escapes[] = {
116 /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
117 /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
118 /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
119 /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
120 /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
121 /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
122 /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
123 /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
124 /* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0,
125 /* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
126 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
127 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0,
128 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
129 /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
130 /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
131 /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
132 /* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0,
133 /* D0 */ '}', 0, -ESC_K, 0, 0, 0, 0, -ESC_P,
134 /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
135 /* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X,
136 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
137 /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
138 /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
139 };
140 #endif
141
142
143 /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
144 searched linearly. Put all the names into a single string, in order to reduce
145 the number of relocations when a shared library is dynamically linked. */
146
147 typedef struct verbitem {
148 int len;
149 int op;
150 } verbitem;
151
152 static const char verbnames[] =
153 "ACCEPT\0"
154 "COMMIT\0"
155 "F\0"
156 "FAIL\0"
157 "PRUNE\0"
158 "SKIP\0"
159 "THEN";
160
161 static verbitem verbs[] = {
162 { 6, OP_ACCEPT },
163 { 6, OP_COMMIT },
164 { 1, OP_FAIL },
165 { 4, OP_FAIL },
166 { 5, OP_PRUNE },
167 { 4, OP_SKIP },
168 { 4, OP_THEN }
169 };
170
171 static int verbcount = sizeof(verbs)/sizeof(verbitem);
172
173
174 /* Tables of names of POSIX character classes and their lengths. The names are
175 now all in a single string, to reduce the number of relocations when a shared
176 library is dynamically loaded. The list of lengths is terminated by a zero
177 length entry. The first three must be alpha, lower, upper, as this is assumed
178 for handling case independence. */
179
180 static const char posix_names[] =
181 "alpha\0" "lower\0" "upper\0" "alnum\0" "ascii\0" "blank\0"
182 "cntrl\0" "digit\0" "graph\0" "print\0" "punct\0" "space\0"
183 "word\0" "xdigit";
184
185 static const uschar posix_name_lengths[] = {
186 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
187
188 /* Table of class bit maps for each POSIX class. Each class is formed from a
189 base map, with an optional addition or removal of another map. Then, for some
190 classes, there is some additional tweaking: for [:blank:] the vertical space
191 characters are removed, and for [:alpha:] and [:alnum:] the underscore
192 character is removed. The triples in the table consist of the base map offset,
193 second map offset or -1 if no second map, and a non-negative value for map
194 addition or a negative value for map subtraction (if there are two maps). The
195 absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
196 remove vertical space characters, 2 => remove underscore. */
197
198 static const int posix_class_maps[] = {
199 cbit_word, cbit_digit, -2, /* alpha */
200 cbit_lower, -1, 0, /* lower */
201 cbit_upper, -1, 0, /* upper */
202 cbit_word, -1, 2, /* alnum - word without underscore */
203 cbit_print, cbit_cntrl, 0, /* ascii */
204 cbit_space, -1, 1, /* blank - a GNU extension */
205 cbit_cntrl, -1, 0, /* cntrl */
206 cbit_digit, -1, 0, /* digit */
207 cbit_graph, -1, 0, /* graph */
208 cbit_print, -1, 0, /* print */
209 cbit_punct, -1, 0, /* punct */
210 cbit_space, -1, 0, /* space */
211 cbit_word, -1, 0, /* word - a Perl extension */
212 cbit_xdigit,-1, 0 /* xdigit */
213 };
214
215
216 #define STRING(a) # a
217 #define XSTRING(s) STRING(s)
218
219 /* The texts of compile-time error messages. These are "char *" because they
220 are passed to the outside world. Do not ever re-use any error number, because
221 they are documented. Always add a new error instead. Messages marked DEAD below
222 are no longer used. This used to be a table of strings, but in order to reduce
223 the number of relocations needed when a shared library is loaded dynamically,
224 it is now one long string. We cannot use a table of offsets, because the
225 lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
226 simply count through to the one we want - this isn't a performance issue
227 because these strings are used only when there is a compilation error. */
228
229 static const char error_texts[] =
230 "no error\0"
231 "\\ at end of pattern\0"
232 "\\c at end of pattern\0"
233 "unrecognized character follows \\\0"
234 "numbers out of order in {} quantifier\0"
235 /* 5 */
236 "number too big in {} quantifier\0"
237 "missing terminating ] for character class\0"
238 "invalid escape sequence in character class\0"
239 "range out of order in character class\0"
240 "nothing to repeat\0"
241 /* 10 */
242 "operand of unlimited repeat could match the empty string\0" /** DEAD **/
243 "internal error: unexpected repeat\0"
244 "unrecognized character after (? or (?-\0"
245 "POSIX named classes are supported only within a class\0"
246 "missing )\0"
247 /* 15 */
248 "reference to non-existent subpattern\0"
249 "erroffset passed as NULL\0"
250 "unknown option bit(s) set\0"
251 "missing ) after comment\0"
252 "parentheses nested too deeply\0" /** DEAD **/
253 /* 20 */
254 "regular expression is too large\0"
255 "failed to get memory\0"
256 "unmatched parentheses\0"
257 "internal error: code overflow\0"
258 "unrecognized character after (?<\0"
259 /* 25 */
260 "lookbehind assertion is not fixed length\0"
261 "malformed number or name after (?(\0"
262 "conditional group contains more than two branches\0"
263 "assertion expected after (?(\0"
264 "(?R or (?[+-]digits must be followed by )\0"
265 /* 30 */
266 "unknown POSIX class name\0"
267 "POSIX collating elements are not supported\0"
268 "this version of PCRE is not compiled with PCRE_UTF8 support\0"
269 "spare error\0" /** DEAD **/
270 "character value in \\x{...} sequence is too large\0"
271 /* 35 */
272 "invalid condition (?(0)\0"
273 "\\C not allowed in lookbehind assertion\0"
274 "PCRE does not support \\L, \\l, \\N, \\U, or \\u\0"
275 "number after (?C is > 255\0"
276 "closing ) for (?C expected\0"
277 /* 40 */
278 "recursive call could loop indefinitely\0"
279 "unrecognized character after (?P\0"
280 "syntax error in subpattern name (missing terminator)\0"
281 "two named subpatterns have the same name\0"
282 "invalid UTF-8 string\0"
283 /* 45 */
284 "support for \\P, \\p, and \\X has not been compiled\0"
285 "malformed \\P or \\p sequence\0"
286 "unknown property name after \\P or \\p\0"
287 "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
288 "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
289 /* 50 */
290 "repeated subpattern is too long\0" /** DEAD **/
291 "octal value is greater than \\377 (not in UTF-8 mode)\0"
292 "internal error: overran compiling workspace\0"
293 "internal error: previously-checked referenced subpattern not found\0"
294 "DEFINE group contains more than one branch\0"
295 /* 55 */
296 "repeating a DEFINE group is not allowed\0"
297 "inconsistent NEWLINE options\0"
298 "\\g is not followed by a braced name or an optionally braced non-zero number\0"
299 "(?+ or (?- or (?(+ or (?(- must be followed by a non-zero number\0"
300 "(*VERB) with an argument is not supported\0"
301 /* 60 */
302 "(*VERB) not recognized\0"
303 "number is too big\0"
304 "subpattern name expected\0"
305 "digit expected after (?+";
306
307
308 /* Table to identify digits and hex digits. This is used when compiling
309 patterns. Note that the tables in chartables are dependent on the locale, and
310 may mark arbitrary characters as digits - but the PCRE compiling code expects
311 to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
312 a private table here. It costs 256 bytes, but it is a lot faster than doing
313 character value tests (at least in some simple cases I timed), and in some
314 applications one wants PCRE to compile efficiently as well as match
315 efficiently.
316
317 For convenience, we use the same bit definitions as in chartables:
318
319 0x04 decimal digit
320 0x08 hexadecimal digit
321
322 Then we can use ctype_digit and ctype_xdigit in the code. */
323
324 #ifndef EBCDIC /* This is the "normal" case, for ASCII systems */
325 static const unsigned char digitab[] =
326 {
327 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
328 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
329 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
330 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
331 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
332 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
333 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
334 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
335 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
336 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
337 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
338 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
339 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
340 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
341 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
342 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
343 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
344 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
345 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
346 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
347 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
348 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
349 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
350 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
351 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
352 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
353 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
354 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
355 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
356 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
357 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
358 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
359
360 #else /* This is the "abnormal" case, for EBCDIC systems */
361 static const unsigned char digitab[] =
362 {
363 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
364 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
365 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
366 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
367 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
368 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
369 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
370 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
371 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
372 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
373 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
374 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */
375 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
376 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
377 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
378 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
379 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
380 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
381 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
382 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
383 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
384 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
385 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
386 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
387 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
388 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
389 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
390 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
391 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
392 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
393 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
394 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
395
396 static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
397 0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
398 0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
399 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
400 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
401 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
402 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
403 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
404 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
405 0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
406 0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
407 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
408 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */
409 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
410 0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
411 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
412 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
413 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
414 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
415 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
416 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
417 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
418 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
419 0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
420 0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
421 0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
422 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
423 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
424 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
425 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
426 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
427 0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
428 0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
429 #endif
430
431
432 /* Definition to allow mutual recursion */
433
434 static BOOL
435 compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
436 int *, int *, branch_chain *, compile_data *, int *);
437
438
439
440 /*************************************************
441 * Find an error text *
442 *************************************************/
443
444 /* The error texts are now all in one long string, to save on relocations. As
445 some of the text is of unknown length, we can't use a table of offsets.
446 Instead, just count through the strings. This is not a performance issue
447 because it happens only when there has been a compilation error.
448
449 Argument: the error number
450 Returns: pointer to the error string
451 */
452
453 static const char *
454 find_error_text(int n)
455 {
456 const char *s = error_texts;
457 for (; n > 0; n--) while (*s++ != 0);
458 return s;
459 }
460
461
462 /*************************************************
463 * Handle escapes *
464 *************************************************/
465
466 /* This function is called when a \ has been encountered. It either returns a
467 positive value for a simple escape such as \n, or a negative value which
468 encodes one of the more complicated things such as \d. A backreference to group
469 n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
470 UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
471 ptr is pointing at the \. On exit, it is on the final character of the escape
472 sequence.
473
474 Arguments:
475 ptrptr points to the pattern position pointer
476 errorcodeptr points to the errorcode variable
477 bracount number of previous extracting brackets
478 options the options bits
479 isclass TRUE if inside a character class
480
481 Returns: zero or positive => a data character
482 negative => a special escape sequence
483 on error, errorcodeptr is set
484 */
485
486 static int
487 check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
488 int options, BOOL isclass)
489 {
490 BOOL utf8 = (options & PCRE_UTF8) != 0;
491 const uschar *ptr = *ptrptr + 1;
492 int c, i;
493
494 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
495 ptr--; /* Set pointer back to the last byte */
496
497 /* If backslash is at the end of the pattern, it's an error. */
498
499 if (c == 0) *errorcodeptr = ERR1;
500
501 /* Non-alphanumerics are literals. For digits or letters, do an initial lookup
502 in a table. A non-zero result is something that can be returned immediately.
503 Otherwise further processing may be required. */
504
505 #ifndef EBCDIC /* ASCII coding */
506 else if (c < '0' || c > 'z') {} /* Not alphanumeric */
507 else if ((i = escapes[c - '0']) != 0) c = i;
508
509 #else /* EBCDIC coding */
510 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphanumeric */
511 else if ((i = escapes[c - 0x48]) != 0) c = i;
512 #endif
513
514 /* Escapes that need further processing, or are illegal. */
515
516 else
517 {
518 const uschar *oldptr;
519 BOOL braced, negated;
520
521 switch (c)
522 {
523 /* A number of Perl escapes are not handled by PCRE. We give an explicit
524 error. */
525
526 case 'l':
527 case 'L':
528 case 'N':
529 case 'u':
530 case 'U':
531 *errorcodeptr = ERR37;
532 break;
533
534 /* \g must be followed by a number, either plain or braced. If positive, it
535 is an absolute backreference. If negative, it is a relative backreference.
536 This is a Perl 5.10 feature. Perl 5.10 also supports \g{name} as a
537 reference to a named group. This is part of Perl's movement towards a
538 unified syntax for back references. As this is synonymous with \k{name}, we
539 fudge it up by pretending it really was \k. */
540
541 case 'g':
542 if (ptr[1] == '{')
543 {
544 const uschar *p;
545 for (p = ptr+2; *p != 0 && *p != '}'; p++)
546 if (*p != '-' && (digitab[*p] & ctype_digit) == 0) break;
547 if (*p != 0 && *p != '}')
548 {
549 c = -ESC_k;
550 break;
551 }
552 braced = TRUE;
553 ptr++;
554 }
555 else braced = FALSE;
556
557 if (ptr[1] == '-')
558 {
559 negated = TRUE;
560 ptr++;
561 }
562 else negated = FALSE;
563
564 c = 0;
565 while ((digitab[ptr[1]] & ctype_digit) != 0)
566 c = c * 10 + *(++ptr) - '0';
567
568 if (c < 0)
569 {
570 *errorcodeptr = ERR61;
571 break;
572 }
573
574 if (c == 0 || (braced && *(++ptr) != '}'))
575 {
576 *errorcodeptr = ERR57;
577 break;
578 }
579
580 if (negated)
581 {
582 if (c > bracount)
583 {
584 *errorcodeptr = ERR15;
585 break;
586 }
587 c = bracount - (c - 1);
588 }
589
590 c = -(ESC_REF + c);
591 break;
592
593 /* The handling of escape sequences consisting of a string of digits
594 starting with one that is not zero is not straightforward. By experiment,
595 the way Perl works seems to be as follows:
596
597 Outside a character class, the digits are read as a decimal number. If the
598 number is less than 10, or if there are that many previous extracting
599 left brackets, then it is a back reference. Otherwise, up to three octal
600 digits are read to form an escaped byte. Thus \123 is likely to be octal
601 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
602 value is greater than 377, the least significant 8 bits are taken. Inside a
603 character class, \ followed by a digit is always an octal number. */
604
605 case '1': case '2': case '3': case '4': case '5':
606 case '6': case '7': case '8': case '9':
607
608 if (!isclass)
609 {
610 oldptr = ptr;
611 c -= '0';
612 while ((digitab[ptr[1]] & ctype_digit) != 0)
613 c = c * 10 + *(++ptr) - '0';
614 if (c < 0)
615 {
616 *errorcodeptr = ERR61;
617 break;
618 }
619 if (c < 10 || c <= bracount)
620 {
621 c = -(ESC_REF + c);
622 break;
623 }
624 ptr = oldptr; /* Put the pointer back and fall through */
625 }
626
627 /* Handle an octal number following \. If the first digit is 8 or 9, Perl
628 generates a binary zero byte and treats the digit as a following literal.
629 Thus we have to pull back the pointer by one. */
630
631 if ((c = *ptr) >= '8')
632 {
633 ptr--;
634 c = 0;
635 break;
636 }
637
638 /* \0 always starts an octal number, but we may drop through to here with a
639 larger first octal digit. The original code used just to take the least
640 significant 8 bits of octal numbers (I think this is what early Perls used
641 to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
642 than 3 octal digits. */
643
644 case '0':
645 c -= '0';
646 while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
647 c = c * 8 + *(++ptr) - '0';
648 if (!utf8 && c > 255) *errorcodeptr = ERR51;
649 break;
650
651 /* \x is complicated. \x{ddd} is a character number which can be greater
652 than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
653 treated as a data character. */
654
655 case 'x':
656 if (ptr[1] == '{')
657 {
658 const uschar *pt = ptr + 2;
659 int count = 0;
660
661 c = 0;
662 while ((digitab[*pt] & ctype_xdigit) != 0)
663 {
664 register int cc = *pt++;
665 if (c == 0 && cc == '0') continue; /* Leading zeroes */
666 count++;
667
668 #ifndef EBCDIC /* ASCII coding */
669 if (cc >= 'a') cc -= 32; /* Convert to upper case */
670 c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));
671 #else /* EBCDIC coding */
672 if (cc >= 'a' && cc <= 'z') cc += 64; /* Convert to upper case */
673 c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));
674 #endif
675 }
676
677 if (*pt == '}')
678 {
679 if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
680 ptr = pt;
681 break;
682 }
683
684 /* If the sequence of hex digits does not end with '}', then we don't
685 recognize this construct; fall through to the normal \x handling. */
686 }
687
688 /* Read just a single-byte hex-defined char */
689
690 c = 0;
691 while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
692 {
693 int cc; /* Some compilers don't like ++ */
694 cc = *(++ptr); /* in initializers */
695 #ifndef EBCDIC /* ASCII coding */
696 if (cc >= 'a') cc -= 32; /* Convert to upper case */
697 c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
698 #else /* EBCDIC coding */
699 if (cc <= 'z') cc += 64; /* Convert to upper case */
700 c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
701 #endif
702 }
703 break;
704
705 /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
706 This coding is ASCII-specific, but then the whole concept of \cx is
707 ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
708
709 case 'c':
710 c = *(++ptr);
711 if (c == 0)
712 {
713 *errorcodeptr = ERR2;
714 break;
715 }
716
717 #ifndef EBCDIC /* ASCII coding */
718 if (c >= 'a' && c <= 'z') c -= 32;
719 c ^= 0x40;
720 #else /* EBCDIC coding */
721 if (c >= 'a' && c <= 'z') c += 64;
722 c ^= 0xC0;
723 #endif
724 break;
725
726 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
727 other alphanumeric following \ is an error if PCRE_EXTRA was set;
728 otherwise, for Perl compatibility, it is a literal. This code looks a bit
729 odd, but there used to be some cases other than the default, and there may
730 be again in future, so I haven't "optimized" it. */
731
732 default:
733 if ((options & PCRE_EXTRA) != 0) switch(c)
734 {
735 default:
736 *errorcodeptr = ERR3;
737 break;
738 }
739 break;
740 }
741 }
742
743 *ptrptr = ptr;
744 return c;
745 }
746
747
748
749 #ifdef SUPPORT_UCP
750 /*************************************************
751 * Handle \P and \p *
752 *************************************************/
753
754 /* This function is called after \P or \p has been encountered, provided that
755 PCRE is compiled with support for Unicode properties. On entry, ptrptr is
756 pointing at the P or p. On exit, it is pointing at the final character of the
757 escape sequence.
758
759 Argument:
760 ptrptr points to the pattern position pointer
761 negptr points to a boolean that is set TRUE for negation else FALSE
762 dptr points to an int that is set to the detailed property value
763 errorcodeptr points to the error code variable
764
765 Returns: type value from ucp_type_table, or -1 for an invalid type
766 */
767
768 static int
769 get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
770 {
771 int c, i, bot, top;
772 const uschar *ptr = *ptrptr;
773 char name[32];
774
775 c = *(++ptr);
776 if (c == 0) goto ERROR_RETURN;
777
778 *negptr = FALSE;
779
780 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
781 negation. */
782
783 if (c == '{')
784 {
785 if (ptr[1] == '^')
786 {
787 *negptr = TRUE;
788 ptr++;
789 }
790 for (i = 0; i < (int)sizeof(name) - 1; i++)
791 {
792 c = *(++ptr);
793 if (c == 0) goto ERROR_RETURN;
794 if (c == '}') break;
795 name[i] = c;
796 }
797 if (c !='}') goto ERROR_RETURN;
798 name[i] = 0;
799 }
800
801 /* Otherwise there is just one following character */
802
803 else
804 {
805 name[0] = c;
806 name[1] = 0;
807 }
808
809 *ptrptr = ptr;
810
811 /* Search for a recognized property name using binary chop */
812
813 bot = 0;
814 top = _pcre_utt_size;
815
816 while (bot < top)
817 {
818 i = (bot + top) >> 1;
819 c = strcmp(name, _pcre_utt_names + _pcre_utt[i].name_offset);
820 if (c == 0)
821 {
822 *dptr = _pcre_utt[i].value;
823 return _pcre_utt[i].type;
824 }
825 if (c > 0) bot = i + 1; else top = i;
826 }
827
828 *errorcodeptr = ERR47;
829 *ptrptr = ptr;
830 return -1;
831
832 ERROR_RETURN:
833 *errorcodeptr = ERR46;
834 *ptrptr = ptr;
835 return -1;
836 }
837 #endif
838
839
840
841
842 /*************************************************
843 * Check for counted repeat *
844 *************************************************/
845
846 /* This function is called when a '{' is encountered in a place where it might
847 start a quantifier. It looks ahead to see if it really is a quantifier or not.
848 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
849 where the ddds are digits.
850
851 Arguments:
852 p pointer to the first char after '{'
853
854 Returns: TRUE or FALSE
855 */
856
857 static BOOL
858 is_counted_repeat(const uschar *p)
859 {
860 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
861 while ((digitab[*p] & ctype_digit) != 0) p++;
862 if (*p == '}') return TRUE;
863
864 if (*p++ != ',') return FALSE;
865 if (*p == '}') return TRUE;
866
867 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
868 while ((digitab[*p] & ctype_digit) != 0) p++;
869
870 return (*p == '}');
871 }
872
873
874
875 /*************************************************
876 * Read repeat counts *
877 *************************************************/
878
879 /* Read an item of the form {n,m} and return the values. This is called only
880 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
881 so the syntax is guaranteed to be correct, but we need to check the values.
882
883 Arguments:
884 p pointer to first char after '{'
885 minp pointer to int for min
886 maxp pointer to int for max
887 returned as -1 if no max
888 errorcodeptr points to error code variable
889
890 Returns: pointer to '}' on success;
891 current ptr on error, with errorcodeptr set non-zero
892 */
893
894 static const uschar *
895 read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
896 {
897 int min = 0;
898 int max = -1;
899
900 /* Read the minimum value and do a paranoid check: a negative value indicates
901 an integer overflow. */
902
903 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
904 if (min < 0 || min > 65535)
905 {
906 *errorcodeptr = ERR5;
907 return p;
908 }
909
910 /* Read the maximum value if there is one, and again do a paranoid on its size.
911 Also, max must not be less than min. */
912
913 if (*p == '}') max = min; else
914 {
915 if (*(++p) != '}')
916 {
917 max = 0;
918 while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
919 if (max < 0 || max > 65535)
920 {
921 *errorcodeptr = ERR5;
922 return p;
923 }
924 if (max < min)
925 {
926 *errorcodeptr = ERR4;
927 return p;
928 }
929 }
930 }
931
932 /* Fill in the required variables, and pass back the pointer to the terminating
933 '}'. */
934
935 *minp = min;
936 *maxp = max;
937 return p;
938 }
939
940
941
942 /*************************************************
943 * Find forward referenced subpattern *
944 *************************************************/
945
946 /* This function scans along a pattern's text looking for capturing
947 subpatterns, and counting them. If it finds a named pattern that matches the
948 name it is given, it returns its number. Alternatively, if the name is NULL, it
949 returns when it reaches a given numbered subpattern. This is used for forward
950 references to subpatterns. We know that if (?P< is encountered, the name will
951 be terminated by '>' because that is checked in the first pass.
952
953 Arguments:
954 ptr current position in the pattern
955 count current count of capturing parens so far encountered
956 name name to seek, or NULL if seeking a numbered subpattern
957 lorn name length, or subpattern number if name is NULL
958 xmode TRUE if we are in /x mode
959
960 Returns: the number of the named subpattern, or -1 if not found
961 */
962
963 static int
964 find_parens(const uschar *ptr, int count, const uschar *name, int lorn,
965 BOOL xmode)
966 {
967 const uschar *thisname;
968
969 for (; *ptr != 0; ptr++)
970 {
971 int term;
972
973 /* Skip over backslashed characters and also entire \Q...\E */
974
975 if (*ptr == '\\')
976 {
977 if (*(++ptr) == 0) return -1;
978 if (*ptr == 'Q') for (;;)
979 {
980 while (*(++ptr) != 0 && *ptr != '\\');
981 if (*ptr == 0) return -1;
982 if (*(++ptr) == 'E') break;
983 }
984 continue;
985 }
986
987 /* Skip over character classes */
988
989 if (*ptr == '[')
990 {
991 while (*(++ptr) != ']')
992 {
993 if (*ptr == 0) return -1;
994 if (*ptr == '\\')
995 {
996 if (*(++ptr) == 0) return -1;
997 if (*ptr == 'Q') for (;;)
998 {
999 while (*(++ptr) != 0 && *ptr != '\\');
1000 if (*ptr == 0) return -1;
1001 if (*(++ptr) == 'E') break;
1002 }
1003 continue;
1004 }
1005 }
1006 continue;
1007 }
1008
1009 /* Skip comments in /x mode */
1010
1011 if (xmode && *ptr == '#')
1012 {
1013 while (*(++ptr) != 0 && *ptr != '\n');
1014 if (*ptr == 0) return -1;
1015 continue;
1016 }
1017
1018 /* An opening parens must now be a real metacharacter */
1019
1020 if (*ptr != '(') continue;
1021 if (ptr[1] != '?' && ptr[1] != '*')
1022 {
1023 count++;
1024 if (name == NULL && count == lorn) return count;
1025 continue;
1026 }
1027
1028 ptr += 2;
1029 if (*ptr == 'P') ptr++; /* Allow optional P */
1030
1031 /* We have to disambiguate (?<! and (?<= from (?<name> */
1032
1033 if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') &&
1034 *ptr != '\'')
1035 continue;
1036
1037 count++;
1038
1039 if (name == NULL && count == lorn) return count;
1040 term = *ptr++;
1041 if (term == '<') term = '>';
1042 thisname = ptr;
1043 while (*ptr != term) ptr++;
1044 if (name != NULL && lorn == ptr - thisname &&
1045 strncmp((const char *)name, (const char *)thisname, lorn) == 0)
1046 return count;
1047 }
1048
1049 return -1;
1050 }
1051
1052
1053
1054 /*************************************************
1055 * Find first significant op code *
1056 *************************************************/
1057
1058 /* This is called by several functions that scan a compiled expression looking
1059 for a fixed first character, or an anchoring op code etc. It skips over things
1060 that do not influence this. For some calls, a change of option is important.
1061 For some calls, it makes sense to skip negative forward and all backward
1062 assertions, and also the \b assertion; for others it does not.
1063
1064 Arguments:
1065 code pointer to the start of the group
1066 options pointer to external options
1067 optbit the option bit whose changing is significant, or
1068 zero if none are
1069 skipassert TRUE if certain assertions are to be skipped
1070
1071 Returns: pointer to the first significant opcode
1072 */
1073
1074 static const uschar*
1075 first_significant_code(const uschar *code, int *options, int optbit,
1076 BOOL skipassert)
1077 {
1078 for (;;)
1079 {
1080 switch ((int)*code)
1081 {
1082 case OP_OPT:
1083 if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
1084 *options = (int)code[1];
1085 code += 2;
1086 break;
1087
1088 case OP_ASSERT_NOT:
1089 case OP_ASSERTBACK:
1090 case OP_ASSERTBACK_NOT:
1091 if (!skipassert) return code;
1092 do code += GET(code, 1); while (*code == OP_ALT);
1093 code += _pcre_OP_lengths[*code];
1094 break;
1095
1096 case OP_WORD_BOUNDARY:
1097 case OP_NOT_WORD_BOUNDARY:
1098 if (!skipassert) return code;
1099 /* Fall through */
1100
1101 case OP_CALLOUT:
1102 case OP_CREF:
1103 case OP_RREF:
1104 case OP_DEF:
1105 code += _pcre_OP_lengths[*code];
1106 break;
1107
1108 default:
1109 return code;
1110 }
1111 }
1112 /* Control never reaches here */
1113 }
1114
1115
1116
1117
1118 /*************************************************
1119 * Find the fixed length of a pattern *
1120 *************************************************/
1121
1122 /* Scan a pattern and compute the fixed length of subject that will match it,
1123 if the length is fixed. This is needed for dealing with backward assertions.
1124 In UTF8 mode, the result is in characters rather than bytes.
1125
1126 Arguments:
1127 code points to the start of the pattern (the bracket)
1128 options the compiling options
1129
1130 Returns: the fixed length, or -1 if there is no fixed length,
1131 or -2 if \C was encountered
1132 */
1133
1134 static int
1135 find_fixedlength(uschar *code, int options)
1136 {
1137 int length = -1;
1138
1139 register int branchlength = 0;
1140 register uschar *cc = code + 1 + LINK_SIZE;
1141
1142 /* Scan along the opcodes for this branch. If we get to the end of the
1143 branch, check the length against that of the other branches. */
1144
1145 for (;;)
1146 {
1147 int d;
1148 register int op = *cc;
1149 switch (op)
1150 {
1151 case OP_CBRA:
1152 case OP_BRA:
1153 case OP_ONCE:
1154 case OP_COND:
1155 d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);
1156 if (d < 0) return d;
1157 branchlength += d;
1158 do cc += GET(cc, 1); while (*cc == OP_ALT);
1159 cc += 1 + LINK_SIZE;
1160 break;
1161
1162 /* Reached end of a branch; if it's a ket it is the end of a nested
1163 call. If it's ALT it is an alternation in a nested call. If it is
1164 END it's the end of the outer call. All can be handled by the same code. */
1165
1166 case OP_ALT:
1167 case OP_KET:
1168 case OP_KETRMAX:
1169 case OP_KETRMIN:
1170 case OP_END:
1171 if (length < 0) length = branchlength;
1172 else if (length != branchlength) return -1;
1173 if (*cc != OP_ALT) return length;
1174 cc += 1 + LINK_SIZE;
1175 branchlength = 0;
1176 break;
1177
1178 /* Skip over assertive subpatterns */
1179
1180 case OP_ASSERT:
1181 case OP_ASSERT_NOT:
1182 case OP_ASSERTBACK:
1183 case OP_ASSERTBACK_NOT:
1184 do cc += GET(cc, 1); while (*cc == OP_ALT);
1185 /* Fall through */
1186
1187 /* Skip over things that don't match chars */
1188
1189 case OP_REVERSE:
1190 case OP_CREF:
1191 case OP_RREF:
1192 case OP_DEF:
1193 case OP_OPT:
1194 case OP_CALLOUT:
1195 case OP_SOD:
1196 case OP_SOM:
1197 case OP_EOD:
1198 case OP_EODN:
1199 case OP_CIRC:
1200 case OP_DOLL:
1201 case OP_NOT_WORD_BOUNDARY:
1202 case OP_WORD_BOUNDARY:
1203 cc += _pcre_OP_lengths[*cc];
1204 break;
1205
1206 /* Handle literal characters */
1207
1208 case OP_CHAR:
1209 case OP_CHARNC:
1210 case OP_NOT:
1211 branchlength++;
1212 cc += 2;
1213 #ifdef SUPPORT_UTF8
1214 if ((options & PCRE_UTF8) != 0)
1215 {
1216 while ((*cc & 0xc0) == 0x80) cc++;
1217 }
1218 #endif
1219 break;
1220
1221 /* Handle exact repetitions. The count is already in characters, but we
1222 need to skip over a multibyte character in UTF8 mode. */
1223
1224 case OP_EXACT:
1225 branchlength += GET2(cc,1);
1226 cc += 4;
1227 #ifdef SUPPORT_UTF8
1228 if ((options & PCRE_UTF8) != 0)
1229 {
1230 while((*cc & 0x80) == 0x80) cc++;
1231 }
1232 #endif
1233 break;
1234
1235 case OP_TYPEEXACT:
1236 branchlength += GET2(cc,1);
1237 if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
1238 cc += 4;
1239 break;
1240
1241 /* Handle single-char matchers */
1242
1243 case OP_PROP:
1244 case OP_NOTPROP:
1245 cc += 2;
1246 /* Fall through */
1247
1248 case OP_NOT_DIGIT:
1249 case OP_DIGIT:
1250 case OP_NOT_WHITESPACE:
1251 case OP_WHITESPACE:
1252 case OP_NOT_WORDCHAR:
1253 case OP_WORDCHAR:
1254 case OP_ANY:
1255 branchlength++;
1256 cc++;
1257 break;
1258
1259 /* The single-byte matcher isn't allowed */
1260
1261 case OP_ANYBYTE:
1262 return -2;
1263
1264 /* Check a class for variable quantification */
1265
1266 #ifdef SUPPORT_UTF8
1267 case OP_XCLASS:
1268 cc += GET(cc, 1) - 33;
1269 /* Fall through */
1270 #endif
1271
1272 case OP_CLASS:
1273 case OP_NCLASS:
1274 cc += 33;
1275
1276 switch (*cc)
1277 {
1278 case OP_CRSTAR:
1279 case OP_CRMINSTAR:
1280 case OP_CRQUERY:
1281 case OP_CRMINQUERY:
1282 return -1;
1283
1284 case OP_CRRANGE:
1285 case OP_CRMINRANGE:
1286 if (GET2(cc,1) != GET2(cc,3)) return -1;
1287 branchlength += GET2(cc,1);
1288 cc += 5;
1289 break;
1290
1291 default:
1292 branchlength++;
1293 }
1294 break;
1295
1296 /* Anything else is variable length */
1297
1298 default:
1299 return -1;
1300 }
1301 }
1302 /* Control never gets here */
1303 }
1304
1305
1306
1307
1308 /*************************************************
1309 * Scan compiled regex for numbered bracket *
1310 *************************************************/
1311
1312 /* This little function scans through a compiled pattern until it finds a
1313 capturing bracket with the given number.
1314
1315 Arguments:
1316 code points to start of expression
1317 utf8 TRUE in UTF-8 mode
1318 number the required bracket number
1319
1320 Returns: pointer to the opcode for the bracket, or NULL if not found
1321 */
1322
1323 static const uschar *
1324 find_bracket(const uschar *code, BOOL utf8, int number)
1325 {
1326 for (;;)
1327 {
1328 register int c = *code;
1329 if (c == OP_END) return NULL;
1330
1331 /* XCLASS is used for classes that cannot be represented just by a bit
1332 map. This includes negated single high-valued characters. The length in
1333 the table is zero; the actual length is stored in the compiled code. */
1334
1335 if (c == OP_XCLASS) code += GET(code, 1);
1336
1337 /* Handle capturing bracket */
1338
1339 else if (c == OP_CBRA)
1340 {
1341 int n = GET2(code, 1+LINK_SIZE);
1342 if (n == number) return (uschar *)code;
1343 code += _pcre_OP_lengths[c];
1344 }
1345
1346 /* Otherwise, we can get the item's length from the table, except that for
1347 repeated character types, we have to test for \p and \P, which have an extra
1348 two bytes of parameters. */
1349
1350 else
1351 {
1352 switch(c)
1353 {
1354 case OP_TYPESTAR:
1355 case OP_TYPEMINSTAR:
1356 case OP_TYPEPLUS:
1357 case OP_TYPEMINPLUS:
1358 case OP_TYPEQUERY:
1359 case OP_TYPEMINQUERY:
1360 case OP_TYPEPOSSTAR:
1361 case OP_TYPEPOSPLUS:
1362 case OP_TYPEPOSQUERY:
1363 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1364 break;
1365
1366 case OP_TYPEUPTO:
1367 case OP_TYPEMINUPTO:
1368 case OP_TYPEEXACT:
1369 case OP_TYPEPOSUPTO:
1370 if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1371 break;
1372 }
1373
1374 /* Add in the fixed length from the table */
1375
1376 code += _pcre_OP_lengths[c];
1377
1378 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1379 a multi-byte character. The length in the table is a minimum, so we have to
1380 arrange to skip the extra bytes. */
1381
1382 #ifdef SUPPORT_UTF8
1383 if (utf8) switch(c)
1384 {
1385 case OP_CHAR:
1386 case OP_CHARNC:
1387 case OP_EXACT:
1388 case OP_UPTO:
1389 case OP_MINUPTO:
1390 case OP_POSUPTO:
1391 case OP_STAR:
1392 case OP_MINSTAR:
1393 case OP_POSSTAR:
1394 case OP_PLUS:
1395 case OP_MINPLUS:
1396 case OP_POSPLUS:
1397 case OP_QUERY:
1398 case OP_MINQUERY:
1399 case OP_POSQUERY:
1400 if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1401 break;
1402 }
1403 #endif
1404 }
1405 }
1406 }
1407
1408
1409
1410 /*************************************************
1411 * Scan compiled regex for recursion reference *
1412 *************************************************/
1413
1414 /* This little function scans through a compiled pattern until it finds an
1415 instance of OP_RECURSE.
1416
1417 Arguments:
1418 code points to start of expression
1419 utf8 TRUE in UTF-8 mode
1420
1421 Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1422 */
1423
1424 static const uschar *
1425 find_recurse(const uschar *code, BOOL utf8)
1426 {
1427 for (;;)
1428 {
1429 register int c = *code;
1430 if (c == OP_END) return NULL;
1431 if (c == OP_RECURSE) return code;
1432
1433 /* XCLASS is used for classes that cannot be represented just by a bit
1434 map. This includes negated single high-valued characters. The length in
1435 the table is zero; the actual length is stored in the compiled code. */
1436
1437 if (c == OP_XCLASS) code += GET(code, 1);
1438
1439 /* Otherwise, we can get the item's length from the table, except that for
1440 repeated character types, we have to test for \p and \P, which have an extra
1441 two bytes of parameters. */
1442
1443 else
1444 {
1445 switch(c)
1446 {
1447 case OP_TYPESTAR:
1448 case OP_TYPEMINSTAR:
1449 case OP_TYPEPLUS:
1450 case OP_TYPEMINPLUS:
1451 case OP_TYPEQUERY:
1452 case OP_TYPEMINQUERY:
1453 case OP_TYPEPOSSTAR:
1454 case OP_TYPEPOSPLUS:
1455 case OP_TYPEPOSQUERY:
1456 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1457 break;
1458
1459 case OP_TYPEPOSUPTO:
1460 case OP_TYPEUPTO:
1461 case OP_TYPEMINUPTO:
1462 case OP_TYPEEXACT:
1463 if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1464 break;
1465 }
1466
1467 /* Add in the fixed length from the table */
1468
1469 code += _pcre_OP_lengths[c];
1470
1471 /* In UTF-8 mode, opcodes that are followed by a character may be followed
1472 by a multi-byte character. The length in the table is a minimum, so we have
1473 to arrange to skip the extra bytes. */
1474
1475 #ifdef SUPPORT_UTF8
1476 if (utf8) switch(c)
1477 {
1478 case OP_CHAR:
1479 case OP_CHARNC:
1480 case OP_EXACT:
1481 case OP_UPTO:
1482 case OP_MINUPTO:
1483 case OP_POSUPTO:
1484 case OP_STAR:
1485 case OP_MINSTAR:
1486 case OP_POSSTAR:
1487 case OP_PLUS:
1488 case OP_MINPLUS:
1489 case OP_POSPLUS:
1490 case OP_QUERY:
1491 case OP_MINQUERY:
1492 case OP_POSQUERY:
1493 if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1494 break;
1495 }
1496 #endif
1497 }
1498 }
1499 }
1500
1501
1502
1503 /*************************************************
1504 * Scan compiled branch for non-emptiness *
1505 *************************************************/
1506
1507 /* This function scans through a branch of a compiled pattern to see whether it
1508 can match the empty string or not. It is called from could_be_empty()
1509 below and from compile_branch() when checking for an unlimited repeat of a
1510 group that can match nothing. Note that first_significant_code() skips over
1511 backward and negative forward assertions when its final argument is TRUE. If we
1512 hit an unclosed bracket, we return "empty" - this means we've struck an inner
1513 bracket whose current branch will already have been scanned.
1514
1515 Arguments:
1516 code points to start of search
1517 endcode points to where to stop
1518 utf8 TRUE if in UTF8 mode
1519
1520 Returns: TRUE if what is matched could be empty
1521 */
1522
1523 static BOOL
1524 could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1525 {
1526 register int c;
1527 for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1528 code < endcode;
1529 code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1530 {
1531 const uschar *ccode;
1532
1533 c = *code;
1534
1535 /* Skip over forward assertions; the other assertions are skipped by
1536 first_significant_code() with a TRUE final argument. */
1537
1538 if (c == OP_ASSERT)
1539 {
1540 do code += GET(code, 1); while (*code == OP_ALT);
1541 c = *code;
1542 continue;
1543 }
1544
1545 /* Groups with zero repeats can of course be empty; skip them. */
1546
1547 if (c == OP_BRAZERO || c == OP_BRAMINZERO)
1548 {
1549 code += _pcre_OP_lengths[c];
1550 do code += GET(code, 1); while (*code == OP_ALT);
1551 c = *code;
1552 continue;
1553 }
1554
1555 /* For other groups, scan the branches. */
1556
1557 if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND)
1558 {
1559 BOOL empty_branch;
1560 if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1561
1562 /* Scan a closed bracket */
1563
1564 empty_branch = FALSE;
1565 do
1566 {
1567 if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1568 empty_branch = TRUE;
1569 code += GET(code, 1);
1570 }
1571 while (*code == OP_ALT);
1572 if (!empty_branch) return FALSE; /* All branches are non-empty */
1573 c = *code;
1574 continue;
1575 }
1576
1577 /* Handle the other opcodes */
1578
1579 switch (c)
1580 {
1581 /* Check for quantifiers after a class. XCLASS is used for classes that
1582 cannot be represented just by a bit map. This includes negated single
1583 high-valued characters. The length in _pcre_OP_lengths[] is zero; the
1584 actual length is stored in the compiled code, so we must update "code"
1585 here. */
1586
1587 #ifdef SUPPORT_UTF8
1588 case OP_XCLASS:
1589 ccode = code += GET(code, 1);
1590 goto CHECK_CLASS_REPEAT;
1591 #endif
1592
1593 case OP_CLASS:
1594 case OP_NCLASS:
1595 ccode = code + 33;
1596
1597 #ifdef SUPPORT_UTF8
1598 CHECK_CLASS_REPEAT:
1599 #endif
1600
1601 switch (*ccode)
1602 {
1603 case OP_CRSTAR: /* These could be empty; continue */
1604 case OP_CRMINSTAR:
1605 case OP_CRQUERY:
1606 case OP_CRMINQUERY:
1607 break;
1608
1609 default: /* Non-repeat => class must match */
1610 case OP_CRPLUS: /* These repeats aren't empty */
1611 case OP_CRMINPLUS:
1612 return FALSE;
1613
1614 case OP_CRRANGE:
1615 case OP_CRMINRANGE:
1616 if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
1617 break;
1618 }
1619 break;
1620
1621 /* Opcodes that must match a character */
1622
1623 case OP_PROP:
1624 case OP_NOTPROP:
1625 case OP_EXTUNI:
1626 case OP_NOT_DIGIT:
1627 case OP_DIGIT:
1628 case OP_NOT_WHITESPACE:
1629 case OP_WHITESPACE:
1630 case OP_NOT_WORDCHAR:
1631 case OP_WORDCHAR:
1632 case OP_ANY:
1633 case OP_ANYBYTE:
1634 case OP_CHAR:
1635 case OP_CHARNC:
1636 case OP_NOT:
1637 case OP_PLUS:
1638 case OP_MINPLUS:
1639 case OP_POSPLUS:
1640 case OP_EXACT:
1641 case OP_NOTPLUS:
1642 case OP_NOTMINPLUS:
1643 case OP_NOTPOSPLUS:
1644 case OP_NOTEXACT:
1645 case OP_TYPEPLUS:
1646 case OP_TYPEMINPLUS:
1647 case OP_TYPEPOSPLUS:
1648 case OP_TYPEEXACT:
1649 return FALSE;
1650
1651 /* These are going to continue, as they may be empty, but we have to
1652 fudge the length for the \p and \P cases. */
1653
1654 case OP_TYPESTAR:
1655 case OP_TYPEMINSTAR:
1656 case OP_TYPEPOSSTAR:
1657 case OP_TYPEQUERY:
1658 case OP_TYPEMINQUERY:
1659 case OP_TYPEPOSQUERY:
1660 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1661 break;
1662
1663 /* Same for these */
1664
1665 case OP_TYPEUPTO:
1666 case OP_TYPEMINUPTO:
1667 case OP_TYPEPOSUPTO:
1668 if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1669 break;
1670
1671 /* End of branch */
1672
1673 case OP_KET:
1674 case OP_KETRMAX:
1675 case OP_KETRMIN:
1676 case OP_ALT:
1677 return TRUE;
1678
1679 /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1680 MINUPTO, and POSUPTO may be followed by a multibyte character */
1681
1682 #ifdef SUPPORT_UTF8
1683 case OP_STAR:
1684 case OP_MINSTAR:
1685 case OP_POSSTAR:
1686 case OP_QUERY:
1687 case OP_MINQUERY:
1688 case OP_POSQUERY:
1689 case OP_UPTO:
1690 case OP_MINUPTO:
1691 case OP_POSUPTO:
1692 if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1693 break;
1694 #endif
1695 }
1696 }
1697
1698 return TRUE;
1699 }
1700
1701
1702
1703 /*************************************************
1704 * Scan compiled regex for non-emptiness *
1705 *************************************************/
1706
1707 /* This function is called to check for left recursive calls. We want to check
1708 the current branch of the current pattern to see if it could match the empty
1709 string. If it could, we must look outwards for branches at other levels,
1710 stopping when we pass beyond the bracket which is the subject of the recursion.
1711
1712 Arguments:
1713 code points to start of the recursion
1714 endcode points to where to stop (current RECURSE item)
1715 bcptr points to the chain of current (unclosed) branch starts
1716 utf8 TRUE if in UTF-8 mode
1717
1718 Returns: TRUE if what is matched could be empty
1719 */
1720
1721 static BOOL
1722 could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1723 BOOL utf8)
1724 {
1725 while (bcptr != NULL && bcptr->current >= code)
1726 {
1727 if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1728 bcptr = bcptr->outer;
1729 }
1730 return TRUE;
1731 }
1732
1733
1734
1735 /*************************************************
1736 * Check for POSIX class syntax *
1737 *************************************************/
1738
1739 /* This function is called when the sequence "[:" or "[." or "[=" is
1740 encountered in a character class. It checks whether this is followed by a
1741 sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
1742 reach an unescaped ']' without the special preceding character, return FALSE.
1743
1744 Originally, this function only recognized a sequence of letters between the
1745 terminators, but it seems that Perl recognizes any sequence of characters,
1746 though of course unknown POSIX names are subsequently rejected. Perl gives an
1747 "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
1748 didn't consider this to be a POSIX class. Likewise for [:1234:].
1749
1750 The problem in trying to be exactly like Perl is in the handling of escapes. We
1751 have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
1752 class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
1753 below handles the special case of \], but does not try to do any other escape
1754 processing. This makes it different from Perl for cases such as [:l\ower:]
1755 where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize
1756 "l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,
1757 I think.
1758
1759 Arguments:
1760 ptr pointer to the initial [
1761 endptr where to return the end pointer
1762
1763 Returns: TRUE or FALSE
1764 */
1765
1766 static BOOL
1767 check_posix_syntax(const uschar *ptr, const uschar **endptr)
1768 {
1769 int terminator; /* Don't combine these lines; the Solaris cc */
1770 terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
1771 for (++ptr; *ptr != 0; ptr++)
1772 {
1773 if (*ptr == '\\' && ptr[1] == ']') ptr++; else
1774 {
1775 if (*ptr == ']') return FALSE;
1776 if (*ptr == terminator && ptr[1] == ']')
1777 {
1778 *endptr = ptr;
1779 return TRUE;
1780 }
1781 }
1782 }
1783 return FALSE;
1784 }
1785
1786
1787
1788
1789 /*************************************************
1790 * Check POSIX class name *
1791 *************************************************/
1792
1793 /* This function is called to check the name given in a POSIX-style class entry
1794 such as [:alnum:].
1795
1796 Arguments:
1797 ptr points to the first letter
1798 len the length of the name
1799
1800 Returns: a value representing the name, or -1 if unknown
1801 */
1802
1803 static int
1804 check_posix_name(const uschar *ptr, int len)
1805 {
1806 const char *pn = posix_names;
1807 register int yield = 0;
1808 while (posix_name_lengths[yield] != 0)
1809 {
1810 if (len == posix_name_lengths[yield] &&
1811 strncmp((const char *)ptr, pn, len) == 0) return yield;
1812 pn += posix_name_lengths[yield] + 1;
1813 yield++;
1814 }
1815 return -1;
1816 }
1817
1818
1819 /*************************************************
1820 * Adjust OP_RECURSE items in repeated group *
1821 *************************************************/
1822
1823 /* OP_RECURSE items contain an offset from the start of the regex to the group
1824 that is referenced. This means that groups can be replicated for fixed
1825 repetition simply by copying (because the recursion is allowed to refer to
1826 earlier groups that are outside the current group). However, when a group is
1827 optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
1828 it, after it has been compiled. This means that any OP_RECURSE items within it
1829 that refer to the group itself or any contained groups have to have their
1830 offsets adjusted. That one of the jobs of this function. Before it is called,
1831 the partially compiled regex must be temporarily terminated with OP_END.
1832
1833 This function has been extended with the possibility of forward references for
1834 recursions and subroutine calls. It must also check the list of such references
1835 for the group we are dealing with. If it finds that one of the recursions in
1836 the current group is on this list, it adjusts the offset in the list, not the
1837 value in the reference (which is a group number).
1838
1839 Arguments:
1840 group points to the start of the group
1841 adjust the amount by which the group is to be moved
1842 utf8 TRUE in UTF-8 mode
1843 cd contains pointers to tables etc.
1844 save_hwm the hwm forward reference pointer at the start of the group
1845
1846 Returns: nothing
1847 */
1848
1849 static void
1850 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
1851 uschar *save_hwm)
1852 {
1853 uschar *ptr = group;
1854
1855 while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1856 {
1857 int offset;
1858 uschar *hc;
1859
1860 /* See if this recursion is on the forward reference list. If so, adjust the
1861 reference. */
1862
1863 for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
1864 {
1865 offset = GET(hc, 0);
1866 if (cd->start_code + offset == ptr + 1)
1867 {
1868 PUT(hc, 0, offset + adjust);
1869 break;
1870 }
1871 }
1872
1873 /* Otherwise, adjust the recursion offset if it's after the start of this
1874 group. */
1875
1876 if (hc >= cd->hwm)
1877 {
1878 offset = GET(ptr, 1);
1879 if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1880 }
1881
1882 ptr += 1 + LINK_SIZE;
1883 }
1884 }
1885
1886
1887
1888 /*************************************************
1889 * Insert an automatic callout point *
1890 *************************************************/
1891
1892 /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
1893 callout points before each pattern item.
1894
1895 Arguments:
1896 code current code pointer
1897 ptr current pattern pointer
1898 cd pointers to tables etc
1899
1900 Returns: new code pointer
1901 */
1902
1903 static uschar *
1904 auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
1905 {
1906 *code++ = OP_CALLOUT;
1907 *code++ = 255;
1908 PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */
1909 PUT(code, LINK_SIZE, 0); /* Default length */
1910 return code + 2*LINK_SIZE;
1911 }
1912
1913
1914
1915 /*************************************************
1916 * Complete a callout item *
1917 *************************************************/
1918
1919 /* A callout item contains the length of the next item in the pattern, which
1920 we can't fill in till after we have reached the relevant point. This is used
1921 for both automatic and manual callouts.
1922
1923 Arguments:
1924 previous_callout points to previous callout item
1925 ptr current pattern pointer
1926 cd pointers to tables etc
1927
1928 Returns: nothing
1929 */
1930
1931 static void
1932 complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
1933 {
1934 int length = ptr - cd->start_pattern - GET(previous_callout, 2);
1935 PUT(previous_callout, 2 + LINK_SIZE, length);
1936 }
1937
1938
1939
1940 #ifdef SUPPORT_UCP
1941 /*************************************************
1942 * Get othercase range *
1943 *************************************************/
1944
1945 /* This function is passed the start and end of a class range, in UTF-8 mode
1946 with UCP support. It searches up the characters, looking for internal ranges of
1947 characters in the "other" case. Each call returns the next one, updating the
1948 start address.
1949
1950 Arguments:
1951 cptr points to starting character value; updated
1952 d end value
1953 ocptr where to put start of othercase range
1954 odptr where to put end of othercase range
1955
1956 Yield: TRUE when range returned; FALSE when no more
1957 */
1958
1959 static BOOL
1960 get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
1961 unsigned int *odptr)
1962 {
1963 unsigned int c, othercase, next;
1964
1965 for (c = *cptr; c <= d; c++)
1966 { if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR) break; }
1967
1968 if (c > d) return FALSE;
1969
1970 *ocptr = othercase;
1971 next = othercase + 1;
1972
1973 for (++c; c <= d; c++)
1974 {
1975 if (_pcre_ucp_othercase(c) != next) break;
1976 next++;
1977 }
1978
1979 *odptr = next - 1;
1980 *cptr = c;
1981
1982 return TRUE;
1983 }
1984 #endif /* SUPPORT_UCP */
1985
1986
1987
1988 /*************************************************
1989 * Check if auto-possessifying is possible *
1990 *************************************************/
1991
1992 /* This function is called for unlimited repeats of certain items, to see
1993 whether the next thing could possibly match the repeated item. If not, it makes
1994 sense to automatically possessify the repeated item.
1995
1996 Arguments:
1997 op_code the repeated op code
1998 this data for this item, depends on the opcode
1999 utf8 TRUE in UTF-8 mode
2000 utf8_char used for utf8 character bytes, NULL if not relevant
2001 ptr next character in pattern
2002 options options bits
2003 cd contains pointers to tables etc.
2004
2005 Returns: TRUE if possessifying is wanted
2006 */
2007
2008 static BOOL
2009 check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
2010 const uschar *ptr, int options, compile_data *cd)
2011 {
2012 int next;
2013
2014 /* Skip whitespace and comments in extended mode */
2015
2016 if ((options & PCRE_EXTENDED) != 0)
2017 {
2018 for (;;)
2019 {
2020 while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2021 if (*ptr == '#')
2022 {
2023 while (*(++ptr) != 0)
2024 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2025 }
2026 else break;
2027 }
2028 }
2029
2030 /* If the next item is one that we can handle, get its value. A non-negative
2031 value is a character, a negative value is an escape value. */
2032
2033 if (*ptr == '\\')
2034 {
2035 int temperrorcode = 0;
2036 next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
2037 if (temperrorcode != 0) return FALSE;
2038 ptr++; /* Point after the escape sequence */
2039 }
2040
2041 else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
2042 {
2043 #ifdef SUPPORT_UTF8
2044 if (utf8) { GETCHARINC(next, ptr); } else
2045 #endif
2046 next = *ptr++;
2047 }
2048
2049 else return FALSE;
2050
2051 /* Skip whitespace and comments in extended mode */
2052
2053 if ((options & PCRE_EXTENDED) != 0)
2054 {
2055 for (;;)
2056 {
2057 while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2058 if (*ptr == '#')
2059 {
2060 while (*(++ptr) != 0)
2061 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2062 }
2063 else break;
2064 }
2065 }
2066
2067 /* If the next thing is itself optional, we have to give up. */
2068
2069 if (*ptr == '*' || *ptr == '?' || strncmp((char *)ptr, "{0,", 3) == 0)
2070 return FALSE;
2071
2072 /* Now compare the next item with the previous opcode. If the previous is a
2073 positive single character match, "item" either contains the character or, if
2074 "item" is greater than 127 in utf8 mode, the character's bytes are in
2075 utf8_char. */
2076
2077
2078 /* Handle cases when the next item is a character. */
2079
2080 if (next >= 0) switch(op_code)
2081 {
2082 case OP_CHAR:
2083 #ifdef SUPPORT_UTF8
2084 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2085 #endif
2086 return item != next;
2087
2088 /* For CHARNC (caseless character) we must check the other case. If we have
2089 Unicode property support, we can use it to test the other case of
2090 high-valued characters. */
2091
2092 case OP_CHARNC:
2093 #ifdef SUPPORT_UTF8
2094 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2095 #endif
2096 if (item == next) return FALSE;
2097 #ifdef SUPPORT_UTF8
2098 if (utf8)
2099 {
2100 unsigned int othercase;
2101 if (next < 128) othercase = cd->fcc[next]; else
2102 #ifdef SUPPORT_UCP
2103 othercase = _pcre_ucp_othercase((unsigned int)next);
2104 #else
2105 othercase = NOTACHAR;
2106 #endif
2107 return (unsigned int)item != othercase;
2108 }
2109 else
2110 #endif /* SUPPORT_UTF8 */
2111 return (item != cd->fcc[next]); /* Non-UTF-8 mode */
2112
2113 /* For OP_NOT, "item" must be a single-byte character. */
2114
2115 case OP_NOT:
2116 if (item == next) return TRUE;
2117 if ((options & PCRE_CASELESS) == 0) return FALSE;
2118 #ifdef SUPPORT_UTF8
2119 if (utf8)
2120 {
2121 unsigned int othercase;
2122 if (next < 128) othercase = cd->fcc[next]; else
2123 #ifdef SUPPORT_UCP
2124 othercase = _pcre_ucp_othercase(next);
2125 #else
2126 othercase = NOTACHAR;
2127 #endif
2128 return (unsigned int)item == othercase;
2129 }
2130 else
2131 #endif /* SUPPORT_UTF8 */
2132 return (item == cd->fcc[next]); /* Non-UTF-8 mode */
2133
2134 case OP_DIGIT:
2135 return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
2136
2137 case OP_NOT_DIGIT:
2138 return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
2139
2140 case OP_WHITESPACE:
2141 return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
2142
2143 case OP_NOT_WHITESPACE:
2144 return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
2145
2146 case OP_WORDCHAR:
2147 return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
2148
2149 case OP_NOT_WORDCHAR:
2150 return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
2151
2152 case OP_HSPACE:
2153 case OP_NOT_HSPACE:
2154 switch(next)
2155 {
2156 case 0x09:
2157 case 0x20:
2158 case 0xa0:
2159 case 0x1680:
2160 case 0x180e:
2161 case 0x2000:
2162 case 0x2001:
2163 case 0x2002:
2164 case 0x2003:
2165 case 0x2004:
2166 case 0x2005:
2167 case 0x2006:
2168 case 0x2007:
2169 case 0x2008:
2170 case 0x2009:
2171 case 0x200A:
2172 case 0x202f:
2173 case 0x205f:
2174 case 0x3000:
2175 return op_code != OP_HSPACE;
2176 default:
2177 return op_code == OP_HSPACE;
2178 }
2179
2180 case OP_VSPACE:
2181 case OP_NOT_VSPACE:
2182 switch(next)
2183 {
2184 case 0x0a:
2185 case 0x0b:
2186 case 0x0c:
2187 case 0x0d:
2188 case 0x85:
2189 case 0x2028:
2190 case 0x2029:
2191 return op_code != OP_VSPACE;
2192 default:
2193 return op_code == OP_VSPACE;
2194 }
2195
2196 default:
2197 return FALSE;
2198 }
2199
2200
2201 /* Handle the case when the next item is \d, \s, etc. */
2202
2203 switch(op_code)
2204 {
2205 case OP_CHAR:
2206 case OP_CHARNC:
2207 #ifdef SUPPORT_UTF8
2208 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2209 #endif
2210 switch(-next)
2211 {
2212 case ESC_d:
2213 return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
2214
2215 case ESC_D:
2216 return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
2217
2218 case ESC_s:
2219 return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
2220
2221 case ESC_S:
2222 return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
2223
2224 case ESC_w:
2225 return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
2226
2227 case ESC_W:
2228 return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
2229
2230 case ESC_h:
2231 case ESC_H:
2232 switch(item)
2233 {
2234 case 0x09:
2235 case 0x20:
2236 case 0xa0:
2237 case 0x1680:
2238 case 0x180e:
2239 case 0x2000:
2240 case 0x2001:
2241 case 0x2002:
2242 case 0x2003:
2243 case 0x2004:
2244 case 0x2005:
2245 case 0x2006:
2246 case 0x2007:
2247 case 0x2008:
2248 case 0x2009:
2249 case 0x200A:
2250 case 0x202f:
2251 case 0x205f:
2252 case 0x3000:
2253 return -next != ESC_h;
2254 default:
2255 return -next == ESC_h;
2256 }
2257
2258 case ESC_v:
2259 case ESC_V:
2260 switch(item)
2261 {
2262 case 0x0a:
2263 case 0x0b:
2264 case 0x0c:
2265 case 0x0d:
2266 case 0x85:
2267 case 0x2028:
2268 case 0x2029:
2269 return -next != ESC_v;
2270 default:
2271 return -next == ESC_v;
2272 }
2273
2274 default:
2275 return FALSE;
2276 }
2277
2278 case OP_DIGIT:
2279 return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
2280 next == -ESC_h || next == -ESC_v;
2281
2282 case OP_NOT_DIGIT:
2283 return next == -ESC_d;
2284
2285 case OP_WHITESPACE:
2286 return next == -ESC_S || next == -ESC_d || next == -ESC_w;
2287
2288 case OP_NOT_WHITESPACE:
2289 return next == -ESC_s || next == -ESC_h || next == -ESC_v;
2290
2291 case OP_HSPACE:
2292 return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w;
2293
2294 case OP_NOT_HSPACE:
2295 return next == -ESC_h;
2296
2297 /* Can't have \S in here because VT matches \S (Perl anomaly) */
2298 case OP_VSPACE:
2299 return next == -ESC_V || next == -ESC_d || next == -ESC_w;
2300
2301 case OP_NOT_VSPACE:
2302 return next == -ESC_v;
2303
2304 case OP_WORDCHAR:
2305 return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v;
2306
2307 case OP_NOT_WORDCHAR:
2308 return next == -ESC_w || next == -ESC_d;
2309
2310 default:
2311 return FALSE;
2312 }
2313
2314 /* Control does not reach here */
2315 }
2316
2317
2318
2319 /*************************************************
2320 * Compile one branch *
2321 *************************************************/
2322
2323 /* Scan the pattern, compiling it into the a vector. If the options are
2324 changed during the branch, the pointer is used to change the external options
2325 bits. This function is used during the pre-compile phase when we are trying
2326 to find out the amount of memory needed, as well as during the real compile
2327 phase. The value of lengthptr distinguishes the two phases.
2328
2329 Arguments:
2330 optionsptr pointer to the option bits
2331 codeptr points to the pointer to the current code point
2332 ptrptr points to the current pattern pointer
2333 errorcodeptr points to error code variable
2334 firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2335 reqbyteptr set to the last literal character required, else < 0
2336 bcptr points to current branch chain
2337 cd contains pointers to tables etc.
2338 lengthptr NULL during the real compile phase
2339 points to length accumulator during pre-compile phase
2340
2341 Returns: TRUE on success
2342 FALSE, with *errorcodeptr set non-zero on error
2343 */
2344
2345 static BOOL
2346 compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2347 int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2348 compile_data *cd, int *lengthptr)
2349 {
2350 int repeat_type, op_type;
2351 int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
2352 int bravalue = 0;
2353 int greedy_default, greedy_non_default;
2354 int firstbyte, reqbyte;
2355 int zeroreqbyte, zerofirstbyte;
2356 int req_caseopt, reqvary, tempreqvary;
2357 int options = *optionsptr;
2358 int after_manual_callout = 0;
2359 int length_prevgroup = 0;
2360 register int c;
2361 register uschar *code = *codeptr;
2362 uschar *last_code = code;
2363 uschar *orig_code = code;
2364 uschar *tempcode;
2365 BOOL inescq = FALSE;
2366 BOOL groupsetfirstbyte = FALSE;
2367 const uschar *ptr = *ptrptr;
2368 const uschar *tempptr;
2369 uschar *previous = NULL;
2370 uschar *previous_callout = NULL;
2371 uschar *save_hwm = NULL;
2372 uschar classbits[32];
2373
2374 #ifdef SUPPORT_UTF8
2375 BOOL class_utf8;
2376 BOOL utf8 = (options & PCRE_UTF8) != 0;
2377 uschar *class_utf8data;
2378 uschar *class_utf8data_base;
2379 uschar utf8_char[6];
2380 #else
2381 BOOL utf8 = FALSE;
2382 uschar *utf8_char = NULL;
2383 #endif
2384
2385 #ifdef DEBUG
2386 if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2387 #endif
2388
2389 /* Set up the default and non-default settings for greediness */
2390
2391 greedy_default = ((options & PCRE_UNGREEDY) != 0);
2392 greedy_non_default = greedy_default ^ 1;
2393
2394 /* Initialize no first byte, no required byte. REQ_UNSET means "no char
2395 matching encountered yet". It gets changed to REQ_NONE if we hit something that
2396 matches a non-fixed char first char; reqbyte just remains unset if we never
2397 find one.
2398
2399 When we hit a repeat whose minimum is zero, we may have to adjust these values
2400 to take the zero repeat into account. This is implemented by setting them to
2401 zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2402 item types that can be repeated set these backoff variables appropriately. */
2403
2404 firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2405
2406 /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2407 according to the current setting of the caseless flag. REQ_CASELESS is a bit
2408 value > 255. It is added into the firstbyte or reqbyte variables to record the
2409 case status of the value. This is used only for ASCII characters. */
2410
2411 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2412
2413 /* Switch on next character until the end of the branch */
2414
2415 for (;; ptr++)
2416 {
2417 BOOL negate_class;
2418 BOOL should_flip_negation;
2419 BOOL possessive_quantifier;
2420 BOOL is_quantifier;
2421 BOOL is_recurse;
2422 BOOL reset_bracount;
2423 int class_charcount;
2424 int class_lastchar;
2425 int newoptions;
2426 int recno;
2427 int refsign;
2428 int skipbytes;
2429 int subreqbyte;
2430 int subfirstbyte;
2431 int terminator;
2432 int mclength;
2433 uschar mcbuffer[8];
2434
2435 /* Get next byte in the pattern */
2436
2437 c = *ptr;
2438
2439 /* If we are in the pre-compile phase, accumulate the length used for the
2440 previous cycle of this loop. */
2441
2442 if (lengthptr != NULL)
2443 {
2444 #ifdef DEBUG
2445 if (code > cd->hwm) cd->hwm = code; /* High water info */
2446 #endif
2447 if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
2448 {
2449 *errorcodeptr = ERR52;
2450 goto FAILED;
2451 }
2452
2453 /* There is at least one situation where code goes backwards: this is the
2454 case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2455 the class is simply eliminated. However, it is created first, so we have to
2456 allow memory for it. Therefore, don't ever reduce the length at this point.
2457 */
2458
2459 if (code < last_code) code = last_code;
2460
2461 /* Paranoid check for integer overflow */
2462
2463 if (OFLOW_MAX - *lengthptr < code - last_code)
2464 {
2465 *errorcodeptr = ERR20;
2466 goto FAILED;
2467 }
2468
2469 *lengthptr += code - last_code;
2470 DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2471
2472 /* If "previous" is set and it is not at the start of the work space, move
2473 it back to there, in order to avoid filling up the work space. Otherwise,
2474 if "previous" is NULL, reset the current code pointer to the start. */
2475
2476 if (previous != NULL)
2477 {
2478 if (previous > orig_code)
2479 {
2480 memmove(orig_code, previous, code - previous);
2481 code -= previous - orig_code;
2482 previous = orig_code;
2483 }
2484 }
2485 else code = orig_code;
2486
2487 /* Remember where this code item starts so we can pick up the length
2488 next time round. */
2489
2490 last_code = code;
2491 }
2492
2493 /* In the real compile phase, just check the workspace used by the forward
2494 reference list. */
2495
2496 else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
2497 {
2498 *errorcodeptr = ERR52;
2499 goto FAILED;
2500 }
2501
2502 /* If in \Q...\E, check for the end; if not, we have a literal */
2503
2504 if (inescq && c != 0)
2505 {
2506 if (c == '\\' && ptr[1] == 'E')
2507 {
2508 inescq = FALSE;
2509 ptr++;
2510 continue;
2511 }
2512 else
2513 {
2514 if (previous_callout != NULL)
2515 {
2516 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2517 complete_callout(previous_callout, ptr, cd);
2518 previous_callout = NULL;
2519 }
2520 if ((options & PCRE_AUTO_CALLOUT) != 0)
2521 {
2522 previous_callout = code;
2523 code = auto_callout(code, ptr, cd);
2524 }
2525 goto NORMAL_CHAR;
2526 }
2527 }
2528
2529 /* Fill in length of a previous callout, except when the next thing is
2530 a quantifier. */
2531
2532 is_quantifier = c == '*' || c == '+' || c == '?' ||
2533 (c == '{' && is_counted_repeat(ptr+1));
2534
2535 if (!is_quantifier && previous_callout != NULL &&
2536 after_manual_callout-- <= 0)
2537 {
2538 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2539 complete_callout(previous_callout, ptr, cd);
2540 previous_callout = NULL;
2541 }
2542
2543 /* In extended mode, skip white space and comments */
2544
2545 if ((options & PCRE_EXTENDED) != 0)
2546 {
2547 if ((cd->ctypes[c] & ctype_space) != 0) continue;
2548 if (c == '#')
2549 {
2550 while (*(++ptr) != 0)
2551 {
2552 if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
2553 }
2554 if (*ptr != 0) continue;
2555
2556 /* Else fall through to handle end of string */
2557 c = 0;
2558 }
2559 }
2560
2561 /* No auto callout for quantifiers. */
2562
2563 if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
2564 {
2565 previous_callout = code;
2566 code = auto_callout(code, ptr, cd);
2567 }
2568
2569 switch(c)
2570 {
2571 /* ===================================================================*/
2572 case 0: /* The branch terminates at string end */
2573 case '|': /* or | or ) */
2574 case ')':
2575 *firstbyteptr = firstbyte;
2576 *reqbyteptr = reqbyte;
2577 *codeptr = code;
2578 *ptrptr = ptr;
2579 if (lengthptr != NULL)
2580 {
2581 if (OFLOW_MAX - *lengthptr < code - last_code)
2582 {
2583 *errorcodeptr = ERR20;
2584 goto FAILED;
2585 }
2586 *lengthptr += code - last_code; /* To include callout length */
2587 DPRINTF((">> end branch\n"));
2588 }
2589 return TRUE;
2590
2591
2592 /* ===================================================================*/
2593 /* Handle single-character metacharacters. In multiline mode, ^ disables
2594 the setting of any following char as a first character. */
2595
2596 case '^':
2597 if ((options & PCRE_MULTILINE) != 0)
2598 {
2599 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2600 }
2601 previous = NULL;
2602 *code++ = OP_CIRC;
2603 break;
2604
2605 case '$':
2606 previous = NULL;
2607 *code++ = OP_DOLL;
2608 break;
2609
2610 /* There can never be a first char if '.' is first, whatever happens about
2611 repeats. The value of reqbyte doesn't change either. */
2612
2613 case '.':
2614 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2615 zerofirstbyte = firstbyte;
2616 zeroreqbyte = reqbyte;
2617 previous = code;
2618 *code++ = OP_ANY;
2619 break;
2620
2621
2622 /* ===================================================================*/
2623 /* Character classes. If the included characters are all < 256, we build a
2624 32-byte bitmap of the permitted characters, except in the special case
2625 where there is only one such character. For negated classes, we build the
2626 map as usual, then invert it at the end. However, we use a different opcode
2627 so that data characters > 255 can be handled correctly.
2628
2629 If the class contains characters outside the 0-255 range, a different
2630 opcode is compiled. It may optionally have a bit map for characters < 256,
2631 but those above are are explicitly listed afterwards. A flag byte tells
2632 whether the bitmap is present, and whether this is a negated class or not.
2633 */
2634
2635 case '[':
2636 previous = code;
2637
2638 /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
2639 they are encountered at the top level, so we'll do that too. */
2640
2641 if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2642 check_posix_syntax(ptr, &tempptr))
2643 {
2644 *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;
2645 goto FAILED;
2646 }
2647
2648 /* If the first character is '^', set the negation flag and skip it. Also,
2649 if the first few characters (either before or after ^) are \Q\E or \E we
2650 skip them too. This makes for compatibility with Perl. */
2651
2652 negate_class = FALSE;
2653 for (;;)
2654 {
2655 c = *(++ptr);
2656 if (c == '\\')
2657 {
2658 if (ptr[1] == 'E') ptr++;
2659 else if (strncmp((const char *)ptr+1, "Q\\E", 3) == 0) ptr += 3;
2660 else break;
2661 }
2662 else if (!negate_class && c == '^')
2663 negate_class = TRUE;
2664 else break;
2665 }
2666
2667 /* If a class contains a negative special such as \S, we need to flip the
2668 negation flag at the end, so that support for characters > 255 works
2669 correctly (they are all included in the class). */
2670
2671 should_flip_negation = FALSE;
2672
2673 /* Keep a count of chars with values < 256 so that we can optimize the case
2674 of just a single character (as long as it's < 256). However, For higher
2675 valued UTF-8 characters, we don't yet do any optimization. */
2676
2677 class_charcount = 0;
2678 class_lastchar = -1;
2679
2680 /* Initialize the 32-char bit map to all zeros. We build the map in a
2681 temporary bit of memory, in case the class contains only 1 character (less
2682 than 256), because in that case the compiled code doesn't use the bit map.
2683 */
2684
2685 memset(classbits, 0, 32 * sizeof(uschar));
2686
2687 #ifdef SUPPORT_UTF8
2688 class_utf8 = FALSE; /* No chars >= 256 */
2689 class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */
2690 class_utf8data_base = class_utf8data; /* For resetting in pass 1 */
2691 #endif
2692
2693 /* Process characters until ] is reached. By writing this as a "do" it
2694 means that an initial ] is taken as a data character. At the start of the
2695 loop, c contains the first byte of the character. */
2696
2697 if (c != 0) do
2698 {
2699 const uschar *oldptr;
2700
2701 #ifdef SUPPORT_UTF8
2702 if (utf8 && c > 127)
2703 { /* Braces are required because the */
2704 GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
2705 }
2706
2707 /* In the pre-compile phase, accumulate the length of any UTF-8 extra
2708 data and reset the pointer. This is so that very large classes that
2709 contain a zillion UTF-8 characters no longer overwrite the work space
2710 (which is on the stack). */
2711
2712 if (lengthptr != NULL)
2713 {
2714 *lengthptr += class_utf8data - class_utf8data_base;
2715 class_utf8data = class_utf8data_base;
2716 }
2717
2718 #endif
2719
2720 /* Inside \Q...\E everything is literal except \E */
2721
2722 if (inescq)
2723 {
2724 if (c == '\\' && ptr[1] == 'E') /* If we are at \E */
2725 {
2726 inescq = FALSE; /* Reset literal state */
2727 ptr++; /* Skip the 'E' */
2728 continue; /* Carry on with next */
2729 }
2730 goto CHECK_RANGE; /* Could be range if \E follows */
2731 }
2732
2733 /* Handle POSIX class names. Perl allows a negation extension of the
2734 form [:^name:]. A square bracket that doesn't match the syntax is
2735 treated as a literal. We also recognize the POSIX constructions
2736 [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
2737 5.6 and 5.8 do. */
2738
2739 if (c == '[' &&
2740 (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2741 check_posix_syntax(ptr, &tempptr))
2742 {
2743 BOOL local_negate = FALSE;
2744 int posix_class, taboffset, tabopt;
2745 register const uschar *cbits = cd->cbits;
2746 uschar pbits[32];
2747
2748 if (ptr[1] != ':')
2749 {
2750 *errorcodeptr = ERR31;
2751 goto FAILED;
2752 }
2753
2754 ptr += 2;
2755 if (*ptr == '^')
2756 {
2757 local_negate = TRUE;
2758 should_flip_negation = TRUE; /* Note negative special */
2759 ptr++;
2760 }
2761
2762 posix_class = check_posix_name(ptr, tempptr - ptr);
2763 if (posix_class < 0)
2764 {
2765 *errorcodeptr = ERR30;
2766 goto FAILED;
2767 }
2768
2769 /* If matching is caseless, upper and lower are converted to
2770 alpha. This relies on the fact that the class table starts with
2771 alpha, lower, upper as the first 3 entries. */
2772
2773 if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
2774 posix_class = 0;
2775
2776 /* We build the bit map for the POSIX class in a chunk of local store
2777 because we may be adding and subtracting from it, and we don't want to
2778 subtract bits that may be in the main map already. At the end we or the
2779 result into the bit map that is being built. */
2780
2781 posix_class *= 3;
2782
2783 /* Copy in the first table (always present) */
2784
2785 memcpy(pbits, cbits + posix_class_maps[posix_class],
2786 32 * sizeof(uschar));
2787
2788 /* If there is a second table, add or remove it as required. */
2789
2790 taboffset = posix_class_maps[posix_class + 1];
2791 tabopt = posix_class_maps[posix_class + 2];
2792
2793 if (taboffset >= 0)
2794 {
2795 if (tabopt >= 0)
2796 for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
2797 else
2798 for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
2799 }
2800
2801 /* Not see if we need to remove any special characters. An option
2802 value of 1 removes vertical space and 2 removes underscore. */
2803
2804 if (tabopt < 0) tabopt = -tabopt;
2805 if (tabopt == 1) pbits[1] &= ~0x3c;
2806 else if (tabopt == 2) pbits[11] &= 0x7f;
2807
2808 /* Add the POSIX table or its complement into the main table that is
2809 being built and we are done. */
2810
2811 if (local_negate)
2812 for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
2813 else
2814 for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
2815
2816 ptr = tempptr + 1;
2817 class_charcount = 10; /* Set > 1; assumes more than 1 per class */
2818 continue; /* End of POSIX syntax handling */
2819 }
2820
2821 /* Backslash may introduce a single character, or it may introduce one
2822 of the specials, which just set a flag. The sequence \b is a special
2823 case. Inside a class (and only there) it is treated as backspace.
2824 Elsewhere it marks a word boundary. Other escapes have preset maps ready
2825 to 'or' into the one we are building. We assume they have more than one
2826 character in them, so set class_charcount bigger than one. */
2827
2828 if (c == '\\')
2829 {
2830 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2831 if (*errorcodeptr != 0) goto FAILED;
2832
2833 if (-c == ESC_b) c = '\b'; /* \b is backspace in a class */
2834 else if (-c == ESC_X) c = 'X'; /* \X is literal X in a class */
2835 else if (-c == ESC_R) c = 'R'; /* \R is literal R in a class */
2836 else if (-c == ESC_Q) /* Handle start of quoted string */
2837 {
2838 if (ptr[1] == '\\' && ptr[2] == 'E')
2839 {
2840 ptr += 2; /* avoid empty string */
2841 }
2842 else inescq = TRUE;
2843 continue;
2844 }
2845 else if (-c == ESC_E) continue; /* Ignore orphan \E */
2846
2847 if (c < 0)
2848 {
2849 register const uschar *cbits = cd->cbits;
2850 class_charcount += 2; /* Greater than 1 is what matters */
2851
2852 /* Save time by not doing this in the pre-compile phase. */
2853
2854 if (lengthptr == NULL) switch (-c)
2855 {
2856 case ESC_d:
2857 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
2858 continue;
2859
2860 case ESC_D:
2861 should_flip_negation = TRUE;
2862 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
2863 continue;
2864
2865 case ESC_w:
2866 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
2867 continue;
2868
2869 case ESC_W:
2870 should_flip_negation = TRUE;
2871 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
2872 continue;
2873
2874 case ESC_s:
2875 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
2876 classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
2877 continue;
2878
2879 case ESC_S:
2880 should_flip_negation = TRUE;
2881 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
2882 classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
2883 continue;
2884
2885 default: /* Not recognized; fall through */
2886 break; /* Need "default" setting to stop compiler warning. */
2887 }
2888
2889 /* In the pre-compile phase, just do the recognition. */
2890
2891 else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
2892 c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
2893
2894 /* We need to deal with \H, \h, \V, and \v in both phases because
2895 they use extra memory. */
2896
2897 if (-c == ESC_h)
2898 {
2899 SETBIT(classbits, 0x09); /* VT */
2900 SETBIT(classbits, 0x20); /* SPACE */
2901 SETBIT(classbits, 0xa0); /* NSBP */
2902 #ifdef SUPPORT_UTF8
2903 if (utf8)
2904 {
2905 class_utf8 = TRUE;
2906 *class_utf8data++ = XCL_SINGLE;
2907 class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
2908 *class_utf8data++ = XCL_SINGLE;
2909 class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
2910 *class_utf8data++ = XCL_RANGE;
2911 class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
2912 class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
2913 *class_utf8data++ = XCL_SINGLE;
2914 class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
2915 *class_utf8data++ = XCL_SINGLE;
2916 class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
2917 *class_utf8data++ = XCL_SINGLE;
2918 class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
2919 }
2920 #endif
2921 continue;
2922 }
2923
2924 if (-c == ESC_H)
2925 {
2926 for (c = 0; c < 32; c++)
2927 {
2928 int x = 0xff;
2929 switch (c)
2930 {
2931 case 0x09/8: x ^= 1 << (0x09%8); break;
2932 case 0x20/8: x ^= 1 << (0x20%8); break;
2933 case 0xa0/8: x ^= 1 << (0xa0%8); break;
2934 default: break;
2935 }
2936 classbits[c] |= x;
2937 }
2938
2939 #ifdef SUPPORT_UTF8
2940 if (utf8)
2941 {
2942 class_utf8 = TRUE;
2943 *class_utf8data++ = XCL_RANGE;
2944 class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2945 class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
2946 *class_utf8data++ = XCL_RANGE;
2947 class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
2948 class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
2949 *class_utf8data++ = XCL_RANGE;
2950 class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
2951 class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
2952 *class_utf8data++ = XCL_RANGE;
2953 class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
2954 class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
2955 *class_utf8data++ = XCL_RANGE;
2956 class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
2957 class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
2958 *class_utf8data++ = XCL_RANGE;
2959 class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
2960 class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
2961 *class_utf8data++ = XCL_RANGE;
2962 class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
2963 class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2964 }
2965 #endif
2966 continue;
2967 }
2968
2969 if (-c == ESC_v)
2970 {
2971 SETBIT(classbits, 0x0a); /* LF */
2972 SETBIT(classbits, 0x0b); /* VT */
2973 SETBIT(classbits, 0x0c); /* FF */
2974 SETBIT(classbits, 0x0d); /* CR */
2975 SETBIT(classbits, 0x85); /* NEL */
2976 #ifdef SUPPORT_UTF8
2977 if (utf8)
2978 {
2979 class_utf8 = TRUE;
2980 *class_utf8data++ = XCL_RANGE;
2981 class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
2982 class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
2983 }
2984 #endif
2985 continue;
2986 }
2987
2988 if (-c == ESC_V)
2989 {
2990 for (c = 0; c < 32; c++)
2991 {
2992 int x = 0xff;
2993 switch (c)
2994 {
2995 case 0x0a/8: x ^= 1 << (0x0a%8);
2996 x ^= 1 << (0x0b%8);
2997 x ^= 1 << (0x0c%8);
2998 x ^= 1 << (0x0d%8);
2999 break;
3000 case 0x85/8: x ^= 1 << (0x85%8); break;
3001 default: break;
3002 }
3003 classbits[c] |= x;
3004 }
3005
3006 #ifdef SUPPORT_UTF8
3007 if (utf8)
3008 {
3009 class_utf8 = TRUE;
3010 *class_utf8data++ = XCL_RANGE;
3011 class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
3012 class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
3013 *class_utf8data++ = XCL_RANGE;
3014 class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
3015 class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
3016 }
3017 #endif
3018 continue;
3019 }
3020
3021 /* We need to deal with \P and \p in both phases. */
3022
3023 #ifdef SUPPORT_UCP
3024 if (-c == ESC_p || -c == ESC_P)
3025 {
3026 BOOL negated;
3027 int pdata;
3028 int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
3029 if (ptype < 0) goto FAILED;
3030 class_utf8 = TRUE;
3031 *class_utf8data++ = ((-c == ESC_p) != negated)?
3032 XCL_PROP : XCL_NOTPROP;
3033 *class_utf8data++ = ptype;
3034 *class_utf8data++ = pdata;
3035 class_charcount -= 2; /* Not a < 256 character */
3036 continue;
3037 }
3038 #endif
3039 /* Unrecognized escapes are faulted if PCRE is running in its
3040 strict mode. By default, for compatibility with Perl, they are
3041 treated as literals. */
3042
3043 if ((options & PCRE_EXTRA) != 0)
3044 {
3045 *errorcodeptr = ERR7;
3046 goto FAILED;
3047 }
3048
3049 class_charcount -= 2; /* Undo the default count from above */
3050 c = *ptr; /* Get the final character and fall through */
3051 }
3052
3053 /* Fall through if we have a single character (c >= 0). This may be
3054 greater than 256 in UTF-8 mode. */
3055
3056 } /* End of backslash handling */
3057
3058 /* A single character may be followed by '-' to form a range. However,
3059 Perl does not permit ']' to be the end of the range. A '-' character
3060 at the end is treated as a literal. Perl ignores orphaned \E sequences
3061 entirely. The code for handling \Q and \E is messy. */
3062
3063 CHECK_RANGE:
3064 while (ptr[1] == '\\' && ptr[2] == 'E')
3065 {
3066 inescq = FALSE;
3067 ptr += 2;
3068 }
3069
3070 oldptr = ptr;
3071
3072 /* Remember \r or \n */
3073
3074 if (c == '\r' || c == '\n') cd->external_flags |= PCRE_HASCRORLF;
3075
3076 /* Check for range */
3077
3078 if (!inescq && ptr[1] == '-')
3079 {
3080 int d;
3081 ptr += 2;
3082 while (*ptr == '\\' && ptr[1] == 'E') ptr += 2;
3083
3084 /* If we hit \Q (not followed by \E) at this point, go into escaped
3085 mode. */
3086
3087 while (*ptr == '\\' && ptr[1] == 'Q')
3088 {
3089 ptr += 2;
3090 if (*ptr == '\\' && ptr[1] == 'E') { ptr += 2; continue; }
3091 inescq = TRUE;
3092 break;
3093 }
3094
3095 if (*ptr == 0 || (!inescq && *ptr == ']'))
3096 {
3097 ptr = oldptr;
3098 goto LONE_SINGLE_CHARACTER;
3099 }
3100
3101 #ifdef SUPPORT_UTF8
3102 if (utf8)
3103 { /* Braces are required because the */
3104 GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
3105 }
3106 else
3107 #endif
3108 d = *ptr; /* Not UTF-8 mode */
3109
3110 /* The second part of a range can be a single-character escape, but
3111 not any of the other escapes. Perl 5.6 treats a hyphen as a literal
3112 in such circumstances. */
3113
3114 if (!inescq && d == '\\')
3115 {
3116 d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3117 if (*errorcodeptr != 0) goto FAILED;
3118
3119 /* \b is backspace; \X is literal X; \R is literal R; any other
3120 special means the '-' was literal */
3121
3122 if (d < 0)
3123 {
3124 if (d == -ESC_b) d = '\b';
3125 else if (d == -ESC_X) d = 'X';
3126 else if (d == -ESC_R) d = 'R'; else
3127 {
3128 ptr = oldptr;
3129 goto LONE_SINGLE_CHARACTER; /* A few lines below */
3130 }
3131 }
3132 }
3133
3134 /* Check that the two values are in the correct order. Optimize
3135 one-character ranges */
3136
3137 if (d < c)
3138 {
3139 *errorcodeptr = ERR8;
3140 goto FAILED;
3141 }
3142
3143 if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
3144
3145 /* Remember \r or \n */
3146
3147 if (d == '\r' || d == '\n') cd->external_flags |= PCRE_HASCRORLF;
3148
3149 /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
3150 matching, we have to use an XCLASS with extra data items. Caseless
3151 matching for characters > 127 is available only if UCP support is
3152 available. */
3153
3154 #ifdef SUPPORT_UTF8
3155 if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
3156 {
3157 class_utf8 = TRUE;
3158
3159 /* With UCP support, we can find the other case equivalents of
3160 the relevant characters. There may be several ranges. Optimize how
3161 they fit with the basic range. */
3162
3163 #ifdef SUPPORT_UCP
3164 if ((options & PCRE_CASELESS) != 0)
3165 {
3166 unsigned int occ, ocd;
3167 unsigned int cc = c;
3168 unsigned int origd = d;
3169 while (get_othercase_range(&cc, origd, &occ, &ocd))
3170 {
3171 if (occ >= (unsigned int)c &&
3172 ocd <= (unsigned int)d)
3173 continue; /* Skip embedded ranges */
3174
3175 if (occ < (unsigned int)c &&
3176 ocd >= (unsigned int)c - 1) /* Extend the basic range */
3177 { /* if there is overlap, */
3178 c = occ; /* noting that if occ < c */
3179 continue; /* we can't have ocd > d */
3180 } /* because a subrange is */
3181 if (ocd > (unsigned int)d &&
3182 occ <= (unsigned int)d + 1) /* always shorter than */
3183 { /* the basic range. */
3184 d = ocd;
3185 continue;
3186 }
3187
3188 if (occ == ocd)
3189 {
3190 *class_utf8data++ = XCL_SINGLE;
3191 }
3192 else
3193 {
3194 *class_utf8data++ = XCL_RANGE;
3195 class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
3196 }
3197 class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
3198 }
3199 }
3200 #endif /* SUPPORT_UCP */
3201
3202 /* Now record the original range, possibly modified for UCP caseless
3203 overlapping ranges. */
3204
3205 *class_utf8data++ = XCL_RANGE;
3206 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3207 class_utf8data += _pcre_ord2utf8(d, class_utf8data);
3208
3209 /* With UCP support, we are done. Without UCP support, there is no
3210 caseless matching for UTF-8 characters > 127; we can use the bit map
3211 for the smaller ones. */
3212
3213 #ifdef SUPPORT_UCP
3214 continue; /* With next character in the class */
3215 #else
3216 if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
3217
3218 /* Adjust upper limit and fall through to set up the map */
3219
3220 d = 127;
3221
3222 #endif /* SUPPORT_UCP */
3223 }
3224 #endif /* SUPPORT_UTF8 */
3225
3226 /* We use the bit map for all cases when not in UTF-8 mode; else
3227 ranges that lie entirely within 0-127 when there is UCP support; else
3228 for partial ranges without UCP support. */
3229
3230 class_charcount += d - c + 1;
3231 class_lastchar = d;
3232
3233 /* We can save a bit of time by skipping this in the pre-compile. */
3234
3235 if (lengthptr == NULL) for (; c <= d; c++)
3236 {
3237 classbits[c/8] |= (1 << (c&7));
3238 if ((options & PCRE_CASELESS) != 0)
3239 {
3240 int uc = cd->fcc[c]; /* flip case */
3241 classbits[uc/8] |= (1 << (uc&7));
3242 }
3243 }
3244
3245 continue; /* Go get the next char in the class */
3246 }
3247
3248 /* Handle a lone single character - we can get here for a normal
3249 non-escape char, or after \ that introduces a single character or for an
3250 apparent range that isn't. */
3251
3252 LONE_SINGLE_CHARACTER:
3253
3254 /* Handle a character that cannot go in the bit map */
3255
3256 #ifdef SUPPORT_UTF8
3257 if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
3258 {
3259 class_utf8 = TRUE;
3260 *class_utf8data++ = XCL_SINGLE;
3261 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3262
3263 #ifdef SUPPORT_UCP
3264 if ((options & PCRE_CASELESS) != 0)
3265 {
3266 unsigned int othercase;
3267 if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR)
3268 {
3269 *class_utf8data++ = XCL_SINGLE;
3270 class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
3271 }
3272 }
3273 #endif /* SUPPORT_UCP */
3274
3275 }
3276 else
3277 #endif /* SUPPORT_UTF8 */
3278
3279 /* Handle a single-byte character */
3280 {
3281 classbits[c/8] |= (1 << (c&7));
3282 if ((options & PCRE_CASELESS) != 0)
3283 {
3284 c = cd->fcc[c]; /* flip case */
3285 classbits[c/8] |= (1 << (c&7));
3286 }
3287 class_charcount++;
3288 class_lastchar = c;
3289 }
3290 }
3291
3292 /* Loop until ']' reached. This "while" is the end of the "do" above. */
3293
3294 while ((c = *(++ptr)) != 0 && (c != ']' || inescq));
3295
3296 if (c == 0) /* Missing terminating ']' */
3297 {
3298 *errorcodeptr = ERR6;
3299 goto FAILED;
3300 }
3301
3302
3303 /* This code has been disabled because it would mean that \s counts as
3304 an explicit \r or \n reference, and that's not really what is wanted. Now
3305 we set the flag only if there is a literal "\r" or "\n" in the class. */
3306
3307 #if 0
3308 /* Remember whether \r or \n are in this class */
3309
3310 if (negate_class)
3311 {
3312 if ((classbits[1] & 0x24) != 0x24) cd->external_flags |= PCRE_HASCRORLF;
3313 }
3314 else
3315 {
3316 if ((classbits[1] & 0x24) != 0) cd->external_flags |= PCRE_HASCRORLF;
3317 }
3318 #endif
3319
3320
3321 /* If class_charcount is 1, we saw precisely one character whose value is
3322 less than 256. As long as there were no characters >= 128 and there was no
3323 use of \p or \P, in other words, no use of any XCLASS features, we can
3324 optimize.
3325
3326 In UTF-8 mode, we can optimize the negative case only if there were no
3327 characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR
3328 operate on single-bytes only. This is an historical hangover. Maybe one day
3329 we can tidy these opcodes to handle multi-byte characters.
3330
3331 The optimization throws away the bit map. We turn the item into a
3332 1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
3333 that OP_NOT does not support multibyte characters. In the positive case, it
3334 can cause firstbyte to be set. Otherwise, there can be no first char if
3335 this item is first, whatever repeat count may follow. In the case of
3336 reqbyte, save the previous value for reinstating. */
3337
3338 #ifdef SUPPORT_UTF8
3339 if (class_charcount == 1 && !class_utf8 &&
3340 (!utf8 || !negate_class || class_lastchar < 128))
3341 #else
3342 if (class_charcount == 1)
3343 #endif
3344 {
3345 zeroreqbyte = reqbyte;
3346
3347 /* The OP_NOT opcode works on one-byte characters only. */
3348
3349 if (negate_class)
3350 {
3351 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3352 zerofirstbyte = firstbyte;
3353 *code++ = OP_NOT;
3354 *code++ = class_lastchar;
3355 break;
3356 }
3357
3358 /* For a single, positive character, get the value into mcbuffer, and
3359 then we can handle this with the normal one-character code. */
3360
3361 #ifdef SUPPORT_UTF8
3362 if (utf8 && class_lastchar > 127)
3363 mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
3364 else
3365 #endif
3366 {
3367 mcbuffer[0] = class_lastchar;
3368 mclength = 1;
3369 }
3370 goto ONE_CHAR;
3371 } /* End of 1-char optimization */
3372
3373 /* The general case - not the one-char optimization. If this is the first
3374 thing in the branch, there can be no first char setting, whatever the
3375 repeat count. Any reqbyte setting must remain unchanged after any kind of
3376 repeat. */
3377
3378 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3379 zerofirstbyte = firstbyte;
3380 zeroreqbyte = reqbyte;
3381
3382 /* If there are characters with values > 255, we have to compile an
3383 extended class, with its own opcode, unless there was a negated special
3384 such as \S in the class, because in that case all characters > 255 are in
3385 the class, so any that were explicitly given as well can be ignored. If
3386 (when there are explicit characters > 255 that must be listed) there are no
3387 characters < 256, we can omit the bitmap in the actual compiled code. */
3388
3389 #ifdef SUPPORT_UTF8
3390 if (class_utf8 && !should_flip_negation)
3391 {
3392 *class_utf8data++ = XCL_END; /* Marks the end of extra data */
3393 *code++ = OP_XCLASS;
3394 code += LINK_SIZE;
3395 *code = negate_class? XCL_NOT : 0;
3396
3397 /* If the map is required, move up the extra data to make room for it;
3398 otherwise just move the code pointer to the end of the extra data. */
3399
3400 if (class_charcount > 0)
3401 {
3402 *code++ |= XCL_MAP;
3403 memmove(code + 32, code, class_utf8data - code);
3404 memcpy(code, classbits, 32);
3405 code = class_utf8data + 32;
3406 }
3407 else code = class_utf8data;
3408
3409 /* Now fill in the complete length of the item */
3410
3411 PUT(previous, 1, code - previous);
3412 break; /* End of class handling */
3413 }
3414 #endif
3415
3416 /* If there are no characters > 255, set the opcode to OP_CLASS or
3417 OP_NCLASS, depending on whether the whole class was negated and whether
3418 there were negative specials such as \S in the class. Then copy the 32-byte
3419 map into the code vector, negating it if necessary. */
3420
3421 *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
3422 if (negate_class)
3423 {
3424 if (lengthptr == NULL) /* Save time in the pre-compile phase */
3425 for (c = 0; c < 32; c++) code[c] = ~classbits[c];
3426 }
3427 else
3428 {
3429 memcpy(code, classbits, 32);
3430 }
3431 code += 32;
3432 break;
3433
3434
3435 /* ===================================================================*/
3436 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
3437 has been tested above. */
3438
3439 case '{':
3440 if (!is_quantifier) goto NORMAL_CHAR;
3441 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
3442 if (*errorcodeptr != 0) goto FAILED;
3443 goto REPEAT;
3444
3445 case '*':
3446 repeat_min = 0;
3447 repeat_max = -1;
3448 goto REPEAT;
3449
3450 case '+':
3451 repeat_min = 1;
3452 repeat_max = -1;
3453 goto REPEAT;
3454
3455 case '?':
3456 repeat_min = 0;
3457 repeat_max = 1;
3458
3459 REPEAT:
3460 if (previous == NULL)
3461 {
3462 *errorcodeptr = ERR9;
3463 goto FAILED;
3464 }
3465
3466 if (repeat_min == 0)
3467 {
3468 firstbyte = zerofirstbyte; /* Adjust for zero repeat */
3469 reqbyte = zeroreqbyte; /* Ditto */
3470 }
3471
3472 /* Remember whether this is a variable length repeat */
3473
3474 reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
3475
3476 op_type = 0; /* Default single-char op codes */
3477 possessive_quantifier = FALSE; /* Default not possessive quantifier */
3478
3479 /* Save start of previous item, in case we have to move it up to make space
3480 for an inserted OP_ONCE for the additional '+' extension. */
3481
3482 tempcode = previous;
3483
3484 /* If the next character is '+', we have a possessive quantifier. This
3485 implies greediness, whatever the setting of the PCRE_UNGREEDY option.
3486 If the next character is '?' this is a minimizing repeat, by default,
3487 but if PCRE_UNGREEDY is set, it works the other way round. We change the
3488 repeat type to the non-default. */
3489
3490 if (ptr[1] == '+')
3491 {
3492 repeat_type = 0; /* Force greedy */
3493 possessive_quantifier = TRUE;
3494 ptr++;
3495 }
3496 else if (ptr[1] == '?')
3497 {
3498 repeat_type = greedy_non_default;
3499 ptr++;
3500 }
3501 else repeat_type = greedy_default;
3502
3503 /* If previous was a character match, abolish the item and generate a
3504 repeat item instead. If a char item has a minumum of more than one, ensure
3505 that it is set in reqbyte - it might not be if a sequence such as x{3} is
3506 the first thing in a branch because the x will have gone into firstbyte
3507 instead. */
3508
3509 if (*previous == OP_CHAR || *previous == OP_CHARNC)
3510 {
3511 /* Deal with UTF-8 characters that take up more than one byte. It's
3512 easier to write this out separately than try to macrify it. Use c to
3513 hold the length of the character in bytes, plus 0x80 to flag that it's a
3514 length rather than a small character. */
3515
3516 #ifdef SUPPORT_UTF8
3517 if (utf8 && (code[-1] & 0x80) != 0)
3518 {
3519 uschar *lastchar = code - 1;
3520 while((*lastchar & 0xc0) == 0x80) lastchar--;
3521 c = code - lastchar; /* Length of UTF-8 character */
3522 memcpy(utf8_char, lastchar, c); /* Save the char */
3523 c |= 0x80; /* Flag c as a length */
3524 }
3525 else
3526 #endif
3527
3528 /* Handle the case of a single byte - either with no UTF8 support, or
3529 with UTF-8 disabled, or for a UTF-8 character < 128. */
3530
3531 {
3532 c = code[-1];
3533 if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
3534 }
3535
3536 /* If the repetition is unlimited, it pays to see if the next thing on
3537 the line is something that cannot possibly match this character. If so,
3538 automatically possessifying this item gains some performance in the case
3539 where the match fails. */
3540
3541 if (!possessive_quantifier &&
3542 repeat_max < 0 &&
3543 check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3544 options, cd))
3545 {
3546 repeat_type = 0; /* Force greedy */
3547 possessive_quantifier = TRUE;
3548 }
3549
3550 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
3551 }
3552
3553 /* If previous was a single negated character ([^a] or similar), we use
3554 one of the special opcodes, replacing it. The code is shared with single-
3555 character repeats by setting opt_type to add a suitable offset into
3556 repeat_type. We can also test for auto-possessification. OP_NOT is
3557 currently used only for single-byte chars. */
3558
3559 else if (*previous == OP_NOT)
3560 {
3561 op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
3562 c = previous[1];
3563 if (!possessive_quantifier &&
3564 repeat_max < 0 &&
3565 check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3566 {
3567 repeat_type = 0; /* Force greedy */
3568 possessive_quantifier = TRUE;
3569 }
3570 goto OUTPUT_SINGLE_REPEAT;
3571 }
3572
3573 /* If previous was a character type match (\d or similar), abolish it and
3574 create a suitable repeat item. The code is shared with single-character
3575 repeats by setting op_type to add a suitable offset into repeat_type. Note
3576 the the Unicode property types will be present only when SUPPORT_UCP is
3577 defined, but we don't wrap the little bits of code here because it just
3578 makes it horribly messy. */
3579
3580 else if (*previous < OP_EODN)
3581 {
3582 uschar *oldcode;
3583 int prop_type, prop_value;
3584 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
3585 c = *previous;
3586
3587 if (!possessive_quantifier &&
3588 repeat_max < 0 &&
3589 check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3590 {
3591 repeat_type = 0; /* Force greedy */
3592 possessive_quantifier = TRUE;
3593 }
3594
3595 OUTPUT_SINGLE_REPEAT:
3596 if (*previous == OP_PROP || *previous == OP_NOTPROP)
3597 {
3598 prop_type = previous[1];
3599 prop_value = previous[2];
3600 }
3601 else prop_type = prop_value = -1;
3602
3603 oldcode = code;
3604 code = previous; /* Usually overwrite previous item */
3605
3606 /* If the maximum is zero then the minimum must also be zero; Perl allows
3607 this case, so we do too - by simply omitting the item altogether. */
3608
3609 if (repeat_max == 0) goto END_REPEAT;
3610
3611 /* All real repeats make it impossible to handle partial matching (maybe
3612 one day we will be able to remove this restriction). */
3613
3614 if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL;
3615
3616 /* Combine the op_type with the repeat_type */
3617
3618 repeat_type += op_type;
3619
3620 /* A minimum of zero is handled either as the special case * or ?, or as
3621 an UPTO, with the maximum given. */
3622
3623 if (repeat_min == 0)
3624 {
3625 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
3626 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
3627 else
3628 {
3629 *code++ = OP_UPTO + repeat_type;
3630 PUT2INC(code, 0, repeat_max);
3631 }
3632 }
3633
3634 /* A repeat minimum of 1 is optimized into some special cases. If the
3635 maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
3636 left in place and, if the maximum is greater than 1, we use OP_UPTO with
3637 one less than the maximum. */
3638
3639 else if (repeat_min == 1)
3640 {
3641 if (repeat_max == -1)
3642 *code++ = OP_PLUS + repeat_type;
3643 else
3644 {
3645 code = oldcode; /* leave previous item in place */
3646 if (repeat_max == 1) goto END_REPEAT;
3647 *code++ = OP_UPTO + repeat_type;
3648 PUT2INC(code, 0, repeat_max - 1);
3649 }
3650 }
3651
3652 /* The case {n,n} is just an EXACT, while the general case {n,m} is
3653 handled as an EXACT followed by an UPTO. */
3654
3655 else
3656 {
3657 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
3658 PUT2INC(code, 0, repeat_min);
3659
3660 /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
3661 we have to insert the character for the previous code. For a repeated
3662 Unicode property match, there are two extra bytes that define the
3663 required property. In UTF-8 mode, long characters have their length in
3664 c, with the 0x80 bit as a flag. */
3665
3666 if (repeat_max < 0)
3667 {
3668 #ifdef SUPPORT_UTF8
3669 if (utf8 && c >= 128)
3670 {
3671 memcpy(code, utf8_char, c & 7);
3672 code += c & 7;
3673 }
3674 else
3675 #endif
3676 {
3677 *code++ = c;
3678 if (prop_type >= 0)
3679 {
3680 *code++ = prop_type;
3681 *code++ = prop_value;
3682 }
3683 }
3684 *code++ = OP_STAR + repeat_type;
3685 }
3686
3687 /* Else insert an UPTO if the max is greater than the min, again
3688 preceded by the character, for the previously inserted code. If the
3689 UPTO is just for 1 instance, we can use QUERY instead. */
3690
3691 else if (repeat_max != repeat_min)
3692 {
3693 #ifdef SUPPORT_UTF8
3694 if (utf8 && c >= 128)
3695 {
3696 memcpy(code, utf8_char, c & 7);
3697 code += c & 7;
3698 }
3699 else
3700 #endif
3701 *code++ = c;
3702 if (prop_type >= 0)
3703 {
3704 *code++ = prop_type;
3705 *code++ = prop_value;
3706 }
3707 repeat_max -= repeat_min;
3708
3709 if (repeat_max == 1)
3710 {
3711 *code++ = OP_QUERY + repeat_type;
3712 }
3713 else
3714 {
3715 *code++ = OP_UPTO + repeat_type;
3716 PUT2INC(code, 0, repeat_max);
3717 }
3718 }
3719 }
3720
3721 /* The character or character type itself comes last in all cases. */
3722
3723 #ifdef SUPPORT_UTF8
3724 if (utf8 && c >= 128)
3725 {
3726 memcpy(code, utf8_char, c & 7);
3727 code += c & 7;
3728 }
3729 else
3730 #endif
3731 *code++ = c;
3732
3733 /* For a repeated Unicode property match, there are two extra bytes that
3734 define the required property. */
3735
3736 #ifdef SUPPORT_UCP
3737 if (prop_type >= 0)
3738 {
3739 *code++ = prop_type;
3740 *code++ = prop_value;
3741 }
3742 #endif
3743 }
3744
3745 /* If previous was a character class or a back reference, we put the repeat
3746 stuff after it, but just skip the item if the repeat was {0,0}. */
3747
3748 else if (*previous == OP_CLASS ||
3749 *previous == OP_NCLASS ||
3750 #ifdef SUPPORT_UTF8
3751 *previous == OP_XCLASS ||
3752 #endif
3753 *previous == OP_REF)
3754 {
3755 if (repeat_max == 0)
3756 {
3757 code = previous;
3758 goto END_REPEAT;
3759 }
3760
3761 /* All real repeats make it impossible to handle partial matching (maybe
3762 one day we will be able to remove this restriction). */
3763
3764 if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL;
3765
3766 if (repeat_min == 0 && repeat_max == -1)
3767 *code++ = OP_CRSTAR + repeat_type;
3768 else if (repeat_min == 1 && repeat_max == -1)
3769 *code++ = OP_CRPLUS + repeat_type;
3770 else if (repeat_min == 0 && repeat_max == 1)
3771 *code++ = OP_CRQUERY + repeat_type;
3772 else
3773 {
3774 *code++ = OP_CRRANGE + repeat_type;
3775 PUT2INC(code, 0, repeat_min);
3776 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
3777 PUT2INC(code, 0, repeat_max);
3778 }
3779 }
3780
3781 /* If previous was a bracket group, we may have to replicate it in certain
3782 cases. */
3783
3784 else if (*previous == OP_BRA || *previous == OP_CBRA ||
3785 *previous == OP_ONCE || *previous == OP_COND)
3786 {
3787 register int i;
3788 int ketoffset = 0;
3789 int len = code - previous;
3790 uschar *bralink = NULL;
3791
3792 /* Repeating a DEFINE group is pointless */
3793
3794 if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
3795 {
3796 *errorcodeptr = ERR55;
3797 goto FAILED;
3798 }
3799
3800 /* If the maximum repeat count is unlimited, find the end of the bracket
3801 by scanning through from the start, and compute the offset back to it
3802 from the current code pointer. There may be an OP_OPT setting following
3803 the final KET, so we can't find the end just by going back from the code
3804 pointer. */
3805
3806 if (repeat_max == -1)
3807 {
3808 register uschar *ket = previous;
3809 do ket += GET(ket, 1); while (*ket != OP_KET);
3810 ketoffset = code - ket;
3811 }
3812
3813 /* The case of a zero minimum is special because of the need to stick
3814 OP_BRAZERO in front of it, and because the group appears once in the
3815 data, whereas in other cases it appears the minimum number of times. For
3816 this reason, it is simplest to treat this case separately, as otherwise
3817 the code gets far too messy. There are several special subcases when the
3818 minimum is zero. */
3819
3820 if (repeat_min == 0)
3821 {
3822 /* If the maximum is also zero, we just omit the group from the output
3823 altogether. */
3824
3825 if (repeat_max == 0)
3826 {
3827 code = previous;
3828 goto END_REPEAT;
3829 }
3830
3831 /* If the maximum is 1 or unlimited, we just have to stick in the
3832 BRAZERO and do no more at this point. However, we do need to adjust
3833 any OP_RECURSE calls inside the group that refer to the group itself or
3834 any internal or forward referenced group, because the offset is from
3835 the start of the whole regex. Temporarily terminate the pattern while
3836 doing this. */
3837
3838 if (repeat_max <= 1)
3839 {
3840 *code = OP_END;
3841 adjust_recurse(previous, 1, utf8, cd, save_hwm);
3842 memmove(previous+1, previous, len);
3843 code++;
3844 *previous++ = OP_BRAZERO + repeat_type;
3845 }
3846
3847 /* If the maximum is greater than 1 and limited, we have to replicate
3848 in a nested fashion, sticking OP_BRAZERO before each set of brackets.
3849 The first one has to be handled carefully because it's the original
3850 copy, which has to be moved up. The remainder can be handled by code
3851 that is common with the non-zero minimum case below. We have to
3852 adjust the value or repeat_max, since one less copy is required. Once
3853 again, we may have to adjust any OP_RECURSE calls inside the group. */
3854
3855 else
3856 {
3857 int offset;
3858 *code = OP_END;
3859 adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
3860 memmove(previous + 2 + LINK_SIZE, previous, len);
3861 code += 2 + LINK_SIZE;
3862 *previous++ = OP_BRAZERO + repeat_type;
3863 *previous++ = OP_BRA;
3864
3865 /* We chain together the bracket offset fields that have to be
3866 filled in later when the ends of the brackets are reached. */
3867
3868 offset = (bralink == NULL)? 0 : previous - bralink;
3869 bralink = previous;
3870 PUTINC(previous, 0, offset);
3871 }
3872
3873 repeat_max--;
3874 }
3875
3876 /* If the minimum is greater than zero, replicate the group as many
3877 times as necessary, and adjust the maximum to the number of subsequent
3878 copies that we need. If we set a first char from the group, and didn't
3879 set a required char, copy the latter from the former. If there are any
3880 forward reference subroutine calls in the group, there will be entries on
3881 the workspace list; replicate these with an appropriate increment. */
3882
3883 else
3884 {
3885 if (repeat_min > 1)
3886 {
3887 /* In the pre-compile phase, we don't actually do the replication. We
3888 just adjust the length as if we had. Do some paranoid checks for
3889 potential integer overflow. */
3890
3891 if (lengthptr != NULL)
3892 {
3893 int delta = (repeat_min - 1)*length_prevgroup;
3894 if ((double)(repeat_min - 1)*(double)length_prevgroup >
3895 (double)INT_MAX ||
3896 OFLOW_MAX - *lengthptr < delta)
3897 {
3898 *errorcodeptr = ERR20;
3899 goto FAILED;
3900 }
3901 *lengthptr += delta;
3902 }
3903
3904 /* This is compiling for real */
3905
3906 else
3907 {
3908 if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
3909 for (i = 1; i < repeat_min; i++)
3910 {
3911 uschar *hc;
3912 uschar *this_hwm = cd->hwm;
3913 memcpy(code, previous, len);
3914 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3915 {
3916 PUT(cd->hwm, 0, GET(hc, 0) + len);
3917 cd->hwm += LINK_SIZE;
3918 }
3919 save_hwm = this_hwm;
3920 code += len;
3921 }
3922 }
3923 }
3924
3925 if (repeat_max > 0) repeat_max -= repeat_min;
3926 }
3927
3928 /* This code is common to both the zero and non-zero minimum cases. If
3929 the maximum is limited, it replicates the group in a nested fashion,
3930 remembering the bracket starts on a stack. In the case of a zero minimum,
3931 the first one was set up above. In all cases the repeat_max now specifies
3932 the number of additional copies needed. Again, we must remember to
3933 replicate entries on the forward reference list. */
3934
3935 if (repeat_max >= 0)
3936 {
3937 /* In the pre-compile phase, we don't actually do the replication. We
3938 just adjust the length as if we had. For each repetition we must add 1
3939 to the length for BRAZERO and for all but the last repetition we must
3940 add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
3941 paranoid checks to avoid integer overflow. */
3942
3943 if (lengthptr != NULL && repeat_max > 0)
3944 {
3945 int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
3946 2 - 2*LINK_SIZE; /* Last one doesn't nest */
3947 if ((double)repeat_max *
3948 (double)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
3949 > (double)INT_MAX ||
3950 OFLOW_MAX - *lengthptr < delta)
3951 {
3952 *errorcodeptr = ERR20;
3953 goto FAILED;
3954 }
3955 *lengthptr += delta;
3956 }
3957
3958 /* This is compiling for real */
3959
3960 else for (i = repeat_max - 1; i >= 0; i--)
3961 {
3962 uschar *hc;
3963 uschar *this_hwm = cd->hwm;
3964
3965 *code++ = OP_BRAZERO + repeat_type;
3966
3967 /* All but the final copy start a new nesting, maintaining the
3968 chain of brackets outstanding. */
3969
3970 if (i != 0)
3971 {
3972 int offset;
3973 *code++ = OP_BRA;
3974 offset = (bralink == NULL)? 0 : code - bralink;
3975 bralink = code;
3976 PUTINC(code, 0, offset);
3977 }
3978
3979 memcpy(code, previous, len);
3980 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3981 {
3982 PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
3983 cd->hwm += LINK_SIZE;
3984 }
3985 save_hwm = this_hwm;
3986 code += len;
3987 }
3988
3989 /* Now chain through the pending brackets, and fill in their length
3990 fields (which are holding the chain links pro tem). */
3991
3992 while (bralink != NULL)
3993 {
3994 int oldlinkoffset;
3995 int offset = code - bralink + 1;
3996 uschar *bra = code - offset;
3997 oldlinkoffset = GET(bra, 1);
3998 bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
3999 *code++ = OP_KET;
4000 PUTINC(code, 0, offset);
4001 PUT(bra, 1, offset);
4002 }
4003 }
4004
4005 /* If the maximum is unlimited, set a repeater in the final copy. We
4006 can't just offset backwards from the current code point, because we
4007 don't know if there's been an options resetting after the ket. The
4008 correct offset was computed above.
4009
4010 Then, when we are doing the actual compile phase, check to see whether
4011 this group is a non-atomic one that could match an empty string. If so,
4012 convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
4013 that runtime checking can be done. [This check is also applied to
4014 atomic groups at runtime, but in a different way.] */
4015
4016 else
4017 {
4018 uschar *ketcode = code - ketoffset;
4019 uschar *bracode = ketcode - GET(ketcode, 1);
4020 *ketcode = OP_KETRMAX + repeat_type;
4021 if (lengthptr == NULL && *bracode != OP_ONCE)
4022 {
4023 uschar *scode = bracode;
4024 do
4025 {
4026 if (could_be_empty_branch(scode, ketcode, utf8))
4027 {
4028 *bracode += OP_SBRA - OP_BRA;
4029 break;
4030 }
4031 scode += GET(scode, 1);
4032 }
4033 while (*scode == OP_ALT);
4034 }
4035 }
4036 }
4037
4038 /* Else there's some kind of shambles */
4039
4040 else
4041 {
4042 *errorcodeptr = ERR11;
4043 goto FAILED;
4044 }
4045
4046 /* If the character following a repeat is '+', or if certain optimization
4047 tests above succeeded, possessive_quantifier is TRUE. For some of the
4048 simpler opcodes, there is an special alternative opcode for this. For
4049 anything else, we wrap the entire repeated item inside OP_ONCE brackets.
4050 The '+' notation is just syntactic sugar, taken from Sun's Java package,
4051 but the special opcodes can optimize it a bit. The repeated item starts at
4052 tempcode, not at previous, which might be the first part of a string whose
4053 (former) last char we repeated.
4054
4055 Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
4056 an 'upto' may follow. We skip over an 'exact' item, and then test the
4057 length of what remains before proceeding. */
4058
4059 if (possessive_quantifier)
4060 {
4061 int len;
4062 if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||
4063 *tempcode == OP_NOTEXACT)
4064 tempcode += _pcre_OP_lengths[*tempcode] +
4065 ((*tempcode == OP_TYPEEXACT &&
4066 (tempcode[3] == OP_PROP || tempcode[3] == OP_NOTPROP))? 2:0);
4067 len = code - tempcode;
4068 if (len > 0) switch (*tempcode)
4069 {
4070 case OP_STAR: *tempcode = OP_POSSTAR; break;
4071 case OP_PLUS: *tempcode = OP_POSPLUS; break;
4072 case OP_QUERY: *tempcode = OP_POSQUERY; break;
4073 case OP_UPTO: *tempcode = OP_POSUPTO; break;
4074
4075 case OP_TYPESTAR: *tempcode = OP_TYPEPOSSTAR; break;
4076 case OP_TYPEPLUS: *tempcode = OP_TYPEPOSPLUS; break;
4077 case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
4078 case OP_TYPEUPTO: *tempcode = OP_TYPEPOSUPTO; break;
4079
4080 case OP_NOTSTAR: *tempcode = OP_NOTPOSSTAR; break;
4081 case OP_NOTPLUS: *tempcode = OP_NOTPOSPLUS; break;
4082 case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
4083 case OP_NOTUPTO: *tempcode = OP_NOTPOSUPTO; break;
4084
4085 default:
4086 memmove(tempcode + 1+LINK_SIZE, tempcode, len);
4087 code += 1 + LINK_SIZE;
4088 len += 1 + LINK_SIZE;
4089 tempcode[0] = OP_ONCE;
4090 *code++ = OP_KET;
4091 PUTINC(code, 0, len);
4092 PUT(tempcode, 1, len);
4093 break;
4094 }
4095 }
4096
4097 /* In all case we no longer have a previous item. We also set the
4098 "follows varying string" flag for subsequently encountered reqbytes if
4099 it isn't already set and we have just passed a varying length item. */
4100
4101 END_REPEAT:
4102 previous = NULL;
4103 cd->req_varyopt |= reqvary;
4104 break;
4105
4106
4107 /* ===================================================================*/
4108 /* Start of nested parenthesized sub-expression, or comment or lookahead or
4109 lookbehind or option setting or condition or all the other extended
4110 parenthesis forms. */
4111
4112 case '(':
4113 newoptions = options;
4114 skipbytes = 0;
4115 bravalue = OP_CBRA;
4116 save_hwm = cd->hwm;
4117 reset_bracount = FALSE;
4118
4119 /* First deal with various "verbs" that can be introduced by '*'. */
4120
4121 if (*(++ptr) == '*' && (cd->ctypes[ptr[1]] & ctype_letter) != 0)
4122 {
4123 int i, namelen;
4124 const char *vn = verbnames;
4125 const uschar *name = ++ptr;
4126 previous = NULL;
4127 while ((cd->ctypes[*++ptr] & ctype_letter) != 0);
4128 if (*ptr == ':')
4129 {
4130 *errorcodeptr = ERR59; /* Not supported */
4131 goto FAILED;
4132 }
4133 if (*ptr != ')')
4134 {
4135 *errorcodeptr = ERR60;
4136 goto FAILED;
4137 }
4138 namelen = ptr - name;
4139 for (i = 0; i < verbcount; i++)
4140 {
4141 if (namelen == verbs[i].len &&
4142 strncmp((char *)name, vn, namelen) == 0)
4143 {
4144 *code = verbs[i].op;
4145 if (*code++ == OP_ACCEPT) cd->had_accept = TRUE;
4146 break;
4147 }
4148 vn += verbs[i].len + 1;
4149 }
4150 if (i < verbcount) continue;
4151 *errorcodeptr = ERR60;
4152 goto FAILED;
4153 }
4154
4155 /* Deal with the extended parentheses; all are introduced by '?', and the
4156 appearance of any of them means that this is not a capturing group. */
4157
4158 else if (*ptr == '?')
4159 {
4160 int i, set, unset, namelen;
4161 int *optset;
4162 const uschar *name;
4163 uschar *slot;
4164
4165 switch (*(++ptr))
4166 {
4167 case '#': /* Comment; skip to ket */
4168 ptr++;
4169 while (*ptr != 0 && *ptr != ')') ptr++;
4170 if (*ptr == 0)
4171 {
4172 *errorcodeptr = ERR18;
4173 goto FAILED;
4174 }
4175 continue;
4176
4177
4178 /* ------------------------------------------------------------ */
4179 case '|': /* Reset capture count for each branch */
4180 reset_bracount = TRUE;
4181 /* Fall through */
4182
4183 /* ------------------------------------------------------------ */
4184 case ':': /* Non-capturing bracket */
4185 bravalue = OP_BRA;
4186 ptr++;
4187 break;
4188
4189
4190 /* ------------------------------------------------------------ */
4191 case '(':
4192 bravalue = OP_COND; /* Conditional group */
4193
4194 /* A condition can be an assertion, a number (referring to a numbered
4195 group), a name (referring to a named group), or 'R', referring to
4196 recursion. R<digits> and R&name are also permitted for recursion tests.
4197
4198 There are several syntaxes for testing a named group: (?(name)) is used
4199 by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
4200
4201 There are two unfortunate ambiguities, caused by history. (a) 'R' can
4202 be the recursive thing or the name 'R' (and similarly for 'R' followed
4203 by digits), and (b) a number could be a name that consists of digits.
4204 In both cases, we look for a name first; if not found, we try the other
4205 cases. */
4206
4207 /* For conditions that are assertions, check the syntax, and then exit
4208 the switch. This will take control down to where bracketed groups,
4209 including assertions, are processed. */
4210
4211 if (ptr[1] == '?' && (ptr[2] == '=' || ptr[2] == '!' || ptr[2] == '<'))
4212 break;
4213
4214 /* Most other conditions use OP_CREF (a couple change to OP_RREF
4215 below), and all need to skip 3 bytes at the start of the group. */
4216
4217 code[1+LINK_SIZE] = OP_CREF;
4218 skipbytes = 3;
4219 refsign = -1;
4220
4221 /* Check for a test for recursion in a named group. */
4222
4223 if (ptr[1] == 'R' && ptr[2] == '&')
4224 {
4225 terminator = -1;
4226 ptr += 2;
4227 code[1+LINK_SIZE] = OP_RREF; /* Change the type of test */
4228 }
4229
4230 /* Check for a test for a named group's having been set, using the Perl
4231 syntax (?(<name>) or (?('name') */
4232
4233 else if (ptr[1] == '<')
4234 {
4235 terminator = '>';
4236 ptr++;
4237 }
4238 else if (ptr[1] == '\'')
4239 {
4240 terminator = '\'';
4241 ptr++;
4242 }
4243 else
4244 {
4245 terminator = 0;
4246 if (ptr[1] == '-' || ptr[1] == '+') refsign = *(++ptr);
4247 }
4248
4249 /* We now expect to read a name; any thing else is an error */
4250
4251 if ((cd->ctypes[ptr[1]] & ctype_word) == 0)
4252 {
4253 ptr += 1; /* To get the right offset */
4254 *errorcodeptr = ERR28;
4255 goto FAILED;
4256 }
4257
4258 /* Read the name, but also get it as a number if it's all digits */
4259
4260 recno = 0;
4261 name = ++ptr;
4262 while ((cd->ctypes[*ptr] & ctype_word) != 0)
4263 {
4264 if (recno >= 0)
4265 recno = ((digitab[*ptr] & ctype_digit) != 0)?
4266 recno * 10 + *ptr - '0' : -1;
4267 ptr++;
4268 }
4269 namelen = ptr - name;
4270
4271 if ((terminator > 0 && *ptr++ != terminator) || *ptr++ != ')')
4272 {
4273 ptr--; /* Error offset */
4274 *errorcodeptr = ERR26;
4275 goto FAILED;
4276 }
4277
4278 /* Do no further checking in the pre-compile phase. */
4279
4280 if (lengthptr != NULL) break;
4281
4282 /* In the real compile we do the work of looking for the actual
4283 reference. If the string started with "+" or "-" we require the rest to
4284 be digits, in which case recno will be set. */
4285
4286 if (refsign > 0)
4287 {
4288 if (recno <= 0)
4289 {
4290 *errorcodeptr = ERR58;
4291 goto FAILED;
4292 }
4293 recno = (refsign == '-')?
4294 cd->bracount - recno + 1 : recno +cd->bracount;
4295 if (recno <= 0 || recno > cd->final_bracount)
4296 {
4297 *errorcodeptr = ERR15;
4298 goto FAILED;
4299 }
4300 PUT2(code, 2+LINK_SIZE, recno);
4301 break;
4302 }
4303
4304 /* Otherwise (did not start with "+" or "-"), start by looking for the
4305 name. */
4306
4307 slot = cd->name_table;
4308 for (i = 0; i < cd->names_found; i++)
4309 {
4310 if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
4311 slot += cd->name_entry_size;
4312 }
4313
4314 /* Found a previous named subpattern */
4315
4316 if (i < cd->names_found)
4317 {
4318 recno = GET2(slot, 0);
4319 PUT2(code, 2+LINK_SIZE, recno);
4320 }
4321
4322 /* Search the pattern for a forward reference */
4323
4324 else if ((i = find_parens(ptr, cd->bracount, name, namelen,
4325 (options & PCRE_EXTENDED) != 0)) > 0)
4326 {
4327 PUT2(code, 2+LINK_SIZE, i);
4328 }
4329
4330 /* If terminator == 0 it means that the name followed directly after
4331 the opening parenthesis [e.g. (?(abc)...] and in this case there are
4332 some further alternatives to try. For the cases where terminator != 0
4333 [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
4334 now checked all the possibilities, so give an error. */
4335
4336 else if (terminator != 0)
4337 {
4338 *errorcodeptr = ERR15;
4339 goto FAILED;
4340 }
4341
4342 /* Check for (?(R) for recursion. Allow digits after R to specify a
4343 specific group number. */
4344
4345 else if (*name == 'R')
4346 {
4347 recno = 0;
4348 for (i = 1; i < namelen; i++)
4349 {
4350 if ((digitab[name[i]] & ctype_digit) == 0)
4351 {
4352 *errorcodeptr = ERR15;
4353 goto FAILED;
4354 }
4355 recno = recno * 10 + name[i] - '0';
4356 }
4357 if (recno == 0) recno = RREF_ANY;
4358 code[1+LINK_SIZE] = OP_RREF; /* Change test type */
4359 PUT2(code, 2+LINK_SIZE, recno);
4360 }
4361
4362 /* Similarly, check for the (?(DEFINE) "condition", which is always
4363 false. */
4364
4365 else if (namelen == 6 && strncmp((char *)name, "DEFINE", 6) == 0)
4366 {
4367 code[1+LINK_SIZE] = OP_DEF;
4368 skipbytes = 1;
4369 }
4370
4371 /* Check for the "name" actually being a subpattern number. We are
4372 in the second pass here, so final_bracount is set. */
4373
4374 else if (recno > 0 && recno <= cd->final_bracount)
4375 {
4376 PUT2(code, 2+LINK_SIZE, recno);
4377 }
4378
4379 /* Either an unidentified subpattern, or a reference to (?(0) */
4380
4381 else
4382 {
4383 *errorcodeptr = (recno == 0)? ERR35: ERR15;
4384 goto FAILED;
4385 }
4386 break;
4387
4388
4389 /* ------------------------------------------------------------ */
4390 case '=': /* Positive lookahead */
4391 bravalue = OP_ASSERT;
4392 ptr++;
4393 break;
4394
4395
4396 /* ------------------------------------------------------------ */
4397 case '!': /* Negative lookahead */
4398 ptr++;
4399 if (*ptr == ')') /* Optimize (?!) */
4400 {
4401 *code++ = OP_FAIL;
4402 previous = NULL;
4403 continue;
4404 }
4405 bravalue = OP_ASSERT_NOT;
4406 break;
4407
4408
4409 /* ------------------------------------------------------------ */
4410 case '<': /* Lookbehind or named define */
4411 switch (ptr[1])
4412 {
4413 case '=': /* Positive lookbehind */
4414 bravalue = OP_ASSERTBACK;
4415 ptr += 2;
4416 break;
4417
4418 case '!': /* Negative lookbehind */
4419 bravalue = OP_ASSERTBACK_NOT;
4420 ptr += 2;
4421 break;
4422
4423 default: /* Could be name define, else bad */
4424 if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME;
4425 ptr++; /* Correct offset for error */
4426 *errorcodeptr = ERR24;
4427 goto FAILED;
4428 }
4429 break;
4430
4431
4432 /* ------------------------------------------------------------ */
4433 case '>': /* One-time brackets */
4434 bravalue = OP_ONCE;
4435 ptr++;
4436 break;
4437
4438
4439 /* ------------------------------------------------------------ */
4440 case 'C': /* Callout - may be followed by digits; */
4441 previous_callout = code; /* Save for later completion */
4442 after_manual_callout = 1; /* Skip one item before completing */
4443 *code++ = OP_CALLOUT;
4444 {
4445 int n = 0;
4446 while ((digitab[*(++ptr)] & ctype_digit) != 0)
4447 n = n * 10 + *ptr - '0';
4448 if (*ptr != ')')
4449 {
4450 *errorcodeptr = ERR39;
4451 goto FAILED;
4452 }
4453 if (n > 255)
4454 {
4455 *errorcodeptr = ERR38;
4456 goto FAILED;
4457 }
4458 *code++ = n;
4459 PUT(code, 0, ptr - cd->start_pattern + 1); /* Pattern offset */
4460 PUT(code, LINK_SIZE, 0); /* Default length */
4461 code += 2 * LINK_SIZE;
4462 }
4463 previous = NULL;
4464 continue;
4465
4466
4467 /* ------------------------------------------------------------ */
4468 case 'P': /* Python-style named subpattern handling */
4469 if (*(++ptr) == '=' || *ptr == '>') /* Reference or recursion */
4470 {
4471 is_recurse = *ptr == '>';
4472 terminator = ')';
4473 goto NAMED_REF_OR_RECURSE;
4474 }
4475 else if (*ptr != '<') /* Test for Python-style definition */
4476 {
4477 *errorcodeptr = ERR41;
4478 goto FAILED;
4479 }
4480 /* Fall through to handle (?P< as (?< is handled */
4481
4482
4483 /* ------------------------------------------------------------ */
4484 DEFINE_NAME: /* Come here from (?< handling */
4485 case '\'':
4486 {
4487 terminator = (*ptr == '<')? '>' : '\'';
4488 name = ++ptr;
4489
4490 while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4491 namelen = ptr - name;
4492
4493 /* In the pre-compile phase, just do a syntax check. */
4494
4495 if (lengthptr != NULL)
4496 {
4497 if (*ptr != terminator)
4498 {
4499 *errorcodeptr = ERR42;
4500 goto FAILED;
4501 }
4502 if (cd->names_found >= MAX_NAME_COUNT)
4503 {
4504 *errorcodeptr = ERR49;
4505 goto FAILED;
4506 }
4507 if (namelen + 3 > cd->name_entry_size)
4508 {
4509 cd->name_entry_size = namelen + 3;
4510 if (namelen > MAX_NAME_SIZE)
4511 {
4512 *errorcodeptr = ERR48;
4513 goto FAILED;
4514 }
4515 }
4516 }
4517
4518 /* In the real compile, create the entry in the table */
4519
4520 else
4521 {
4522 slot = cd->name_table;
4523 for (i = 0; i < cd->names_found; i++)
4524 {
4525 int crc = memcmp(name, slot+2, namelen);
4526 if (crc == 0)
4527 {
4528 if (slot[2+namelen] == 0)
4529 {
4530 if ((options & PCRE_DUPNAMES) == 0)
4531 {
4532 *errorcodeptr = ERR43;
4533 goto FAILED;
4534 }
4535 }
4536 else crc = -1; /* Current name is substring */
4537 }
4538 if (crc < 0)
4539 {
4540 memmove(slot + cd->name_entry_size, slot,
4541 (cd->names_found - i) * cd->name_entry_size);
4542 break;
4543 }
4544 slot += cd->name_entry_size;
4545 }
4546
4547 PUT2(slot, 0, cd->bracount + 1);
4548 memcpy(slot + 2, name, namelen);
4549 slot[2+namelen] = 0;
4550 }
4551 }
4552
4553 /* In both cases, count the number of names we've encountered. */
4554
4555 ptr++; /* Move past > or ' */
4556 cd->names_found++;
4557 goto NUMBERED_GROUP;
4558
4559
4560 /* ------------------------------------------------------------ */
4561 case '&': /* Perl recursion/subroutine syntax */
4562 terminator = ')';
4563 is_recurse = TRUE;
4564 /* Fall through */
4565
4566 /* We come here from the Python syntax above that handles both
4567 references (?P=name) and recursion (?P>name), as well as falling
4568 through from the Perl recursion syntax (?&name). We also come here from
4569 the Perl \k<name> or \k'name' back reference syntax and the \k{name}
4570 .NET syntax. */
4571
4572 NAMED_REF_OR_RECURSE:
4573 name = ++ptr;
4574 while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4575 namelen = ptr - name;
4576
4577 /* In the pre-compile phase, do a syntax check and set a dummy
4578 reference number. */
4579
4580 if (lengthptr != NULL)
4581 {
4582 if (namelen == 0)
4583 {
4584 *errorcodeptr = ERR62;
4585 goto FAILED;
4586 }
4587 if (*ptr != terminator)
4588 {
4589 *errorcodeptr = ERR42;
4590 goto FAILED;
4591 }
4592 if (namelen > MAX_NAME_SIZE)
4593 {
4594 *errorcodeptr = ERR48;
4595 goto FAILED;
4596 }
4597 recno = 0;
4598 }
4599
4600 /* In the real compile, seek the name in the table. We check the name
4601 first, and then check that we have reached the end of the name in the
4602 table. That way, if the name that is longer than any in the table,
4603 the comparison will fail without reading beyond the table entry. */
4604
4605 else
4606 {
4607 slot = cd->name_table;
4608 for (i = 0; i < cd->names_found; i++)
4609 {
4610 if (strncmp((char *)name, (char *)slot+2, namelen) == 0 &&
4611 slot[2+namelen] == 0)
4612 break;
4613 slot += cd->name_entry_size;
4614 }
4615
4616 if (i < cd->names_found) /* Back reference */
4617 {
4618 recno = GET2(slot, 0);
4619 }
4620 else if ((recno = /* Forward back reference */
4621 find_parens(ptr, cd->bracount, name, namelen,
4622 (options & PCRE_EXTENDED) != 0)) <= 0)
4623 {
4624 *errorcodeptr = ERR15;
4625 goto FAILED;
4626 }
4627 }
4628
4629 /* In both phases, we can now go to the code than handles numerical
4630 recursion or backreferences. */
4631
4632 if (is_recurse) goto HANDLE_RECURSION;
4633 else goto HANDLE_REFERENCE;
4634
4635
4636 /* ------------------------------------------------------------ */
4637 case 'R': /* Recursion */
4638 ptr++; /* Same as (?0) */
4639 /* Fall through */
4640
4641
4642 /* ------------------------------------------------------------ */
4643 case '-': case '+':
4644 case '0': case '1': case '2': case '3': case '4': /* Recursion or */
4645 case '5': case '6': case '7': case '8': case '9': /* subroutine */
4646 {
4647 const uschar *called;
4648
4649 if ((refsign = *ptr) == '+')
4650 {
4651 ptr++;
4652 if ((digitab[*ptr] & ctype_digit) == 0)
4653 {
4654 *errorcodeptr = ERR63;
4655 goto FAILED;
4656 }
4657 }
4658 else if (refsign == '-')
4659 {
4660 if ((digitab[ptr[1]] & ctype_digit) == 0)
4661 goto OTHER_CHAR_AFTER_QUERY;
4662 ptr++;
4663 }
4664
4665 recno = 0;
4666 while((digitab[*ptr] & ctype_digit) != 0)
4667 recno = recno * 10 + *ptr++ - '0';
4668
4669 if (*ptr != ')')
4670 {
4671 *errorcodeptr = ERR29;
4672 goto FAILED;
4673 }
4674
4675 if (refsign == '-')
4676 {
4677 if (recno == 0)
4678 {
4679 *errorcodeptr = ERR58;
4680 goto FAILED;
4681 }
4682 recno = cd->bracount - recno + 1;
4683 if (recno <= 0)
4684 {
4685 *errorcodeptr = ERR15;
4686 goto FAILED;
4687 }
4688 }
4689 else if (refsign == '+')
4690 {
4691 if (recno == 0)
4692 {
4693 *errorcodeptr = ERR58;
4694 goto FAILED;
4695 }
4696 recno += cd->bracount;
4697 }
4698
4699 /* Come here from code above that handles a named recursion */
4700
4701 HANDLE_RECURSION:
4702
4703 previous = code;
4704 called = cd->start_code;
4705
4706 /* When we are actually compiling, find the bracket that is being
4707 referenced. Temporarily end the regex in case it doesn't exist before
4708 this point. If we end up with a forward reference, first check that
4709 the bracket does occur later so we can give the error (and position)
4710 now. Then remember this forward reference in the workspace so it can
4711 be filled in at the end. */
4712
4713 if (lengthptr == NULL)
4714 {
4715 *code = OP_END;
4716 if (recno != 0) called = find_bracket(cd->start_code, utf8, recno);
4717
4718 /* Forward reference */
4719
4720 if (called == NULL)
4721 {
4722 if (find_parens(ptr, cd->bracount, NULL, recno,
4723 (options & PCRE_EXTENDED) != 0) < 0)
4724 {
4725 *errorcodeptr = ERR15;
4726 goto FAILED;
4727 }
4728 called = cd->start_code + recno;
4729 PUTINC(cd->hwm, 0, code + 2 + LINK_SIZE - cd->start_code);
4730 }
4731
4732 /* If not a forward reference, and the subpattern is still open,
4733 this is a recursive call. We check to see if this is a left
4734 recursion that could loop for ever, and diagnose that case. */
4735
4736 else if (GET(called, 1) == 0 &&
4737 could_be_empty(called, code, bcptr, utf8))
4738 {
4739 *errorcodeptr = ERR40;
4740 goto FAILED;
4741 }
4742 }
4743
4744 /* Insert the recursion/subroutine item, automatically wrapped inside
4745 "once" brackets. Set up a "previous group" length so that a
4746 subsequent quantifier will work. */
4747
4748 *code = OP_ONCE;
4749 PUT(code, 1, 2 + 2*LINK_SIZE);
4750 code += 1 + LINK_SIZE;
4751
4752 *code = OP_RECURSE;
4753 PUT(code, 1, called - cd->start_code);
4754 code += 1 + LINK_SIZE;
4755
4756 *code = OP_KET;
4757 PUT(code, 1, 2 + 2*LINK_SIZE);
4758 code += 1 + LINK_SIZE;
4759
4760 length_prevgroup = 3 + 3*LINK_SIZE;
4761 }
4762
4763 /* Can't determine a first byte now */
4764
4765 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
4766 continue;
4767
4768
4769 /* ------------------------------------------------------------ */
4770 default: /* Other characters: check option setting */
4771 OTHER_CHAR_AFTER_QUERY:
4772 set = unset = 0;
4773 optset = &set;
4774
4775 while (*ptr != ')' && *ptr != ':')
4776 {
4777 switch (*ptr++)
4778 {
4779 case '-': optset = &unset; break;
4780
4781 case 'J': /* Record that it changed in the external options */
4782 *optset |= PCRE_DUPNAMES;
4783 cd->external_flags |= PCRE_JCHANGED;
4784 break;
4785
4786 case 'i': *optset |= PCRE_CASELESS; break;
4787 case 'm': *optset |= PCRE_MULTILINE; break;
4788 case 's': *optset |= PCRE_DOTALL; break;
4789 case 'x': *optset |= PCRE_EXTENDED; break;
4790 case 'U': *optset |= PCRE_UNGREEDY; break;
4791 case 'X': *optset |= PCRE_EXTRA; break;
4792
4793 default: *errorcodeptr = ERR12;
4794 ptr--; /* Correct the offset */
4795 goto FAILED;
4796 }
4797 }
4798
4799 /* Set up the changed option bits, but don't change anything yet. */
4800
4801 newoptions = (options | set) & (~unset);
4802
4803 /* If the options ended with ')' this is not the start of a nested
4804 group with option changes, so the options change at this level. If this
4805 item is right at the start of the pattern, the options can be
4806 abstracted and made external in the pre-compile phase, and ignored in
4807 the compile phase. This can be helpful when matching -- for instance in
4808 caseless checking of required bytes.
4809
4810 If the code pointer is not (cd->start_code + 1 + LINK_SIZE), we are
4811 definitely *not* at the start of the pattern because something has been
4812 compiled. In the pre-compile phase, however, the code pointer can have
4813 that value after the start, because it gets reset as code is discarded
4814 during the pre-compile. However, this can happen only at top level - if
4815 we are within parentheses, the starting BRA will still be present. At
4816 any parenthesis level, the length value can be used to test if anything
4817 has been compiled at that level. Thus, a test for both these conditions
4818 is necessary to ensure we correctly detect the start of the pattern in
4819 both phases.
4820
4821 If we are not at the pattern start, compile code to change the ims
4822 options if this setting actually changes any of them. We also pass the
4823 new setting back so that it can be put at the start of any following
4824 branches, and when this group ends (if we are in a group), a resetting
4825 item can be compiled. */
4826
4827 if (*ptr == ')')
4828 {
4829 if (code == cd->start_code + 1 + LINK_SIZE &&
4830 (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE))
4831 {
4832 cd->external_options = newoptions;
4833 options = newoptions;
4834 }
4835 else
4836 {
4837 if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
4838 {
4839 *code++ = OP_OPT;
4840 *code++ = newoptions & PCRE_IMS;
4841 }
4842
4843 /* Change options at this level, and pass them back for use
4844 in subsequent branches. Reset the greedy defaults and the case
4845 value for firstbyte and reqbyte. */
4846
4847 *optionsptr = options = newoptions;
4848 greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
4849 greedy_non_default = greedy_default ^ 1;
4850 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
4851 }
4852
4853 previous = NULL; /* This item can't be repeated */
4854 continue; /* It is complete */
4855 }
4856
4857 /* If the options ended with ':' we are heading into a nested group
4858 with possible change of options. Such groups are non-capturing and are
4859 not assertions of any kind. All we need to do is skip over the ':';
4860 the newoptions value is handled below. */
4861
4862 bravalue = OP_BRA;
4863 ptr++;
4864 } /* End of switch for character following (? */
4865 } /* End of (? handling */
4866
4867 /* Opening parenthesis not followed by '?'. If PCRE_NO_AUTO_CAPTURE is set,
4868 all unadorned brackets become non-capturing and behave like (?:...)
4869 brackets. */
4870
4871 else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
4872 {
4873 bravalue = OP_BRA;
4874 }
4875
4876 /* Else we have a capturing group. */
4877
4878 else
4879 {
4880 NUMBERED_GROUP:
4881 cd->bracount += 1;
4882 PUT2(code, 1+LINK_SIZE, cd->bracount);
4883 skipbytes = 2;
4884 }
4885
4886 /* Process nested bracketed regex. Assertions may not be repeated, but
4887 other kinds can be. All their opcodes are >= OP_ONCE. We copy code into a
4888 non-register variable in order to be able to pass its address because some
4889 compilers complain otherwise. Pass in a new setting for the ims options if
4890 they have changed. */
4891
4892 previous = (bravalue >= OP_ONCE)? code : NULL;
4893 *code = bravalue;
4894 tempcode = code;
4895 tempreqvary = cd->req_varyopt; /* Save value before bracket */
4896 length_prevgroup = 0; /* Initialize for pre-compile phase */
4897
4898 if (!compile_regex(
4899 newoptions, /* The complete new option state */
4900 options & PCRE_IMS, /* The previous ims option state */
4901 &tempcode, /* Where to put code (updated) */
4902 &ptr, /* Input pointer (updated) */
4903 errorcodeptr, /* Where to put an error message */
4904 (bravalue == OP_ASSERTBACK ||
4905 bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
4906 reset_bracount, /* True if (?| group */
4907 skipbytes, /* Skip over bracket number */
4908 &subfirstbyte, /* For possible first char */
4909 &subreqbyte, /* For possible last char */
4910 bcptr, /* Current branch chain */
4911 cd, /* Tables block */
4912 (lengthptr == NULL)? NULL : /* Actual compile phase */
4913 &length_prevgroup /* Pre-compile phase */
4914 ))
4915 goto FAILED;
4916
4917 /* At the end of compiling, code is still pointing to the start of the
4918 group, while tempcode has been updated to point past the end of the group
4919 and any option resetting that may follow it. The pattern pointer (ptr)
4920 is on the bracket. */
4921
4922 /* If this is a conditional bracket, check that there are no more than
4923 two branches in the group, or just one if it's a DEFINE group. We do this
4924 in the real compile phase, not in the pre-pass, where the whole group may
4925 not be available. */
4926
4927 if (bravalue == OP_COND && lengthptr == NULL)
4928 {
4929 uschar *tc = code;
4930 int condcount = 0;
4931
4932 do {
4933 condcount++;
4934 tc += GET(tc,1);
4935 }
4936 while (*tc != OP_KET);
4937
4938 /* A DEFINE group is never obeyed inline (the "condition" is always
4939 false). It must have only one branch. */
4940
4941 if (code[LINK_SIZE+1] == OP_DEF)
4942 {
4943 if (condcount > 1)
4944 {
4945 *errorcodeptr = ERR54;
4946 goto FAILED;
4947 }
4948 bravalue = OP_DEF; /* Just a flag to suppress char handling below */
4949 }
4950
4951 /* A "normal" conditional group. If there is just one branch, we must not
4952 make use of its firstbyte or reqbyte, because this is equivalent to an
4953 empty second branch. */
4954
4955 else
4956 {
4957 if (condcount > 2)
4958 {
4959 *errorcodeptr = ERR27;
4960 goto FAILED;
4961 }
4962 if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
4963 }
4964 }
4965
4966 /* Error if hit end of pattern */
4967
4968 if (*ptr != ')')
4969 {
4970 *errorcodeptr = ERR14;
4971 goto FAILED;
4972 }
4973
4974 /* In the pre-compile phase, update the length by the length of the group,
4975 less the brackets at either end. Then reduce the compiled code to just a
4976 set of non-capturing brackets so that it doesn't use much memory if it is
4977 duplicated by a quantifier.*/
4978
4979 if (lengthptr != NULL)
4980 {
4981 if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE)
4982 {
4983 *errorcodeptr = ERR20;
4984 goto FAILED;
4985 }
4986 *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
4987 *code++ = OP_BRA;
4988 PUTINC(code, 0, 1 + LINK_SIZE);
4989 *code++ = OP_KET;
4990 PUTINC(code, 0, 1 + LINK_SIZE);
4991 break; /* No need to waste time with special character handling */
4992 }
4993
4994 /* Otherwise update the main code pointer to the end of the group. */
4995
4996 code = tempcode;
4997
4998 /* For a DEFINE group, required and first character settings are not
4999 relevant. */
5000
5001 if (bravalue == OP_DEF) break;
5002
5003 /* Handle updating of the required and first characters for other types of
5004 group. Update for normal brackets of all kinds, and conditions with two
5005 branches (see code above). If the bracket is followed by a quantifier with
5006 zero repeat, we have to back off. Hence the definition of zeroreqbyte and
5007 zerofirstbyte outside the main loop so that they can be accessed for the
5008 back off. */
5009
5010 zeroreqbyte = reqbyte;
5011 zerofirstbyte = firstbyte;
5012 groupsetfirstbyte = FALSE;
5013
5014 if (bravalue >= OP_ONCE)
5015 {
5016 /* If we have not yet set a firstbyte in this branch, take it from the
5017 subpattern, remembering that it was set here so that a repeat of more
5018 than one can replicate it as reqbyte if necessary. If the subpattern has
5019 no firstbyte, set "none" for the whole branch. In both cases, a zero
5020 repeat forces firstbyte to "none". */
5021
5022 if (firstbyte == REQ_UNSET)
5023 {
5024 if (subfirstbyte >= 0)
5025 {
5026 firstbyte = subfirstbyte;
5027 groupsetfirstbyte = TRUE;
5028 }
5029 else firstbyte = REQ_NONE;
5030 zerofirstbyte = REQ_NONE;
5031 }
5032
5033 /* If firstbyte was previously set, convert the subpattern's firstbyte
5034 into reqbyte if there wasn't one, using the vary flag that was in
5035 existence beforehand. */
5036
5037 else if (subfirstbyte >= 0 && subreqbyte < 0)
5038 subreqbyte = subfirstbyte | tempreqvary;
5039
5040 /* If the subpattern set a required byte (or set a first byte that isn't
5041 really the first byte - see above), set it. */
5042
5043 if (subreqbyte >= 0) reqbyte = subreqbyte;
5044 }
5045
5046 /* For a forward assertion, we take the reqbyte, if set. This can be
5047 helpful if the pattern that follows the assertion doesn't set a different
5048 char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
5049 for an assertion, however because it leads to incorrect effect for patterns
5050 such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
5051 of a firstbyte. This is overcome by a scan at the end if there's no
5052 firstbyte, looking for an asserted first char. */
5053
5054 else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
5055 break; /* End of processing '(' */
5056
5057
5058 /* ===================================================================*/
5059 /* Handle metasequences introduced by \. For ones like \d, the ESC_ values
5060 are arranged to be the negation of the corresponding OP_values. For the
5061 back references, the values are ESC_REF plus the reference number. Only
5062 back references and those types that consume a character may be repeated.
5063 We can test for values between ESC_b and ESC_Z for the latter; this may
5064 have to change if any new ones are ever created. */
5065
5066 case '\\':
5067 tempptr = ptr;
5068 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, FALSE);
5069 if (*errorcodeptr != 0) goto FAILED;
5070
5071 if (c < 0)
5072 {
5073 if (-c == ESC_Q) /* Handle start of quoted string */
5074 {
5075 if (ptr[1] == '\\' && ptr[2] == 'E') ptr += 2; /* avoid empty string */
5076 else inescq = TRUE;
5077 continue;
5078 }
5079
5080 if (-c == ESC_E) continue; /* Perl ignores an orphan \E */
5081
5082 /* For metasequences that actually match a character, we disable the
5083 setting of a first character if it hasn't already been set. */
5084
5085 if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
5086 firstbyte = REQ_NONE;
5087
5088 /* Set values to reset to if this is followed by a zero repeat. */
5089
5090 zerofirstbyte = firstbyte;
5091 zeroreqbyte = reqbyte;
5092
5093 /* \k<name> or \k'name' is a back reference by name (Perl syntax).
5094 We also support \k{name} (.NET syntax) */
5095
5096 if (-c == ESC_k && (ptr[1] == '<' || ptr[1] == '\'' || ptr[1] == '{'))
5097 {
5098 is_recurse = FALSE;
5099 terminator = (*(++ptr) == '<')? '>' : (*ptr == '\'')? '\'' : '}';
5100 goto NAMED_REF_OR_RECURSE;
5101 }
5102
5103 /* Back references are handled specially; must disable firstbyte if
5104 not set to cope with cases like (?=(\w+))\1: which would otherwise set
5105 ':' later. */
5106
5107 if (-c >= ESC_REF)
5108 {
5109 recno = -c - ESC_REF;
5110
5111 HANDLE_REFERENCE: /* Come here from named backref handling */
5112 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
5113 previous = code;
5114 *code++ = OP_REF;
5115 PUT2INC(code, 0, recno);
5116 cd->backref_map |= (recno < 32)? (1 << recno) : 1;
5117 if (recno > cd->top_backref) cd->top_backref = recno;
5118 }
5119
5120 /* So are Unicode property matches, if supported. */
5121
5122 #ifdef SUPPORT_UCP
5123 else if (-c == ESC_P || -c == ESC_p)
5124 {
5125 BOOL negated;
5126 int pdata;
5127 int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
5128 if (ptype < 0) goto FAILED;
5129 previous = code;
5130 *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
5131 *code++ = ptype;
5132 *code++ = pdata;
5133 }
5134 #else
5135
5136 /* If Unicode properties are not supported, \X, \P, and \p are not
5137 allowed. */
5138
5139 else if (-c == ESC_X || -c == ESC_P || -c == ESC_p)
5140 {
5141 *errorcodeptr = ERR45;
5142 goto FAILED;
5143 }
5144 #endif
5145
5146 /* For the rest (including \X when Unicode properties are supported), we
5147 can obtain the OP value by negating the escape value. */
5148
5149 else
5150 {
5151 previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
5152 *code++ = -c;
5153 }
5154 continue;
5155 }
5156
5157 /* We have a data character whose value is in c. In UTF-8 mode it may have
5158 a value > 127. We set its representation in the length/buffer, and then
5159 handle it as a data character. */
5160
5161 #ifdef SUPPORT_UTF8
5162 if (utf8 && c > 127)
5163 mclength = _pcre_ord2utf8(c, mcbuffer);
5164 else
5165 #endif
5166
5167 {
5168 mcbuffer[0] = c;
5169 mclength = 1;
5170 }
5171 goto ONE_CHAR;
5172
5173
5174 /* ===================================================================*/
5175 /* Handle a literal character. It is guaranteed not to be whitespace or #
5176 when the extended flag is set. If we are in UTF-8 mode, it may be a
5177 multi-byte literal character. */
5178
5179 default:
5180 NORMAL_CHAR:
5181 mclength = 1;
5182 mcbuffer[0] = c;
5183
5184 #ifdef SUPPORT_UTF8
5185 if (utf8 && c >= 0xc0)
5186 {
5187 while ((ptr[1] & 0xc0) == 0x80)
5188 mcbuffer[mclength++] = *(++ptr);
5189 }
5190 #endif
5191
5192 /* At this point we have the character's bytes in mcbuffer, and the length
5193 in mclength. When not in UTF-8 mode, the length is always 1. */
5194
5195 ONE_CHAR:
5196 previous = code;
5197 *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARNC : OP_CHAR;
5198 for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
5199
5200 /* Remember if \r or \n were seen */
5201
5202 if (mcbuffer[0] == '\r' || mcbuffer[0] == '\n')
5203 cd->external_flags |= PCRE_HASCRORLF;
5204
5205 /* Set the first and required bytes appropriately. If no previous first
5206 byte, set it from this character, but revert to none on a zero repeat.
5207 Otherwise, leave the firstbyte value alone, and don't change it on a zero
5208 repeat. */
5209
5210 if (firstbyte == REQ_UNSET)
5211 {
5212 zerofirstbyte = REQ_NONE;
5213 zeroreqbyte = reqbyte;
5214
5215 /* If the character is more than one byte long, we can set firstbyte
5216 only if it is not to be matched caselessly. */
5217
5218 if (mclength == 1 || req_caseopt == 0)
5219 {
5220 firstbyte = mcbuffer[0] | req_caseopt;
5221 if (mclength != 1) reqbyte = code[-1] | cd->req_varyopt;
5222 }
5223 else firstbyte = reqbyte = REQ_NONE;
5224 }
5225
5226 /* firstbyte was previously set; we can set reqbyte only the length is
5227 1 or the matching is caseful. */
5228
5229 else
5230 {
5231 zerofirstbyte = firstbyte;
5232 zeroreqbyte = reqbyte;
5233 if (mclength == 1 || req_caseopt == 0)
5234 reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
5235 }
5236
5237 break; /* End of literal character handling */
5238 }
5239 } /* end of big loop */
5240
5241
5242 /* Control never reaches here by falling through, only by a goto for all the
5243 error states. Pass back the position in the pattern so that it can be displayed
5244 to the user for diagnosing the error. */
5245
5246 FAILED:
5247 *ptrptr = ptr;
5248 return FALSE;
5249 }
5250
5251
5252
5253
5254 /*************************************************
5255 * Compile sequence of alternatives *
5256 *************************************************/
5257
5258 /* On entry, ptr is pointing past the bracket character, but on return it
5259 points to the closing bracket, or vertical bar, or end of string. The code
5260 variable is pointing at the byte into which the BRA operator has been stored.
5261 If the ims options are changed at the start (for a (?ims: group) or during any
5262 branch, we need to insert an OP_OPT item at the start of every following branch
5263 to ensure they get set correctly at run time, and also pass the new options
5264 into every subsequent branch compile.
5265
5266 This function is used during the pre-compile phase when we are trying to find
5267 out the amount of memory needed, as well as during the real compile phase. The
5268 value of lengthptr distinguishes the two phases.
5269
5270 Arguments:
5271 options option bits, including any changes for this subpattern
5272 oldims previous settings of ims option bits
5273 codeptr -> the address of the current code pointer
5274 ptrptr -> the address of the current pattern pointer
5275 errorcodeptr -> pointer to error code variable
5276 lookbehind TRUE if this is a lookbehind assertion
5277 reset_bracount TRUE to reset the count for each branch
5278 skipbytes skip this many bytes at start (for brackets and OP_COND)
5279 firstbyteptr place to put the first required character, or a negative number
5280 reqbyteptr place to put the last required character, or a negative number
5281 bcptr pointer to the chain of currently open branches
5282 cd points to the data block with tables pointers etc.
5283 lengthptr NULL during the real compile phase
5284 points to length accumulator during pre-compile phase
5285
5286 Returns: TRUE on success
5287 */
5288
5289 static BOOL
5290 compile_regex(int options, int oldims, uschar **codeptr, const uschar **ptrptr,
5291 int *errorcodeptr, BOOL lookbehind, BOOL reset_bracount, int skipbytes,
5292 int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd,
5293 int *lengthptr)
5294 {
5295 const uschar *ptr = *ptrptr;
5296 uschar *code = *codeptr;
5297 uschar *last_branch = code;
5298 uschar *start_bracket = code;
5299 uschar *reverse_count = NULL;
5300 int firstbyte, reqbyte;
5301 int branchfirstbyte, branchreqbyte;
5302 int length;
5303 int orig_bracount;
5304 int max_bracount;
5305 branch_chain bc;
5306
5307 bc.outer = bcptr;
5308 bc.current = code;
5309
5310 firstbyte = reqbyte = REQ_UNSET;
5311
5312 /* Accumulate the length for use in the pre-compile phase. Start with the
5313 length of the BRA and KET and any extra bytes that are required at the
5314 beginning. We accumulate in a local variable to save frequent testing of
5315 lenthptr for NULL. We cannot do this by looking at the value of code at the
5316 start and end of each alternative, because compiled items are discarded during
5317 the pre-compile phase so that the work space is not exceeded. */
5318
5319 length = 2 + 2*LINK_SIZE + skipbytes;
5320
5321 /* WARNING: If the above line is changed for any reason, you must also change
5322 the code that abstracts option settings at the start of the pattern and makes
5323 them global. It tests the value of length for (2 + 2*LINK_SIZE) in the
5324 pre-compile phase to find out whether anything has yet been compiled or not. */
5325
5326 /* Offset is set zero to mark that this bracket is still open */
5327
5328 PUT(code, 1, 0);
5329 code += 1 + LINK_SIZE + skipbytes;
5330
5331 /* Loop for each alternative branch */
5332
5333 orig_bracount = max_bracount = cd->bracount;
5334 for (;;)
5335 {
5336 /* For a (?| group, reset the capturing bracket count so that each branch
5337 uses the same numbers. */
5338
5339 if (reset_bracount) cd->bracount = orig_bracount;
5340
5341 /* Handle a change of ims options at the start of the branch */
5342
5343 if ((options & PCRE_IMS) != oldims)
5344 {
5345 *code++ = OP_OPT;
5346 *code++ = options & PCRE_IMS;
5347 length += 2;
5348 }
5349
5350 /* Set up dummy OP_REVERSE if lookbehind assertion */
5351
5352 if (lookbehind)
5353 {
5354 *code++ = OP_REVERSE;
5355 reverse_count = code;
5356 PUTINC(code, 0, 0);
5357 length += 1 + LINK_SIZE;
5358 }
5359
5360 /* Now compile the branch; in the pre-compile phase its length gets added
5361 into the length. */
5362
5363 if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstbyte,
5364 &branchreqbyte, &bc, cd, (lengthptr == NULL)? NULL : &length))
5365 {
5366 *ptrptr = ptr;
5367 return FALSE;
5368 }
5369
5370 /* Keep the highest bracket count in case (?| was used and some branch
5371 has fewer than the rest. */
5372
5373 if (cd->bracount > max_bracount) max_bracount = cd->bracount;
5374
5375 /* In the real compile phase, there is some post-processing to be done. */
5376
5377 if (lengthptr == NULL)
5378 {
5379 /* If this is the first branch, the firstbyte and reqbyte values for the
5380 branch become the values for the regex. */
5381
5382 if (*last_branch != OP_ALT)
5383 {
5384 firstbyte = branchfirstbyte;
5385 reqbyte = branchreqbyte;
5386 }
5387
5388 /* If this is not the first branch, the first char and reqbyte have to
5389 match the values from all the previous branches, except that if the
5390 previous value for reqbyte didn't have REQ_VARY set, it can still match,
5391 and we set REQ_VARY for the regex. */
5392
5393 else
5394 {
5395 /* If we previously had a firstbyte, but it doesn't match the new branch,
5396 we have to abandon the firstbyte for the regex, but if there was
5397 previously no reqbyte, it takes on the value of the old firstbyte. */
5398
5399 if (firstbyte >= 0 && firstbyte != branchfirstbyte)
5400 {
5401 if (reqbyte < 0) reqbyte = firstbyte;
5402 firstbyte = REQ_NONE;
5403 }
5404
5405 /* If we (now or from before) have no firstbyte, a firstbyte from the
5406 branch becomes a reqbyte if there isn't a branch reqbyte. */
5407
5408 if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
5409 branchreqbyte = branchfirstbyte;
5410
5411 /* Now ensure that the reqbytes match */
5412
5413 if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
5414 reqbyte = REQ_NONE;
5415 else reqbyte |= branchreqbyte; /* To "or" REQ_VARY */
5416 }
5417
5418 /* If lookbehind, check that this branch matches a fixed-length string, and
5419 put the length into the OP_REVERSE item. Temporarily mark the end of the
5420 branch with OP_END. */
5421
5422 if (lookbehind)
5423 {
5424 int fixed_length;
5425 *code = OP_END;
5426 fixed_length = find_fixedlength(last_branch, options);
5427 DPRINTF(("fixed length = %d\n", fixed_length));
5428 if (fixed_length < 0)
5429 {
5430 *errorcodeptr = (fixed_length == -2)? ERR36 : ERR25;
5431 *ptrptr = ptr;
5432 return FALSE;
5433 }
5434 PUT(reverse_count, 0, fixed_length);
5435 }
5436 }
5437
5438 /* Reached end of expression, either ')' or end of pattern. In the real
5439 compile phase, go back through the alternative branches and reverse the chain
5440 of offsets, with the field in the BRA item now becoming an offset to the
5441 first alternative. If there are no alternatives, it points to the end of the
5442 group. The length in the terminating ket is always the length of the whole
5443 bracketed item. If any of the ims options were changed inside the group,
5444 compile a resetting op-code following, except at the very end of the pattern.
5445 Return leaving the pointer at the terminating char. */
5446
5447 if (*ptr != '|')
5448 {
5449 if (lengthptr == NULL)
5450 {
5451 int branch_length = code - last_branch;
5452 do
5453 {
5454 int prev_length = GET(last_branch, 1);
5455 PUT(last_branch, 1, branch_length);
5456 branch_length = prev_length;
5457 last_branch -= branch_length;
5458 }
5459 while (branch_length > 0);
5460 }
5461
5462 /* Fill in the ket */
5463
5464 *code = OP_KET;
5465 PUT(code, 1, code - start_bracket);
5466 code += 1 + LINK_SIZE;
5467
5468 /* Resetting option if needed */
5469
5470 if ((options & PCRE_IMS) != oldims && *ptr == ')')
5471 {
5472 *code++ = OP_OPT;
5473 *code++ = oldims;
5474 length += 2;
5475 }
5476
5477 /* Retain the highest bracket number, in case resetting was used. */
5478
5479 cd->bracount = max_bracount;
5480
5481 /* Set values to pass back */
5482
5483 *codeptr = code;
5484 *ptrptr = ptr;
5485 *firstbyteptr = firstbyte;
5486 *reqbyteptr = reqbyte;
5487 if (lengthptr != NULL)
5488 {
5489 if (OFLOW_MAX - *lengthptr < length)
5490 {
5491 *errorcodeptr = ERR20;
5492 return FALSE;
5493 }
5494 *lengthptr += length;
5495 }
5496 return TRUE;
5497 }
5498
5499 /* Another branch follows. In the pre-compile phase, we can move the code
5500 pointer back to where it was for the start of the first branch. (That is,
5501 pretend that each branch is the only one.)
5502
5503 In the real compile phase, insert an ALT node. Its length field points back
5504 to the previous branch while the bracket remains open. At the end the chain
5505 is reversed. It's done like this so that the start of the bracket has a
5506 zero offset until it is closed, making it possible to detect recursion. */
5507
5508 if (lengthptr != NULL)
5509 {
5510 code = *codeptr + 1 + LINK_SIZE + skipbytes;
5511 length += 1 + LINK_SIZE;
5512 }
5513 else
5514 {
5515 *code = OP_ALT;
5516 PUT(code, 1, code - last_branch);
5517 bc.current = last_branch = code;
5518 code += 1 + LINK_SIZE;
5519 }
5520
5521 ptr++;
5522 }
5523 /* Control never reaches here */
5524 }
5525
5526
5527
5528
5529 /*************************************************
5530 * Check for anchored expression *
5531 *************************************************/
5532
5533 /* Try to find out if this is an anchored regular expression. Consider each
5534 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
5535 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
5536 it's anchored. However, if this is a multiline pattern, then only OP_SOD
5537 counts, since OP_CIRC can match in the middle.
5538
5539 We can also consider a regex to be anchored if OP_SOM starts all its branches.
5540 This is the code for \G, which means "match at start of match position, taking
5541 into account the match offset".
5542
5543 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
5544 because that will try the rest of the pattern at all possible matching points,
5545 so there is no point trying again.... er ....
5546
5547 .... except when the .* appears inside capturing parentheses, and there is a
5548 subsequent back reference to those parentheses. We haven't enough information
5549 to catch that case precisely.
5550
5551 At first, the best we could do was to detect when .* was in capturing brackets
5552 and the highest back reference was greater than or equal to that level.
5553 However, by keeping a bitmap of the first 31 back references, we can catch some
5554 of the more common cases more precisely.
5555
5556 Arguments:
5557 code points to start of expression (the bracket)
5558 options points to the options setting
5559 bracket_map a bitmap of which brackets we are inside while testing; this
5560 handles up to substring 31; after that we just have to take
5561 the less precise approach
5562 backref_map the back reference bitmap
5563
5564 Returns: TRUE or FALSE
5565 */
5566
5567 static BOOL
5568 is_anchored(register const uschar *code, int *options, unsigned int bracket_map,
5569 unsigned int backref_map)
5570 {
5571 do {
5572 const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
5573 options, PCRE_MULTILINE, FALSE);
5574 register int op = *scode;
5575
5576 /* Non-capturing brackets */
5577
5578 if (op == OP_BRA)
5579 {
5580 if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
5581 }
5582
5583 /* Capturing brackets */
5584
5585 else if (op == OP_CBRA)
5586 {
5587 int n = GET2(scode, 1+LINK_SIZE);
5588 int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
5589 if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;
5590 }
5591
5592 /* Other brackets */
5593
5594 else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
5595 {
5596 if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
5597 }
5598
5599 /* .* is not anchored unless DOTALL is set and it isn't in brackets that
5600 are or may be referenced. */
5601
5602 else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
5603 op == OP_TYPEPOSSTAR) &&
5604 (*options & PCRE_DOTALL) != 0)
5605 {
5606 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
5607 }
5608
5609 /* Check for explicit anchoring */
5610
5611 else if (op != OP_SOD && op != OP_SOM &&
5612 ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
5613 return FALSE;
5614 code += GET(code, 1);
5615 }
5616 while (*code == OP_ALT); /* Loop for each alternative */
5617 return TRUE;
5618 }
5619
5620
5621
5622 /*************************************************
5623 * Check for starting with ^ or .* *
5624 *************************************************/
5625
5626 /* This is called to find out if every branch starts with ^ or .* so that
5627 "first char" processing can be done to speed things up in multiline
5628 matching and for non-DOTALL patterns that start with .* (which must start at
5629 the beginning or after \n). As in the case of is_anchored() (see above), we
5630 have to take account of back references to capturing brackets that contain .*
5631 because in that case we can't make the assumption.
5632
5633 Arguments:
5634 code points to start of expression (the bracket)
5635 bracket_map a bitmap of which brackets we are inside while testing; this
5636 handles up to substring 31; after that we just have to take
5637 the less precise approach
5638 backref_map the back reference bitmap
5639
5640 Returns: TRUE or FALSE
5641 */
5642
5643 static BOOL
5644 is_startline(const uschar *code, unsigned int bracket_map,
5645 unsigned int backref_map)
5646 {
5647 do {
5648 const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
5649 NULL, 0, FALSE);
5650 register int op = *scode;
5651
5652 /* Non-capturing brackets */
5653
5654 if (op == OP_BRA)
5655 {
5656 if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
5657 }
5658
5659 /* Capturing brackets */
5660
5661 else if (op == OP_CBRA)
5662 {
5663 int n = GET2(scode, 1+LINK_SIZE);
5664 int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
5665 if (!is_startline(scode, new_map, backref_map)) return FALSE;
5666 }
5667
5668 /* Other brackets */
5669
5670 else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
5671 { if (!is_startline(scode, bracket_map, backref_map)) return FALSE; }
5672
5673 /* .* means "start at start or after \n" if it isn't in brackets that
5674 may be referenced. */
5675
5676 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
5677 {
5678 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
5679 }
5680
5681 /* Check for explicit circumflex */
5682
5683 else if (op != OP_CIRC) return FALSE;
5684
5685 /* Move on to the next alternative */
5686
5687 code += GET(code, 1);
5688 }
5689 while (*code == OP_ALT); /* Loop for each alternative */
5690 return TRUE;
5691 }
5692
5693
5694
5695 /*************************************************
5696 * Check for asserted fixed first char *
5697 *************************************************/
5698
5699 /* During compilation, the "first char" settings from forward assertions are
5700 discarded, because they can cause conflicts with actual literals that follow.
5701 However, if we end up without a first char setting for an unanchored pattern,
5702 it is worth scanning the regex to see if there is an initial asserted first
5703 char. If all branches start with the same asserted char, or with a bracket all
5704 of whose alternatives start with the same asserted char (recurse ad lib), then
5705 we return that char, otherwise -1.
5706
5707 Arguments:
5708 code points to start of expression (the bracket)
5709 options pointer to the options (used to check casing changes)
5710 inassert TRUE if in an assertion
5711
5712 Returns: -1 or the fixed first char
5713 */
5714
5715 static int
5716 find_firstassertedchar(const uschar *code, int *options, BOOL inassert)
5717 {
5718 register int c = -1;
5719 do {
5720 int d;
5721 const uschar *scode =
5722 first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE);
5723 register int op = *scode;
5724
5725 switch(op)
5726 {
5727 default:
5728 return -1;
5729
5730 case OP_BRA:
5731 case OP_CBRA:
5732 case OP_ASSERT:
5733 case OP_ONCE:
5734 case OP_COND:
5735 if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0)
5736 return -1;
5737 if (c < 0) c = d; else if (c != d) return -1;
5738 break;
5739
5740 case OP_EXACT: /* Fall through */
5741 scode += 2;
5742
5743 case OP_CHAR:
5744 case OP_CHARNC:
5745 case OP_PLUS:
5746 case OP_MINPLUS:
5747 case OP_POSPLUS:
5748 if (!inassert) return -1;
5749 if (c < 0)
5750 {
5751 c = scode[1];
5752 if ((*options & PCRE_CASELESS) != 0) c |= REQ_CASELESS;
5753 }
5754 else if (c != scode[1]) return -1;
5755 break;
5756 }
5757
5758 code += GET(code, 1);
5759 }
5760 while (*code == OP_ALT);
5761 return c;
5762 }
5763
5764
5765
5766 /*************************************************
5767 * Compile a Regular Expression *
5768 *************************************************/
5769
5770 /* This function takes a string and returns a pointer to a block of store
5771 holding a compiled version of the expression. The original API for this
5772 function had no error code return variable; it is retained for backwards
5773 compatibility. The new function is given a new name.
5774
5775 Arguments:
5776 pattern the regular expression
5777 options various option bits
5778 errorcodeptr pointer to error code variable (pcre_compile2() only)
5779 can be NULL if you don't want a code value
5780 errorptr pointer to pointer to error text
5781 erroroffset ptr offset in pattern where error was detected
5782 tables pointer to character tables or NULL
5783
5784 Returns: pointer to compiled data block, or NULL on error,
5785 with errorptr and erroroffset set
5786 */
5787
5788 PCRE_EXP_DEFN pcre *
5789 pcre_compile(const char *pattern, int options, const char **errorptr,
5790 int *erroroffset, const unsigned char *tables)
5791 {
5792 return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
5793 }
5794
5795
5796 PCRE_EXP_DEFN pcre *
5797 pcre_compile2(const char *pattern, int options, int *errorcodeptr,
5798 const char **errorptr, int *erroroffset, const unsigned char *tables)
5799 {
5800 real_pcre *re;
5801 int length = 1; /* For final END opcode */
5802 int firstbyte, reqbyte, newline;
5803 int errorcode = 0;
5804 int skipatstart = 0;
5805 #ifdef SUPPORT_UTF8
5806 BOOL utf8;
5807 #endif
5808 size_t size;
5809 uschar *code;
5810 const uschar *codestart;
5811 const uschar *ptr;
5812 compile_data compile_block;
5813 compile_data *cd = &compile_block;
5814
5815 /* This space is used for "compiling" into during the first phase, when we are
5816 computing the amount of memory that is needed. Compiled items are thrown away
5817 as soon as possible, so that a fairly large buffer should be sufficient for
5818 this purpose. The same space is used in the second phase for remembering where
5819 to fill in forward references to subpatterns. */
5820
5821 uschar cworkspace[COMPILE_WORK_SIZE];
5822
5823 /* Set this early so that early errors get offset 0. */
5824
5825 ptr = (const uschar *)pattern;
5826
5827 /* We can't pass back an error message if errorptr is NULL; I guess the best we
5828 can do is just return NULL, but we can set a code value if there is a code
5829 pointer. */
5830
5831 if (errorptr == NULL)
5832 {
5833 if (errorcodeptr != NULL) *errorcodeptr = 99;
5834 return NULL;
5835 }
5836
5837 *errorptr = NULL;
5838 if (errorcodeptr != NULL) *errorcodeptr = ERR0;
5839
5840 /* However, we can give a message for this error */
5841
5842 if (erroroffset == NULL)
5843 {
5844 errorcode = ERR16;
5845 goto PCRE_EARLY_ERROR_RETURN2;
5846 }
5847
5848 *erroroffset = 0;
5849
5850 /* Can't support UTF8 unless PCRE has been compiled to include the code. */
5851
5852 #ifdef SUPPORT_UTF8
5853 utf8 = (options & PCRE_UTF8) != 0;
5854 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
5855 (*erroroffset = _pcre_valid_utf8((uschar *)pattern, -1)) >= 0)
5856 {
5857 errorcode = ERR44;
5858 goto PCRE_EARLY_ERROR_RETURN2;
5859 }
5860 #else
5861 if ((options & PCRE_UTF8) != 0)
5862 {
5863 errorcode = ERR32;
5864 goto PCRE_EARLY_ERROR_RETURN;
5865 }
5866 #endif
5867
5868 if ((options & ~PUBLIC_OPTIONS) != 0)
5869 {
5870 errorcode = ERR17;
5871 goto PCRE_EARLY_ERROR_RETURN;
5872 }
5873
5874 /* Set up pointers to the individual character tables */
5875
5876 if (tables == NULL) tables = _pcre_default_tables;
5877 cd->lcc = tables + lcc_offset;
5878 cd->fcc = tables + fcc_offset;
5879 cd->cbits = tables + cbits_offset;
5880 cd->ctypes = tables + ctypes_offset;
5881
5882 /* Check for global one-time settings at the start of the pattern, and remember
5883 the offset for later. */
5884
5885 while (ptr[skipatstart] == '(' && ptr[skipatstart+1] == '*')
5886 {
5887 int newnl = 0;
5888 int newbsr = 0;
5889
5890 if (strncmp((char *)(ptr+skipatstart+2), "CR)", 3) == 0)
5891 { skipatstart += 5; newnl = PCRE_NEWLINE_CR; }
5892 else if (strncmp((char *)(ptr+skipatstart+2), "LF)", 3) == 0)
5893 { skipatstart += 5; newnl = PCRE_NEWLINE_LF; }
5894 else if (strncmp((char *)(ptr+skipatstart+2), "CRLF)", 5) == 0)
5895 { skipatstart += 7; newnl = PCRE_NEWLINE_CR + PCRE_NEWLINE_LF; }
5896 else if (strncmp((char *)(ptr+skipatstart+2), "ANY)", 4) == 0)
5897 { skipatstart += 6; newnl = PCRE_NEWLINE_ANY; }
5898 else if (strncmp((char *)(ptr+skipatstart+2), "ANYCRLF)", 8) == 0)
5899 { skipatstart += 10; newnl = PCRE_NEWLINE_ANYCRLF; }
5900
5901 else if (strncmp((char *)(ptr+skipatstart+2), "BSR_ANYCRLF)", 12) == 0)
5902 { skipatstart += 14; newbsr = PCRE_BSR_ANYCRLF; }
5903 else if (strncmp((char *)(ptr+skipatstart+2), "BSR_UNICODE)", 12) == 0)
5904 { skipatstart += 14; newbsr = PCRE_BSR_UNICODE; }
5905
5906 if (newnl != 0)
5907 options = (options & ~PCRE_NEWLINE_BITS) | newnl;
5908 else if (newbsr != 0)
5909 options = (options & ~(PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) | newbsr;
5910 else break;
5911 }
5912
5913 /* Check validity of \R options. */
5914
5915 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
5916 {
5917 case 0:
5918 case PCRE_BSR_ANYCRLF:
5919 case PCRE_BSR_UNICODE:
5920 break;
5921 default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
5922 }
5923
5924 /* Handle different types of newline. The three bits give seven cases. The
5925 current code allows for fixed one- or two-byte sequences, plus "any" and
5926 "anycrlf". */
5927
5928 switch (options & PCRE_NEWLINE_BITS)
5929 {
5930 case 0: newline = NEWLINE; break; /* Build-time default */
5931 case PCRE_NEWLINE_CR: newline = '\r'; break;
5932 case PCRE_NEWLINE_LF: newline = '\n'; break;
5933 case PCRE_NEWLINE_CR+
5934 PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
5935 case PCRE_NEWLINE_ANY: newline = -1; break;
5936 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
5937 default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
5938 }
5939
5940 if (newline == -2)
5941 {
5942 cd->nltype = NLTYPE_ANYCRLF;
5943 }
5944 else if (newline < 0)
5945 {
5946 cd->nltype = NLTYPE_ANY;
5947 }
5948 else
5949 {
5950 cd->nltype = NLTYPE_FIXED;
5951 if (newline > 255)
5952 {
5953 cd->nllen = 2;
5954 cd->nl[0] = (newline >> 8) & 255;
5955 cd->nl[1] = newline & 255;
5956 }
5957 else
5958 {
5959 cd->nllen = 1;
5960 cd->nl[0] = newline;
5961 }
5962 }
5963
5964 /* Maximum back reference and backref bitmap. The bitmap records up to 31 back
5965 references to help in deciding whether (.*) can be treated as anchored or not.
5966 */
5967
5968 cd->top_backref = 0;
5969 cd->backref_map = 0;
5970
5971 /* Reflect pattern for debugging output */
5972
5973 DPRINTF(("------------------------------------------------------------------\n"));
5974 DPRINTF(("%s\n", pattern));
5975
5976 /* Pretend to compile the pattern while actually just accumulating the length
5977 of memory required. This behaviour is triggered by passing a non-NULL final
5978 argument to compile_regex(). We pass a block of workspace (cworkspace) for it
5979 to compile parts of the pattern into; the compiled code is discarded when it is
5980 no longer needed, so hopefully this workspace will never overflow, though there
5981 is a test for its doing so. */
5982
5983 cd->bracount = cd->final_bracount = 0;
5984 cd->names_found = 0;
5985 cd->name_entry_size = 0;
5986 cd->name_table = NULL;
5987 cd->start_workspace = cworkspace;
5988 cd->start_code = cworkspace;
5989 cd->hwm = cworkspace;
5990 cd->start_pattern = (const uschar *)pattern;
5991 cd->end_pattern = (const uschar *)(pattern + strlen(pattern));
5992 cd->req_varyopt = 0;
5993 cd->external_options = options;
5994 cd->external_flags = 0;
5995
5996 /* Now do the pre-compile. On error, errorcode will be set non-zero, so we
5997 don't need to look at the result of the function here. The initial options have
5998 been put into the cd block so that they can be changed if an option setting is
5999 found within the regex right at the beginning. Bringing initial option settings
6000 outside can help speed up starting point checks. */
6001
6002 ptr += skipatstart;
6003 code = cworkspace;
6004 *code = OP_BRA;
6005 (void)compile_regex(cd->external_options, cd->external_options & PCRE_IMS,
6006 &code, &ptr, &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd,
6007 &length);
6008 if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN;
6009
6010 DPRINTF(("end pre-compile: length=%d workspace=%d\n", length,
6011 cd->hwm - cworkspace));
6012
6013 if (length > MAX_PATTERN_SIZE)
6014 {
6015 errorcode = ERR20;
6016 goto PCRE_EARLY_ERROR_RETURN;
6017 }
6018
6019 /* Compute the size of data block needed and get it, either from malloc or
6020 externally provided function. Integer overflow should no longer be possible
6021 because nowadays we limit the maximum value of cd->names_found and
6022 cd->name_entry_size. */
6023
6024 size = length + sizeof(real_pcre) + cd->names_found * (cd->name_entry_size + 3);
6025 re = (real_pcre *)(pcre_malloc)(size);
6026
6027 if (re == NULL)
6028 {
6029 errorcode = ERR21;
6030 goto PCRE_EARLY_ERROR_RETURN;
6031 }
6032
6033 /* Put in the magic number, and save the sizes, initial options, internal
6034 flags, and character table pointer. NULL is used for the default character
6035 tables. The nullpad field is at the end; it's there to help in the case when a
6036 regex compiled on a system with 4-byte pointers is run on another with 8-byte
6037 pointers. */
6038
6039 re->magic_number = MAGIC_NUMBER;
6040 re->size = size;
6041 re->options = cd->external_options;
6042 re->flags = cd->external_flags;
6043 re->dummy1 = 0;
6044 re->first_byte = 0;
6045 re->req_byte = 0;
6046 re->name_table_offset = sizeof(real_pcre);
6047 re->name_entry_size = cd->name_entry_size;
6048 re->name_count = cd->names_found;
6049 re->ref_count = 0;
6050 re->tables = (tables == _pcre_default_tables)? NULL : tables;
6051 re->nullpad = NULL;
6052
6053 /* The starting points of the name/number translation table and of the code are
6054 passed around in the compile data block. The start/end pattern and initial
6055 options are already set from the pre-compile phase, as is the name_entry_size
6056 field. Reset the bracket count and the names_found field. Also reset the hwm
6057 field; this time it's used for remembering forward references to subpatterns.
6058 */
6059
6060 cd->final_bracount = cd->bracount; /* Save for checking forward references */
6061 cd->bracount = 0;
6062 cd->names_found = 0;
6063 cd->name_table = (uschar *)re + re->name_table_offset;
6064 codestart = cd->name_table + re->name_entry_size * re->name_count;
6065 cd->start_code = codestart;
6066 cd->hwm = cworkspace;
6067 cd->req_varyopt = 0;
6068 cd->had_accept = FALSE;
6069
6070 /* Set up a starting, non-extracting bracket, then compile the expression. On
6071 error, errorcode will be set non-zero, so we don't need to look at the result
6072 of the function here. */
6073
6074 ptr = (const uschar *)pattern + skipatstart;
6075 code = (uschar *)codestart;
6076 *code = OP_BRA;
6077 (void)compile_regex(re->options, re->options & PCRE_IMS, &code, &ptr,
6078 &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd, NULL);
6079 re->top_bracket = cd->bracount;
6080 re->top_backref = cd->top_backref;
6081 re->flags = cd->external_flags;
6082
6083 if (cd->had_accept) reqbyte = -1; /* Must disable after (*ACCEPT) */
6084
6085 /* If not reached end of pattern on success, there's an excess bracket. */
6086
6087 if (errorcode == 0 && *ptr != 0) errorcode = ERR22;
6088
6089 /* Fill in the terminating state and check for disastrous overflow, but
6090 if debugging, leave the test till after things are printed out. */
6091
6092 *code++ = OP_END;
6093
6094 #ifndef DEBUG
6095 if (code - codestart > length) errorcode = ERR23;
6096 #endif
6097
6098 /* Fill in any forward references that are required. */
6099
6100 while (errorcode == 0 && cd->hwm > cworkspace)
6101 {
6102 int offset, recno;
6103 const uschar *groupptr;
6104 cd->hwm -= LINK_SIZE;
6105 offset = GET(cd->hwm, 0);
6106 recno = GET(codestart, offset);
6107 groupptr = find_bracket(codestart, (re->options & PCRE_UTF8) != 0, recno);
6108 if (groupptr == NULL) errorcode = ERR53;
6109 else PUT(((uschar *)codestart), offset, groupptr - codestart);
6110 }
6111
6112 /* Give an error if there's back reference to a non-existent capturing
6113 subpattern. */
6114
6115 if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15;
6116
6117 /* Failed to compile, or error while post-processing */
6118
6119 if (errorcode != 0)
6120 {
6121 (pcre_free)(re);
6122 PCRE_EARLY_ERROR_RETURN:
6123 *erroroffset = ptr - (const uschar *)pattern;
6124 PCRE_EARLY_ERROR_RETURN2:
6125 *errorptr = find_error_text(errorcode);
6126 if (errorcodeptr != NULL) *errorcodeptr = errorcode;
6127 return NULL;
6128 }
6129
6130 /* If the anchored option was not passed, set the flag if we can determine that
6131 the pattern is anchored by virtue of ^ characters or \A or anything else (such
6132 as starting with .* when DOTALL is set).
6133
6134 Otherwise, if we know what the first byte has to be, save it, because that
6135 speeds up unanchored matches no end. If not, see if we can set the
6136 PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
6137 start with ^. and also when all branches start with .* for non-DOTALL matches.
6138 */
6139
6140 if ((re->options & PCRE_ANCHORED) == 0)
6141 {
6142 int temp_options = re->options; /* May get changed during these scans */
6143 if (is_anchored(codestart, &temp_options, 0, cd->backref_map))
6144 re->options |= PCRE_ANCHORED;
6145 else
6146 {
6147 if (firstbyte < 0)
6148 firstbyte = find_firstassertedchar(codestart, &temp_options, FALSE);
6149 if (firstbyte >= 0) /* Remove caseless flag for non-caseable chars */
6150 {
6151 int ch = firstbyte & 255;
6152 re->first_byte = ((firstbyte & REQ_CASELESS) != 0 &&
6153 cd->fcc[ch] == ch)? ch : firstbyte;
6154 re->flags |= PCRE_FIRSTSET;
6155 }
6156 else if (is_startline(codestart, 0, cd->backref_map))
6157 re->flags |= PCRE_STARTLINE;
6158 }
6159 }
6160
6161 /* For an anchored pattern, we use the "required byte" only if it follows a
6162 variable length item in the regex. Remove the caseless flag for non-caseable
6163 bytes. */
6164
6165 if (reqbyte >= 0 &&
6166 ((re->options & PCRE_ANCHORED) == 0 || (reqbyte & REQ_VARY) != 0))
6167 {
6168 int ch = reqbyte & 255;
6169 re->req_byte = ((reqbyte & REQ_CASELESS) != 0 &&
6170 cd->fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte;
6171 re->flags |= PCRE_REQCHSET;
6172 }
6173
6174 /* Print out the compiled data if debugging is enabled. This is never the
6175 case when building a production library. */
6176
6177 #ifdef DEBUG
6178
6179 printf("Length = %d top_bracket = %d top_backref = %d\n",
6180 length, re->top_bracket, re->top_backref);
6181
6182 printf("Options=%08x\n", re->options);
6183
6184 if ((re->flags & PCRE_FIRSTSET) != 0)
6185 {
6186 int ch = re->first_byte & 255;
6187 const char *caseless = ((re->first_byte & REQ_CASELESS) == 0)?
6188 "" : " (caseless)";
6189 if (isprint(ch)) printf("First char = %c%s\n", ch, caseless);
6190 else printf("First char = \\x%02x%s\n", ch, caseless);
6191 }
6192
6193 if ((re->flags & PCRE_REQCHSET) != 0)
6194 {
6195 int ch = re->req_byte & 255;
6196 const char *caseless = ((re->req_byte & REQ_CASELESS) == 0)?
6197 "" : " (caseless)";
6198 if (isprint(ch)) printf("Req char = %c%s\n", ch, caseless);
6199 else printf("Req char = \\x%02x%s\n", ch, caseless);
6200 }
6201
6202 pcre_printint(re, stdout, TRUE);
6203
6204 /* This check is done here in the debugging case so that the code that
6205 was compiled can be seen. */
6206
6207 if (code - codestart > length)
6208 {
6209 (pcre_free)(re);
6210 *errorptr = find_error_text(ERR23);
6211 *erroroffset = ptr - (uschar *)pattern;
6212 if (errorcodeptr != NULL) *errorcodeptr = ERR23;
6213 return NULL;
6214 }
6215 #endif /* DEBUG */
6216
6217 return (pcre *)re;
6218 }
6219
6220 /* End of pcre_compile.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12