/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Contents of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 335 - (show annotations) (download)
Sat Apr 12 14:36:14 2008 UTC (6 years, 8 months ago) by ph10
File MIME type: text/plain
File size: 200753 byte(s)
Do not discard subpatterns with {0} quantifiers, as they may be called as 
subroutines.

1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2008 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains the external function pcre_compile(), along with
42 supporting internal functions that are not used by other modules. */
43
44
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48
49 #define NLBLOCK cd /* Block containing newline information */
50 #define PSSTART start_pattern /* Field containing processed string start */
51 #define PSEND end_pattern /* Field containing processed string end */
52
53 #include "pcre_internal.h"
54
55
56 /* When DEBUG is defined, we need the pcre_printint() function, which is also
57 used by pcretest. DEBUG is not defined when building a production library. */
58
59 #ifdef DEBUG
60 #include "pcre_printint.src"
61 #endif
62
63
64 /* Macro for setting individual bits in class bitmaps. */
65
66 #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
67
68 /* Maximum length value to check against when making sure that the integer that
69 holds the compiled pattern length does not overflow. We make it a bit less than
70 INT_MAX to allow for adding in group terminating bytes, so that we don't have
71 to check them every time. */
72
73 #define OFLOW_MAX (INT_MAX - 20)
74
75
76 /*************************************************
77 * Code parameters and static tables *
78 *************************************************/
79
80 /* This value specifies the size of stack workspace that is used during the
81 first pre-compile phase that determines how much memory is required. The regex
82 is partly compiled into this space, but the compiled parts are discarded as
83 soon as they can be, so that hopefully there will never be an overrun. The code
84 does, however, check for an overrun. The largest amount I've seen used is 218,
85 so this number is very generous.
86
87 The same workspace is used during the second, actual compile phase for
88 remembering forward references to groups so that they can be filled in at the
89 end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
90 is 4 there is plenty of room. */
91
92 #define COMPILE_WORK_SIZE (4096)
93
94
95 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
96 are simple data values; negative values are for special things like \d and so
97 on. Zero means further processing is needed (for things like \x), or the escape
98 is invalid. */
99
100 #ifndef EBCDIC /* This is the "normal" table for ASCII systems */
101 static const short int escapes[] = {
102 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
103 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
104 '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */
105 -ESC_H, 0, 0, -ESC_K, 0, 0, 0, 0, /* H - O */
106 -ESC_P, -ESC_Q, -ESC_R, -ESC_S, 0, 0, -ESC_V, -ESC_W, /* P - W */
107 -ESC_X, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
108 '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */
109 -ESC_h, 0, 0, -ESC_k, 0, 0, ESC_n, 0, /* h - o */
110 -ESC_p, 0, ESC_r, -ESC_s, ESC_tee, 0, -ESC_v, -ESC_w, /* p - w */
111 0, 0, -ESC_z /* x - z */
112 };
113
114 #else /* This is the "abnormal" table for EBCDIC systems */
115 static const short int escapes[] = {
116 /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
117 /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
118 /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
119 /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
120 /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
121 /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
122 /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
123 /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
124 /* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0,
125 /* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
126 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
127 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0,
128 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
129 /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
130 /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
131 /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
132 /* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0,
133 /* D0 */ '}', 0, -ESC_K, 0, 0, 0, 0, -ESC_P,
134 /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
135 /* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X,
136 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
137 /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
138 /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
139 };
140 #endif
141
142
143 /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
144 searched linearly. Put all the names into a single string, in order to reduce
145 the number of relocations when a shared library is dynamically linked. */
146
147 typedef struct verbitem {
148 int len;
149 int op;
150 } verbitem;
151
152 static const char verbnames[] =
153 "ACCEPT\0"
154 "COMMIT\0"
155 "F\0"
156 "FAIL\0"
157 "PRUNE\0"
158 "SKIP\0"
159 "THEN";
160
161 static const verbitem verbs[] = {
162 { 6, OP_ACCEPT },
163 { 6, OP_COMMIT },
164 { 1, OP_FAIL },
165 { 4, OP_FAIL },
166 { 5, OP_PRUNE },
167 { 4, OP_SKIP },
168 { 4, OP_THEN }
169 };
170
171 static const int verbcount = sizeof(verbs)/sizeof(verbitem);
172
173
174 /* Tables of names of POSIX character classes and their lengths. The names are
175 now all in a single string, to reduce the number of relocations when a shared
176 library is dynamically loaded. The list of lengths is terminated by a zero
177 length entry. The first three must be alpha, lower, upper, as this is assumed
178 for handling case independence. */
179
180 static const char posix_names[] =
181 "alpha\0" "lower\0" "upper\0" "alnum\0" "ascii\0" "blank\0"
182 "cntrl\0" "digit\0" "graph\0" "print\0" "punct\0" "space\0"
183 "word\0" "xdigit";
184
185 static const uschar posix_name_lengths[] = {
186 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
187
188 /* Table of class bit maps for each POSIX class. Each class is formed from a
189 base map, with an optional addition or removal of another map. Then, for some
190 classes, there is some additional tweaking: for [:blank:] the vertical space
191 characters are removed, and for [:alpha:] and [:alnum:] the underscore
192 character is removed. The triples in the table consist of the base map offset,
193 second map offset or -1 if no second map, and a non-negative value for map
194 addition or a negative value for map subtraction (if there are two maps). The
195 absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
196 remove vertical space characters, 2 => remove underscore. */
197
198 static const int posix_class_maps[] = {
199 cbit_word, cbit_digit, -2, /* alpha */
200 cbit_lower, -1, 0, /* lower */
201 cbit_upper, -1, 0, /* upper */
202 cbit_word, -1, 2, /* alnum - word without underscore */
203 cbit_print, cbit_cntrl, 0, /* ascii */
204 cbit_space, -1, 1, /* blank - a GNU extension */
205 cbit_cntrl, -1, 0, /* cntrl */
206 cbit_digit, -1, 0, /* digit */
207 cbit_graph, -1, 0, /* graph */
208 cbit_print, -1, 0, /* print */
209 cbit_punct, -1, 0, /* punct */
210 cbit_space, -1, 0, /* space */
211 cbit_word, -1, 0, /* word - a Perl extension */
212 cbit_xdigit,-1, 0 /* xdigit */
213 };
214
215
216 #define STRING(a) # a
217 #define XSTRING(s) STRING(s)
218
219 /* The texts of compile-time error messages. These are "char *" because they
220 are passed to the outside world. Do not ever re-use any error number, because
221 they are documented. Always add a new error instead. Messages marked DEAD below
222 are no longer used. This used to be a table of strings, but in order to reduce
223 the number of relocations needed when a shared library is loaded dynamically,
224 it is now one long string. We cannot use a table of offsets, because the
225 lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
226 simply count through to the one we want - this isn't a performance issue
227 because these strings are used only when there is a compilation error. */
228
229 static const char error_texts[] =
230 "no error\0"
231 "\\ at end of pattern\0"
232 "\\c at end of pattern\0"
233 "unrecognized character follows \\\0"
234 "numbers out of order in {} quantifier\0"
235 /* 5 */
236 "number too big in {} quantifier\0"
237 "missing terminating ] for character class\0"
238 "invalid escape sequence in character class\0"
239 "range out of order in character class\0"
240 "nothing to repeat\0"
241 /* 10 */
242 "operand of unlimited repeat could match the empty string\0" /** DEAD **/
243 "internal error: unexpected repeat\0"
244 "unrecognized character after (? or (?-\0"
245 "POSIX named classes are supported only within a class\0"
246 "missing )\0"
247 /* 15 */
248 "reference to non-existent subpattern\0"
249 "erroffset passed as NULL\0"
250 "unknown option bit(s) set\0"
251 "missing ) after comment\0"
252 "parentheses nested too deeply\0" /** DEAD **/
253 /* 20 */
254 "regular expression is too large\0"
255 "failed to get memory\0"
256 "unmatched parentheses\0"
257 "internal error: code overflow\0"
258 "unrecognized character after (?<\0"
259 /* 25 */
260 "lookbehind assertion is not fixed length\0"
261 "malformed number or name after (?(\0"
262 "conditional group contains more than two branches\0"
263 "assertion expected after (?(\0"
264 "(?R or (?[+-]digits must be followed by )\0"
265 /* 30 */
266 "unknown POSIX class name\0"
267 "POSIX collating elements are not supported\0"
268 "this version of PCRE is not compiled with PCRE_UTF8 support\0"
269 "spare error\0" /** DEAD **/
270 "character value in \\x{...} sequence is too large\0"
271 /* 35 */
272 "invalid condition (?(0)\0"
273 "\\C not allowed in lookbehind assertion\0"
274 "PCRE does not support \\L, \\l, \\N, \\U, or \\u\0"
275 "number after (?C is > 255\0"
276 "closing ) for (?C expected\0"
277 /* 40 */
278 "recursive call could loop indefinitely\0"
279 "unrecognized character after (?P\0"
280 "syntax error in subpattern name (missing terminator)\0"
281 "two named subpatterns have the same name\0"
282 "invalid UTF-8 string\0"
283 /* 45 */
284 "support for \\P, \\p, and \\X has not been compiled\0"
285 "malformed \\P or \\p sequence\0"
286 "unknown property name after \\P or \\p\0"
287 "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
288 "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
289 /* 50 */
290 "repeated subpattern is too long\0" /** DEAD **/
291 "octal value is greater than \\377 (not in UTF-8 mode)\0"
292 "internal error: overran compiling workspace\0"
293 "internal error: previously-checked referenced subpattern not found\0"
294 "DEFINE group contains more than one branch\0"
295 /* 55 */
296 "repeating a DEFINE group is not allowed\0"
297 "inconsistent NEWLINE options\0"
298 "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
299 "a numbered reference must not be zero\0"
300 "(*VERB) with an argument is not supported\0"
301 /* 60 */
302 "(*VERB) not recognized\0"
303 "number is too big\0"
304 "subpattern name expected\0"
305 "digit expected after (?+";
306
307
308 /* Table to identify digits and hex digits. This is used when compiling
309 patterns. Note that the tables in chartables are dependent on the locale, and
310 may mark arbitrary characters as digits - but the PCRE compiling code expects
311 to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
312 a private table here. It costs 256 bytes, but it is a lot faster than doing
313 character value tests (at least in some simple cases I timed), and in some
314 applications one wants PCRE to compile efficiently as well as match
315 efficiently.
316
317 For convenience, we use the same bit definitions as in chartables:
318
319 0x04 decimal digit
320 0x08 hexadecimal digit
321
322 Then we can use ctype_digit and ctype_xdigit in the code. */
323
324 #ifndef EBCDIC /* This is the "normal" case, for ASCII systems */
325 static const unsigned char digitab[] =
326 {
327 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
328 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
329 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
330 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
331 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
332 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
333 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
334 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
335 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
336 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
337 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
338 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
339 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
340 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
341 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
342 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
343 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
344 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
345 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
346 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
347 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
348 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
349 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
350 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
351 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
352 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
353 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
354 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
355 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
356 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
357 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
358 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
359
360 #else /* This is the "abnormal" case, for EBCDIC systems */
361 static const unsigned char digitab[] =
362 {
363 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
364 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
365 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
366 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
367 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
368 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
369 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
370 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
371 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
372 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
373 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
374 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */
375 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
376 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
377 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
378 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
379 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
380 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
381 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
382 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
383 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
384 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
385 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
386 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
387 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
388 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
389 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
390 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
391 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
392 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
393 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
394 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
395
396 static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
397 0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
398 0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
399 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
400 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
401 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
402 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
403 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
404 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
405 0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
406 0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
407 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
408 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */
409 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
410 0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
411 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
412 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
413 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
414 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
415 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
416 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
417 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
418 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
419 0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
420 0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
421 0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
422 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
423 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
424 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
425 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
426 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
427 0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
428 0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
429 #endif
430
431
432 /* Definition to allow mutual recursion */
433
434 static BOOL
435 compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
436 int *, int *, branch_chain *, compile_data *, int *);
437
438
439
440 /*************************************************
441 * Find an error text *
442 *************************************************/
443
444 /* The error texts are now all in one long string, to save on relocations. As
445 some of the text is of unknown length, we can't use a table of offsets.
446 Instead, just count through the strings. This is not a performance issue
447 because it happens only when there has been a compilation error.
448
449 Argument: the error number
450 Returns: pointer to the error string
451 */
452
453 static const char *
454 find_error_text(int n)
455 {
456 const char *s = error_texts;
457 for (; n > 0; n--) while (*s++ != 0);
458 return s;
459 }
460
461
462 /*************************************************
463 * Handle escapes *
464 *************************************************/
465
466 /* This function is called when a \ has been encountered. It either returns a
467 positive value for a simple escape such as \n, or a negative value which
468 encodes one of the more complicated things such as \d. A backreference to group
469 n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
470 UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
471 ptr is pointing at the \. On exit, it is on the final character of the escape
472 sequence.
473
474 Arguments:
475 ptrptr points to the pattern position pointer
476 errorcodeptr points to the errorcode variable
477 bracount number of previous extracting brackets
478 options the options bits
479 isclass TRUE if inside a character class
480
481 Returns: zero or positive => a data character
482 negative => a special escape sequence
483 on error, errorcodeptr is set
484 */
485
486 static int
487 check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
488 int options, BOOL isclass)
489 {
490 BOOL utf8 = (options & PCRE_UTF8) != 0;
491 const uschar *ptr = *ptrptr + 1;
492 int c, i;
493
494 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
495 ptr--; /* Set pointer back to the last byte */
496
497 /* If backslash is at the end of the pattern, it's an error. */
498
499 if (c == 0) *errorcodeptr = ERR1;
500
501 /* Non-alphanumerics are literals. For digits or letters, do an initial lookup
502 in a table. A non-zero result is something that can be returned immediately.
503 Otherwise further processing may be required. */
504
505 #ifndef EBCDIC /* ASCII coding */
506 else if (c < '0' || c > 'z') {} /* Not alphanumeric */
507 else if ((i = escapes[c - '0']) != 0) c = i;
508
509 #else /* EBCDIC coding */
510 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphanumeric */
511 else if ((i = escapes[c - 0x48]) != 0) c = i;
512 #endif
513
514 /* Escapes that need further processing, or are illegal. */
515
516 else
517 {
518 const uschar *oldptr;
519 BOOL braced, negated;
520
521 switch (c)
522 {
523 /* A number of Perl escapes are not handled by PCRE. We give an explicit
524 error. */
525
526 case 'l':
527 case 'L':
528 case 'N':
529 case 'u':
530 case 'U':
531 *errorcodeptr = ERR37;
532 break;
533
534 /* \g must be followed by one of a number of specific things:
535
536 (1) A number, either plain or braced. If positive, it is an absolute
537 backreference. If negative, it is a relative backreference. This is a Perl
538 5.10 feature.
539
540 (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
541 is part of Perl's movement towards a unified syntax for back references. As
542 this is synonymous with \k{name}, we fudge it up by pretending it really
543 was \k.
544
545 (3) For Oniguruma compatibility we also support \g followed by a name or a
546 number either in angle brackets or in single quotes. However, these are
547 (possibly recursive) subroutine calls, _not_ backreferences. Just return
548 the -ESC_g code (cf \k). */
549
550 case 'g':
551 if (ptr[1] == '<' || ptr[1] == '\'')
552 {
553 c = -ESC_g;
554 break;
555 }
556
557 /* Handle the Perl-compatible cases */
558
559 if (ptr[1] == '{')
560 {
561 const uschar *p;
562 for (p = ptr+2; *p != 0 && *p != '}'; p++)
563 if (*p != '-' && (digitab[*p] & ctype_digit) == 0) break;
564 if (*p != 0 && *p != '}')
565 {
566 c = -ESC_k;
567 break;
568 }
569 braced = TRUE;
570 ptr++;
571 }
572 else braced = FALSE;
573
574 if (ptr[1] == '-')
575 {
576 negated = TRUE;
577 ptr++;
578 }
579 else negated = FALSE;
580
581 c = 0;
582 while ((digitab[ptr[1]] & ctype_digit) != 0)
583 c = c * 10 + *(++ptr) - '0';
584
585 if (c < 0) /* Integer overflow */
586 {
587 *errorcodeptr = ERR61;
588 break;
589 }
590
591 if (braced && *(++ptr) != '}')
592 {
593 *errorcodeptr = ERR57;
594 break;
595 }
596
597 if (c == 0)
598 {
599 *errorcodeptr = ERR58;
600 break;
601 }
602
603 if (negated)
604 {
605 if (c > bracount)
606 {
607 *errorcodeptr = ERR15;
608 break;
609 }
610 c = bracount - (c - 1);
611 }
612
613 c = -(ESC_REF + c);
614 break;
615
616 /* The handling of escape sequences consisting of a string of digits
617 starting with one that is not zero is not straightforward. By experiment,
618 the way Perl works seems to be as follows:
619
620 Outside a character class, the digits are read as a decimal number. If the
621 number is less than 10, or if there are that many previous extracting
622 left brackets, then it is a back reference. Otherwise, up to three octal
623 digits are read to form an escaped byte. Thus \123 is likely to be octal
624 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
625 value is greater than 377, the least significant 8 bits are taken. Inside a
626 character class, \ followed by a digit is always an octal number. */
627
628 case '1': case '2': case '3': case '4': case '5':
629 case '6': case '7': case '8': case '9':
630
631 if (!isclass)
632 {
633 oldptr = ptr;
634 c -= '0';
635 while ((digitab[ptr[1]] & ctype_digit) != 0)
636 c = c * 10 + *(++ptr) - '0';
637 if (c < 0) /* Integer overflow */
638 {
639 *errorcodeptr = ERR61;
640 break;
641 }
642 if (c < 10 || c <= bracount)
643 {
644 c = -(ESC_REF + c);
645 break;
646 }
647 ptr = oldptr; /* Put the pointer back and fall through */
648 }
649
650 /* Handle an octal number following \. If the first digit is 8 or 9, Perl
651 generates a binary zero byte and treats the digit as a following literal.
652 Thus we have to pull back the pointer by one. */
653
654 if ((c = *ptr) >= '8')
655 {
656 ptr--;
657 c = 0;
658 break;
659 }
660
661 /* \0 always starts an octal number, but we may drop through to here with a
662 larger first octal digit. The original code used just to take the least
663 significant 8 bits of octal numbers (I think this is what early Perls used
664 to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
665 than 3 octal digits. */
666
667 case '0':
668 c -= '0';
669 while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
670 c = c * 8 + *(++ptr) - '0';
671 if (!utf8 && c > 255) *errorcodeptr = ERR51;
672 break;
673
674 /* \x is complicated. \x{ddd} is a character number which can be greater
675 than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
676 treated as a data character. */
677
678 case 'x':
679 if (ptr[1] == '{')
680 {
681 const uschar *pt = ptr + 2;
682 int count = 0;
683
684 c = 0;
685 while ((digitab[*pt] & ctype_xdigit) != 0)
686 {
687 register int cc = *pt++;
688 if (c == 0 && cc == '0') continue; /* Leading zeroes */
689 count++;
690
691 #ifndef EBCDIC /* ASCII coding */
692 if (cc >= 'a') cc -= 32; /* Convert to upper case */
693 c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));
694 #else /* EBCDIC coding */
695 if (cc >= 'a' && cc <= 'z') cc += 64; /* Convert to upper case */
696 c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));
697 #endif
698 }
699
700 if (*pt == '}')
701 {
702 if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
703 ptr = pt;
704 break;
705 }
706
707 /* If the sequence of hex digits does not end with '}', then we don't
708 recognize this construct; fall through to the normal \x handling. */
709 }
710
711 /* Read just a single-byte hex-defined char */
712
713 c = 0;
714 while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
715 {
716 int cc; /* Some compilers don't like ++ */
717 cc = *(++ptr); /* in initializers */
718 #ifndef EBCDIC /* ASCII coding */
719 if (cc >= 'a') cc -= 32; /* Convert to upper case */
720 c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
721 #else /* EBCDIC coding */
722 if (cc <= 'z') cc += 64; /* Convert to upper case */
723 c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
724 #endif
725 }
726 break;
727
728 /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
729 This coding is ASCII-specific, but then the whole concept of \cx is
730 ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
731
732 case 'c':
733 c = *(++ptr);
734 if (c == 0)
735 {
736 *errorcodeptr = ERR2;
737 break;
738 }
739
740 #ifndef EBCDIC /* ASCII coding */
741 if (c >= 'a' && c <= 'z') c -= 32;
742 c ^= 0x40;
743 #else /* EBCDIC coding */
744 if (c >= 'a' && c <= 'z') c += 64;
745 c ^= 0xC0;
746 #endif
747 break;
748
749 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
750 other alphanumeric following \ is an error if PCRE_EXTRA was set;
751 otherwise, for Perl compatibility, it is a literal. This code looks a bit
752 odd, but there used to be some cases other than the default, and there may
753 be again in future, so I haven't "optimized" it. */
754
755 default:
756 if ((options & PCRE_EXTRA) != 0) switch(c)
757 {
758 default:
759 *errorcodeptr = ERR3;
760 break;
761 }
762 break;
763 }
764 }
765
766 *ptrptr = ptr;
767 return c;
768 }
769
770
771
772 #ifdef SUPPORT_UCP
773 /*************************************************
774 * Handle \P and \p *
775 *************************************************/
776
777 /* This function is called after \P or \p has been encountered, provided that
778 PCRE is compiled with support for Unicode properties. On entry, ptrptr is
779 pointing at the P or p. On exit, it is pointing at the final character of the
780 escape sequence.
781
782 Argument:
783 ptrptr points to the pattern position pointer
784 negptr points to a boolean that is set TRUE for negation else FALSE
785 dptr points to an int that is set to the detailed property value
786 errorcodeptr points to the error code variable
787
788 Returns: type value from ucp_type_table, or -1 for an invalid type
789 */
790
791 static int
792 get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
793 {
794 int c, i, bot, top;
795 const uschar *ptr = *ptrptr;
796 char name[32];
797
798 c = *(++ptr);
799 if (c == 0) goto ERROR_RETURN;
800
801 *negptr = FALSE;
802
803 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
804 negation. */
805
806 if (c == '{')
807 {
808 if (ptr[1] == '^')
809 {
810 *negptr = TRUE;
811 ptr++;
812 }
813 for (i = 0; i < (int)sizeof(name) - 1; i++)
814 {
815 c = *(++ptr);
816 if (c == 0) goto ERROR_RETURN;
817 if (c == '}') break;
818 name[i] = c;
819 }
820 if (c !='}') goto ERROR_RETURN;
821 name[i] = 0;
822 }
823
824 /* Otherwise there is just one following character */
825
826 else
827 {
828 name[0] = c;
829 name[1] = 0;
830 }
831
832 *ptrptr = ptr;
833
834 /* Search for a recognized property name using binary chop */
835
836 bot = 0;
837 top = _pcre_utt_size;
838
839 while (bot < top)
840 {
841 i = (bot + top) >> 1;
842 c = strcmp(name, _pcre_utt_names + _pcre_utt[i].name_offset);
843 if (c == 0)
844 {
845 *dptr = _pcre_utt[i].value;
846 return _pcre_utt[i].type;
847 }
848 if (c > 0) bot = i + 1; else top = i;
849 }
850
851 *errorcodeptr = ERR47;
852 *ptrptr = ptr;
853 return -1;
854
855 ERROR_RETURN:
856 *errorcodeptr = ERR46;
857 *ptrptr = ptr;
858 return -1;
859 }
860 #endif
861
862
863
864
865 /*************************************************
866 * Check for counted repeat *
867 *************************************************/
868
869 /* This function is called when a '{' is encountered in a place where it might
870 start a quantifier. It looks ahead to see if it really is a quantifier or not.
871 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
872 where the ddds are digits.
873
874 Arguments:
875 p pointer to the first char after '{'
876
877 Returns: TRUE or FALSE
878 */
879
880 static BOOL
881 is_counted_repeat(const uschar *p)
882 {
883 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
884 while ((digitab[*p] & ctype_digit) != 0) p++;
885 if (*p == '}') return TRUE;
886
887 if (*p++ != ',') return FALSE;
888 if (*p == '}') return TRUE;
889
890 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
891 while ((digitab[*p] & ctype_digit) != 0) p++;
892
893 return (*p == '}');
894 }
895
896
897
898 /*************************************************
899 * Read repeat counts *
900 *************************************************/
901
902 /* Read an item of the form {n,m} and return the values. This is called only
903 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
904 so the syntax is guaranteed to be correct, but we need to check the values.
905
906 Arguments:
907 p pointer to first char after '{'
908 minp pointer to int for min
909 maxp pointer to int for max
910 returned as -1 if no max
911 errorcodeptr points to error code variable
912
913 Returns: pointer to '}' on success;
914 current ptr on error, with errorcodeptr set non-zero
915 */
916
917 static const uschar *
918 read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
919 {
920 int min = 0;
921 int max = -1;
922
923 /* Read the minimum value and do a paranoid check: a negative value indicates
924 an integer overflow. */
925
926 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
927 if (min < 0 || min > 65535)
928 {
929 *errorcodeptr = ERR5;
930 return p;
931 }
932
933 /* Read the maximum value if there is one, and again do a paranoid on its size.
934 Also, max must not be less than min. */
935
936 if (*p == '}') max = min; else
937 {
938 if (*(++p) != '}')
939 {
940 max = 0;
941 while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
942 if (max < 0 || max > 65535)
943 {
944 *errorcodeptr = ERR5;
945 return p;
946 }
947 if (max < min)
948 {
949 *errorcodeptr = ERR4;
950 return p;
951 }
952 }
953 }
954
955 /* Fill in the required variables, and pass back the pointer to the terminating
956 '}'. */
957
958 *minp = min;
959 *maxp = max;
960 return p;
961 }
962
963
964
965 /*************************************************
966 * Find forward referenced subpattern *
967 *************************************************/
968
969 /* This function scans along a pattern's text looking for capturing
970 subpatterns, and counting them. If it finds a named pattern that matches the
971 name it is given, it returns its number. Alternatively, if the name is NULL, it
972 returns when it reaches a given numbered subpattern. This is used for forward
973 references to subpatterns. We know that if (?P< is encountered, the name will
974 be terminated by '>' because that is checked in the first pass.
975
976 Arguments:
977 ptr current position in the pattern
978 count current count of capturing parens so far encountered
979 name name to seek, or NULL if seeking a numbered subpattern
980 lorn name length, or subpattern number if name is NULL
981 xmode TRUE if we are in /x mode
982
983 Returns: the number of the named subpattern, or -1 if not found
984 */
985
986 static int
987 find_parens(const uschar *ptr, int count, const uschar *name, int lorn,
988 BOOL xmode)
989 {
990 const uschar *thisname;
991
992 for (; *ptr != 0; ptr++)
993 {
994 int term;
995
996 /* Skip over backslashed characters and also entire \Q...\E */
997
998 if (*ptr == '\\')
999 {
1000 if (*(++ptr) == 0) return -1;
1001 if (*ptr == 'Q') for (;;)
1002 {
1003 while (*(++ptr) != 0 && *ptr != '\\');
1004 if (*ptr == 0) return -1;
1005 if (*(++ptr) == 'E') break;
1006 }
1007 continue;
1008 }
1009
1010 /* Skip over character classes */
1011
1012 if (*ptr == '[')
1013 {
1014 while (*(++ptr) != ']')
1015 {
1016 if (*ptr == 0) return -1;
1017 if (*ptr == '\\')
1018 {
1019 if (*(++ptr) == 0) return -1;
1020 if (*ptr == 'Q') for (;;)
1021 {
1022 while (*(++ptr) != 0 && *ptr != '\\');
1023 if (*ptr == 0) return -1;
1024 if (*(++ptr) == 'E') break;
1025 }
1026 continue;
1027 }
1028 }
1029 continue;
1030 }
1031
1032 /* Skip comments in /x mode */
1033
1034 if (xmode && *ptr == '#')
1035 {
1036 while (*(++ptr) != 0 && *ptr != '\n');
1037 if (*ptr == 0) return -1;
1038 continue;
1039 }
1040
1041 /* An opening parens must now be a real metacharacter */
1042
1043 if (*ptr != '(') continue;
1044 if (ptr[1] != '?' && ptr[1] != '*')
1045 {
1046 count++;
1047 if (name == NULL && count == lorn) return count;
1048 continue;
1049 }
1050
1051 ptr += 2;
1052 if (*ptr == 'P') ptr++; /* Allow optional P */
1053
1054 /* We have to disambiguate (?<! and (?<= from (?<name> */
1055
1056 if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') &&
1057 *ptr != '\'')
1058 continue;
1059
1060 count++;
1061
1062 if (name == NULL && count == lorn) return count;
1063 term = *ptr++;
1064 if (term == '<') term = '>';
1065 thisname = ptr;
1066 while (*ptr != term) ptr++;
1067 if (name != NULL && lorn == ptr - thisname &&
1068 strncmp((const char *)name, (const char *)thisname, lorn) == 0)
1069 return count;
1070 }
1071
1072 return -1;
1073 }
1074
1075
1076
1077 /*************************************************
1078 * Find first significant op code *
1079 *************************************************/
1080
1081 /* This is called by several functions that scan a compiled expression looking
1082 for a fixed first character, or an anchoring op code etc. It skips over things
1083 that do not influence this. For some calls, a change of option is important.
1084 For some calls, it makes sense to skip negative forward and all backward
1085 assertions, and also the \b assertion; for others it does not.
1086
1087 Arguments:
1088 code pointer to the start of the group
1089 options pointer to external options
1090 optbit the option bit whose changing is significant, or
1091 zero if none are
1092 skipassert TRUE if certain assertions are to be skipped
1093
1094 Returns: pointer to the first significant opcode
1095 */
1096
1097 static const uschar*
1098 first_significant_code(const uschar *code, int *options, int optbit,
1099 BOOL skipassert)
1100 {
1101 for (;;)
1102 {
1103 switch ((int)*code)
1104 {
1105 case OP_OPT:
1106 if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
1107 *options = (int)code[1];
1108 code += 2;
1109 break;
1110
1111 case OP_ASSERT_NOT:
1112 case OP_ASSERTBACK:
1113 case OP_ASSERTBACK_NOT:
1114 if (!skipassert) return code;
1115 do code += GET(code, 1); while (*code == OP_ALT);
1116 code += _pcre_OP_lengths[*code];
1117 break;
1118
1119 case OP_WORD_BOUNDARY:
1120 case OP_NOT_WORD_BOUNDARY:
1121 if (!skipassert) return code;
1122 /* Fall through */
1123
1124 case OP_CALLOUT:
1125 case OP_CREF:
1126 case OP_RREF:
1127 case OP_DEF:
1128 code += _pcre_OP_lengths[*code];
1129 break;
1130
1131 default:
1132 return code;
1133 }
1134 }
1135 /* Control never reaches here */
1136 }
1137
1138
1139
1140
1141 /*************************************************
1142 * Find the fixed length of a pattern *
1143 *************************************************/
1144
1145 /* Scan a pattern and compute the fixed length of subject that will match it,
1146 if the length is fixed. This is needed for dealing with backward assertions.
1147 In UTF8 mode, the result is in characters rather than bytes.
1148
1149 Arguments:
1150 code points to the start of the pattern (the bracket)
1151 options the compiling options
1152
1153 Returns: the fixed length, or -1 if there is no fixed length,
1154 or -2 if \C was encountered
1155 */
1156
1157 static int
1158 find_fixedlength(uschar *code, int options)
1159 {
1160 int length = -1;
1161
1162 register int branchlength = 0;
1163 register uschar *cc = code + 1 + LINK_SIZE;
1164
1165 /* Scan along the opcodes for this branch. If we get to the end of the
1166 branch, check the length against that of the other branches. */
1167
1168 for (;;)
1169 {
1170 int d;
1171 register int op = *cc;
1172 switch (op)
1173 {
1174 case OP_CBRA:
1175 case OP_BRA:
1176 case OP_ONCE:
1177 case OP_COND:
1178 d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);
1179 if (d < 0) return d;
1180 branchlength += d;
1181 do cc += GET(cc, 1); while (*cc == OP_ALT);
1182 cc += 1 + LINK_SIZE;
1183 break;
1184
1185 /* Reached end of a branch; if it's a ket it is the end of a nested
1186 call. If it's ALT it is an alternation in a nested call. If it is
1187 END it's the end of the outer call. All can be handled by the same code. */
1188
1189 case OP_ALT:
1190 case OP_KET:
1191 case OP_KETRMAX:
1192 case OP_KETRMIN:
1193 case OP_END:
1194 if (length < 0) length = branchlength;
1195 else if (length != branchlength) return -1;
1196 if (*cc != OP_ALT) return length;
1197 cc += 1 + LINK_SIZE;
1198 branchlength = 0;
1199 break;
1200
1201 /* Skip over assertive subpatterns */
1202
1203 case OP_ASSERT:
1204 case OP_ASSERT_NOT:
1205 case OP_ASSERTBACK:
1206 case OP_ASSERTBACK_NOT:
1207 do cc += GET(cc, 1); while (*cc == OP_ALT);
1208 /* Fall through */
1209
1210 /* Skip over things that don't match chars */
1211
1212 case OP_REVERSE:
1213 case OP_CREF:
1214 case OP_RREF:
1215 case OP_DEF:
1216 case OP_OPT:
1217 case OP_CALLOUT:
1218 case OP_SOD:
1219 case OP_SOM:
1220 case OP_EOD:
1221 case OP_EODN:
1222 case OP_CIRC:
1223 case OP_DOLL:
1224 case OP_NOT_WORD_BOUNDARY:
1225 case OP_WORD_BOUNDARY:
1226 cc += _pcre_OP_lengths[*cc];
1227 break;
1228
1229 /* Handle literal characters */
1230
1231 case OP_CHAR:
1232 case OP_CHARNC:
1233 case OP_NOT:
1234 branchlength++;
1235 cc += 2;
1236 #ifdef SUPPORT_UTF8
1237 if ((options & PCRE_UTF8) != 0)
1238 {
1239 while ((*cc & 0xc0) == 0x80) cc++;
1240 }
1241 #endif
1242 break;
1243
1244 /* Handle exact repetitions. The count is already in characters, but we
1245 need to skip over a multibyte character in UTF8 mode. */
1246
1247 case OP_EXACT:
1248 branchlength += GET2(cc,1);
1249 cc += 4;
1250 #ifdef SUPPORT_UTF8
1251 if ((options & PCRE_UTF8) != 0)
1252 {
1253 while((*cc & 0x80) == 0x80) cc++;
1254 }
1255 #endif
1256 break;
1257
1258 case OP_TYPEEXACT:
1259 branchlength += GET2(cc,1);
1260 if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
1261 cc += 4;
1262 break;
1263
1264 /* Handle single-char matchers */
1265
1266 case OP_PROP:
1267 case OP_NOTPROP:
1268 cc += 2;
1269 /* Fall through */
1270
1271 case OP_NOT_DIGIT:
1272 case OP_DIGIT:
1273 case OP_NOT_WHITESPACE:
1274 case OP_WHITESPACE:
1275 case OP_NOT_WORDCHAR:
1276 case OP_WORDCHAR:
1277 case OP_ANY:
1278 branchlength++;
1279 cc++;
1280 break;
1281
1282 /* The single-byte matcher isn't allowed */
1283
1284 case OP_ANYBYTE:
1285 return -2;
1286
1287 /* Check a class for variable quantification */
1288
1289 #ifdef SUPPORT_UTF8
1290 case OP_XCLASS:
1291 cc += GET(cc, 1) - 33;
1292 /* Fall through */
1293 #endif
1294
1295 case OP_CLASS:
1296 case OP_NCLASS:
1297 cc += 33;
1298
1299 switch (*cc)
1300 {
1301 case OP_CRSTAR:
1302 case OP_CRMINSTAR:
1303 case OP_CRQUERY:
1304 case OP_CRMINQUERY:
1305 return -1;
1306
1307 case OP_CRRANGE:
1308 case OP_CRMINRANGE:
1309 if (GET2(cc,1) != GET2(cc,3)) return -1;
1310 branchlength += GET2(cc,1);
1311 cc += 5;
1312 break;
1313
1314 default:
1315 branchlength++;
1316 }
1317 break;
1318
1319 /* Anything else is variable length */
1320
1321 default:
1322 return -1;
1323 }
1324 }
1325 /* Control never gets here */
1326 }
1327
1328
1329
1330
1331 /*************************************************
1332 * Scan compiled regex for numbered bracket *
1333 *************************************************/
1334
1335 /* This little function scans through a compiled pattern until it finds a
1336 capturing bracket with the given number.
1337
1338 Arguments:
1339 code points to start of expression
1340 utf8 TRUE in UTF-8 mode
1341 number the required bracket number
1342
1343 Returns: pointer to the opcode for the bracket, or NULL if not found
1344 */
1345
1346 static const uschar *
1347 find_bracket(const uschar *code, BOOL utf8, int number)
1348 {
1349 for (;;)
1350 {
1351 register int c = *code;
1352 if (c == OP_END) return NULL;
1353
1354 /* XCLASS is used for classes that cannot be represented just by a bit
1355 map. This includes negated single high-valued characters. The length in
1356 the table is zero; the actual length is stored in the compiled code. */
1357
1358 if (c == OP_XCLASS) code += GET(code, 1);
1359
1360 /* Handle capturing bracket */
1361
1362 else if (c == OP_CBRA)
1363 {
1364 int n = GET2(code, 1+LINK_SIZE);
1365 if (n == number) return (uschar *)code;
1366 code += _pcre_OP_lengths[c];
1367 }
1368
1369 /* Otherwise, we can get the item's length from the table, except that for
1370 repeated character types, we have to test for \p and \P, which have an extra
1371 two bytes of parameters. */
1372
1373 else
1374 {
1375 switch(c)
1376 {
1377 case OP_TYPESTAR:
1378 case OP_TYPEMINSTAR:
1379 case OP_TYPEPLUS:
1380 case OP_TYPEMINPLUS:
1381 case OP_TYPEQUERY:
1382 case OP_TYPEMINQUERY:
1383 case OP_TYPEPOSSTAR:
1384 case OP_TYPEPOSPLUS:
1385 case OP_TYPEPOSQUERY:
1386 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1387 break;
1388
1389 case OP_TYPEUPTO:
1390 case OP_TYPEMINUPTO:
1391 case OP_TYPEEXACT:
1392 case OP_TYPEPOSUPTO:
1393 if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1394 break;
1395 }
1396
1397 /* Add in the fixed length from the table */
1398
1399 code += _pcre_OP_lengths[c];
1400
1401 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1402 a multi-byte character. The length in the table is a minimum, so we have to
1403 arrange to skip the extra bytes. */
1404
1405 #ifdef SUPPORT_UTF8
1406 if (utf8) switch(c)
1407 {
1408 case OP_CHAR:
1409 case OP_CHARNC:
1410 case OP_EXACT:
1411 case OP_UPTO:
1412 case OP_MINUPTO:
1413 case OP_POSUPTO:
1414 case OP_STAR:
1415 case OP_MINSTAR:
1416 case OP_POSSTAR:
1417 case OP_PLUS:
1418 case OP_MINPLUS:
1419 case OP_POSPLUS:
1420 case OP_QUERY:
1421 case OP_MINQUERY:
1422 case OP_POSQUERY:
1423 if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1424 break;
1425 }
1426 #endif
1427 }
1428 }
1429 }
1430
1431
1432
1433 /*************************************************
1434 * Scan compiled regex for recursion reference *
1435 *************************************************/
1436
1437 /* This little function scans through a compiled pattern until it finds an
1438 instance of OP_RECURSE.
1439
1440 Arguments:
1441 code points to start of expression
1442 utf8 TRUE in UTF-8 mode
1443
1444 Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1445 */
1446
1447 static const uschar *
1448 find_recurse(const uschar *code, BOOL utf8)
1449 {
1450 for (;;)
1451 {
1452 register int c = *code;
1453 if (c == OP_END) return NULL;
1454 if (c == OP_RECURSE) return code;
1455
1456 /* XCLASS is used for classes that cannot be represented just by a bit
1457 map. This includes negated single high-valued characters. The length in
1458 the table is zero; the actual length is stored in the compiled code. */
1459
1460 if (c == OP_XCLASS) code += GET(code, 1);
1461
1462 /* Otherwise, we can get the item's length from the table, except that for
1463 repeated character types, we have to test for \p and \P, which have an extra
1464 two bytes of parameters. */
1465
1466 else
1467 {
1468 switch(c)
1469 {
1470 case OP_TYPESTAR:
1471 case OP_TYPEMINSTAR:
1472 case OP_TYPEPLUS:
1473 case OP_TYPEMINPLUS:
1474 case OP_TYPEQUERY:
1475 case OP_TYPEMINQUERY:
1476 case OP_TYPEPOSSTAR:
1477 case OP_TYPEPOSPLUS:
1478 case OP_TYPEPOSQUERY:
1479 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1480 break;
1481
1482 case OP_TYPEPOSUPTO:
1483 case OP_TYPEUPTO:
1484 case OP_TYPEMINUPTO:
1485 case OP_TYPEEXACT:
1486 if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1487 break;
1488 }
1489
1490 /* Add in the fixed length from the table */
1491
1492 code += _pcre_OP_lengths[c];
1493
1494 /* In UTF-8 mode, opcodes that are followed by a character may be followed
1495 by a multi-byte character. The length in the table is a minimum, so we have
1496 to arrange to skip the extra bytes. */
1497
1498 #ifdef SUPPORT_UTF8
1499 if (utf8) switch(c)
1500 {
1501 case OP_CHAR:
1502 case OP_CHARNC:
1503 case OP_EXACT:
1504 case OP_UPTO:
1505 case OP_MINUPTO:
1506 case OP_POSUPTO:
1507 case OP_STAR:
1508 case OP_MINSTAR:
1509 case OP_POSSTAR:
1510 case OP_PLUS:
1511 case OP_MINPLUS:
1512 case OP_POSPLUS:
1513 case OP_QUERY:
1514 case OP_MINQUERY:
1515 case OP_POSQUERY:
1516 if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1517 break;
1518 }
1519 #endif
1520 }
1521 }
1522 }
1523
1524
1525
1526 /*************************************************
1527 * Scan compiled branch for non-emptiness *
1528 *************************************************/
1529
1530 /* This function scans through a branch of a compiled pattern to see whether it
1531 can match the empty string or not. It is called from could_be_empty()
1532 below and from compile_branch() when checking for an unlimited repeat of a
1533 group that can match nothing. Note that first_significant_code() skips over
1534 backward and negative forward assertions when its final argument is TRUE. If we
1535 hit an unclosed bracket, we return "empty" - this means we've struck an inner
1536 bracket whose current branch will already have been scanned.
1537
1538 Arguments:
1539 code points to start of search
1540 endcode points to where to stop
1541 utf8 TRUE if in UTF8 mode
1542
1543 Returns: TRUE if what is matched could be empty
1544 */
1545
1546 static BOOL
1547 could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1548 {
1549 register int c;
1550 for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1551 code < endcode;
1552 code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1553 {
1554 const uschar *ccode;
1555
1556 c = *code;
1557
1558 /* Skip over forward assertions; the other assertions are skipped by
1559 first_significant_code() with a TRUE final argument. */
1560
1561 if (c == OP_ASSERT)
1562 {
1563 do code += GET(code, 1); while (*code == OP_ALT);
1564 c = *code;
1565 continue;
1566 }
1567
1568 /* Groups with zero repeats can of course be empty; skip them. */
1569
1570 if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO)
1571 {
1572 code += _pcre_OP_lengths[c];
1573 do code += GET(code, 1); while (*code == OP_ALT);
1574 c = *code;
1575 continue;
1576 }
1577
1578 /* For other groups, scan the branches. */
1579
1580 if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND)
1581 {
1582 BOOL empty_branch;
1583 if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1584
1585 /* Scan a closed bracket */
1586
1587 empty_branch = FALSE;
1588 do
1589 {
1590 if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1591 empty_branch = TRUE;
1592 code += GET(code, 1);
1593 }
1594 while (*code == OP_ALT);
1595 if (!empty_branch) return FALSE; /* All branches are non-empty */
1596 c = *code;
1597 continue;
1598 }
1599
1600 /* Handle the other opcodes */
1601
1602 switch (c)
1603 {
1604 /* Check for quantifiers after a class. XCLASS is used for classes that
1605 cannot be represented just by a bit map. This includes negated single
1606 high-valued characters. The length in _pcre_OP_lengths[] is zero; the
1607 actual length is stored in the compiled code, so we must update "code"
1608 here. */
1609
1610 #ifdef SUPPORT_UTF8
1611 case OP_XCLASS:
1612 ccode = code += GET(code, 1);
1613 goto CHECK_CLASS_REPEAT;
1614 #endif
1615
1616 case OP_CLASS:
1617 case OP_NCLASS:
1618 ccode = code + 33;
1619
1620 #ifdef SUPPORT_UTF8
1621 CHECK_CLASS_REPEAT:
1622 #endif
1623
1624 switch (*ccode)
1625 {
1626 case OP_CRSTAR: /* These could be empty; continue */
1627 case OP_CRMINSTAR:
1628 case OP_CRQUERY:
1629 case OP_CRMINQUERY:
1630 break;
1631
1632 default: /* Non-repeat => class must match */
1633 case OP_CRPLUS: /* These repeats aren't empty */
1634 case OP_CRMINPLUS:
1635 return FALSE;
1636
1637 case OP_CRRANGE:
1638 case OP_CRMINRANGE:
1639 if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
1640 break;
1641 }
1642 break;
1643
1644 /* Opcodes that must match a character */
1645
1646 case OP_PROP:
1647 case OP_NOTPROP:
1648 case OP_EXTUNI:
1649 case OP_NOT_DIGIT:
1650 case OP_DIGIT:
1651 case OP_NOT_WHITESPACE:
1652 case OP_WHITESPACE:
1653 case OP_NOT_WORDCHAR:
1654 case OP_WORDCHAR:
1655 case OP_ANY:
1656 case OP_ANYBYTE:
1657 case OP_CHAR:
1658 case OP_CHARNC:
1659 case OP_NOT:
1660 case OP_PLUS:
1661 case OP_MINPLUS:
1662 case OP_POSPLUS:
1663 case OP_EXACT:
1664 case OP_NOTPLUS:
1665 case OP_NOTMINPLUS:
1666 case OP_NOTPOSPLUS:
1667 case OP_NOTEXACT:
1668 case OP_TYPEPLUS:
1669 case OP_TYPEMINPLUS:
1670 case OP_TYPEPOSPLUS:
1671 case OP_TYPEEXACT:
1672 return FALSE;
1673
1674 /* These are going to continue, as they may be empty, but we have to
1675 fudge the length for the \p and \P cases. */
1676
1677 case OP_TYPESTAR:
1678 case OP_TYPEMINSTAR:
1679 case OP_TYPEPOSSTAR:
1680 case OP_TYPEQUERY:
1681 case OP_TYPEMINQUERY:
1682 case OP_TYPEPOSQUERY:
1683 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1684 break;
1685
1686 /* Same for these */
1687
1688 case OP_TYPEUPTO:
1689 case OP_TYPEMINUPTO:
1690 case OP_TYPEPOSUPTO:
1691 if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1692 break;
1693
1694 /* End of branch */
1695
1696 case OP_KET:
1697 case OP_KETRMAX:
1698 case OP_KETRMIN:
1699 case OP_ALT:
1700 return TRUE;
1701
1702 /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1703 MINUPTO, and POSUPTO may be followed by a multibyte character */
1704
1705 #ifdef SUPPORT_UTF8
1706 case OP_STAR:
1707 case OP_MINSTAR:
1708 case OP_POSSTAR:
1709 case OP_QUERY:
1710 case OP_MINQUERY:
1711 case OP_POSQUERY:
1712 case OP_UPTO:
1713 case OP_MINUPTO:
1714 case OP_POSUPTO:
1715 if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1716 break;
1717 #endif
1718 }
1719 }
1720
1721 return TRUE;
1722 }
1723
1724
1725
1726 /*************************************************
1727 * Scan compiled regex for non-emptiness *
1728 *************************************************/
1729
1730 /* This function is called to check for left recursive calls. We want to check
1731 the current branch of the current pattern to see if it could match the empty
1732 string. If it could, we must look outwards for branches at other levels,
1733 stopping when we pass beyond the bracket which is the subject of the recursion.
1734
1735 Arguments:
1736 code points to start of the recursion
1737 endcode points to where to stop (current RECURSE item)
1738 bcptr points to the chain of current (unclosed) branch starts
1739 utf8 TRUE if in UTF-8 mode
1740
1741 Returns: TRUE if what is matched could be empty
1742 */
1743
1744 static BOOL
1745 could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1746 BOOL utf8)
1747 {
1748 while (bcptr != NULL && bcptr->current >= code)
1749 {
1750 if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1751 bcptr = bcptr->outer;
1752 }
1753 return TRUE;
1754 }
1755
1756
1757
1758 /*************************************************
1759 * Check for POSIX class syntax *
1760 *************************************************/
1761
1762 /* This function is called when the sequence "[:" or "[." or "[=" is
1763 encountered in a character class. It checks whether this is followed by a
1764 sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
1765 reach an unescaped ']' without the special preceding character, return FALSE.
1766
1767 Originally, this function only recognized a sequence of letters between the
1768 terminators, but it seems that Perl recognizes any sequence of characters,
1769 though of course unknown POSIX names are subsequently rejected. Perl gives an
1770 "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
1771 didn't consider this to be a POSIX class. Likewise for [:1234:].
1772
1773 The problem in trying to be exactly like Perl is in the handling of escapes. We
1774 have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
1775 class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
1776 below handles the special case of \], but does not try to do any other escape
1777 processing. This makes it different from Perl for cases such as [:l\ower:]
1778 where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize
1779 "l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,
1780 I think.
1781
1782 Arguments:
1783 ptr pointer to the initial [
1784 endptr where to return the end pointer
1785
1786 Returns: TRUE or FALSE
1787 */
1788
1789 static BOOL
1790 check_posix_syntax(const uschar *ptr, const uschar **endptr)
1791 {
1792 int terminator; /* Don't combine these lines; the Solaris cc */
1793 terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
1794 for (++ptr; *ptr != 0; ptr++)
1795 {
1796 if (*ptr == '\\' && ptr[1] == ']') ptr++; else
1797 {
1798 if (*ptr == ']') return FALSE;
1799 if (*ptr == terminator && ptr[1] == ']')
1800 {
1801 *endptr = ptr;
1802 return TRUE;
1803 }
1804 }
1805 }
1806 return FALSE;
1807 }
1808
1809
1810
1811
1812 /*************************************************
1813 * Check POSIX class name *
1814 *************************************************/
1815
1816 /* This function is called to check the name given in a POSIX-style class entry
1817 such as [:alnum:].
1818
1819 Arguments:
1820 ptr points to the first letter
1821 len the length of the name
1822
1823 Returns: a value representing the name, or -1 if unknown
1824 */
1825
1826 static int
1827 check_posix_name(const uschar *ptr, int len)
1828 {
1829 const char *pn = posix_names;
1830 register int yield = 0;
1831 while (posix_name_lengths[yield] != 0)
1832 {
1833 if (len == posix_name_lengths[yield] &&
1834 strncmp((const char *)ptr, pn, len) == 0) return yield;
1835 pn += posix_name_lengths[yield] + 1;
1836 yield++;
1837 }
1838 return -1;
1839 }
1840
1841
1842 /*************************************************
1843 * Adjust OP_RECURSE items in repeated group *
1844 *************************************************/
1845
1846 /* OP_RECURSE items contain an offset from the start of the regex to the group
1847 that is referenced. This means that groups can be replicated for fixed
1848 repetition simply by copying (because the recursion is allowed to refer to
1849 earlier groups that are outside the current group). However, when a group is
1850 optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is
1851 inserted before it, after it has been compiled. This means that any OP_RECURSE
1852 items within it that refer to the group itself or any contained groups have to
1853 have their offsets adjusted. That one of the jobs of this function. Before it
1854 is called, the partially compiled regex must be temporarily terminated with
1855 OP_END.
1856
1857 This function has been extended with the possibility of forward references for
1858 recursions and subroutine calls. It must also check the list of such references
1859 for the group we are dealing with. If it finds that one of the recursions in
1860 the current group is on this list, it adjusts the offset in the list, not the
1861 value in the reference (which is a group number).
1862
1863 Arguments:
1864 group points to the start of the group
1865 adjust the amount by which the group is to be moved
1866 utf8 TRUE in UTF-8 mode
1867 cd contains pointers to tables etc.
1868 save_hwm the hwm forward reference pointer at the start of the group
1869
1870 Returns: nothing
1871 */
1872
1873 static void
1874 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
1875 uschar *save_hwm)
1876 {
1877 uschar *ptr = group;
1878
1879 while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1880 {
1881 int offset;
1882 uschar *hc;
1883
1884 /* See if this recursion is on the forward reference list. If so, adjust the
1885 reference. */
1886
1887 for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
1888 {
1889 offset = GET(hc, 0);
1890 if (cd->start_code + offset == ptr + 1)
1891 {
1892 PUT(hc, 0, offset + adjust);
1893 break;
1894 }
1895 }
1896
1897 /* Otherwise, adjust the recursion offset if it's after the start of this
1898 group. */
1899
1900 if (hc >= cd->hwm)
1901 {
1902 offset = GET(ptr, 1);
1903 if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1904 }
1905
1906 ptr += 1 + LINK_SIZE;
1907 }
1908 }
1909
1910
1911
1912 /*************************************************
1913 * Insert an automatic callout point *
1914 *************************************************/
1915
1916 /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
1917 callout points before each pattern item.
1918
1919 Arguments:
1920 code current code pointer
1921 ptr current pattern pointer
1922 cd pointers to tables etc
1923
1924 Returns: new code pointer
1925 */
1926
1927 static uschar *
1928 auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
1929 {
1930 *code++ = OP_CALLOUT;
1931 *code++ = 255;
1932 PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */
1933 PUT(code, LINK_SIZE, 0); /* Default length */
1934 return code + 2*LINK_SIZE;
1935 }
1936
1937
1938
1939 /*************************************************
1940 * Complete a callout item *
1941 *************************************************/
1942
1943 /* A callout item contains the length of the next item in the pattern, which
1944 we can't fill in till after we have reached the relevant point. This is used
1945 for both automatic and manual callouts.
1946
1947 Arguments:
1948 previous_callout points to previous callout item
1949 ptr current pattern pointer
1950 cd pointers to tables etc
1951
1952 Returns: nothing
1953 */
1954
1955 static void
1956 complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
1957 {
1958 int length = ptr - cd->start_pattern - GET(previous_callout, 2);
1959 PUT(previous_callout, 2 + LINK_SIZE, length);
1960 }
1961
1962
1963
1964 #ifdef SUPPORT_UCP
1965 /*************************************************
1966 * Get othercase range *
1967 *************************************************/
1968
1969 /* This function is passed the start and end of a class range, in UTF-8 mode
1970 with UCP support. It searches up the characters, looking for internal ranges of
1971 characters in the "other" case. Each call returns the next one, updating the
1972 start address.
1973
1974 Arguments:
1975 cptr points to starting character value; updated
1976 d end value
1977 ocptr where to put start of othercase range
1978 odptr where to put end of othercase range
1979
1980 Yield: TRUE when range returned; FALSE when no more
1981 */
1982
1983 static BOOL
1984 get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
1985 unsigned int *odptr)
1986 {
1987 unsigned int c, othercase, next;
1988
1989 for (c = *cptr; c <= d; c++)
1990 { if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR) break; }
1991
1992 if (c > d) return FALSE;
1993
1994 *ocptr = othercase;
1995 next = othercase + 1;
1996
1997 for (++c; c <= d; c++)
1998 {
1999 if (_pcre_ucp_othercase(c) != next) break;
2000 next++;
2001 }
2002
2003 *odptr = next - 1;
2004 *cptr = c;
2005
2006 return TRUE;
2007 }
2008 #endif /* SUPPORT_UCP */
2009
2010
2011
2012 /*************************************************
2013 * Check if auto-possessifying is possible *
2014 *************************************************/
2015
2016 /* This function is called for unlimited repeats of certain items, to see
2017 whether the next thing could possibly match the repeated item. If not, it makes
2018 sense to automatically possessify the repeated item.
2019
2020 Arguments:
2021 op_code the repeated op code
2022 this data for this item, depends on the opcode
2023 utf8 TRUE in UTF-8 mode
2024 utf8_char used for utf8 character bytes, NULL if not relevant
2025 ptr next character in pattern
2026 options options bits
2027 cd contains pointers to tables etc.
2028
2029 Returns: TRUE if possessifying is wanted
2030 */
2031
2032 static BOOL
2033 check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
2034 const uschar *ptr, int options, compile_data *cd)
2035 {
2036 int next;
2037
2038 /* Skip whitespace and comments in extended mode */
2039
2040 if ((options & PCRE_EXTENDED) != 0)
2041 {
2042 for (;;)
2043 {
2044 while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2045 if (*ptr == '#')
2046 {
2047 while (*(++ptr) != 0)
2048 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2049 }
2050 else break;
2051 }
2052 }
2053
2054 /* If the next item is one that we can handle, get its value. A non-negative
2055 value is a character, a negative value is an escape value. */
2056
2057 if (*ptr == '\\')
2058 {
2059 int temperrorcode = 0;
2060 next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
2061 if (temperrorcode != 0) return FALSE;
2062 ptr++; /* Point after the escape sequence */
2063 }
2064
2065 else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
2066 {
2067 #ifdef SUPPORT_UTF8
2068 if (utf8) { GETCHARINC(next, ptr); } else
2069 #endif
2070 next = *ptr++;
2071 }
2072
2073 else return FALSE;
2074
2075 /* Skip whitespace and comments in extended mode */
2076
2077 if ((options & PCRE_EXTENDED) != 0)
2078 {
2079 for (;;)
2080 {
2081 while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2082 if (*ptr == '#')
2083 {
2084 while (*(++ptr) != 0)
2085 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2086 }
2087 else break;
2088 }
2089 }
2090
2091 /* If the next thing is itself optional, we have to give up. */
2092
2093 if (*ptr == '*' || *ptr == '?' || strncmp((char *)ptr, "{0,", 3) == 0)
2094 return FALSE;
2095
2096 /* Now compare the next item with the previous opcode. If the previous is a
2097 positive single character match, "item" either contains the character or, if
2098 "item" is greater than 127 in utf8 mode, the character's bytes are in
2099 utf8_char. */
2100
2101
2102 /* Handle cases when the next item is a character. */
2103
2104 if (next >= 0) switch(op_code)
2105 {
2106 case OP_CHAR:
2107 #ifdef SUPPORT_UTF8
2108 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2109 #endif
2110 return item != next;
2111
2112 /* For CHARNC (caseless character) we must check the other case. If we have
2113 Unicode property support, we can use it to test the other case of
2114 high-valued characters. */
2115
2116 case OP_CHARNC:
2117 #ifdef SUPPORT_UTF8
2118 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2119 #endif
2120 if (item == next) return FALSE;
2121 #ifdef SUPPORT_UTF8
2122 if (utf8)
2123 {
2124 unsigned int othercase;
2125 if (next < 128) othercase = cd->fcc[next]; else
2126 #ifdef SUPPORT_UCP
2127 othercase = _pcre_ucp_othercase((unsigned int)next);
2128 #else
2129 othercase = NOTACHAR;
2130 #endif
2131 return (unsigned int)item != othercase;
2132 }
2133 else
2134 #endif /* SUPPORT_UTF8 */
2135 return (item != cd->fcc[next]); /* Non-UTF-8 mode */
2136
2137 /* For OP_NOT, "item" must be a single-byte character. */
2138
2139 case OP_NOT:
2140 if (item == next) return TRUE;
2141 if ((options & PCRE_CASELESS) == 0) return FALSE;
2142 #ifdef SUPPORT_UTF8
2143 if (utf8)
2144 {
2145 unsigned int othercase;
2146 if (next < 128) othercase = cd->fcc[next]; else
2147 #ifdef SUPPORT_UCP
2148 othercase = _pcre_ucp_othercase(next);
2149 #else
2150 othercase = NOTACHAR;
2151 #endif
2152 return (unsigned int)item == othercase;
2153 }
2154 else
2155 #endif /* SUPPORT_UTF8 */
2156 return (item == cd->fcc[next]); /* Non-UTF-8 mode */
2157
2158 case OP_DIGIT:
2159 return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
2160
2161 case OP_NOT_DIGIT:
2162 return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
2163
2164 case OP_WHITESPACE:
2165 return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
2166
2167 case OP_NOT_WHITESPACE:
2168 return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
2169
2170 case OP_WORDCHAR:
2171 return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
2172
2173 case OP_NOT_WORDCHAR:
2174 return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
2175
2176 case OP_HSPACE:
2177 case OP_NOT_HSPACE:
2178 switch(next)
2179 {
2180 case 0x09:
2181 case 0x20:
2182 case 0xa0:
2183 case 0x1680:
2184 case 0x180e:
2185 case 0x2000:
2186 case 0x2001:
2187 case 0x2002:
2188 case 0x2003:
2189 case 0x2004:
2190 case 0x2005:
2191 case 0x2006:
2192 case 0x2007:
2193 case 0x2008:
2194 case 0x2009:
2195 case 0x200A:
2196 case 0x202f:
2197 case 0x205f:
2198 case 0x3000:
2199 return op_code != OP_HSPACE;
2200 default:
2201 return op_code == OP_HSPACE;
2202 }
2203
2204 case OP_VSPACE:
2205 case OP_NOT_VSPACE:
2206 switch(next)
2207 {
2208 case 0x0a:
2209 case 0x0b:
2210 case 0x0c:
2211 case 0x0d:
2212 case 0x85:
2213 case 0x2028:
2214 case 0x2029:
2215 return op_code != OP_VSPACE;
2216 default:
2217 return op_code == OP_VSPACE;
2218 }
2219
2220 default:
2221 return FALSE;
2222 }
2223
2224
2225 /* Handle the case when the next item is \d, \s, etc. */
2226
2227 switch(op_code)
2228 {
2229 case OP_CHAR:
2230 case OP_CHARNC:
2231 #ifdef SUPPORT_UTF8
2232 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2233 #endif
2234 switch(-next)
2235 {
2236 case ESC_d:
2237 return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
2238
2239 case ESC_D:
2240 return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
2241
2242 case ESC_s:
2243 return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
2244
2245 case ESC_S:
2246 return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
2247
2248 case ESC_w:
2249 return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
2250
2251 case ESC_W:
2252 return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
2253
2254 case ESC_h:
2255 case ESC_H:
2256 switch(item)
2257 {
2258 case 0x09:
2259 case 0x20:
2260 case 0xa0:
2261 case 0x1680:
2262 case 0x180e:
2263 case 0x2000:
2264 case 0x2001:
2265 case 0x2002:
2266 case 0x2003:
2267 case 0x2004:
2268 case 0x2005:
2269 case 0x2006:
2270 case 0x2007:
2271 case 0x2008:
2272 case 0x2009:
2273 case 0x200A:
2274 case 0x202f:
2275 case 0x205f:
2276 case 0x3000:
2277 return -next != ESC_h;
2278 default:
2279 return -next == ESC_h;
2280 }
2281
2282 case ESC_v:
2283 case ESC_V:
2284 switch(item)
2285 {
2286 case 0x0a:
2287 case 0x0b:
2288 case 0x0c:
2289 case 0x0d:
2290 case 0x85:
2291 case 0x2028:
2292 case 0x2029:
2293 return -next != ESC_v;
2294 default:
2295 return -next == ESC_v;
2296 }
2297
2298 default:
2299 return FALSE;
2300 }
2301
2302 case OP_DIGIT:
2303 return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
2304 next == -ESC_h || next == -ESC_v;
2305
2306 case OP_NOT_DIGIT:
2307 return next == -ESC_d;
2308
2309 case OP_WHITESPACE:
2310 return next == -ESC_S || next == -ESC_d || next == -ESC_w;
2311
2312 case OP_NOT_WHITESPACE:
2313 return next == -ESC_s || next == -ESC_h || next == -ESC_v;
2314
2315 case OP_HSPACE:
2316 return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w;
2317
2318 case OP_NOT_HSPACE:
2319 return next == -ESC_h;
2320
2321 /* Can't have \S in here because VT matches \S (Perl anomaly) */
2322 case OP_VSPACE:
2323 return next == -ESC_V || next == -ESC_d || next == -ESC_w;
2324
2325 case OP_NOT_VSPACE:
2326 return next == -ESC_v;
2327
2328 case OP_WORDCHAR:
2329 return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v;
2330
2331 case OP_NOT_WORDCHAR:
2332 return next == -ESC_w || next == -ESC_d;
2333
2334 default:
2335 return FALSE;
2336 }
2337
2338 /* Control does not reach here */
2339 }
2340
2341
2342
2343 /*************************************************
2344 * Compile one branch *
2345 *************************************************/
2346
2347 /* Scan the pattern, compiling it into the a vector. If the options are
2348 changed during the branch, the pointer is used to change the external options
2349 bits. This function is used during the pre-compile phase when we are trying
2350 to find out the amount of memory needed, as well as during the real compile
2351 phase. The value of lengthptr distinguishes the two phases.
2352
2353 Arguments:
2354 optionsptr pointer to the option bits
2355 codeptr points to the pointer to the current code point
2356 ptrptr points to the current pattern pointer
2357 errorcodeptr points to error code variable
2358 firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2359 reqbyteptr set to the last literal character required, else < 0
2360 bcptr points to current branch chain
2361 cd contains pointers to tables etc.
2362 lengthptr NULL during the real compile phase
2363 points to length accumulator during pre-compile phase
2364
2365 Returns: TRUE on success
2366 FALSE, with *errorcodeptr set non-zero on error
2367 */
2368
2369 static BOOL
2370 compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2371 int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2372 compile_data *cd, int *lengthptr)
2373 {
2374 int repeat_type, op_type;
2375 int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
2376 int bravalue = 0;
2377 int greedy_default, greedy_non_default;
2378 int firstbyte, reqbyte;
2379 int zeroreqbyte, zerofirstbyte;
2380 int req_caseopt, reqvary, tempreqvary;
2381 int options = *optionsptr;
2382 int after_manual_callout = 0;
2383 int length_prevgroup = 0;
2384 register int c;
2385 register uschar *code = *codeptr;
2386 uschar *last_code = code;
2387 uschar *orig_code = code;
2388 uschar *tempcode;
2389 BOOL inescq = FALSE;
2390 BOOL groupsetfirstbyte = FALSE;
2391 const uschar *ptr = *ptrptr;
2392 const uschar *tempptr;
2393 uschar *previous = NULL;
2394 uschar *previous_callout = NULL;
2395 uschar *save_hwm = NULL;
2396 uschar classbits[32];
2397
2398 #ifdef SUPPORT_UTF8
2399 BOOL class_utf8;
2400 BOOL utf8 = (options & PCRE_UTF8) != 0;
2401 uschar *class_utf8data;
2402 uschar *class_utf8data_base;
2403 uschar utf8_char[6];
2404 #else
2405 BOOL utf8 = FALSE;
2406 uschar *utf8_char = NULL;
2407 #endif
2408
2409 #ifdef DEBUG
2410 if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2411 #endif
2412
2413 /* Set up the default and non-default settings for greediness */
2414
2415 greedy_default = ((options & PCRE_UNGREEDY) != 0);
2416 greedy_non_default = greedy_default ^ 1;
2417
2418 /* Initialize no first byte, no required byte. REQ_UNSET means "no char
2419 matching encountered yet". It gets changed to REQ_NONE if we hit something that
2420 matches a non-fixed char first char; reqbyte just remains unset if we never
2421 find one.
2422
2423 When we hit a repeat whose minimum is zero, we may have to adjust these values
2424 to take the zero repeat into account. This is implemented by setting them to
2425 zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2426 item types that can be repeated set these backoff variables appropriately. */
2427
2428 firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2429
2430 /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2431 according to the current setting of the caseless flag. REQ_CASELESS is a bit
2432 value > 255. It is added into the firstbyte or reqbyte variables to record the
2433 case status of the value. This is used only for ASCII characters. */
2434
2435 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2436
2437 /* Switch on next character until the end of the branch */
2438
2439 for (;; ptr++)
2440 {
2441 BOOL negate_class;
2442 BOOL should_flip_negation;
2443 BOOL possessive_quantifier;
2444 BOOL is_quantifier;
2445 BOOL is_recurse;
2446 BOOL reset_bracount;
2447 int class_charcount;
2448 int class_lastchar;
2449 int newoptions;
2450 int recno;
2451 int refsign;
2452 int skipbytes;
2453 int subreqbyte;
2454 int subfirstbyte;
2455 int terminator;
2456 int mclength;
2457 uschar mcbuffer[8];
2458
2459 /* Get next byte in the pattern */
2460
2461 c = *ptr;
2462
2463 /* If we are in the pre-compile phase, accumulate the length used for the
2464 previous cycle of this loop. */
2465
2466 if (lengthptr != NULL)
2467 {
2468 #ifdef DEBUG
2469 if (code > cd->hwm) cd->hwm = code; /* High water info */
2470 #endif
2471 if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
2472 {
2473 *errorcodeptr = ERR52;
2474 goto FAILED;
2475 }
2476
2477 /* There is at least one situation where code goes backwards: this is the
2478 case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2479 the class is simply eliminated. However, it is created first, so we have to
2480 allow memory for it. Therefore, don't ever reduce the length at this point.
2481 */
2482
2483 if (code < last_code) code = last_code;
2484
2485 /* Paranoid check for integer overflow */
2486
2487 if (OFLOW_MAX - *lengthptr < code - last_code)
2488 {
2489 *errorcodeptr = ERR20;
2490 goto FAILED;
2491 }
2492
2493 *lengthptr += code - last_code;
2494 DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2495
2496 /* If "previous" is set and it is not at the start of the work space, move
2497 it back to there, in order to avoid filling up the work space. Otherwise,
2498 if "previous" is NULL, reset the current code pointer to the start. */
2499
2500 if (previous != NULL)
2501 {
2502 if (previous > orig_code)
2503 {
2504 memmove(orig_code, previous, code - previous);
2505 code -= previous - orig_code;
2506 previous = orig_code;
2507 }
2508 }
2509 else code = orig_code;
2510
2511 /* Remember where this code item starts so we can pick up the length
2512 next time round. */
2513
2514 last_code = code;
2515 }
2516
2517 /* In the real compile phase, just check the workspace used by the forward
2518 reference list. */
2519
2520 else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
2521 {
2522 *errorcodeptr = ERR52;
2523 goto FAILED;
2524 }
2525
2526 /* If in \Q...\E, check for the end; if not, we have a literal */
2527
2528 if (inescq && c != 0)
2529 {
2530 if (c == '\\' && ptr[1] == 'E')
2531 {
2532 inescq = FALSE;
2533 ptr++;
2534 continue;
2535 }
2536 else
2537 {
2538 if (previous_callout != NULL)
2539 {
2540 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2541 complete_callout(previous_callout, ptr, cd);
2542 previous_callout = NULL;
2543 }
2544 if ((options & PCRE_AUTO_CALLOUT) != 0)
2545 {
2546 previous_callout = code;
2547 code = auto_callout(code, ptr, cd);
2548 }
2549 goto NORMAL_CHAR;
2550 }
2551 }
2552
2553 /* Fill in length of a previous callout, except when the next thing is
2554 a quantifier. */
2555
2556 is_quantifier = c == '*' || c == '+' || c == '?' ||
2557 (c == '{' && is_counted_repeat(ptr+1));
2558
2559 if (!is_quantifier && previous_callout != NULL &&
2560 after_manual_callout-- <= 0)
2561 {
2562 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2563 complete_callout(previous_callout, ptr, cd);
2564 previous_callout = NULL;
2565 }
2566
2567 /* In extended mode, skip white space and comments */
2568
2569 if ((options & PCRE_EXTENDED) != 0)
2570 {
2571 if ((cd->ctypes[c] & ctype_space) != 0) continue;
2572 if (c == '#')
2573 {
2574 while (*(++ptr) != 0)
2575 {
2576 if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
2577 }
2578 if (*ptr != 0) continue;
2579
2580 /* Else fall through to handle end of string */
2581 c = 0;
2582 }
2583 }
2584
2585 /* No auto callout for quantifiers. */
2586
2587 if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
2588 {
2589 previous_callout = code;
2590 code = auto_callout(code, ptr, cd);
2591 }
2592
2593 switch(c)
2594 {
2595 /* ===================================================================*/
2596 case 0: /* The branch terminates at string end */
2597 case '|': /* or | or ) */
2598 case ')':
2599 *firstbyteptr = firstbyte;
2600 *reqbyteptr = reqbyte;
2601 *codeptr = code;
2602 *ptrptr = ptr;
2603 if (lengthptr != NULL)
2604 {
2605 if (OFLOW_MAX - *lengthptr < code - last_code)
2606 {
2607 *errorcodeptr = ERR20;
2608 goto FAILED;
2609 }
2610 *lengthptr += code - last_code; /* To include callout length */
2611 DPRINTF((">> end branch\n"));
2612 }
2613 return TRUE;
2614
2615
2616 /* ===================================================================*/
2617 /* Handle single-character metacharacters. In multiline mode, ^ disables
2618 the setting of any following char as a first character. */
2619
2620 case '^':
2621 if ((options & PCRE_MULTILINE) != 0)
2622 {
2623 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2624 }
2625 previous = NULL;
2626 *code++ = OP_CIRC;
2627 break;
2628
2629 case '$':
2630 previous = NULL;
2631 *code++ = OP_DOLL;
2632 break;
2633
2634 /* There can never be a first char if '.' is first, whatever happens about
2635 repeats. The value of reqbyte doesn't change either. */
2636
2637 case '.':
2638 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2639 zerofirstbyte = firstbyte;
2640 zeroreqbyte = reqbyte;
2641 previous = code;
2642 *code++ = OP_ANY;
2643 break;
2644
2645
2646 /* ===================================================================*/
2647 /* Character classes. If the included characters are all < 256, we build a
2648 32-byte bitmap of the permitted characters, except in the special case
2649 where there is only one such character. For negated classes, we build the
2650 map as usual, then invert it at the end. However, we use a different opcode
2651 so that data characters > 255 can be handled correctly.
2652
2653 If the class contains characters outside the 0-255 range, a different
2654 opcode is compiled. It may optionally have a bit map for characters < 256,
2655 but those above are are explicitly listed afterwards. A flag byte tells
2656 whether the bitmap is present, and whether this is a negated class or not.
2657 */
2658
2659 case '[':
2660 previous = code;
2661
2662 /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
2663 they are encountered at the top level, so we'll do that too. */
2664
2665 if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2666 check_posix_syntax(ptr, &tempptr))
2667 {
2668 *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;
2669 goto FAILED;
2670 }
2671
2672 /* If the first character is '^', set the negation flag and skip it. Also,
2673 if the first few characters (either before or after ^) are \Q\E or \E we
2674 skip them too. This makes for compatibility with Perl. */
2675
2676 negate_class = FALSE;
2677 for (;;)
2678 {
2679 c = *(++ptr);
2680 if (c == '\\')
2681 {
2682 if (ptr[1] == 'E') ptr++;
2683 else if (strncmp((const char *)ptr+1, "Q\\E", 3) == 0) ptr += 3;
2684 else break;
2685 }
2686 else if (!negate_class && c == '^')
2687 negate_class = TRUE;
2688 else break;
2689 }
2690
2691 /* If a class contains a negative special such as \S, we need to flip the
2692 negation flag at the end, so that support for characters > 255 works
2693 correctly (they are all included in the class). */
2694
2695 should_flip_negation = FALSE;
2696
2697 /* Keep a count of chars with values < 256 so that we can optimize the case
2698 of just a single character (as long as it's < 256). However, For higher
2699 valued UTF-8 characters, we don't yet do any optimization. */
2700
2701 class_charcount = 0;
2702 class_lastchar = -1;
2703
2704 /* Initialize the 32-char bit map to all zeros. We build the map in a
2705 temporary bit of memory, in case the class contains only 1 character (less
2706 than 256), because in that case the compiled code doesn't use the bit map.
2707 */
2708
2709 memset(classbits, 0, 32 * sizeof(uschar));
2710
2711 #ifdef SUPPORT_UTF8
2712 class_utf8 = FALSE; /* No chars >= 256 */
2713 class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */
2714 class_utf8data_base = class_utf8data; /* For resetting in pass 1 */
2715 #endif
2716
2717 /* Process characters until ] is reached. By writing this as a "do" it
2718 means that an initial ] is taken as a data character. At the start of the
2719 loop, c contains the first byte of the character. */
2720
2721 if (c != 0) do
2722 {
2723 const uschar *oldptr;
2724
2725 #ifdef SUPPORT_UTF8
2726 if (utf8 && c > 127)
2727 { /* Braces are required because the */
2728 GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
2729 }
2730
2731 /* In the pre-compile phase, accumulate the length of any UTF-8 extra
2732 data and reset the pointer. This is so that very large classes that
2733 contain a zillion UTF-8 characters no longer overwrite the work space
2734 (which is on the stack). */
2735
2736 if (lengthptr != NULL)
2737 {
2738 *lengthptr += class_utf8data - class_utf8data_base;
2739 class_utf8data = class_utf8data_base;
2740 }
2741
2742 #endif
2743
2744 /* Inside \Q...\E everything is literal except \E */
2745
2746 if (inescq)
2747 {
2748 if (c == '\\' && ptr[1] == 'E') /* If we are at \E */
2749 {
2750 inescq = FALSE; /* Reset literal state */
2751 ptr++; /* Skip the 'E' */
2752 continue; /* Carry on with next */
2753 }
2754 goto CHECK_RANGE; /* Could be range if \E follows */
2755 }
2756
2757 /* Handle POSIX class names. Perl allows a negation extension of the
2758 form [:^name:]. A square bracket that doesn't match the syntax is
2759 treated as a literal. We also recognize the POSIX constructions
2760 [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
2761 5.6 and 5.8 do. */
2762
2763 if (c == '[' &&
2764 (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2765 check_posix_syntax(ptr, &tempptr))
2766 {
2767 BOOL local_negate = FALSE;
2768 int posix_class, taboffset, tabopt;
2769 register const uschar *cbits = cd->cbits;
2770 uschar pbits[32];
2771
2772 if (ptr[1] != ':')
2773 {
2774 *errorcodeptr = ERR31;
2775 goto FAILED;
2776 }
2777
2778 ptr += 2;
2779 if (*ptr == '^')
2780 {
2781 local_negate = TRUE;
2782 should_flip_negation = TRUE; /* Note negative special */
2783 ptr++;
2784 }
2785
2786 posix_class = check_posix_name(ptr, tempptr - ptr);
2787 if (posix_class < 0)
2788 {
2789 *errorcodeptr = ERR30;
2790 goto FAILED;
2791 }
2792
2793 /* If matching is caseless, upper and lower are converted to
2794 alpha. This relies on the fact that the class table starts with
2795 alpha, lower, upper as the first 3 entries. */
2796
2797 if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
2798 posix_class = 0;
2799
2800 /* We build the bit map for the POSIX class in a chunk of local store
2801 because we may be adding and subtracting from it, and we don't want to
2802 subtract bits that may be in the main map already. At the end we or the
2803 result into the bit map that is being built. */
2804
2805 posix_class *= 3;
2806
2807 /* Copy in the first table (always present) */
2808
2809 memcpy(pbits, cbits + posix_class_maps[posix_class],
2810 32 * sizeof(uschar));
2811
2812 /* If there is a second table, add or remove it as required. */
2813
2814 taboffset = posix_class_maps[posix_class + 1];
2815 tabopt = posix_class_maps[posix_class + 2];
2816
2817 if (taboffset >= 0)
2818 {
2819 if (tabopt >= 0)
2820 for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
2821 else
2822 for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
2823 }
2824
2825 /* Not see if we need to remove any special characters. An option
2826 value of 1 removes vertical space and 2 removes underscore. */
2827
2828 if (tabopt < 0) tabopt = -tabopt;
2829 if (tabopt == 1) pbits[1] &= ~0x3c;
2830 else if (tabopt == 2) pbits[11] &= 0x7f;
2831
2832 /* Add the POSIX table or its complement into the main table that is
2833 being built and we are done. */
2834
2835 if (local_negate)
2836 for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
2837 else
2838 for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
2839
2840 ptr = tempptr + 1;
2841 class_charcount = 10; /* Set > 1; assumes more than 1 per class */
2842 continue; /* End of POSIX syntax handling */
2843 }
2844
2845 /* Backslash may introduce a single character, or it may introduce one
2846 of the specials, which just set a flag. The sequence \b is a special
2847 case. Inside a class (and only there) it is treated as backspace.
2848 Elsewhere it marks a word boundary. Other escapes have preset maps ready
2849 to 'or' into the one we are building. We assume they have more than one
2850 character in them, so set class_charcount bigger than one. */
2851
2852 if (c == '\\')
2853 {
2854 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2855 if (*errorcodeptr != 0) goto FAILED;
2856
2857 if (-c == ESC_b) c = '\b'; /* \b is backspace in a class */
2858 else if (-c == ESC_X) c = 'X'; /* \X is literal X in a class */
2859 else if (-c == ESC_R) c = 'R'; /* \R is literal R in a class */
2860 else if (-c == ESC_Q) /* Handle start of quoted string */
2861 {
2862 if (ptr[1] == '\\' && ptr[2] == 'E')
2863 {
2864 ptr += 2; /* avoid empty string */
2865 }
2866 else inescq = TRUE;
2867 continue;
2868 }
2869 else if (-c == ESC_E) continue; /* Ignore orphan \E */
2870
2871 if (c < 0)
2872 {
2873 register const uschar *cbits = cd->cbits;
2874 class_charcount += 2; /* Greater than 1 is what matters */
2875
2876 /* Save time by not doing this in the pre-compile phase. */
2877
2878 if (lengthptr == NULL) switch (-c)
2879 {
2880 case ESC_d:
2881 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
2882 continue;
2883
2884 case ESC_D:
2885 should_flip_negation = TRUE;
2886 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
2887 continue;
2888
2889 case ESC_w:
2890 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
2891 continue;
2892
2893 case ESC_W:
2894 should_flip_negation = TRUE;
2895 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
2896 continue;
2897
2898 case ESC_s:
2899 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
2900 classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
2901 continue;
2902
2903 case ESC_S:
2904 should_flip_negation = TRUE;
2905 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
2906 classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
2907 continue;
2908
2909 default: /* Not recognized; fall through */
2910 break; /* Need "default" setting to stop compiler warning. */
2911 }
2912
2913 /* In the pre-compile phase, just do the recognition. */
2914
2915 else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
2916 c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
2917
2918 /* We need to deal with \H, \h, \V, and \v in both phases because
2919 they use extra memory. */
2920
2921 if (-c == ESC_h)
2922 {
2923 SETBIT(classbits, 0x09); /* VT */
2924 SETBIT(classbits, 0x20); /* SPACE */
2925 SETBIT(classbits, 0xa0); /* NSBP */
2926 #ifdef SUPPORT_UTF8
2927 if (utf8)
2928 {
2929 class_utf8 = TRUE;
2930 *class_utf8data++ = XCL_SINGLE;
2931 class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
2932 *class_utf8data++ = XCL_SINGLE;
2933 class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
2934 *class_utf8data++ = XCL_RANGE;
2935 class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
2936 class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
2937 *class_utf8data++ = XCL_SINGLE;
2938 class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
2939 *class_utf8data++ = XCL_SINGLE;
2940 class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
2941 *class_utf8data++ = XCL_SINGLE;
2942 class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
2943 }
2944 #endif
2945 continue;
2946 }
2947
2948 if (-c == ESC_H)
2949 {
2950 for (c = 0; c < 32; c++)
2951 {
2952 int x = 0xff;
2953 switch (c)
2954 {
2955 case 0x09/8: x ^= 1 << (0x09%8); break;
2956 case 0x20/8: x ^= 1 << (0x20%8); break;
2957 case 0xa0/8: x ^= 1 << (0xa0%8); break;
2958 default: break;
2959 }
2960 classbits[c] |= x;
2961 }
2962
2963 #ifdef SUPPORT_UTF8
2964 if (utf8)
2965 {
2966 class_utf8 = TRUE;
2967 *class_utf8data++ = XCL_RANGE;
2968 class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2969 class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
2970 *class_utf8data++ = XCL_RANGE;
2971 class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
2972 class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
2973 *class_utf8data++ = XCL_RANGE;
2974 class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
2975 class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
2976 *class_utf8data++ = XCL_RANGE;
2977 class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
2978 class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
2979 *class_utf8data++ = XCL_RANGE;
2980 class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
2981 class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
2982 *class_utf8data++ = XCL_RANGE;
2983 class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
2984 class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
2985 *class_utf8data++ = XCL_RANGE;
2986 class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
2987 class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2988 }
2989 #endif
2990 continue;
2991 }
2992
2993 if (-c == ESC_v)
2994 {
2995 SETBIT(classbits, 0x0a); /* LF */
2996 SETBIT(classbits, 0x0b); /* VT */
2997 SETBIT(classbits, 0x0c); /* FF */
2998 SETBIT(classbits, 0x0d); /* CR */
2999 SETBIT(classbits, 0x85); /* NEL */
3000 #ifdef SUPPORT_UTF8
3001 if (utf8)
3002 {
3003 class_utf8 = TRUE;
3004 *class_utf8data++ = XCL_RANGE;
3005 class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
3006 class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
3007 }
3008 #endif
3009 continue;
3010 }
3011
3012 if (-c == ESC_V)
3013 {
3014 for (c = 0; c < 32; c++)
3015 {
3016 int x = 0xff;
3017 switch (c)
3018 {
3019 case 0x0a/8: x ^= 1 << (0x0a%8);
3020 x ^= 1 << (0x0b%8);
3021 x ^= 1 << (0x0c%8);
3022 x ^= 1 << (0x0d%8);
3023 break;
3024 case 0x85/8: x ^= 1 << (0x85%8); break;
3025 default: break;
3026 }
3027 classbits[c] |= x;
3028 }
3029
3030 #ifdef SUPPORT_UTF8
3031 if (utf8)
3032 {
3033 class_utf8 = TRUE;
3034 *class_utf8data++ = XCL_RANGE;
3035 class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
3036 class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
3037 *class_utf8data++ = XCL_RANGE;
3038 class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
3039 class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
3040 }
3041 #endif
3042 continue;
3043 }
3044
3045 /* We need to deal with \P and \p in both phases. */
3046
3047 #ifdef SUPPORT_UCP
3048 if (-c == ESC_p || -c == ESC_P)
3049 {
3050 BOOL negated;
3051 int pdata;
3052 int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
3053 if (ptype < 0) goto FAILED;
3054 class_utf8 = TRUE;
3055 *class_utf8data++ = ((-c == ESC_p) != negated)?
3056 XCL_PROP : XCL_NOTPROP;
3057 *class_utf8data++ = ptype;
3058 *class_utf8data++ = pdata;
3059 class_charcount -= 2; /* Not a < 256 character */
3060 continue;
3061 }
3062 #endif
3063 /* Unrecognized escapes are faulted if PCRE is running in its
3064 strict mode. By default, for compatibility with Perl, they are
3065 treated as literals. */
3066
3067 if ((options & PCRE_EXTRA) != 0)
3068 {
3069 *errorcodeptr = ERR7;
3070 goto FAILED;
3071 }
3072
3073 class_charcount -= 2; /* Undo the default count from above */
3074 c = *ptr; /* Get the final character and fall through */
3075 }
3076
3077 /* Fall through if we have a single character (c >= 0). This may be
3078 greater than 256 in UTF-8 mode. */
3079
3080 } /* End of backslash handling */
3081
3082 /* A single character may be followed by '-' to form a range. However,
3083 Perl does not permit ']' to be the end of the range. A '-' character
3084 at the end is treated as a literal. Perl ignores orphaned \E sequences
3085 entirely. The code for handling \Q and \E is messy. */
3086
3087 CHECK_RANGE:
3088 while (ptr[1] == '\\' && ptr[2] == 'E')
3089 {
3090 inescq = FALSE;
3091 ptr += 2;
3092 }
3093
3094 oldptr = ptr;
3095
3096 /* Remember \r or \n */
3097
3098 if (c == '\r' || c == '\n') cd->external_flags |= PCRE_HASCRORLF;
3099
3100 /* Check for range */
3101
3102 if (!inescq && ptr[1] == '-')
3103 {
3104 int d;
3105 ptr += 2;
3106 while (*ptr == '\\' && ptr[1] == 'E') ptr += 2;
3107
3108 /* If we hit \Q (not followed by \E) at this point, go into escaped
3109 mode. */
3110
3111 while (*ptr == '\\' && ptr[1] == 'Q')
3112 {
3113 ptr += 2;
3114 if (*ptr == '\\' && ptr[1] == 'E') { ptr += 2; continue; }
3115 inescq = TRUE;
3116 break;
3117 }
3118
3119 if (*ptr == 0 || (!inescq && *ptr == ']'))
3120 {
3121 ptr = oldptr;
3122 goto LONE_SINGLE_CHARACTER;
3123 }
3124
3125 #ifdef SUPPORT_UTF8
3126 if (utf8)
3127 { /* Braces are required because the */
3128 GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
3129 }
3130 else
3131 #endif
3132 d = *ptr; /* Not UTF-8 mode */
3133
3134 /* The second part of a range can be a single-character escape, but
3135 not any of the other escapes. Perl 5.6 treats a hyphen as a literal
3136 in such circumstances. */
3137
3138 if (!inescq && d == '\\')
3139 {
3140 d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3141 if (*errorcodeptr != 0) goto FAILED;
3142
3143 /* \b is backspace; \X is literal X; \R is literal R; any other
3144 special means the '-' was literal */
3145
3146 if (d < 0)
3147 {
3148 if (d == -ESC_b) d = '\b';
3149 else if (d == -ESC_X) d = 'X';
3150 else if (d == -ESC_R) d = 'R'; else
3151 {
3152 ptr = oldptr;
3153 goto LONE_SINGLE_CHARACTER; /* A few lines below */
3154 }
3155 }
3156 }
3157
3158 /* Check that the two values are in the correct order. Optimize
3159 one-character ranges */
3160
3161 if (d < c)
3162 {
3163 *errorcodeptr = ERR8;
3164 goto FAILED;
3165 }
3166
3167 if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
3168
3169 /* Remember \r or \n */
3170
3171 if (d == '\r' || d == '\n') cd->external_flags |= PCRE_HASCRORLF;
3172
3173 /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
3174 matching, we have to use an XCLASS with extra data items. Caseless
3175 matching for characters > 127 is available only if UCP support is
3176 available. */
3177
3178 #ifdef SUPPORT_UTF8
3179 if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
3180 {
3181 class_utf8 = TRUE;
3182
3183 /* With UCP support, we can find the other case equivalents of
3184 the relevant characters. There may be several ranges. Optimize how
3185 they fit with the basic range. */
3186
3187 #ifdef SUPPORT_UCP
3188 if ((options & PCRE_CASELESS) != 0)
3189 {
3190 unsigned int occ, ocd;
3191 unsigned int cc = c;
3192 unsigned int origd = d;
3193 while (get_othercase_range(&cc, origd, &occ, &ocd))
3194 {
3195 if (occ >= (unsigned int)c &&
3196 ocd <= (unsigned int)d)
3197 continue; /* Skip embedded ranges */
3198
3199 if (occ < (unsigned int)c &&
3200 ocd >= (unsigned int)c - 1) /* Extend the basic range */
3201 { /* if there is overlap, */
3202 c = occ; /* noting that if occ < c */
3203 continue; /* we can't have ocd > d */
3204 } /* because a subrange is */
3205 if (ocd > (unsigned int)d &&
3206 occ <= (unsigned int)d + 1) /* always shorter than */
3207 { /* the basic range. */
3208 d = ocd;
3209 continue;
3210 }
3211
3212 if (occ == ocd)
3213 {
3214 *class_utf8data++ = XCL_SINGLE;
3215 }
3216 else
3217 {
3218 *class_utf8data++ = XCL_RANGE;
3219 class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
3220 }
3221 class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
3222 }
3223 }
3224 #endif /* SUPPORT_UCP */
3225
3226 /* Now record the original range, possibly modified for UCP caseless
3227 overlapping ranges. */
3228
3229 *class_utf8data++ = XCL_RANGE;
3230 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3231 class_utf8data += _pcre_ord2utf8(d, class_utf8data);
3232
3233 /* With UCP support, we are done. Without UCP support, there is no
3234 caseless matching for UTF-8 characters > 127; we can use the bit map
3235 for the smaller ones. */
3236
3237 #ifdef SUPPORT_UCP
3238 continue; /* With next character in the class */
3239 #else
3240 if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
3241
3242 /* Adjust upper limit and fall through to set up the map */
3243
3244 d = 127;
3245
3246 #endif /* SUPPORT_UCP */
3247 }
3248 #endif /* SUPPORT_UTF8 */
3249
3250 /* We use the bit map for all cases when not in UTF-8 mode; else
3251 ranges that lie entirely within 0-127 when there is UCP support; else
3252 for partial ranges without UCP support. */
3253
3254 class_charcount += d - c + 1;
3255 class_lastchar = d;
3256
3257 /* We can save a bit of time by skipping this in the pre-compile. */
3258
3259 if (lengthptr == NULL) for (; c <= d; c++)
3260 {
3261 classbits[c/8] |= (1 << (c&7));
3262 if ((options & PCRE_CASELESS) != 0)
3263 {
3264 int uc = cd->fcc[c]; /* flip case */
3265 classbits[uc/8] |= (1 << (uc&7));
3266 }
3267 }
3268
3269 continue; /* Go get the next char in the class */
3270 }
3271
3272 /* Handle a lone single character - we can get here for a normal
3273 non-escape char, or after \ that introduces a single character or for an
3274 apparent range that isn't. */
3275
3276 LONE_SINGLE_CHARACTER:
3277
3278 /* Handle a character that cannot go in the bit map */
3279
3280 #ifdef SUPPORT_UTF8
3281 if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
3282 {
3283 class_utf8 = TRUE;
3284 *class_utf8data++ = XCL_SINGLE;
3285 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3286
3287 #ifdef SUPPORT_UCP
3288 if ((options & PCRE_CASELESS) != 0)
3289 {
3290 unsigned int othercase;
3291 if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR)
3292 {
3293 *class_utf8data++ = XCL_SINGLE;
3294 class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
3295 }
3296 }
3297 #endif /* SUPPORT_UCP */
3298
3299 }
3300 else
3301 #endif /* SUPPORT_UTF8 */
3302
3303 /* Handle a single-byte character */
3304 {
3305 classbits[c/8] |= (1 << (c&7));
3306 if ((options & PCRE_CASELESS) != 0)
3307 {
3308 c = cd->fcc[c]; /* flip case */
3309 classbits[c/8] |= (1 << (c&7));
3310 }
3311 class_charcount++;
3312 class_lastchar = c;
3313 }
3314 }
3315
3316 /* Loop until ']' reached. This "while" is the end of the "do" above. */
3317
3318 while ((c = *(++ptr)) != 0 && (c != ']' || inescq));
3319
3320 if (c == 0) /* Missing terminating ']' */
3321 {
3322 *errorcodeptr = ERR6;
3323 goto FAILED;
3324 }
3325
3326
3327 /* This code has been disabled because it would mean that \s counts as
3328 an explicit \r or \n reference, and that's not really what is wanted. Now
3329 we set the flag only if there is a literal "\r" or "\n" in the class. */
3330
3331 #if 0
3332 /* Remember whether \r or \n are in this class */
3333
3334 if (negate_class)
3335 {
3336 if ((classbits[1] & 0x24) != 0x24) cd->external_flags |= PCRE_HASCRORLF;
3337 }
3338 else
3339 {
3340 if ((classbits[1] & 0x24) != 0) cd->external_flags |= PCRE_HASCRORLF;
3341 }
3342 #endif
3343
3344
3345 /* If class_charcount is 1, we saw precisely one character whose value is
3346 less than 256. As long as there were no characters >= 128 and there was no
3347 use of \p or \P, in other words, no use of any XCLASS features, we can
3348 optimize.
3349
3350 In UTF-8 mode, we can optimize the negative case only if there were no
3351 characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR
3352 operate on single-bytes only. This is an historical hangover. Maybe one day
3353 we can tidy these opcodes to handle multi-byte characters.
3354
3355 The optimization throws away the bit map. We turn the item into a
3356 1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
3357 that OP_NOT does not support multibyte characters. In the positive case, it
3358 can cause firstbyte to be set. Otherwise, there can be no first char if
3359 this item is first, whatever repeat count may follow. In the case of
3360 reqbyte, save the previous value for reinstating. */
3361
3362 #ifdef SUPPORT_UTF8
3363 if (class_charcount == 1 && !class_utf8 &&
3364 (!utf8 || !negate_class || class_lastchar < 128))
3365 #else
3366 if (class_charcount == 1)
3367 #endif
3368 {
3369 zeroreqbyte = reqbyte;
3370
3371 /* The OP_NOT opcode works on one-byte characters only. */
3372
3373 if (negate_class)
3374 {
3375 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3376 zerofirstbyte = firstbyte;
3377 *code++ = OP_NOT;
3378 *code++ = class_lastchar;
3379 break;
3380 }
3381
3382 /* For a single, positive character, get the value into mcbuffer, and
3383 then we can handle this with the normal one-character code. */
3384
3385 #ifdef SUPPORT_UTF8
3386 if (utf8 && class_lastchar > 127)
3387 mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
3388 else
3389 #endif
3390 {
3391 mcbuffer[0] = class_lastchar;
3392 mclength = 1;
3393 }
3394 goto ONE_CHAR;
3395 } /* End of 1-char optimization */
3396
3397 /* The general case - not the one-char optimization. If this is the first
3398 thing in the branch, there can be no first char setting, whatever the
3399 repeat count. Any reqbyte setting must remain unchanged after any kind of
3400 repeat. */
3401
3402 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3403 zerofirstbyte = firstbyte;
3404 zeroreqbyte = reqbyte;
3405
3406 /* If there are characters with values > 255, we have to compile an
3407 extended class, with its own opcode, unless there was a negated special
3408 such as \S in the class, because in that case all characters > 255 are in
3409 the class, so any that were explicitly given as well can be ignored. If
3410 (when there are explicit characters > 255 that must be listed) there are no
3411 characters < 256, we can omit the bitmap in the actual compiled code. */
3412
3413 #ifdef SUPPORT_UTF8
3414 if (class_utf8 && !should_flip_negation)
3415 {
3416 *class_utf8data++ = XCL_END; /* Marks the end of extra data */
3417 *code++ = OP_XCLASS;
3418 code += LINK_SIZE;
3419 *code = negate_class? XCL_NOT : 0;
3420
3421 /* If the map is required, move up the extra data to make room for it;
3422 otherwise just move the code pointer to the end of the extra data. */
3423
3424 if (class_charcount > 0)
3425 {
3426 *code++ |= XCL_MAP;
3427 memmove(code + 32, code, class_utf8data - code);
3428 memcpy(code, classbits, 32);
3429 code = class_utf8data + 32;
3430 }
3431 else code = class_utf8data;
3432
3433 /* Now fill in the complete length of the item */
3434
3435 PUT(previous, 1, code - previous);
3436 break; /* End of class handling */
3437 }
3438 #endif
3439
3440 /* If there are no characters > 255, set the opcode to OP_CLASS or
3441 OP_NCLASS, depending on whether the whole class was negated and whether
3442 there were negative specials such as \S in the class. Then copy the 32-byte
3443 map into the code vector, negating it if necessary. */
3444
3445 *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
3446 if (negate_class)
3447 {
3448 if (lengthptr == NULL) /* Save time in the pre-compile phase */
3449 for (c = 0; c < 32; c++) code[c] = ~classbits[c];
3450 }
3451 else
3452 {
3453 memcpy(code, classbits, 32);
3454 }
3455 code += 32;
3456 break;
3457
3458
3459 /* ===================================================================*/
3460 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
3461 has been tested above. */
3462
3463 case '{':
3464 if (!is_quantifier) goto NORMAL_CHAR;
3465 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
3466 if (*errorcodeptr != 0) goto FAILED;
3467 goto REPEAT;
3468
3469 case '*':
3470 repeat_min = 0;
3471 repeat_max = -1;
3472 goto REPEAT;
3473
3474 case '+':
3475 repeat_min = 1;
3476 repeat_max = -1;
3477 goto REPEAT;
3478
3479 case '?':
3480 repeat_min = 0;
3481 repeat_max = 1;
3482
3483 REPEAT:
3484 if (previous == NULL)
3485 {
3486 *errorcodeptr = ERR9;
3487 goto FAILED;
3488 }
3489
3490 if (repeat_min == 0)
3491 {
3492 firstbyte = zerofirstbyte; /* Adjust for zero repeat */
3493 reqbyte = zeroreqbyte; /* Ditto */
3494 }
3495
3496 /* Remember whether this is a variable length repeat */
3497
3498 reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
3499
3500 op_type = 0; /* Default single-char op codes */
3501 possessive_quantifier = FALSE; /* Default not possessive quantifier */
3502
3503 /* Save start of previous item, in case we have to move it up to make space
3504 for an inserted OP_ONCE for the additional '+' extension. */
3505
3506 tempcode = previous;
3507
3508 /* If the next character is '+', we have a possessive quantifier. This
3509 implies greediness, whatever the setting of the PCRE_UNGREEDY option.
3510 If the next character is '?' this is a minimizing repeat, by default,
3511 but if PCRE_UNGREEDY is set, it works the other way round. We change the
3512 repeat type to the non-default. */
3513
3514 if (ptr[1] == '+')
3515 {
3516 repeat_type = 0; /* Force greedy */
3517 possessive_quantifier = TRUE;
3518 ptr++;
3519 }
3520 else if (ptr[1] == '?')
3521 {
3522 repeat_type = greedy_non_default;
3523 ptr++;
3524 }
3525 else repeat_type = greedy_default;
3526
3527 /* If previous was a character match, abolish the item and generate a
3528 repeat item instead. If a char item has a minumum of more than one, ensure
3529 that it is set in reqbyte - it might not be if a sequence such as x{3} is
3530 the first thing in a branch because the x will have gone into firstbyte
3531 instead. */
3532
3533 if (*previous == OP_CHAR || *previous == OP_CHARNC)
3534 {
3535 /* Deal with UTF-8 characters that take up more than one byte. It's
3536 easier to write this out separately than try to macrify it. Use c to
3537 hold the length of the character in bytes, plus 0x80 to flag that it's a
3538 length rather than a small character. */
3539
3540 #ifdef SUPPORT_UTF8
3541 if (utf8 && (code[-1] & 0x80) != 0)
3542 {
3543 uschar *lastchar = code - 1;
3544 while((*lastchar & 0xc0) == 0x80) lastchar--;
3545 c = code - lastchar; /* Length of UTF-8 character */
3546 memcpy(utf8_char, lastchar, c); /* Save the char */
3547 c |= 0x80; /* Flag c as a length */
3548 }
3549 else
3550 #endif
3551
3552 /* Handle the case of a single byte - either with no UTF8 support, or
3553 with UTF-8 disabled, or for a UTF-8 character < 128. */
3554
3555 {
3556 c = code[-1];
3557 if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
3558 }
3559
3560 /* If the repetition is unlimited, it pays to see if the next thing on
3561 the line is something that cannot possibly match this character. If so,
3562 automatically possessifying this item gains some performance in the case
3563 where the match fails. */
3564
3565 if (!possessive_quantifier &&
3566 repeat_max < 0 &&
3567 check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3568 options, cd))
3569 {
3570 repeat_type = 0; /* Force greedy */
3571 possessive_quantifier = TRUE;
3572 }
3573
3574 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
3575 }
3576
3577 /* If previous was a single negated character ([^a] or similar), we use
3578 one of the special opcodes, replacing it. The code is shared with single-
3579 character repeats by setting opt_type to add a suitable offset into
3580 repeat_type. We can also test for auto-possessification. OP_NOT is
3581 currently used only for single-byte chars. */
3582
3583 else if (*previous == OP_NOT)
3584 {
3585 op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
3586 c = previous[1];
3587 if (!possessive_quantifier &&
3588 repeat_max < 0 &&
3589 check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3590 {
3591 repeat_type = 0; /* Force greedy */
3592 possessive_quantifier = TRUE;
3593 }
3594 goto OUTPUT_SINGLE_REPEAT;
3595 }
3596
3597 /* If previous was a character type match (\d or similar), abolish it and
3598 create a suitable repeat item. The code is shared with single-character
3599 repeats by setting op_type to add a suitable offset into repeat_type. Note
3600 the the Unicode property types will be present only when SUPPORT_UCP is
3601 defined, but we don't wrap the little bits of code here because it just
3602 makes it horribly messy. */
3603
3604 else if (*previous < OP_EODN)
3605 {
3606 uschar *oldcode;
3607 int prop_type, prop_value;
3608 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
3609 c = *previous;
3610
3611 if (!possessive_quantifier &&
3612 repeat_max < 0 &&
3613 check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3614 {
3615 repeat_type = 0; /* Force greedy */
3616 possessive_quantifier = TRUE;
3617 }
3618
3619 OUTPUT_SINGLE_REPEAT:
3620 if (*previous == OP_PROP || *previous == OP_NOTPROP)
3621 {
3622 prop_type = previous[1];
3623 prop_value = previous[2];
3624 }
3625 else prop_type = prop_value = -1;
3626
3627 oldcode = code;
3628 code = previous; /* Usually overwrite previous item */
3629
3630 /* If the maximum is zero then the minimum must also be zero; Perl allows
3631 this case, so we do too - by simply omitting the item altogether. */
3632
3633 if (repeat_max == 0) goto END_REPEAT;
3634
3635 /* All real repeats make it impossible to handle partial matching (maybe
3636 one day we will be able to remove this restriction). */
3637
3638 if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL;
3639
3640 /* Combine the op_type with the repeat_type */
3641
3642 repeat_type += op_type;
3643
3644 /* A minimum of zero is handled either as the special case * or ?, or as
3645 an UPTO, with the maximum given. */
3646
3647 if (repeat_min == 0)
3648 {
3649 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
3650 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
3651 else
3652 {
3653 *code++ = OP_UPTO + repeat_type;
3654 PUT2INC(code, 0, repeat_max);
3655 }
3656 }
3657
3658 /* A repeat minimum of 1 is optimized into some special cases. If the
3659 maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
3660 left in place and, if the maximum is greater than 1, we use OP_UPTO with
3661 one less than the maximum. */
3662
3663 else if (repeat_min == 1)
3664 {
3665 if (repeat_max == -1)
3666 *code++ = OP_PLUS + repeat_type;
3667 else
3668 {
3669 code = oldcode; /* leave previous item in place */
3670 if (repeat_max == 1) goto END_REPEAT;
3671 *code++ = OP_UPTO + repeat_type;
3672 PUT2INC(code, 0, repeat_max - 1);
3673 }
3674 }
3675
3676 /* The case {n,n} is just an EXACT, while the general case {n,m} is
3677 handled as an EXACT followed by an UPTO. */
3678
3679 else
3680 {
3681 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
3682 PUT2INC(code, 0, repeat_min);
3683
3684 /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
3685 we have to insert the character for the previous code. For a repeated
3686 Unicode property match, there are two extra bytes that define the
3687 required property. In UTF-8 mode, long characters have their length in
3688 c, with the 0x80 bit as a flag. */
3689
3690 if (repeat_max < 0)
3691 {
3692 #ifdef SUPPORT_UTF8
3693 if (utf8 && c >= 128)
3694 {
3695 memcpy(code, utf8_char, c & 7);
3696 code += c & 7;
3697 }
3698 else
3699 #endif
3700 {
3701 *code++ = c;
3702 if (prop_type >= 0)
3703 {
3704 *code++ = prop_type;
3705 *code++ = prop_value;
3706 }
3707 }
3708 *code++ = OP_STAR + repeat_type;
3709 }
3710
3711 /* Else insert an UPTO if the max is greater than the min, again
3712 preceded by the character, for the previously inserted code. If the
3713 UPTO is just for 1 instance, we can use QUERY instead. */
3714
3715 else if (repeat_max != repeat_min)
3716 {
3717 #ifdef SUPPORT_UTF8
3718 if (utf8 && c >= 128)
3719 {
3720 memcpy(code, utf8_char, c & 7);
3721 code += c & 7;
3722 }
3723 else
3724 #endif
3725 *code++ = c;
3726 if (prop_type >= 0)
3727 {
3728 *code++ = prop_type;
3729 *code++ = prop_value;
3730 }
3731 repeat_max -= repeat_min;
3732
3733 if (repeat_max == 1)
3734 {
3735 *code++ = OP_QUERY + repeat_type;
3736 }
3737 else
3738 {
3739 *code++ = OP_UPTO + repeat_type;
3740 PUT2INC(code, 0, repeat_max);
3741 }
3742 }
3743 }
3744
3745 /* The character or character type itself comes last in all cases. */
3746
3747 #ifdef SUPPORT_UTF8
3748 if (utf8 && c >= 128)
3749 {
3750 memcpy(code, utf8_char, c & 7);
3751 code += c & 7;
3752 }
3753 else
3754 #endif
3755 *code++ = c;
3756
3757 /* For a repeated Unicode property match, there are two extra bytes that
3758 define the required property. */
3759
3760 #ifdef SUPPORT_UCP
3761 if (prop_type >= 0)
3762 {
3763 *code++ = prop_type;
3764 *code++ = prop_value;
3765 }
3766 #endif
3767 }
3768
3769 /* If previous was a character class or a back reference, we put the repeat
3770 stuff after it, but just skip the item if the repeat was {0,0}. */
3771
3772 else if (*previous == OP_CLASS ||
3773 *previous == OP_NCLASS ||
3774 #ifdef SUPPORT_UTF8
3775 *previous == OP_XCLASS ||
3776 #endif
3777 *previous == OP_REF)
3778 {
3779 if (repeat_max == 0)
3780 {
3781 code = previous;
3782 goto END_REPEAT;
3783 }
3784
3785 /* All real repeats make it impossible to handle partial matching (maybe
3786 one day we will be able to remove this restriction). */
3787
3788 if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL;
3789
3790 if (repeat_min == 0 && repeat_max == -1)
3791 *code++ = OP_CRSTAR + repeat_type;
3792 else if (repeat_min == 1 && repeat_max == -1)
3793 *code++ = OP_CRPLUS + repeat_type;
3794 else if (repeat_min == 0 && repeat_max == 1)
3795 *code++ = OP_CRQUERY + repeat_type;
3796 else
3797 {
3798 *code++ = OP_CRRANGE + repeat_type;
3799 PUT2INC(code, 0, repeat_min);
3800 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
3801 PUT2INC(code, 0, repeat_max);
3802 }
3803 }
3804
3805 /* If previous was a bracket group, we may have to replicate it in certain
3806 cases. */
3807
3808 else if (*previous == OP_BRA || *previous == OP_CBRA ||
3809 *previous == OP_ONCE || *previous == OP_COND)
3810 {
3811 register int i;
3812 int ketoffset = 0;
3813 int len = code - previous;
3814 uschar *bralink = NULL;
3815
3816 /* Repeating a DEFINE group is pointless */
3817
3818 if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
3819 {
3820 *errorcodeptr = ERR55;
3821 goto FAILED;
3822 }
3823
3824 /* If the maximum repeat count is unlimited, find the end of the bracket
3825 by scanning through from the start, and compute the offset back to it
3826 from the current code pointer. There may be an OP_OPT setting following
3827 the final KET, so we can't find the end just by going back from the code
3828 pointer. */
3829
3830 if (repeat_max == -1)
3831 {
3832 register uschar *ket = previous;
3833 do ket += GET(ket, 1); while (*ket != OP_KET);
3834 ketoffset = code - ket;
3835 }
3836
3837 /* The case of a zero minimum is special because of the need to stick
3838 OP_BRAZERO in front of it, and because the group appears once in the
3839 data, whereas in other cases it appears the minimum number of times. For
3840 this reason, it is simplest to treat this case separately, as otherwise
3841 the code gets far too messy. There are several special subcases when the
3842 minimum is zero. */
3843
3844 if (repeat_min == 0)
3845 {
3846 /* If the maximum is also zero, we used to just omit the group from the
3847 output altogether, like this:
3848
3849 ** if (repeat_max == 0)
3850 ** {
3851 ** code = previous;
3852 ** goto END_REPEAT;
3853 ** }
3854
3855 However, that fails when a group is referenced as a subroutine from
3856 elsewhere in the pattern, so now we stick in OP_SKIPZERO in front of it
3857 so that it is skipped on execution. As we don't have a list of which
3858 groups are referenced, we cannot do this selectively.
3859
3860 If the maximum is 1 or unlimited, we just have to stick in the BRAZERO
3861 and do no more at this point. However, we do need to adjust any
3862 OP_RECURSE calls inside the group that refer to the group itself or any
3863 internal or forward referenced group, because the offset is from the
3864 start of the whole regex. Temporarily terminate the pattern while doing
3865 this. */
3866
3867 if (repeat_max <= 1) /* Covers 0, 1, and unlimited */
3868 {
3869 *code = OP_END;
3870 adjust_recurse(previous, 1, utf8, cd, save_hwm);
3871 memmove(previous+1, previous, len);
3872 code++;
3873 if (repeat_max == 0)
3874 {
3875 *previous++ = OP_SKIPZERO;
3876 goto END_REPEAT;
3877 }
3878 *previous++ = OP_BRAZERO + repeat_type;
3879 }
3880
3881 /* If the maximum is greater than 1 and limited, we have to replicate
3882 in a nested fashion, sticking OP_BRAZERO before each set of brackets.
3883 The first one has to be handled carefully because it's the original
3884 copy, which has to be moved up. The remainder can be handled by code
3885 that is common with the non-zero minimum case below. We have to
3886 adjust the value or repeat_max, since one less copy is required. Once
3887 again, we may have to adjust any OP_RECURSE calls inside the group. */
3888
3889 else
3890 {
3891 int offset;
3892 *code = OP_END;
3893 adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
3894 memmove(previous + 2 + LINK_SIZE, previous, len);
3895 code += 2 + LINK_SIZE;
3896 *previous++ = OP_BRAZERO + repeat_type;
3897 *previous++ = OP_BRA;
3898
3899 /* We chain together the bracket offset fields that have to be
3900 filled in later when the ends of the brackets are reached. */
3901
3902 offset = (bralink == NULL)? 0 : previous - bralink;
3903 bralink = previous;
3904 PUTINC(previous, 0, offset);
3905 }
3906
3907 repeat_max--;
3908 }
3909
3910 /* If the minimum is greater than zero, replicate the group as many
3911 times as necessary, and adjust the maximum to the number of subsequent
3912 copies that we need. If we set a first char from the group, and didn't
3913 set a required char, copy the latter from the former. If there are any
3914 forward reference subroutine calls in the group, there will be entries on
3915 the workspace list; replicate these with an appropriate increment. */
3916
3917 else
3918 {
3919 if (repeat_min > 1)
3920 {
3921 /* In the pre-compile phase, we don't actually do the replication. We
3922 just adjust the length as if we had. Do some paranoid checks for
3923 potential integer overflow. */
3924
3925 if (lengthptr != NULL)
3926 {
3927 int delta = (repeat_min - 1)*length_prevgroup;
3928 if ((double)(repeat_min - 1)*(double)length_prevgroup >
3929 (double)INT_MAX ||
3930 OFLOW_MAX - *lengthptr < delta)
3931 {
3932 *errorcodeptr = ERR20;
3933 goto FAILED;
3934 }
3935 *lengthptr += delta;
3936 }
3937
3938 /* This is compiling for real */
3939
3940 else
3941 {
3942 if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
3943 for (i = 1; i < repeat_min; i++)
3944 {
3945 uschar *hc;
3946 uschar *this_hwm = cd->hwm;
3947 memcpy(code, previous, len);
3948 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3949 {
3950 PUT(cd->hwm, 0, GET(hc, 0) + len);
3951 cd->hwm += LINK_SIZE;
3952 }
3953 save_hwm = this_hwm;
3954 code += len;
3955 }
3956 }
3957 }
3958
3959 if (repeat_max > 0) repeat_max -= repeat_min;
3960 }
3961
3962 /* This code is common to both the zero and non-zero minimum cases. If
3963 the maximum is limited, it replicates the group in a nested fashion,
3964 remembering the bracket starts on a stack. In the case of a zero minimum,
3965 the first one was set up above. In all cases the repeat_max now specifies
3966 the number of additional copies needed. Again, we must remember to
3967 replicate entries on the forward reference list. */
3968
3969 if (repeat_max >= 0)
3970 {
3971 /* In the pre-compile phase, we don't actually do the replication. We
3972 just adjust the length as if we had. For each repetition we must add 1
3973 to the length for BRAZERO and for all but the last repetition we must
3974 add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
3975 paranoid checks to avoid integer overflow. */
3976
3977 if (lengthptr != NULL && repeat_max > 0)
3978 {
3979 int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
3980 2 - 2*LINK_SIZE; /* Last one doesn't nest */
3981 if ((double)repeat_max *
3982 (double)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
3983 > (double)INT_MAX ||
3984 OFLOW_MAX - *lengthptr < delta)
3985 {
3986 *errorcodeptr = ERR20;
3987 goto FAILED;
3988 }
3989 *lengthptr += delta;
3990 }
3991
3992 /* This is compiling for real */
3993
3994 else for (i = repeat_max - 1; i >= 0; i--)
3995 {
3996 uschar *hc;
3997 uschar *this_hwm = cd->hwm;
3998
3999 *code++ = OP_BRAZERO + repeat_type;
4000
4001 /* All but the final copy start a new nesting, maintaining the
4002 chain of brackets outstanding. */
4003
4004 if (i != 0)
4005 {
4006 int offset;
4007 *code++ = OP_BRA;
4008 offset = (bralink == NULL)? 0 : code - bralink;
4009 bralink = code;
4010 PUTINC(code, 0, offset);
4011 }
4012
4013 memcpy(code, previous, len);
4014 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
4015 {
4016 PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
4017 cd->hwm += LINK_SIZE;
4018 }
4019 save_hwm = this_hwm;
4020 code += len;
4021 }
4022
4023 /* Now chain through the pending brackets, and fill in their length
4024 fields (which are holding the chain links pro tem). */
4025
4026 while (bralink != NULL)
4027 {
4028 int oldlinkoffset;
4029 int offset = code - bralink + 1;
4030 uschar *bra = code - offset;
4031 oldlinkoffset = GET(bra, 1);
4032 bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
4033 *code++ = OP_KET;
4034 PUTINC(code, 0, offset);
4035 PUT(bra, 1, offset);
4036 }
4037 }
4038
4039 /* If the maximum is unlimited, set a repeater in the final copy. We
4040 can't just offset backwards from the current code point, because we
4041 don't know if there's been an options resetting after the ket. The
4042 correct offset was computed above.
4043
4044 Then, when we are doing the actual compile phase, check to see whether
4045 this group is a non-atomic one that could match an empty string. If so,
4046 convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
4047 that runtime checking can be done. [This check is also applied to
4048 atomic groups at runtime, but in a different way.] */
4049
4050 else
4051 {
4052 uschar *ketcode = code - ketoffset;
4053 uschar *bracode = ketcode - GET(ketcode, 1);
4054 *ketcode = OP_KETRMAX + repeat_type;
4055 if (lengthptr == NULL && *bracode != OP_ONCE)
4056 {
4057 uschar *scode = bracode;
4058 do
4059 {
4060 if (could_be_empty_branch(scode, ketcode, utf8))
4061 {
4062 *bracode += OP_SBRA - OP_BRA;
4063 break;
4064 }
4065 scode += GET(scode, 1);
4066 }
4067 while (*scode == OP_ALT);
4068 }
4069 }
4070 }
4071
4072 /* Else there's some kind of shambles */
4073
4074 else
4075 {
4076 *errorcodeptr = ERR11;
4077 goto FAILED;
4078 }
4079
4080 /* If the character following a repeat is '+', or if certain optimization
4081 tests above succeeded, possessive_quantifier is TRUE. For some of the
4082 simpler opcodes, there is an special alternative opcode for this. For
4083 anything else, we wrap the entire repeated item inside OP_ONCE brackets.
4084 The '+' notation is just syntactic sugar, taken from Sun's Java package,
4085 but the special opcodes can optimize it a bit. The repeated item starts at
4086 tempcode, not at previous, which might be the first part of a string whose
4087 (former) last char we repeated.
4088
4089 Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
4090 an 'upto' may follow. We skip over an 'exact' item, and then test the
4091 length of what remains before proceeding. */
4092
4093 if (possessive_quantifier)
4094 {
4095 int len;
4096 if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||
4097 *tempcode == OP_NOTEXACT)
4098 tempcode += _pcre_OP_lengths[*tempcode] +
4099 ((*tempcode == OP_TYPEEXACT &&
4100 (tempcode[3] == OP_PROP || tempcode[3] == OP_NOTPROP))? 2:0);
4101 len = code - tempcode;
4102 if (len > 0) switch (*tempcode)
4103 {
4104 case OP_STAR: *tempcode = OP_POSSTAR; break;
4105 case OP_PLUS: *tempcode = OP_POSPLUS; break;
4106 case OP_QUERY: *tempcode = OP_POSQUERY; break;
4107 case OP_UPTO: *tempcode = OP_POSUPTO; break;
4108
4109 case OP_TYPESTAR: *tempcode = OP_TYPEPOSSTAR; break;
4110 case OP_TYPEPLUS: *tempcode = OP_TYPEPOSPLUS; break;
4111 case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
4112 case OP_TYPEUPTO: *tempcode = OP_TYPEPOSUPTO; break;
4113
4114 case OP_NOTSTAR: *tempcode = OP_NOTPOSSTAR; break;
4115 case OP_NOTPLUS: *tempcode = OP_NOTPOSPLUS; break;
4116 case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
4117 case OP_NOTUPTO: *tempcode = OP_NOTPOSUPTO; break;
4118
4119 default:
4120 memmove(tempcode + 1+LINK_SIZE, tempcode, len);
4121 code += 1 + LINK_SIZE;
4122 len += 1 + LINK_SIZE;
4123 tempcode[0] = OP_ONCE;
4124 *code++ = OP_KET;
4125 PUTINC(code, 0, len);
4126 PUT(tempcode, 1, len);
4127 break;
4128 }
4129 }
4130
4131 /* In all case we no longer have a previous item. We also set the
4132 "follows varying string" flag for subsequently encountered reqbytes if
4133 it isn't already set and we have just passed a varying length item. */
4134
4135 END_REPEAT:
4136 previous = NULL;
4137 cd->req_varyopt |= reqvary;
4138 break;
4139
4140
4141 /* ===================================================================*/
4142 /* Start of nested parenthesized sub-expression, or comment or lookahead or
4143 lookbehind or option setting or condition or all the other extended
4144 parenthesis forms. */
4145
4146 case '(':
4147 newoptions = options;
4148 skipbytes = 0;
4149 bravalue = OP_CBRA;
4150 save_hwm = cd->hwm;
4151 reset_bracount = FALSE;
4152
4153 /* First deal with various "verbs" that can be introduced by '*'. */
4154
4155 if (*(++ptr) == '*' && (cd->ctypes[ptr[1]] & ctype_letter) != 0)
4156 {
4157 int i, namelen;
4158 const char *vn = verbnames;
4159 const uschar *name = ++ptr;
4160 previous = NULL;
4161 while ((cd->ctypes[*++ptr] & ctype_letter) != 0);
4162 if (*ptr == ':')
4163 {
4164 *errorcodeptr = ERR59; /* Not supported */
4165 goto FAILED;
4166 }
4167 if (*ptr != ')')
4168 {
4169 *errorcodeptr = ERR60;
4170 goto FAILED;
4171 }
4172 namelen = ptr - name;
4173 for (i = 0; i < verbcount; i++)
4174 {
4175 if (namelen == verbs[i].len &&
4176 strncmp((char *)name, vn, namelen) == 0)
4177 {
4178 *code = verbs[i].op;
4179 if (*code++ == OP_ACCEPT) cd->had_accept = TRUE;
4180 break;
4181 }
4182 vn += verbs[i].len + 1;
4183 }
4184 if (i < verbcount) continue;
4185 *errorcodeptr = ERR60;
4186 goto FAILED;
4187 }
4188
4189 /* Deal with the extended parentheses; all are introduced by '?', and the
4190 appearance of any of them means that this is not a capturing group. */
4191
4192 else if (*ptr == '?')
4193 {
4194 int i, set, unset, namelen;
4195 int *optset;
4196 const uschar *name;
4197 uschar *slot;
4198
4199 switch (*(++ptr))
4200 {
4201 case '#': /* Comment; skip to ket */
4202 ptr++;
4203 while (*ptr != 0 && *ptr != ')') ptr++;
4204 if (*ptr == 0)
4205 {
4206 *errorcodeptr = ERR18;
4207 goto FAILED;
4208 }
4209 continue;
4210
4211
4212 /* ------------------------------------------------------------ */
4213 case '|': /* Reset capture count for each branch */
4214 reset_bracount = TRUE;
4215 /* Fall through */
4216
4217 /* ------------------------------------------------------------ */
4218 case ':': /* Non-capturing bracket */
4219 bravalue = OP_BRA;
4220 ptr++;
4221 break;
4222
4223
4224 /* ------------------------------------------------------------ */
4225 case '(':
4226 bravalue = OP_COND; /* Conditional group */
4227
4228 /* A condition can be an assertion, a number (referring to a numbered
4229 group), a name (referring to a named group), or 'R', referring to
4230 recursion. R<digits> and R&name are also permitted for recursion tests.
4231
4232 There are several syntaxes for testing a named group: (?(name)) is used
4233 by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
4234
4235 There are two unfortunate ambiguities, caused by history. (a) 'R' can
4236 be the recursive thing or the name 'R' (and similarly for 'R' followed
4237 by digits), and (b) a number could be a name that consists of digits.
4238 In both cases, we look for a name first; if not found, we try the other
4239 cases. */
4240
4241 /* For conditions that are assertions, check the syntax, and then exit
4242 the switch. This will take control down to where bracketed groups,
4243 including assertions, are processed. */
4244
4245 if (ptr[1] == '?' && (ptr[2] == '=' || ptr[2] == '!' || ptr[2] == '<'))
4246 break;
4247
4248 /* Most other conditions use OP_CREF (a couple change to OP_RREF
4249 below), and all need to skip 3 bytes at the start of the group. */
4250
4251 code[1+LINK_SIZE] = OP_CREF;
4252 skipbytes = 3;
4253 refsign = -1;
4254
4255 /* Check for a test for recursion in a named group. */
4256
4257 if (ptr[1] == 'R' && ptr[2] == '&')
4258 {
4259 terminator = -1;
4260 ptr += 2;
4261 code[1+LINK_SIZE] = OP_RREF; /* Change the type of test */
4262 }
4263
4264 /* Check for a test for a named group's having been set, using the Perl
4265 syntax (?(<name>) or (?('name') */
4266
4267 else if (ptr[1] == '<')
4268 {
4269 terminator = '>';
4270 ptr++;
4271 }
4272 else if (ptr[1] == '\'')
4273 {
4274 terminator = '\'';
4275 ptr++;
4276 }
4277 else
4278 {
4279 terminator = 0;
4280 if (ptr[1] == '-' || ptr[1] == '+') refsign = *(++ptr);
4281 }
4282
4283 /* We now expect to read a name; any thing else is an error */
4284
4285 if ((cd->ctypes[ptr[1]] & ctype_word) == 0)
4286 {
4287 ptr += 1; /* To get the right offset */
4288 *errorcodeptr = ERR28;
4289 goto FAILED;
4290 }
4291
4292 /* Read the name, but also get it as a number if it's all digits */
4293
4294 recno = 0;
4295 name = ++ptr;
4296 while ((cd->ctypes[*ptr] & ctype_word) != 0)
4297 {
4298 if (recno >= 0)
4299 recno = ((digitab[*ptr] & ctype_digit) != 0)?
4300 recno * 10 + *ptr - '0' : -1;
4301 ptr++;
4302 }
4303 namelen = ptr - name;
4304
4305 if ((terminator > 0 && *ptr++ != terminator) || *ptr++ != ')')
4306 {
4307 ptr--; /* Error offset */
4308 *errorcodeptr = ERR26;
4309 goto FAILED;
4310 }
4311
4312 /* Do no further checking in the pre-compile phase. */
4313
4314 if (lengthptr != NULL) break;
4315
4316 /* In the real compile we do the work of looking for the actual
4317 reference. If the string started with "+" or "-" we require the rest to
4318 be digits, in which case recno will be set. */
4319
4320 if (refsign > 0)
4321 {
4322 if (recno <= 0)
4323 {
4324 *errorcodeptr = ERR58;
4325 goto FAILED;
4326 }
4327 recno = (refsign == '-')?
4328 cd->bracount - recno + 1 : recno +cd->bracount;
4329 if (recno <= 0 || recno > cd->final_bracount)
4330 {
4331 *errorcodeptr = ERR15;
4332 goto FAILED;
4333 }
4334 PUT2(code, 2+LINK_SIZE, recno);
4335 break;
4336 }
4337
4338 /* Otherwise (did not start with "+" or "-"), start by looking for the
4339 name. */
4340
4341 slot = cd->name_table;
4342 for (i = 0; i < cd->names_found; i++)
4343 {
4344 if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
4345 slot += cd->name_entry_size;
4346 }
4347
4348 /* Found a previous named subpattern */
4349
4350 if (i < cd->names_found)
4351 {
4352 recno = GET2(slot, 0);
4353 PUT2(code, 2+LINK_SIZE, recno);
4354 }
4355
4356 /* Search the pattern for a forward reference */
4357
4358 else if ((i = find_parens(ptr, cd->bracount, name, namelen,
4359 (options & PCRE_EXTENDED) != 0)) > 0)
4360 {
4361 PUT2(code, 2+LINK_SIZE, i);
4362 }
4363
4364 /* If terminator == 0 it means that the name followed directly after
4365 the opening parenthesis [e.g. (?(abc)...] and in this case there are
4366 some further alternatives to try. For the cases where terminator != 0
4367 [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
4368 now checked all the possibilities, so give an error. */
4369
4370 else if (terminator != 0)
4371 {
4372 *errorcodeptr = ERR15;
4373 goto FAILED;
4374 }
4375
4376 /* Check for (?(R) for recursion. Allow digits after R to specify a
4377 specific group number. */
4378
4379 else if (*name == 'R')
4380 {
4381 recno = 0;
4382 for (i = 1; i < namelen; i++)
4383 {
4384 if ((digitab[name[i]] & ctype_digit) == 0)
4385 {
4386 *errorcodeptr = ERR15;
4387 goto FAILED;
4388 }
4389 recno = recno * 10 + name[i] - '0';
4390 }
4391 if (recno == 0) recno = RREF_ANY;
4392 code[1+LINK_SIZE] = OP_RREF; /* Change test type */
4393 PUT2(code, 2+LINK_SIZE, recno);
4394 }
4395
4396 /* Similarly, check for the (?(DEFINE) "condition", which is always
4397 false. */
4398
4399 else if (namelen == 6 && strncmp((char *)name, "DEFINE", 6) == 0)
4400 {
4401 code[1+LINK_SIZE] = OP_DEF;
4402 skipbytes = 1;
4403 }
4404
4405 /* Check for the "name" actually being a subpattern number. We are
4406 in the second pass here, so final_bracount is set. */
4407
4408 else if (recno > 0 && recno <= cd->final_bracount)
4409 {
4410 PUT2(code, 2+LINK_SIZE, recno);
4411 }
4412
4413 /* Either an unidentified subpattern, or a reference to (?(0) */
4414
4415 else
4416 {
4417 *errorcodeptr = (recno == 0)? ERR35: ERR15;
4418 goto FAILED;
4419 }
4420 break;
4421
4422
4423 /* ------------------------------------------------------------ */
4424 case '=': /* Positive lookahead */
4425 bravalue = OP_ASSERT;
4426 ptr++;
4427 break;
4428
4429
4430 /* ------------------------------------------------------------ */
4431 case '!': /* Negative lookahead */
4432 ptr++;
4433 if (*ptr == ')') /* Optimize (?!) */
4434 {
4435 *code++ = OP_FAIL;
4436 previous = NULL;
4437 continue;
4438 }
4439 bravalue = OP_ASSERT_NOT;
4440 break;
4441
4442
4443 /* ------------------------------------------------------------ */
4444 case '<': /* Lookbehind or named define */
4445 switch (ptr[1])
4446 {
4447 case '=': /* Positive lookbehind */
4448 bravalue = OP_ASSERTBACK;
4449 ptr += 2;
4450 break;
4451
4452 case '!': /* Negative lookbehind */
4453 bravalue = OP_ASSERTBACK_NOT;
4454 ptr += 2;
4455 break;
4456
4457 default: /* Could be name define, else bad */
4458 if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME;
4459 ptr++; /* Correct offset for error */
4460 *errorcodeptr = ERR24;
4461 goto FAILED;
4462 }
4463 break;
4464
4465
4466 /* ------------------------------------------------------------ */
4467 case '>': /* One-time brackets */
4468 bravalue = OP_ONCE;
4469 ptr++;
4470 break;
4471
4472
4473 /* ------------------------------------------------------------ */
4474 case 'C': /* Callout - may be followed by digits; */
4475 previous_callout = code; /* Save for later completion */
4476 after_manual_callout = 1; /* Skip one item before completing */
4477 *code++ = OP_CALLOUT;
4478 {
4479 int n = 0;
4480 while ((digitab[*(++ptr)] & ctype_digit) != 0)
4481 n = n * 10 + *ptr - '0';
4482 if (*ptr != ')')
4483 {
4484 *errorcodeptr = ERR39;
4485 goto FAILED;
4486 }
4487 if (n > 255)
4488 {
4489 *errorcodeptr = ERR38;
4490 goto FAILED;
4491 }
4492 *code++ = n;
4493 PUT(code, 0, ptr - cd->start_pattern + 1); /* Pattern offset */
4494 PUT(code, LINK_SIZE, 0); /* Default length */
4495 code += 2 * LINK_SIZE;
4496 }
4497 previous = NULL;
4498 continue;
4499
4500
4501 /* ------------------------------------------------------------ */
4502 case 'P': /* Python-style named subpattern handling */
4503 if (*(++ptr) == '=' || *ptr == '>') /* Reference or recursion */
4504 {
4505 is_recurse = *ptr == '>';
4506 terminator = ')';
4507 goto NAMED_REF_OR_RECURSE;
4508 }
4509 else if (*ptr != '<') /* Test for Python-style definition */
4510 {
4511 *errorcodeptr = ERR41;
4512 goto FAILED;
4513 }
4514 /* Fall through to handle (?P< as (?< is handled */
4515
4516
4517 /* ------------------------------------------------------------ */
4518 DEFINE_NAME: /* Come here from (?< handling */
4519 case '\'':
4520 {
4521 terminator = (*ptr == '<')? '>' : '\'';
4522 name = ++ptr;
4523
4524 while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4525 namelen = ptr - name;
4526
4527 /* In the pre-compile phase, just do a syntax check. */
4528
4529 if (lengthptr != NULL)
4530 {
4531 if (*ptr != terminator)
4532 {
4533 *errorcodeptr = ERR42;
4534 goto FAILED;
4535 }
4536 if (cd->names_found >= MAX_NAME_COUNT)
4537 {
4538 *errorcodeptr = ERR49;
4539 goto FAILED;
4540 }
4541 if (namelen + 3 > cd->name_entry_size)
4542 {
4543 cd->name_entry_size = namelen + 3;
4544 if (namelen > MAX_NAME_SIZE)
4545 {
4546 *errorcodeptr = ERR48;
4547 goto FAILED;
4548 }
4549 }
4550 }
4551
4552 /* In the real compile, create the entry in the table */
4553
4554 else
4555 {
4556 slot = cd->name_table;
4557 for (i = 0; i < cd->names_found; i++)
4558 {
4559 int crc = memcmp(name, slot+2, namelen);
4560 if (crc == 0)
4561 {
4562 if (slot[2+namelen] == 0)
4563 {
4564 if ((options & PCRE_DUPNAMES) == 0)
4565 {
4566 *errorcodeptr = ERR43;
4567 goto FAILED;
4568 }
4569 }
4570 else crc = -1; /* Current name is substring */
4571 }
4572 if (crc < 0)
4573 {
4574 memmove(slot + cd->name_entry_size, slot,
4575 (cd->names_found - i) * cd->name_entry_size);
4576 break;
4577 }
4578 slot += cd->name_entry_size;
4579 }
4580
4581 PUT2(slot, 0, cd->bracount + 1);
4582 memcpy(slot + 2, name, namelen);
4583 slot[2+namelen] = 0;
4584 }
4585 }
4586
4587 /* In both cases, count the number of names we've encountered. */
4588
4589 ptr++; /* Move past > or ' */
4590 cd->names_found++;
4591 goto NUMBERED_GROUP;
4592
4593
4594 /* ------------------------------------------------------------ */
4595 case '&': /* Perl recursion/subroutine syntax */
4596 terminator = ')';
4597 is_recurse = TRUE;
4598 /* Fall through */
4599
4600 /* We come here from the Python syntax above that handles both
4601 references (?P=name) and recursion (?P>name), as well as falling
4602 through from the Perl recursion syntax (?&name). We also come here from
4603 the Perl \k<name> or \k'name' back reference syntax and the \k{name}
4604 .NET syntax, and the Oniguruma \g<...> and \g'...' subroutine syntax. */
4605
4606 NAMED_REF_OR_RECURSE:
4607 name = ++ptr;
4608 while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4609 namelen = ptr - name;
4610
4611 /* In the pre-compile phase, do a syntax check and set a dummy
4612 reference number. */
4613
4614 if (lengthptr != NULL)
4615 {
4616 if (namelen == 0)
4617 {
4618 *errorcodeptr = ERR62;
4619 goto FAILED;
4620 }
4621 if (*ptr != terminator)
4622 {
4623 *errorcodeptr = ERR42;
4624 goto FAILED;
4625 }
4626 if (namelen > MAX_NAME_SIZE)
4627 {
4628 *errorcodeptr = ERR48;
4629 goto FAILED;
4630 }
4631 recno = 0;
4632 }
4633
4634 /* In the real compile, seek the name in the table. We check the name
4635 first, and then check that we have reached the end of the name in the
4636 table. That way, if the name that is longer than any in the table,
4637 the comparison will fail without reading beyond the table entry. */
4638
4639 else
4640 {
4641 slot = cd->name_table;
4642 for (i = 0; i < cd->names_found; i++)
4643 {
4644 if (strncmp((char *)name, (char *)slot+2, namelen) == 0 &&
4645 slot[2+namelen] == 0)
4646 break;
4647 slot += cd->name_entry_size;
4648 }
4649
4650 if (i < cd->names_found) /* Back reference */
4651 {
4652 recno = GET2(slot, 0);
4653 }
4654 else if ((recno = /* Forward back reference */
4655 find_parens(ptr, cd->bracount, name, namelen,
4656 (options & PCRE_EXTENDED) != 0)) <= 0)
4657 {
4658 *errorcodeptr = ERR15;
4659 goto FAILED;
4660 }
4661 }
4662
4663 /* In both phases, we can now go to the code than handles numerical
4664 recursion or backreferences. */
4665
4666 if (is_recurse) goto HANDLE_RECURSION;
4667 else goto HANDLE_REFERENCE;
4668
4669
4670 /* ------------------------------------------------------------ */
4671 case 'R': /* Recursion */
4672 ptr++; /* Same as (?0) */
4673 /* Fall through */
4674
4675
4676 /* ------------------------------------------------------------ */
4677 case '-': case '+':
4678 case '0': case '1': case '2': case '3': case '4': /* Recursion or */
4679 case '5': case '6': case '7': case '8': case '9': /* subroutine */
4680 {
4681 const uschar *called;
4682 terminator = ')';
4683
4684 /* Come here from the \g<...> and \g'...' code (Oniguruma
4685 compatibility). However, the syntax has been checked to ensure that
4686 the ... are a (signed) number, so that neither ERR63 nor ERR29 will
4687 be called on this path, nor with the jump to OTHER_CHAR_AFTER_QUERY
4688 ever be taken. */
4689
4690 HANDLE_NUMERICAL_RECURSION:
4691
4692 if ((refsign = *ptr) == '+')
4693 {
4694 ptr++;
4695 if ((digitab[*ptr] & ctype_digit) == 0)
4696 {
4697 *errorcodeptr = ERR63;
4698 goto FAILED;
4699 }
4700 }
4701 else if (refsign == '-')
4702 {
4703 if ((digitab[ptr[1]] & ctype_digit) == 0)
4704 goto OTHER_CHAR_AFTER_QUERY;
4705 ptr++;
4706 }
4707
4708 recno = 0;
4709 while((digitab[*ptr] & ctype_digit) != 0)
4710 recno = recno * 10 + *ptr++ - '0';
4711
4712 if (*ptr != terminator)
4713 {
4714 *errorcodeptr = ERR29;
4715 goto FAILED;
4716 }
4717
4718 if (refsign == '-')
4719 {
4720 if (recno == 0)
4721 {
4722 *errorcodeptr = ERR58;
4723 goto FAILED;
4724 }
4725 recno = cd->bracount - recno + 1;
4726 if (recno <= 0)
4727 {
4728 *errorcodeptr = ERR15;
4729 goto FAILED;
4730 }
4731 }
4732 else if (refsign == '+')
4733 {
4734 if (recno == 0)
4735 {
4736 *errorcodeptr = ERR58;
4737 goto FAILED;
4738 }
4739 recno += cd->bracount;
4740 }
4741
4742 /* Come here from code above that handles a named recursion */
4743
4744 HANDLE_RECURSION:
4745
4746 previous = code;
4747 called = cd->start_code;
4748
4749 /* When we are actually compiling, find the bracket that is being
4750 referenced. Temporarily end the regex in case it doesn't exist before
4751 this point. If we end up with a forward reference, first check that
4752 the bracket does occur later so we can give the error (and position)
4753 now. Then remember this forward reference in the workspace so it can
4754 be filled in at the end. */
4755
4756 if (lengthptr == NULL)
4757 {
4758 *code = OP_END;
4759 if (recno != 0) called = find_bracket(cd->start_code, utf8, recno);
4760
4761 /* Forward reference */
4762
4763 if (called == NULL)
4764 {
4765 if (find_parens(ptr, cd->bracount, NULL, recno,
4766 (options & PCRE_EXTENDED) != 0) < 0)
4767 {
4768 *errorcodeptr = ERR15;
4769 goto FAILED;
4770 }
4771 called = cd->start_code + recno;
4772 PUTINC(cd->hwm, 0, code + 2 + LINK_SIZE - cd->start_code);
4773 }
4774
4775 /* If not a forward reference, and the subpattern is still open,
4776 this is a recursive call. We check to see if this is a left
4777 recursion that could loop for ever, and diagnose that case. */
4778
4779 else if (GET(called, 1) == 0 &&
4780 could_be_empty(called, code, bcptr, utf8))
4781 {
4782 *errorcodeptr = ERR40;
4783 goto FAILED;
4784 }
4785 }
4786
4787 /* Insert the recursion/subroutine item, automatically wrapped inside
4788 "once" brackets. Set up a "previous group" length so that a
4789 subsequent quantifier will work. */
4790
4791 *code = OP_ONCE;
4792 PUT(code, 1, 2 + 2*LINK_SIZE);
4793 code += 1 + LINK_SIZE;
4794
4795 *code = OP_RECURSE;
4796 PUT(code, 1, called - cd->start_code);
4797 code += 1 + LINK_SIZE;
4798
4799 *code = OP_KET;
4800 PUT(code, 1, 2 + 2*LINK_SIZE);
4801 code += 1 + LINK_SIZE;
4802
4803 length_prevgroup = 3 + 3*LINK_SIZE;
4804 }
4805
4806 /* Can't determine a first byte now */
4807
4808 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
4809 continue;
4810
4811
4812 /* ------------------------------------------------------------ */
4813 default: /* Other characters: check option setting */
4814 OTHER_CHAR_AFTER_QUERY:
4815 set = unset = 0;
4816 optset = &set;
4817
4818 while (*ptr != ')' && *ptr != ':')
4819 {
4820 switch (*ptr++)
4821 {
4822 case '-': optset = &unset; break;
4823
4824 case 'J': /* Record that it changed in the external options */
4825 *optset |= PCRE_DUPNAMES;
4826 cd->external_flags |= PCRE_JCHANGED;
4827 break;
4828
4829 case 'i': *optset |= PCRE_CASELESS; break;
4830 case 'm': *optset |= PCRE_MULTILINE; break;
4831 case 's': *optset |= PCRE_DOTALL; break;
4832 case 'x': *optset |= PCRE_EXTENDED; break;
4833 case 'U': *optset |= PCRE_UNGREEDY; break;
4834 case 'X': *optset |= PCRE_EXTRA; break;
4835
4836 default: *errorcodeptr = ERR12;
4837 ptr--; /* Correct the offset */
4838 goto FAILED;
4839 }
4840 }
4841
4842 /* Set up the changed option bits, but don't change anything yet. */
4843
4844 newoptions = (options | set) & (~unset);
4845
4846 /* If the options ended with ')' this is not the start of a nested
4847 group with option changes, so the options change at this level. If this
4848 item is right at the start of the pattern, the options can be
4849 abstracted and made external in the pre-compile phase, and ignored in
4850 the compile phase. This can be helpful when matching -- for instance in
4851 caseless checking of required bytes.
4852
4853 If the code pointer is not (cd->start_code + 1 + LINK_SIZE), we are
4854 definitely *not* at the start of the pattern because something has been
4855 compiled. In the pre-compile phase, however, the code pointer can have
4856 that value after the start, because it gets reset as code is discarded
4857 during the pre-compile. However, this can happen only at top level - if
4858 we are within parentheses, the starting BRA will still be present. At
4859 any parenthesis level, the length value can be used to test if anything
4860 has been compiled at that level. Thus, a test for both these conditions
4861 is necessary to ensure we correctly detect the start of the pattern in
4862 both phases.
4863
4864 If we are not at the pattern start, compile code to change the ims
4865 options if this setting actually changes any of them. We also pass the
4866 new setting back so that it can be put at the start of any following
4867 branches, and when this group ends (if we are in a group), a resetting
4868 item can be compiled. */
4869
4870 if (*ptr == ')')
4871 {
4872 if (code == cd->start_code + 1 + LINK_SIZE &&
4873 (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE))
4874 {
4875 cd->external_options = newoptions;
4876 options = newoptions;
4877 }
4878 else
4879 {
4880 if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
4881 {
4882 *code++ = OP_OPT;
4883 *code++ = newoptions & PCRE_IMS;
4884 }
4885
4886 /* Change options at this level, and pass them back for use
4887 in subsequent branches. Reset the greedy defaults and the case
4888 value for firstbyte and reqbyte. */
4889
4890 *optionsptr = options = newoptions;
4891 greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
4892 greedy_non_default = greedy_default ^ 1;
4893 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
4894 }
4895
4896 previous = NULL; /* This item can't be repeated */
4897 continue; /* It is complete */
4898 }
4899
4900 /* If the options ended with ':' we are heading into a nested group
4901 with possible change of options. Such groups are non-capturing and are
4902 not assertions of any kind. All we need to do is skip over the ':';
4903 the newoptions value is handled below. */
4904
4905 bravalue = OP_BRA;
4906 ptr++;
4907 } /* End of switch for character following (? */
4908 } /* End of (? handling */
4909
4910 /* Opening parenthesis not followed by '?'. If PCRE_NO_AUTO_CAPTURE is set,
4911 all unadorned brackets become non-capturing and behave like (?:...)
4912 brackets. */
4913
4914 else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
4915 {
4916 bravalue = OP_BRA;
4917 }
4918
4919 /* Else we have a capturing group. */
4920
4921 else
4922 {
4923 NUMBERED_GROUP:
4924 cd->bracount += 1;
4925 PUT2(code, 1+LINK_SIZE, cd->bracount);
4926 skipbytes = 2;
4927 }
4928
4929 /* Process nested bracketed regex. Assertions may not be repeated, but
4930 other kinds can be. All their opcodes are >= OP_ONCE. We copy code into a
4931 non-register variable in order to be able to pass its address because some
4932 compilers complain otherwise. Pass in a new setting for the ims options if
4933 they have changed. */
4934
4935 previous = (bravalue >= OP_ONCE)? code : NULL;
4936 *code = bravalue;
4937 tempcode = code;
4938 tempreqvary = cd->req_varyopt; /* Save value before bracket */
4939 length_prevgroup = 0; /* Initialize for pre-compile phase */
4940
4941 if (!compile_regex(
4942 newoptions, /* The complete new option state */
4943 options & PCRE_IMS, /* The previous ims option state */
4944 &tempcode, /* Where to put code (updated) */
4945 &ptr, /* Input pointer (updated) */
4946 errorcodeptr, /* Where to put an error message */
4947 (bravalue == OP_ASSERTBACK ||
4948 bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
4949 reset_bracount, /* True if (?| group */
4950 skipbytes, /* Skip over bracket number */
4951 &subfirstbyte, /* For possible first char */
4952 &subreqbyte, /* For possible last char */
4953 bcptr, /* Current branch chain */
4954 cd, /* Tables block */
4955 (lengthptr == NULL)? NULL : /* Actual compile phase */
4956 &length_prevgroup /* Pre-compile phase */
4957 ))
4958 goto FAILED;
4959
4960 /* At the end of compiling, code is still pointing to the start of the
4961 group, while tempcode has been updated to point past the end of the group
4962 and any option resetting that may follow it. The pattern pointer (ptr)
4963 is on the bracket. */
4964
4965 /* If this is a conditional bracket, check that there are no more than
4966 two branches in the group, or just one if it's a DEFINE group. We do this
4967 in the real compile phase, not in the pre-pass, where the whole group may
4968 not be available. */
4969
4970 if (bravalue == OP_COND && lengthptr == NULL)
4971 {
4972 uschar *tc = code;
4973 int condcount = 0;
4974
4975 do {
4976 condcount++;
4977 tc += GET(tc,1);
4978 }
4979 while (*tc != OP_KET);
4980
4981 /* A DEFINE group is never obeyed inline (the "condition" is always
4982 false). It must have only one branch. */
4983
4984 if (code[LINK_SIZE+1] == OP_DEF)
4985 {
4986 if (condcount > 1)
4987 {
4988 *errorcodeptr = ERR54;
4989 goto FAILED;
4990 }
4991 bravalue = OP_DEF; /* Just a flag to suppress char handling below */
4992 }
4993
4994 /* A "normal" conditional group. If there is just one branch, we must not
4995 make use of its firstbyte or reqbyte, because this is equivalent to an
4996 empty second branch. */
4997
4998 else
4999 {
5000 if (condcount > 2)
5001 {
5002 *errorcodeptr = ERR27;
5003 goto FAILED;
5004 }
5005 if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
5006 }
5007 }
5008
5009 /* Error if hit end of pattern */
5010
5011 if (*ptr != ')')
5012 {
5013 *errorcodeptr = ERR14;
5014 goto FAILED;
5015 }
5016
5017 /* In the pre-compile phase, update the length by the length of the group,
5018 less the brackets at either end. Then reduce the compiled code to just a
5019 set of non-capturing brackets so that it doesn't use much memory if it is
5020 duplicated by a quantifier.*/
5021
5022 if (lengthptr != NULL)
5023 {
5024 if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE)
5025 {
5026 *errorcodeptr = ERR20;
5027 goto FAILED;
5028 }
5029 *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
5030 *code++ = OP_BRA;
5031 PUTINC(code, 0, 1 + LINK_SIZE);
5032 *code++ = OP_KET;
5033 PUTINC(code, 0, 1 + LINK_SIZE);
5034 break; /* No need to waste time with special character handling */
5035 }
5036
5037 /* Otherwise update the main code pointer to the end of the group. */
5038
5039 code = tempcode;
5040
5041 /* For a DEFINE group, required and first character settings are not
5042 relevant. */
5043
5044 if (bravalue == OP_DEF) break;
5045
5046 /* Handle updating of the required and first characters for other types of
5047 group. Update for normal brackets of all kinds, and conditions with two
5048 branches (see code above). If the bracket is followed by a quantifier with
5049 zero repeat, we have to back off. Hence the definition of zeroreqbyte and
5050 zerofirstbyte outside the main loop so that they can be accessed for the
5051 back off. */
5052
5053 zeroreqbyte = reqbyte;
5054 zerofirstbyte = firstbyte;
5055 groupsetfirstbyte = FALSE;
5056
5057 if (bravalue >= OP_ONCE)
5058 {
5059 /* If we have not yet set a firstbyte in this branch, take it from the
5060 subpattern, remembering that it was set here so that a repeat of more
5061 than one can replicate it as reqbyte if necessary. If the subpattern has
5062 no firstbyte, set "none" for the whole branch. In both cases, a zero
5063 repeat forces firstbyte to "none". */
5064
5065 if (firstbyte == REQ_UNSET)
5066 {
5067 if (subfirstbyte >= 0)
5068 {
5069 firstbyte = subfirstbyte;
5070 groupsetfirstbyte = TRUE;
5071 }
5072 else firstbyte = REQ_NONE;
5073 zerofirstbyte = REQ_NONE;
5074 }
5075
5076 /* If firstbyte was previously set, convert the subpattern's firstbyte
5077 into reqbyte if there wasn't one, using the vary flag that was in
5078 existence beforehand. */
5079
5080 else if (subfirstbyte >= 0 && subreqbyte < 0)
5081 subreqbyte = subfirstbyte | tempreqvary;
5082
5083 /* If the subpattern set a required byte (or set a first byte that isn't
5084 really the first byte - see above), set it. */
5085
5086 if (subreqbyte >= 0) reqbyte = subreqbyte;
5087 }
5088
5089 /* For a forward assertion, we take the reqbyte, if set. This can be
5090 helpful if the pattern that follows the assertion doesn't set a different
5091 char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
5092 for an assertion, however because it leads to incorrect effect for patterns
5093 such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
5094 of a firstbyte. This is overcome by a scan at the end if there's no
5095 firstbyte, looking for an asserted first char. */
5096
5097 else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
5098 break; /* End of processing '(' */
5099
5100
5101 /* ===================================================================*/
5102 /* Handle metasequences introduced by \. For ones like \d, the ESC_ values
5103 are arranged to be the negation of the corresponding OP_values. For the
5104 back references, the values are ESC_REF plus the reference number. Only
5105 back references and those types that consume a character may be repeated.
5106 We can test for values between ESC_b and ESC_Z for the latter; this may
5107 have to change if any new ones are ever created. */
5108
5109 case '\\':
5110 tempptr = ptr;
5111 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, FALSE);
5112 if (*errorcodeptr != 0) goto FAILED;
5113
5114 if (c < 0)
5115 {
5116 if (-c == ESC_Q) /* Handle start of quoted string */
5117 {
5118 if (ptr[1] == '\\' && ptr[2] == 'E') ptr += 2; /* avoid empty string */
5119 else inescq = TRUE;
5120 continue;
5121 }
5122
5123 if (-c == ESC_E) continue; /* Perl ignores an orphan \E */
5124
5125 /* For metasequences that actually match a character, we disable the
5126 setting of a first character if it hasn't already been set. */
5127
5128 if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
5129 firstbyte = REQ_NONE;
5130
5131 /* Set values to reset to if this is followed by a zero repeat. */
5132
5133 zerofirstbyte = firstbyte;
5134 zeroreqbyte = reqbyte;
5135
5136 /* \g<name> or \g'name' is a subroutine call by name and \g<n> or \g'n'
5137 is a subroutine call by number (Oniguruma syntax). In fact, the value
5138 -ESC_g is returned only for these cases. So we don't need to check for <
5139 or ' if the value is -ESC_g. For the Perl syntax \g{n} the value is
5140 -ESC_REF+n, and for the Perl syntax \g{name} the result is -ESC_k (as
5141 that is a synonym for a named back reference). */
5142
5143 if (-c == ESC_g)
5144 {
5145 const uschar *p;
5146 save_hwm = cd->hwm; /* Normally this is set when '(' is read */
5147 terminator = (*(++ptr) == '<')? '>' : '\'';
5148
5149 /* These two statements stop the compiler for warning about possibly
5150 unset variables caused by the jump to HANDLE_NUMERICAL_RECURSION. In
5151 fact, because we actually check for a number below, the paths that
5152 would actually be in error are never taken. */
5153
5154 skipbytes = 0;
5155 reset_bracount = FALSE;
5156
5157 /* Test for a name */
5158
5159 if (ptr[1] != '+' && ptr[1] != '-')
5160 {
5161 BOOL isnumber = TRUE;
5162 for (p = ptr + 1; *p != 0 && *p != terminator; p++)
5163 {
5164 if ((cd->ctypes[*p] & ctype_digit) == 0) isnumber = FALSE;
5165 if ((cd->ctypes[*p] & ctype_word) == 0) break;
5166 }
5167 if (*p != terminator)
5168 {
5169 *errorcodeptr = ERR57;
5170 break;
5171 }
5172 if (isnumber)
5173 {
5174 ptr++;
5175 goto HANDLE_NUMERICAL_RECURSION;
5176 }
5177 is_recurse = TRUE;
5178 goto NAMED_REF_OR_RECURSE;
5179 }
5180
5181 /* Test a signed number in angle brackets or quotes. */
5182
5183 p = ptr + 2;
5184 while ((digitab[*p] & ctype_digit) != 0) p++;
5185 if (*p != terminator)
5186 {
5187 *errorcodeptr = ERR57;
5188 break;
5189 }
5190 ptr++;
5191 goto HANDLE_NUMERICAL_RECURSION;
5192 }
5193
5194 /* \k<name> or \k'name' is a back reference by name (Perl syntax).
5195 We also support \k{name} (.NET syntax) */
5196
5197 if (-c == ESC_k && (ptr[1] == '<' || ptr[1] == '\'' || ptr[1] == '{'))
5198 {
5199 is_recurse = FALSE;
5200 terminator = (*(++ptr) == '<')? '>' : (*ptr == '\'')? '\'' : '}';
5201 goto NAMED_REF_OR_RECURSE;
5202 }
5203
5204 /* Back references are handled specially; must disable firstbyte if
5205 not set to cope with cases like (?=(\w+))\1: which would otherwise set
5206 ':' later. */
5207
5208 if (-c >= ESC_REF)
5209 {
5210 recno = -c - ESC_REF;
5211
5212 HANDLE_REFERENCE: /* Come here from named backref handling */
5213 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
5214 previous = code;
5215 *code++ = OP_REF;
5216 PUT2INC(code, 0, recno);
5217 cd->backref_map |= (recno < 32)? (1 << recno) : 1;
5218 if (recno > cd->top_backref) cd->top_backref = recno;
5219 }
5220
5221 /* So are Unicode property matches, if supported. */
5222
5223 #ifdef SUPPORT_UCP
5224 else if (-c == ESC_P || -c == ESC_p)
5225 {
5226 BOOL negated;
5227 int pdata;
5228 int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
5229 if (ptype < 0) goto FAILED;
5230 previous = code;
5231 *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
5232 *code++ = ptype;
5233 *code++ = pdata;
5234 }
5235 #else
5236
5237 /* If Unicode properties are not supported, \X, \P, and \p are not
5238 allowed. */
5239
5240 else if (-c == ESC_X || -c == ESC_P || -c == ESC_p)
5241 {
5242 *errorcodeptr = ERR45;
5243 goto FAILED;
5244 }
5245 #endif
5246
5247 /* For the rest (including \X when Unicode properties are supported), we
5248 can obtain the OP value by negating the escape value. */
5249
5250 else
5251 {
5252 previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
5253 *code++ = -c;
5254 }
5255 continue;
5256 }
5257
5258 /* We have a data character whose value is in c. In UTF-8 mode it may have
5259 a value > 127. We set its representation in the length/buffer, and then
5260 handle it as a data character. */
5261
5262 #ifdef SUPPORT_UTF8
5263 if (utf8 && c > 127)
5264 mclength = _pcre_ord2utf8(c, mcbuffer);
5265 else
5266 #endif
5267
5268 {
5269 mcbuffer[0] = c;
5270 mclength = 1;
5271 }
5272 goto ONE_CHAR;
5273
5274
5275 /* ===================================================================*/
5276 /* Handle a literal character. It is guaranteed not to be whitespace or #
5277 when the extended flag is set. If we are in UTF-8 mode, it may be a
5278 multi-byte literal character. */
5279
5280 default:
5281 NORMAL_CHAR:
5282 mclength = 1;
5283 mcbuffer[0] = c;
5284
5285 #ifdef SUPPORT_UTF8
5286 if (utf8 && c >= 0xc0)
5287 {
5288 while ((ptr[1] & 0xc0) == 0x80)
5289 mcbuffer[mclength++] = *(++ptr);
5290 }
5291 #endif
5292
5293 /* At this point we have the character's bytes in mcbuffer, and the length
5294 in mclength. When not in UTF-8 mode, the length is always 1. */
5295
5296 ONE_CHAR:
5297 previous = code;
5298 *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARNC : OP_CHAR;
5299 for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
5300
5301 /* Remember if \r or \n were seen */
5302
5303 if (mcbuffer[0] == '\r' || mcbuffer[0] == '\n')
5304 cd->external_flags |= PCRE_HASCRORLF;
5305
5306 /* Set the first and required bytes appropriately. If no previous first
5307 byte, set it from this character, but revert to none on a zero repeat.
5308 Otherwise, leave the firstbyte value alone, and don't change it on a zero
5309 repeat. */
5310
5311 if (firstbyte == REQ_UNSET)
5312 {
5313 zerofirstbyte = REQ_NONE;
5314 zeroreqbyte = reqbyte;
5315
5316 /* If the character is more than one byte long, we can set firstbyte
5317 only if it is not to be matched caselessly. */
5318
5319 if (mclength == 1 || req_caseopt == 0)
5320 {
5321 firstbyte = mcbuffer[0] | req_caseopt;
5322 if (mclength != 1) reqbyte = code[-1] | cd->req_varyopt;
5323 }
5324 else firstbyte = reqbyte = REQ_NONE;
5325 }
5326
5327 /* firstbyte was previously set; we can set reqbyte only the length is
5328 1 or the matching is caseful. */
5329
5330 else
5331 {
5332 zerofirstbyte = firstbyte;
5333 zeroreqbyte = reqbyte;
5334 if (mclength == 1 || req_caseopt == 0)
5335 reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
5336 }
5337
5338 break; /* End of literal character handling */
5339 }
5340 } /* end of big loop */
5341
5342
5343 /* Control never reaches here by falling through, only by a goto for all the
5344 error states. Pass back the position in the pattern so that it can be displayed
5345 to the user for diagnosing the error. */
5346
5347 FAILED:
5348 *ptrptr = ptr;
5349 return FALSE;
5350 }
5351
5352
5353
5354
5355 /*************************************************
5356 * Compile sequence of alternatives *
5357 *************************************************/
5358
5359 /* On entry, ptr is pointing past the bracket character, but on return it
5360 points to the closing bracket, or vertical bar, or end of string. The code
5361 variable is pointing at the byte into which the BRA operator has been stored.
5362 If the ims options are changed at the start (for a (?ims: group) or during any
5363 branch, we need to insert an OP_OPT item at the start of every following branch
5364 to ensure they get set correctly at run time, and also pass the new options
5365 into every subsequent branch compile.
5366
5367 This function is used during the pre-compile phase when we are trying to find
5368 out the amount of memory needed, as well as during the real compile phase. The
5369 value of lengthptr distinguishes the two phases.
5370
5371 Arguments:
5372 options option bits, including any changes for this subpattern
5373 oldims previous settings of ims option bits
5374 codeptr -> the address of the current code pointer
5375 ptrptr -> the address of the current pattern pointer
5376 errorcodeptr -> pointer to error code variable
5377 lookbehind TRUE if this is a lookbehind assertion
5378 reset_bracount TRUE to reset the count for each branch
5379 skipbytes skip this many bytes at start (for brackets and OP_COND)
5380 firstbyteptr place to put the first required character, or a negative number
5381 reqbyteptr place to put the last required character, or a negative number
5382 bcptr pointer to the chain of currently open branches
5383 cd points to the data block with tables pointers etc.
5384 lengthptr NULL during the real compile phase
5385 points to length accumulator during pre-compile phase
5386
5387 Returns: TRUE on success
5388 */
5389
5390 static BOOL
5391 compile_regex(int options, int oldims, uschar **codeptr, const uschar **ptrptr,
5392 int *errorcodeptr, BOOL lookbehind, BOOL reset_bracount, int skipbytes,
5393 int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd,
5394 int *lengthptr)
5395 {
5396 const uschar *ptr = *ptrptr;
5397 uschar *code = *codeptr;
5398 uschar *last_branch = code;
5399 uschar *start_bracket = code;
5400 uschar *reverse_count = NULL;
5401 int firstbyte, reqbyte;
5402 int branchfirstbyte, branchreqbyte;
5403 int length;
5404 int orig_bracount;
5405 int max_bracount;
5406 branch_chain bc;
5407
5408 bc.outer = bcptr;
5409 bc.current = code;
5410
5411 firstbyte = reqbyte = REQ_UNSET;
5412
5413 /* Accumulate the length for use in the pre-compile phase. Start with the
5414 length of the BRA and KET and any extra bytes that are required at the
5415 beginning. We accumulate in a local variable to save frequent testing of
5416 lenthptr for NULL. We cannot do this by looking at the value of code at the
5417 start and end of each alternative, because compiled items are discarded during
5418 the pre-compile phase so that the work space is not exceeded. */
5419
5420 length = 2 + 2*LINK_SIZE + skipbytes;
5421
5422 /* WARNING: If the above line is changed for any reason, you must also change
5423 the code that abstracts option settings at the start of the pattern and makes
5424 them global. It tests the value of length for (2 + 2*LINK_SIZE) in the
5425 pre-compile phase to find out whether anything has yet been compiled or not. */
5426
5427 /* Offset is set zero to mark that this bracket is still open */
5428
5429 PUT(code, 1, 0);
5430 code += 1 + LINK_SIZE + skipbytes;
5431
5432 /* Loop for each alternative branch */
5433
5434 orig_bracount = max_bracount = cd->bracount;
5435 for (;;)
5436 {
5437 /* For a (?| group, reset the capturing bracket count so that each branch
5438 uses the same numbers. */
5439
5440 if (reset_bracount) cd->bracount = orig_bracount;
5441
5442 /* Handle a change of ims options at the start of the branch */
5443
5444 if ((options & PCRE_IMS) != oldims)
5445 {
5446 *code++ = OP_OPT;
5447 *code++ = options & PCRE_IMS;
5448 length += 2;
5449 }
5450
5451 /* Set up dummy OP_REVERSE if lookbehind assertion */
5452
5453 if (lookbehind)
5454 {
5455 *code++ = OP_REVERSE;
5456 reverse_count = code;
5457 PUTINC(code, 0, 0);
5458 length += 1 + LINK_SIZE;
5459 }
5460
5461 /* Now compile the branch; in the pre-compile phase its length gets added
5462 into the length. */
5463
5464 if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstbyte,
5465 &branchreqbyte, &bc, cd, (lengthptr == NULL)? NULL : &length))
5466 {
5467 *ptrptr = ptr;
5468 return FALSE;
5469 }
5470
5471 /* Keep the highest bracket count in case (?| was used and some branch
5472 has fewer than the rest. */
5473
5474 if (cd->bracount > max_bracount) max_bracount = cd->bracount;
5475
5476 /* In the real compile phase, there is some post-processing to be done. */
5477
5478 if (lengthptr == NULL)
5479 {
5480 /* If this is the first branch, the firstbyte and reqbyte values for the
5481 branch become the values for the regex. */
5482
5483 if (*last_branch != OP_ALT)
5484 {
5485 firstbyte = branchfirstbyte;
5486 reqbyte = branchreqbyte;
5487 }
5488
5489 /* If this is not the first branch, the first char and reqbyte have to
5490 match the values from all the previous branches, except that if the
5491 previous value for reqbyte didn't have REQ_VARY set, it can still match,
5492 and we set REQ_VARY for the regex. */
5493
5494 else
5495 {
5496 /* If we previously had a firstbyte, but it doesn't match the new branch,
5497 we have to abandon the firstbyte for the regex, but if there was
5498 previously no reqbyte, it takes on the value of the old firstbyte. */
5499
5500 if (firstbyte >= 0 && firstbyte != branchfirstbyte)
5501 {
5502 if (reqbyte < 0) reqbyte = firstbyte;
5503 firstbyte = REQ_NONE;
5504 }
5505
5506 /* If we (now or from before) have no firstbyte, a firstbyte from the
5507 branch becomes a reqbyte if there isn't a branch reqbyte. */
5508
5509 if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
5510 branchreqbyte = branchfirstbyte;
5511
5512 /* Now ensure that the reqbytes match */
5513
5514 if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
5515 reqbyte = REQ_NONE;
5516 else reqbyte |= branchreqbyte; /* To "or" REQ_VARY */
5517 }
5518
5519 /* If lookbehind, check that this branch matches a fixed-length string, and
5520 put the length into the OP_REVERSE item. Temporarily mark the end of the
5521 branch with OP_END. */
5522
5523 if (lookbehind)
5524 {
5525 int fixed_length;
5526 *code = OP_END;
5527 fixed_length = find_fixedlength(last_branch, options);
5528 DPRINTF(("fixed length = %d\n", fixed_length));
5529 if (fixed_length < 0)
5530 {
5531 *errorcodeptr = (fixed_length == -2)? ERR36 : ERR25;
5532 *ptrptr = ptr;
5533 return FALSE;
5534 }
5535 PUT(reverse_count, 0, fixed_length);
5536 }
5537 }
5538
5539 /* Reached end of expression, either ')' or end of pattern. In the real
5540 compile phase, go back through the alternative branches and reverse the chain
5541 of offsets, with the field in the BRA item now becoming an offset to the
5542 first alternative. If there are no alternatives, it points to the end of the
5543 group. The length in the terminating ket is always the length of the whole
5544 bracketed item. If any of the ims options were changed inside the group,
5545 compile a resetting op-code following, except at the very end of the pattern.
5546 Return leaving the pointer at the terminating char. */
5547
5548 if (*ptr != '|')
5549 {
5550 if (lengthptr == NULL)
5551 {
5552 int branch_length = code - last_branch;
5553 do
5554 {
5555 int prev_length = GET(last_branch, 1);
5556 PUT(last_branch, 1, branch_length);
5557 branch_length = prev_length;
5558 last_branch -= branch_length;
5559 }
5560 while (branch_length > 0);
5561 }
5562
5563 /* Fill in the ket */
5564
5565 *code = OP_KET;
5566 PUT(code, 1, code - start_bracket);
5567 code += 1 + LINK_SIZE;
5568
5569 /* Resetting option if needed */
5570
5571 if ((options & PCRE_IMS) != oldims && *ptr == ')')
5572 {
5573 *code++ = OP_OPT;
5574 *code++ = oldims;
5575 length += 2;
5576 }
5577
5578 /* Retain the highest bracket number, in case resetting was used. */
5579
5580 cd->bracount = max_bracount;
5581
5582 /* Set values to pass back */
5583
5584 *codeptr = code;
5585 *ptrptr = ptr;
5586 *firstbyteptr = firstbyte;
5587 *reqbyteptr = reqbyte;
5588 if (lengthptr != NULL)
5589 {
5590 if (OFLOW_MAX - *lengthptr < length)
5591 {
5592 *errorcodeptr = ERR20;
5593 return FALSE;
5594 }
5595 *lengthptr += length;
5596 }
5597 return TRUE;
5598 }
5599
5600 /* Another branch follows. In the pre-compile phase, we can move the code
5601 pointer back to where it was for the start of the first branch. (That is,
5602 pretend that each branch is the only one.)
5603
5604 In the real compile phase, insert an ALT node. Its length field points back
5605 to the previous branch while the bracket remains open. At the end the chain
5606 is reversed. It's done like this so that the start of the bracket has a
5607 zero offset until it is closed, making it possible to detect recursion. */
5608
5609 if (lengthptr != NULL)
5610 {
5611 code = *codeptr + 1 + LINK_SIZE + skipbytes;
5612 length += 1 + LINK_SIZE;
5613 }
5614 else
5615 {
5616 *code = OP_ALT;
5617 PUT(code, 1, code - last_branch);
5618 bc.current = last_branch = code;
5619 code += 1 + LINK_SIZE;
5620 }
5621
5622 ptr++;
5623 }
5624 /* Control never reaches here */
5625 }
5626
5627
5628
5629
5630 /*************************************************
5631 * Check for anchored expression *
5632 *************************************************/
5633
5634 /* Try to find out if this is an anchored regular expression. Consider each
5635 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
5636 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
5637 it's anchored. However, if this is a multiline pattern, then only OP_SOD
5638 counts, since OP_CIRC can match in the middle.
5639
5640 We can also consider a regex to be anchored if OP_SOM starts all its branches.
5641 This is the code for \G, which means "match at start of match position, taking
5642 into account the match offset".
5643
5644 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
5645 because that will try the rest of the pattern at all possible matching points,
5646 so there is no point trying again.... er ....
5647
5648 .... except when the .* appears inside capturing parentheses, and there is a
5649 subsequent back reference to those parentheses. We haven't enough information
5650 to catch that case precisely.
5651
5652 At first, the best we could do was to detect when .* was in capturing brackets
5653 and the highest back reference was greater than or equal to that level.
5654 However, by keeping a bitmap of the first 31 back references, we can catch some
5655 of the more common cases more precisely.
5656
5657 Arguments:
5658 code points to start of expression (the bracket)
5659 options points to the options setting
5660 bracket_map a bitmap of which brackets we are inside while testing; this
5661 handles up to substring 31; after that we just have to take
5662 the less precise approach
5663 backref_map the back reference bitmap
5664
5665 Returns: TRUE or FALSE
5666 */
5667
5668 static BOOL
5669 is_anchored(register const uschar *code, int *options, unsigned int bracket_map,
5670 unsigned int backref_map)
5671 {
5672 do {
5673 const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
5674 options, PCRE_MULTILINE, FALSE);
5675 register int op = *scode;
5676
5677 /* Non-capturing brackets */
5678
5679 if (op == OP_BRA)
5680 {
5681 if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
5682 }
5683
5684 /* Capturing brackets */
5685
5686 else if (op == OP_CBRA)
5687 {
5688 int n = GET2(scode, 1+LINK_SIZE);
5689 int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
5690 if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;
5691 }
5692
5693 /* Other brackets */
5694
5695 else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
5696 {
5697 if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
5698 }
5699
5700 /* .* is not anchored unless DOTALL is set and it isn't in brackets that
5701 are or may be referenced. */
5702
5703 else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
5704 op == OP_TYPEPOSSTAR) &&
5705 (*options & PCRE_DOTALL) != 0)
5706 {
5707 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
5708 }
5709
5710 /* Check for explicit anchoring */
5711
5712 else if (op != OP_SOD && op != OP_SOM &&
5713 ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
5714 return FALSE;
5715 code += GET(code, 1);
5716 }
5717 while (*code == OP_ALT); /* Loop for each alternative */
5718 return TRUE;
5719 }
5720
5721
5722
5723 /*************************************************
5724 * Check for starting with ^ or .* *
5725 *************************************************/
5726
5727 /* This is called to find out if every branch starts with ^ or .* so that
5728 "first char" processing can be done to speed things up in multiline
5729 matching and for non-DOTALL patterns that start with .* (which must start at
5730 the beginning or after \n). As in the case of is_anchored() (see above), we
5731 have to take account of back references to capturing brackets that contain .*
5732 because in that case we can't make the assumption.
5733
5734 Arguments:
5735 code points to start of expression (the bracket)
5736 bracket_map a bitmap of which brackets we are inside while testing; this
5737 handles up to substring 31; after that we just have to take
5738 the less precise approach
5739 backref_map the back reference bitmap
5740
5741 Returns: TRUE or FALSE
5742 */
5743
5744 static BOOL
5745 is_startline(const uschar *code, unsigned int bracket_map,
5746 unsigned int backref_map)
5747 {
5748 do {
5749 const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
5750 NULL, 0, FALSE);
5751 register int op = *scode;
5752
5753 /* Non-capturing brackets */
5754
5755 if (op == OP_BRA)
5756 {
5757 if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
5758 }
5759
5760 /* Capturing brackets */
5761
5762 else if (op == OP_CBRA)
5763 {
5764 int n = GET2(scode, 1+LINK_SIZE);
5765 int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
5766 if (!is_startline(scode, new_map, backref_map)) return FALSE;
5767 }
5768
5769 /* Other brackets */
5770
5771 else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
5772 { if (!is_startline(scode, bracket_map, backref_map)) return FALSE; }
5773
5774 /* .* means "start at start or after \n" if it isn't in brackets that
5775 may be referenced. */
5776
5777 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
5778 {
5779 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
5780 }
5781
5782 /* Check for explicit circumflex */
5783
5784 else if (op != OP_CIRC) return FALSE;
5785
5786 /* Move on to the next alternative */
5787
5788 code += GET(code, 1);
5789 }
5790 while (*code == OP_ALT); /* Loop for each alternative */
5791 return TRUE;
5792 }
5793
5794
5795
5796 /*************************************************
5797 * Check for asserted fixed first char *
5798 *************************************************/
5799
5800 /* During compilation, the "first char" settings from forward assertions are
5801 discarded, because they can cause conflicts with actual literals that follow.
5802 However, if we end up without a first char setting for an unanchored pattern,
5803 it is worth scanning the regex to see if there is an initial asserted first
5804 char. If all branches start with the same asserted char, or with a bracket all
5805 of whose alternatives start with the same asserted char (recurse ad lib), then
5806 we return that char, otherwise -1.
5807
5808 Arguments:
5809 code points to start of expression (the bracket)
5810 options pointer to the options (used to check casing changes)
5811 inassert TRUE if in an assertion
5812
5813 Returns: -1 or the fixed first char
5814 */
5815
5816 static int
5817 find_firstassertedchar(const uschar *code, int *options, BOOL inassert)
5818 {
5819 register int c = -1;
5820 do {
5821 int d;
5822 const uschar *scode =
5823 first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE);
5824 register int op = *scode;
5825
5826 switch(op)
5827 {
5828 default:
5829 return -1;
5830
5831 case OP_BRA:
5832 case OP_CBRA:
5833 case OP_ASSERT:
5834 case OP_ONCE:
5835 case OP_COND:
5836 if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0)
5837 return -1;
5838 if (c < 0) c = d; else if (c != d) return -1;
5839 break;
5840
5841 case OP_EXACT: /* Fall through */
5842 scode += 2;
5843
5844 case OP_CHAR:
5845 case OP_CHARNC:
5846 case OP_PLUS:
5847 case OP_MINPLUS:
5848 case OP_POSPLUS:
5849 if (!inassert) return -1;
5850 if (c < 0)
5851 {
5852 c = scode[1];
5853 if ((*options & PCRE_CASELESS) != 0) c |= REQ_CASELESS;
5854 }
5855 else if (c != scode[1]) return -1;
5856 break;
5857 }
5858
5859 code += GET(code, 1);
5860 }
5861 while (*code == OP_ALT);
5862 return c;
5863 }
5864
5865
5866
5867 /*************************************************
5868 * Compile a Regular Expression *
5869 *************************************************/
5870
5871 /* This function takes a string and returns a pointer to a block of store
5872 holding a compiled version of the expression. The original API for this
5873 function had no error code return variable; it is retained for backwards
5874 compatibility. The new function is given a new name.
5875
5876 Arguments:
5877 pattern the regular expression
5878 options various option bits
5879 errorcodeptr pointer to error code variable (pcre_compile2() only)
5880 can be NULL if you don't want a code value
5881 errorptr pointer to pointer to error text
5882 erroroffset ptr offset in pattern where error was detected
5883 tables pointer to character tables or NULL
5884
5885 Returns: pointer to compiled data block, or NULL on error,
5886 with errorptr and erroroffset set
5887 */
5888
5889 PCRE_EXP_DEFN pcre *
5890 pcre_compile(const char *pattern, int options, const char **errorptr,
5891 int *erroroffset, const unsigned char *tables)
5892 {
5893 return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
5894 }
5895
5896
5897 PCRE_EXP_DEFN pcre *
5898 pcre_compile2(const char *pattern, int options, int *errorcodeptr,
5899 const char **errorptr, int *erroroffset, const unsigned char *tables)
5900 {
5901 real_pcre *re;
5902 int length = 1; /* For final END opcode */
5903 int firstbyte, reqbyte, newline;
5904 int errorcode = 0;
5905 int skipatstart = 0;
5906 #ifdef SUPPORT_UTF8
5907 BOOL utf8;
5908 #endif
5909 size_t size;
5910 uschar *code;
5911 const uschar *codestart;
5912 const uschar *ptr;
5913 compile_data compile_block;
5914 compile_data *cd = &compile_block;
5915
5916 /* This space is used for "compiling" into during the first phase, when we are
5917 computing the amount of memory that is needed. Compiled items are thrown away
5918 as soon as possible, so that a fairly large buffer should be sufficient for
5919 this purpose. The same space is used in the second phase for remembering where
5920 to fill in forward references to subpatterns. */
5921
5922 uschar cworkspace[COMPILE_WORK_SIZE];
5923
5924 /* Set this early so that early errors get offset 0. */
5925
5926 ptr = (const uschar *)pattern;
5927
5928 /* We can't pass back an error message if errorptr is NULL; I guess the best we
5929 can do is just return NULL, but we can set a code value if there is a code
5930 pointer. */
5931
5932 if (errorptr == NULL)
5933 {
5934 if (errorcodeptr != NULL) *errorcodeptr = 99;
5935 return NULL;
5936 }
5937
5938 *errorptr = NULL;
5939 if (errorcodeptr != NULL) *errorcodeptr = ERR0;
5940
5941 /* However, we can give a message for this error */
5942
5943 if (erroroffset == NULL)
5944 {
5945 errorcode = ERR16;
5946 goto PCRE_EARLY_ERROR_RETURN2;
5947 }
5948
5949 *erroroffset = 0;
5950
5951 /* Can't support UTF8 unless PCRE has been compiled to include the code. */
5952
5953 #ifdef SUPPORT_UTF8
5954 utf8 = (options & PCRE_UTF8) != 0;
5955 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
5956 (*erroroffset = _pcre_valid_utf8((uschar *)pattern, -1)) >= 0)
5957 {
5958 errorcode = ERR44;
5959 goto PCRE_EARLY_ERROR_RETURN2;
5960 }
5961 #else
5962 if ((options & PCRE_UTF8) != 0)
5963 {
5964 errorcode = ERR32;
5965 goto PCRE_EARLY_ERROR_RETURN;
5966 }
5967 #endif
5968
5969 if ((options & ~PUBLIC_OPTIONS) != 0)
5970 {
5971 errorcode = ERR17;
5972 goto PCRE_EARLY_ERROR_RETURN;
5973 }
5974
5975 /* Set up pointers to the individual character tables */
5976
5977 if (tables == NULL) tables = _pcre_default_tables;
5978 cd->lcc = tables + lcc_offset;
5979 cd->fcc = tables + fcc_offset;
5980 cd->cbits = tables + cbits_offset;
5981 cd->ctypes = tables + ctypes_offset;
5982
5983 /* Check for global one-time settings at the start of the pattern, and remember
5984 the offset for later. */
5985
5986 while (ptr[skipatstart] == '(' && ptr[skipatstart+1] == '*')
5987 {
5988 int newnl = 0;
5989 int newbsr = 0;
5990
5991 if (strncmp((char *)(ptr+skipatstart+2), "CR)", 3) == 0)
5992 { skipatstart += 5; newnl = PCRE_NEWLINE_CR; }
5993 else if (strncmp((char *)(ptr+skipatstart+2), "LF)", 3) == 0)
5994 { skipatstart += 5; newnl = PCRE_NEWLINE_LF; }
5995 else if (strncmp((char *)(ptr+skipatstart+2), "CRLF)", 5) == 0)
5996 { skipatstart += 7; newnl = PCRE_NEWLINE_CR + PCRE_NEWLINE_LF; }
5997 else if (strncmp((char *)(ptr+skipatstart+2), "ANY)", 4) == 0)
5998 { skipatstart += 6; newnl = PCRE_NEWLINE_ANY; }
5999 else if (strncmp((char *)(ptr+skipatstart+2), "ANYCRLF)", 8) == 0)
6000 { skipatstart += 10; newnl = PCRE_NEWLINE_ANYCRLF; }
6001
6002 else if (strncmp((char *)(ptr+skipatstart+2), "BSR_ANYCRLF)", 12) == 0)
6003 { skipatstart += 14; newbsr = PCRE_BSR_ANYCRLF; }
6004 else if (strncmp((char *)(ptr+skipatstart+2), "BSR_UNICODE)", 12) == 0)
6005 { skipatstart += 14; newbsr = PCRE_BSR_UNICODE; }
6006
6007 if (newnl != 0)
6008 options = (options & ~PCRE_NEWLINE_BITS) | newnl;
6009 else if (newbsr != 0)
6010 options = (options & ~(PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) | newbsr;
6011 else break;
6012 }
6013
6014 /* Check validity of \R options. */
6015
6016 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
6017 {
6018 case 0:
6019 case PCRE_BSR_ANYCRLF:
6020 case PCRE_BSR_UNICODE:
6021 break;
6022 default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
6023 }
6024
6025 /* Handle different types of newline. The three bits give seven cases. The
6026 current code allows for fixed one- or two-byte sequences, plus "any" and
6027 "anycrlf". */
6028
6029 switch (options & PCRE_NEWLINE_BITS)
6030 {
6031 case 0: newline = NEWLINE; break; /* Build-time default */
6032 case PCRE_NEWLINE_CR: newline = '\r'; break;
6033 case PCRE_NEWLINE_LF: newline = '\n'; break;
6034 case PCRE_NEWLINE_CR+
6035 PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
6036 case PCRE_NEWLINE_ANY: newline = -1; break;
6037 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
6038 default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
6039 }
6040
6041 if (newline == -2)
6042 {
6043 cd->nltype = NLTYPE_ANYCRLF;
6044 }
6045 else if (newline < 0)
6046 {
6047 cd->nltype = NLTYPE_ANY;
6048 }
6049 else
6050 {
6051 cd->nltype = NLTYPE_FIXED;
6052 if (newline > 255)
6053 {
6054 cd->nllen = 2;
6055 cd->nl[0] = (newline >> 8) & 255;
6056 cd->nl[1] = newline & 255;
6057 }
6058 else
6059 {
6060 cd->nllen = 1;
6061 cd->nl[0] = newline;
6062 }
6063 }
6064
6065 /* Maximum back reference and backref bitmap. The bitmap records up to 31 back
6066 references to help in deciding whether (.*) can be treated as anchored or not.
6067 */
6068
6069 cd->top_backref = 0;
6070 cd->backref_map = 0;
6071
6072 /* Reflect pattern for debugging output */
6073
6074 DPRINTF(("------------------------------------------------------------------\n"));
6075 DPRINTF(("%s\n", pattern));
6076
6077 /* Pretend to compile the pattern while actually just accumulating the length
6078 of memory required. This behaviour is triggered by passing a non-NULL final
6079 argument to compile_regex(). We pass a block of workspace (cworkspace) for it
6080 to compile parts of the pattern into; the compiled code is discarded when it is
6081 no longer needed, so hopefully this workspace will never overflow, though there
6082 is a test for its doing so. */
6083
6084 cd->bracount = cd->final_bracount = 0;
6085 cd->names_found = 0;
6086 cd->name_entry_size = 0;
6087 cd->name_table = NULL;
6088 cd->start_workspace = cworkspace;
6089 cd->start_code = cworkspace;
6090 cd->hwm = cworkspace;
6091 cd->start_pattern = (const uschar *)pattern;
6092 cd->end_pattern = (const uschar *)(pattern + strlen(pattern));
6093 cd->req_varyopt = 0;
6094 cd->external_options = options;
6095 cd->external_flags = 0;
6096
6097 /* Now do the pre-compile. On error, errorcode will be set non-zero, so we
6098 don't need to look at the result of the function here. The initial options have
6099 been put into the cd block so that they can be changed if an option setting is
6100 found within the regex right at the beginning. Bringing initial option settings
6101 outside can help speed up starting point checks. */
6102
6103 ptr += skipatstart;
6104 code = cworkspace;
6105 *code = OP_BRA;
6106 (void)compile_regex(cd->external_options, cd->external_options & PCRE_IMS,
6107 &code, &ptr, &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd,
6108 &length);
6109 if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN;
6110
6111 DPRINTF(("end pre-compile: length=%d workspace=%d\n", length,
6112 cd->hwm - cworkspace));
6113
6114 if (length > MAX_PATTERN_SIZE)
6115 {
6116 errorcode = ERR20;
6117 goto PCRE_EARLY_ERROR_RETURN;
6118 }
6119
6120 /* Compute the size of data block needed and get it, either from malloc or
6121 externally provided function. Integer overflow should no longer be possible
6122 because nowadays we limit the maximum value of cd->names_found and
6123 cd->name_entry_size. */
6124
6125 size = length + sizeof(real_pcre) + cd->names_found * (cd->name_entry_size + 3);
6126 re = (real_pcre *)(pcre_malloc)(size);
6127
6128 if (re == NULL)
6129 {
6130 errorcode = ERR21;
6131 goto PCRE_EARLY_ERROR_RETURN;
6132 }
6133
6134 /* Put in the magic number, and save the sizes, initial options, internal
6135 flags, and character table pointer. NULL is used for the default character
6136 tables. The nullpad field is at the end; it's there to help in the case when a
6137 regex compiled on a system with 4-byte pointers is run on another with 8-byte
6138 pointers. */
6139
6140 re->magic_number = MAGIC_NUMBER;
6141 re->size = size;
6142 re->options = cd->external_options;
6143 re->flags = cd->external_flags;
6144 re->dummy1 = 0;
6145 re->first_byte = 0;
6146 re->req_byte = 0;
6147 re->name_table_offset = sizeof(real_pcre);
6148 re->name_entry_size = cd->name_entry_size;
6149 re->name_count = cd->names_found;
6150 re->ref_count = 0;
6151 re->tables = (tables == _pcre_default_tables)? NULL : tables;
6152 re->nullpad = NULL;
6153
6154 /* The starting points of the name/number translation table and of the code are
6155 passed around in the compile data block. The start/end pattern and initial
6156 options are already set from the pre-compile phase, as is the name_entry_size
6157 field. Reset the bracket count and the names_found field. Also reset the hwm
6158 field; this time it's used for remembering forward references to subpatterns.
6159 */
6160
6161 cd->final_bracount = cd->bracount; /* Save for checking forward references */
6162 cd->bracount = 0;
6163 cd->names_found = 0;
6164 cd->name_table = (uschar *)re + re->name_table_offset;
6165 codestart = cd->name_table + re->name_entry_size * re->name_count;
6166 cd->start_code = codestart;
6167 cd->hwm = cworkspace;
6168 cd->req_varyopt = 0;
6169 cd->had_accept = FALSE;
6170
6171 /* Set up a starting, non-extracting bracket, then compile the expression. On
6172 error, errorcode will be set non-zero, so we don't need to look at the result
6173 of the function here. */
6174
6175 ptr = (const uschar *)pattern + skipatstart;
6176 code = (uschar *)codestart;
6177 *code = OP_BRA;
6178 (void)compile_regex(re->options, re->options & PCRE_IMS, &code, &ptr,
6179 &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd, NULL);
6180 re->top_bracket = cd->bracount;
6181 re->top_backref = cd->top_backref;
6182 re->flags = cd->external_flags;
6183
6184 if (cd->had_accept) reqbyte = -1; /* Must disable after (*ACCEPT) */
6185
6186 /* If not reached end of pattern on success, there's an excess bracket. */
6187
6188 if (errorcode == 0 && *ptr != 0) errorcode = ERR22;
6189
6190 /* Fill in the terminating state and check for disastrous overflow, but
6191 if debugging, leave the test till after things are printed out. */
6192
6193 *code++ = OP_END;
6194
6195 #ifndef DEBUG
6196 if (code - codestart > length) errorcode = ERR23;
6197 #endif
6198
6199 /* Fill in any forward references that are required. */
6200
6201 while (errorcode == 0 && cd->hwm > cworkspace)
6202 {
6203 int offset, recno;
6204 const uschar *groupptr;
6205 cd->hwm -= LINK_SIZE;
6206 offset = GET(cd->hwm, 0);
6207 recno = GET(codestart, offset);
6208 groupptr = find_bracket(codestart, (re->options & PCRE_UTF8) != 0, recno);
6209 if (groupptr == NULL) errorcode = ERR53;
6210 else PUT(((uschar *)codestart), offset, groupptr - codestart);
6211 }
6212
6213 /* Give an error if there's back reference to a non-existent capturing
6214 subpattern. */
6215
6216 if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15;
6217
6218 /* Failed to compile, or error while post-processing */
6219
6220 if (errorcode != 0)
6221 {
6222 (pcre_free)(re);
6223 PCRE_EARLY_ERROR_RETURN:
6224 *erroroffset = ptr - (const uschar *)pattern;
6225 PCRE_EARLY_ERROR_RETURN2:
6226 *errorptr = find_error_text(errorcode);
6227 if (errorcodeptr != NULL) *errorcodeptr = errorcode;
6228 return NULL;
6229 }
6230
6231 /* If the anchored option was not passed, set the flag if we can determine that
6232 the pattern is anchored by virtue of ^ characters or \A or anything else (such
6233 as starting with .* when DOTALL is set).
6234
6235 Otherwise, if we know what the first byte has to be, save it, because that
6236 speeds up unanchored matches no end. If not, see if we can set the
6237 PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
6238 start with ^. and also when all branches start with .* for non-DOTALL matches.
6239 */
6240
6241 if ((re->options & PCRE_ANCHORED) == 0)
6242 {
6243 int temp_options = re->options; /* May get changed during these scans */
6244 if (is_anchored(codestart, &temp_options, 0, cd->backref_map))
6245 re->options |= PCRE_ANCHORED;
6246 else
6247 {
6248 if (firstbyte < 0)
6249 firstbyte = find_firstassertedchar(codestart, &temp_options, FALSE);
6250 if (firstbyte >= 0) /* Remove caseless flag for non-caseable chars */
6251 {
6252 int ch = firstbyte & 255;
6253 re->first_byte = ((firstbyte & REQ_CASELESS) != 0 &&
6254 cd->fcc[ch] == ch)? ch : firstbyte;
6255 re->flags |= PCRE_FIRSTSET;
6256 }
6257 else if (is_startline(codestart, 0, cd->backref_map))
6258 re->flags |= PCRE_STARTLINE;
6259 }
6260 }
6261
6262 /* For an anchored pattern, we use the "required byte" only if it follows a
6263 variable length item in the regex. Remove the caseless flag for non-caseable
6264 bytes. */
6265
6266 if (reqbyte >= 0 &&
6267 ((re->options & PCRE_ANCHORED) == 0 || (reqbyte & REQ_VARY) != 0))
6268 {
6269 int ch = reqbyte & 255;
6270 re->req_byte = ((reqbyte & REQ_CASELESS) != 0 &&
6271 cd->fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte;
6272 re->flags |= PCRE_REQCHSET;
6273 }
6274
6275 /* Print out the compiled data if debugging is enabled. This is never the
6276 case when building a production library. */
6277
6278 #ifdef DEBUG
6279
6280 printf("Length = %d top_bracket = %d top_backref = %d\n",
6281 length, re->top_bracket, re->top_backref);
6282
6283 printf("Options=%08x\n", re->options);
6284
6285 if ((re->flags & PCRE_FIRSTSET) != 0)
6286 {
6287 int ch = re->first_byte & 255;
6288 const char *caseless = ((re->first_byte & REQ_CASELESS) == 0)?
6289 "" : " (caseless)";
6290 if (isprint(ch)) printf("First char = %c%s\n", ch, caseless);
6291 else printf("First char = \\x%02x%s\n", ch, caseless);
6292 }
6293
6294 if ((re->flags & PCRE_REQCHSET) != 0)
6295 {
6296 int ch = re->req_byte & 255;
6297 const char *caseless = ((re->req_byte & REQ_CASELESS) == 0)?
6298 "" : " (caseless)";
6299 if (isprint(ch)) printf("Req char = %c%s\n", ch, caseless);
6300 else printf("Req char = \\x%02x%s\n", ch, caseless);
6301 }
6302
6303 pcre_printint(re, stdout, TRUE);
6304
6305 /* This check is done here in the debugging case so that the code that
6306 was compiled can be seen. */
6307
6308 if (code - codestart > length)
6309 {
6310 (pcre_free)(re);
6311 *errorptr = find_error_text(ERR23);
6312 *erroroffset = ptr - (uschar *)pattern;
6313 if (errorcodeptr != NULL) *errorcodeptr = ERR23;
6314 return NULL;
6315 }
6316 #endif /* DEBUG */
6317
6318 return (pcre *)re;
6319 }
6320
6321 /* End of pcre_compile.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12