/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Contents of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 336 - (show annotations) (download)
Sat Apr 12 15:59:03 2008 UTC (6 years, 6 months ago) by ph10
File MIME type: text/plain
File size: 201134 byte(s)
Added PCRE_JAVASCRIPT_COMPAT option.

1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2008 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains the external function pcre_compile(), along with
42 supporting internal functions that are not used by other modules. */
43
44
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48
49 #define NLBLOCK cd /* Block containing newline information */
50 #define PSSTART start_pattern /* Field containing processed string start */
51 #define PSEND end_pattern /* Field containing processed string end */
52
53 #include "pcre_internal.h"
54
55
56 /* When DEBUG is defined, we need the pcre_printint() function, which is also
57 used by pcretest. DEBUG is not defined when building a production library. */
58
59 #ifdef DEBUG
60 #include "pcre_printint.src"
61 #endif
62
63
64 /* Macro for setting individual bits in class bitmaps. */
65
66 #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
67
68 /* Maximum length value to check against when making sure that the integer that
69 holds the compiled pattern length does not overflow. We make it a bit less than
70 INT_MAX to allow for adding in group terminating bytes, so that we don't have
71 to check them every time. */
72
73 #define OFLOW_MAX (INT_MAX - 20)
74
75
76 /*************************************************
77 * Code parameters and static tables *
78 *************************************************/
79
80 /* This value specifies the size of stack workspace that is used during the
81 first pre-compile phase that determines how much memory is required. The regex
82 is partly compiled into this space, but the compiled parts are discarded as
83 soon as they can be, so that hopefully there will never be an overrun. The code
84 does, however, check for an overrun. The largest amount I've seen used is 218,
85 so this number is very generous.
86
87 The same workspace is used during the second, actual compile phase for
88 remembering forward references to groups so that they can be filled in at the
89 end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
90 is 4 there is plenty of room. */
91
92 #define COMPILE_WORK_SIZE (4096)
93
94
95 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
96 are simple data values; negative values are for special things like \d and so
97 on. Zero means further processing is needed (for things like \x), or the escape
98 is invalid. */
99
100 #ifndef EBCDIC /* This is the "normal" table for ASCII systems */
101 static const short int escapes[] = {
102 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
103 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
104 '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */
105 -ESC_H, 0, 0, -ESC_K, 0, 0, 0, 0, /* H - O */
106 -ESC_P, -ESC_Q, -ESC_R, -ESC_S, 0, 0, -ESC_V, -ESC_W, /* P - W */
107 -ESC_X, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
108 '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */
109 -ESC_h, 0, 0, -ESC_k, 0, 0, ESC_n, 0, /* h - o */
110 -ESC_p, 0, ESC_r, -ESC_s, ESC_tee, 0, -ESC_v, -ESC_w, /* p - w */
111 0, 0, -ESC_z /* x - z */
112 };
113
114 #else /* This is the "abnormal" table for EBCDIC systems */
115 static const short int escapes[] = {
116 /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
117 /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
118 /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
119 /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
120 /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
121 /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
122 /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
123 /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
124 /* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0,
125 /* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
126 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
127 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0,
128 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
129 /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
130 /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
131 /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
132 /* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0,
133 /* D0 */ '}', 0, -ESC_K, 0, 0, 0, 0, -ESC_P,
134 /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
135 /* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X,
136 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
137 /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
138 /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
139 };
140 #endif
141
142
143 /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
144 searched linearly. Put all the names into a single string, in order to reduce
145 the number of relocations when a shared library is dynamically linked. */
146
147 typedef struct verbitem {
148 int len;
149 int op;
150 } verbitem;
151
152 static const char verbnames[] =
153 "ACCEPT\0"
154 "COMMIT\0"
155 "F\0"
156 "FAIL\0"
157 "PRUNE\0"
158 "SKIP\0"
159 "THEN";
160
161 static const verbitem verbs[] = {
162 { 6, OP_ACCEPT },
163 { 6, OP_COMMIT },
164 { 1, OP_FAIL },
165 { 4, OP_FAIL },
166 { 5, OP_PRUNE },
167 { 4, OP_SKIP },
168 { 4, OP_THEN }
169 };
170
171 static const int verbcount = sizeof(verbs)/sizeof(verbitem);
172
173
174 /* Tables of names of POSIX character classes and their lengths. The names are
175 now all in a single string, to reduce the number of relocations when a shared
176 library is dynamically loaded. The list of lengths is terminated by a zero
177 length entry. The first three must be alpha, lower, upper, as this is assumed
178 for handling case independence. */
179
180 static const char posix_names[] =
181 "alpha\0" "lower\0" "upper\0" "alnum\0" "ascii\0" "blank\0"
182 "cntrl\0" "digit\0" "graph\0" "print\0" "punct\0" "space\0"
183 "word\0" "xdigit";
184
185 static const uschar posix_name_lengths[] = {
186 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
187
188 /* Table of class bit maps for each POSIX class. Each class is formed from a
189 base map, with an optional addition or removal of another map. Then, for some
190 classes, there is some additional tweaking: for [:blank:] the vertical space
191 characters are removed, and for [:alpha:] and [:alnum:] the underscore
192 character is removed. The triples in the table consist of the base map offset,
193 second map offset or -1 if no second map, and a non-negative value for map
194 addition or a negative value for map subtraction (if there are two maps). The
195 absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
196 remove vertical space characters, 2 => remove underscore. */
197
198 static const int posix_class_maps[] = {
199 cbit_word, cbit_digit, -2, /* alpha */
200 cbit_lower, -1, 0, /* lower */
201 cbit_upper, -1, 0, /* upper */
202 cbit_word, -1, 2, /* alnum - word without underscore */
203 cbit_print, cbit_cntrl, 0, /* ascii */
204 cbit_space, -1, 1, /* blank - a GNU extension */
205 cbit_cntrl, -1, 0, /* cntrl */
206 cbit_digit, -1, 0, /* digit */
207 cbit_graph, -1, 0, /* graph */
208 cbit_print, -1, 0, /* print */
209 cbit_punct, -1, 0, /* punct */
210 cbit_space, -1, 0, /* space */
211 cbit_word, -1, 0, /* word - a Perl extension */
212 cbit_xdigit,-1, 0 /* xdigit */
213 };
214
215
216 #define STRING(a) # a
217 #define XSTRING(s) STRING(s)
218
219 /* The texts of compile-time error messages. These are "char *" because they
220 are passed to the outside world. Do not ever re-use any error number, because
221 they are documented. Always add a new error instead. Messages marked DEAD below
222 are no longer used. This used to be a table of strings, but in order to reduce
223 the number of relocations needed when a shared library is loaded dynamically,
224 it is now one long string. We cannot use a table of offsets, because the
225 lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
226 simply count through to the one we want - this isn't a performance issue
227 because these strings are used only when there is a compilation error. */
228
229 static const char error_texts[] =
230 "no error\0"
231 "\\ at end of pattern\0"
232 "\\c at end of pattern\0"
233 "unrecognized character follows \\\0"
234 "numbers out of order in {} quantifier\0"
235 /* 5 */
236 "number too big in {} quantifier\0"
237 "missing terminating ] for character class\0"
238 "invalid escape sequence in character class\0"
239 "range out of order in character class\0"
240 "nothing to repeat\0"
241 /* 10 */
242 "operand of unlimited repeat could match the empty string\0" /** DEAD **/
243 "internal error: unexpected repeat\0"
244 "unrecognized character after (? or (?-\0"
245 "POSIX named classes are supported only within a class\0"
246 "missing )\0"
247 /* 15 */
248 "reference to non-existent subpattern\0"
249 "erroffset passed as NULL\0"
250 "unknown option bit(s) set\0"
251 "missing ) after comment\0"
252 "parentheses nested too deeply\0" /** DEAD **/
253 /* 20 */
254 "regular expression is too large\0"
255 "failed to get memory\0"
256 "unmatched parentheses\0"
257 "internal error: code overflow\0"
258 "unrecognized character after (?<\0"
259 /* 25 */
260 "lookbehind assertion is not fixed length\0"
261 "malformed number or name after (?(\0"
262 "conditional group contains more than two branches\0"
263 "assertion expected after (?(\0"
264 "(?R or (?[+-]digits must be followed by )\0"
265 /* 30 */
266 "unknown POSIX class name\0"
267 "POSIX collating elements are not supported\0"
268 "this version of PCRE is not compiled with PCRE_UTF8 support\0"
269 "spare error\0" /** DEAD **/
270 "character value in \\x{...} sequence is too large\0"
271 /* 35 */
272 "invalid condition (?(0)\0"
273 "\\C not allowed in lookbehind assertion\0"
274 "PCRE does not support \\L, \\l, \\N, \\U, or \\u\0"
275 "number after (?C is > 255\0"
276 "closing ) for (?C expected\0"
277 /* 40 */
278 "recursive call could loop indefinitely\0"
279 "unrecognized character after (?P\0"
280 "syntax error in subpattern name (missing terminator)\0"
281 "two named subpatterns have the same name\0"
282 "invalid UTF-8 string\0"
283 /* 45 */
284 "support for \\P, \\p, and \\X has not been compiled\0"
285 "malformed \\P or \\p sequence\0"
286 "unknown property name after \\P or \\p\0"
287 "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
288 "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
289 /* 50 */
290 "repeated subpattern is too long\0" /** DEAD **/
291 "octal value is greater than \\377 (not in UTF-8 mode)\0"
292 "internal error: overran compiling workspace\0"
293 "internal error: previously-checked referenced subpattern not found\0"
294 "DEFINE group contains more than one branch\0"
295 /* 55 */
296 "repeating a DEFINE group is not allowed\0"
297 "inconsistent NEWLINE options\0"
298 "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
299 "a numbered reference must not be zero\0"
300 "(*VERB) with an argument is not supported\0"
301 /* 60 */
302 "(*VERB) not recognized\0"
303 "number is too big\0"
304 "subpattern name expected\0"
305 "digit expected after (?+\0"
306 "] is an invalid data character in JavaScript compatibility mode";
307
308
309 /* Table to identify digits and hex digits. This is used when compiling
310 patterns. Note that the tables in chartables are dependent on the locale, and
311 may mark arbitrary characters as digits - but the PCRE compiling code expects
312 to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
313 a private table here. It costs 256 bytes, but it is a lot faster than doing
314 character value tests (at least in some simple cases I timed), and in some
315 applications one wants PCRE to compile efficiently as well as match
316 efficiently.
317
318 For convenience, we use the same bit definitions as in chartables:
319
320 0x04 decimal digit
321 0x08 hexadecimal digit
322
323 Then we can use ctype_digit and ctype_xdigit in the code. */
324
325 #ifndef EBCDIC /* This is the "normal" case, for ASCII systems */
326 static const unsigned char digitab[] =
327 {
328 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
329 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
330 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
331 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
332 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
333 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
334 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
335 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
336 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
337 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
338 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
339 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
340 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
341 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
342 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
343 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
344 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
345 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
346 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
347 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
348 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
349 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
350 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
351 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
352 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
353 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
354 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
355 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
356 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
357 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
358 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
359 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
360
361 #else /* This is the "abnormal" case, for EBCDIC systems */
362 static const unsigned char digitab[] =
363 {
364 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
365 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
366 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
367 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
368 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
369 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
370 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
371 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
372 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
373 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
374 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
375 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */
376 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
377 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
378 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
379 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
380 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
381 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
382 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
383 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
384 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
385 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
386 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
387 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
388 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
389 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
390 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
391 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
392 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
393 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
394 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
395 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
396
397 static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
398 0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
399 0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
400 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
401 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
402 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
403 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
404 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
405 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
406 0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
407 0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
408 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
409 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */
410 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
411 0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
412 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
413 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
414 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
415 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
416 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
417 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
418 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
419 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
420 0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
421 0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
422 0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
423 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
424 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
425 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
426 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
427 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
428 0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
429 0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
430 #endif
431
432
433 /* Definition to allow mutual recursion */
434
435 static BOOL
436 compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
437 int *, int *, branch_chain *, compile_data *, int *);
438
439
440
441 /*************************************************
442 * Find an error text *
443 *************************************************/
444
445 /* The error texts are now all in one long string, to save on relocations. As
446 some of the text is of unknown length, we can't use a table of offsets.
447 Instead, just count through the strings. This is not a performance issue
448 because it happens only when there has been a compilation error.
449
450 Argument: the error number
451 Returns: pointer to the error string
452 */
453
454 static const char *
455 find_error_text(int n)
456 {
457 const char *s = error_texts;
458 for (; n > 0; n--) while (*s++ != 0);
459 return s;
460 }
461
462
463 /*************************************************
464 * Handle escapes *
465 *************************************************/
466
467 /* This function is called when a \ has been encountered. It either returns a
468 positive value for a simple escape such as \n, or a negative value which
469 encodes one of the more complicated things such as \d. A backreference to group
470 n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
471 UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
472 ptr is pointing at the \. On exit, it is on the final character of the escape
473 sequence.
474
475 Arguments:
476 ptrptr points to the pattern position pointer
477 errorcodeptr points to the errorcode variable
478 bracount number of previous extracting brackets
479 options the options bits
480 isclass TRUE if inside a character class
481
482 Returns: zero or positive => a data character
483 negative => a special escape sequence
484 on error, errorcodeptr is set
485 */
486
487 static int
488 check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
489 int options, BOOL isclass)
490 {
491 BOOL utf8 = (options & PCRE_UTF8) != 0;
492 const uschar *ptr = *ptrptr + 1;
493 int c, i;
494
495 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
496 ptr--; /* Set pointer back to the last byte */
497
498 /* If backslash is at the end of the pattern, it's an error. */
499
500 if (c == 0) *errorcodeptr = ERR1;
501
502 /* Non-alphanumerics are literals. For digits or letters, do an initial lookup
503 in a table. A non-zero result is something that can be returned immediately.
504 Otherwise further processing may be required. */
505
506 #ifndef EBCDIC /* ASCII coding */
507 else if (c < '0' || c > 'z') {} /* Not alphanumeric */
508 else if ((i = escapes[c - '0']) != 0) c = i;
509
510 #else /* EBCDIC coding */
511 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphanumeric */
512 else if ((i = escapes[c - 0x48]) != 0) c = i;
513 #endif
514
515 /* Escapes that need further processing, or are illegal. */
516
517 else
518 {
519 const uschar *oldptr;
520 BOOL braced, negated;
521
522 switch (c)
523 {
524 /* A number of Perl escapes are not handled by PCRE. We give an explicit
525 error. */
526
527 case 'l':
528 case 'L':
529 case 'N':
530 case 'u':
531 case 'U':
532 *errorcodeptr = ERR37;
533 break;
534
535 /* \g must be followed by one of a number of specific things:
536
537 (1) A number, either plain or braced. If positive, it is an absolute
538 backreference. If negative, it is a relative backreference. This is a Perl
539 5.10 feature.
540
541 (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
542 is part of Perl's movement towards a unified syntax for back references. As
543 this is synonymous with \k{name}, we fudge it up by pretending it really
544 was \k.
545
546 (3) For Oniguruma compatibility we also support \g followed by a name or a
547 number either in angle brackets or in single quotes. However, these are
548 (possibly recursive) subroutine calls, _not_ backreferences. Just return
549 the -ESC_g code (cf \k). */
550
551 case 'g':
552 if (ptr[1] == '<' || ptr[1] == '\'')
553 {
554 c = -ESC_g;
555 break;
556 }
557
558 /* Handle the Perl-compatible cases */
559
560 if (ptr[1] == '{')
561 {
562 const uschar *p;
563 for (p = ptr+2; *p != 0 && *p != '}'; p++)
564 if (*p != '-' && (digitab[*p] & ctype_digit) == 0) break;
565 if (*p != 0 && *p != '}')
566 {
567 c = -ESC_k;
568 break;
569 }
570 braced = TRUE;
571 ptr++;
572 }
573 else braced = FALSE;
574
575 if (ptr[1] == '-')
576 {
577 negated = TRUE;
578 ptr++;
579 }
580 else negated = FALSE;
581
582 c = 0;
583 while ((digitab[ptr[1]] & ctype_digit) != 0)
584 c = c * 10 + *(++ptr) - '0';
585
586 if (c < 0) /* Integer overflow */
587 {
588 *errorcodeptr = ERR61;
589 break;
590 }
591
592 if (braced && *(++ptr) != '}')
593 {
594 *errorcodeptr = ERR57;
595 break;
596 }
597
598 if (c == 0)
599 {
600 *errorcodeptr = ERR58;
601 break;
602 }
603
604 if (negated)
605 {
606 if (c > bracount)
607 {
608 *errorcodeptr = ERR15;
609 break;
610 }
611 c = bracount - (c - 1);
612 }
613
614 c = -(ESC_REF + c);
615 break;
616
617 /* The handling of escape sequences consisting of a string of digits
618 starting with one that is not zero is not straightforward. By experiment,
619 the way Perl works seems to be as follows:
620
621 Outside a character class, the digits are read as a decimal number. If the
622 number is less than 10, or if there are that many previous extracting
623 left brackets, then it is a back reference. Otherwise, up to three octal
624 digits are read to form an escaped byte. Thus \123 is likely to be octal
625 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
626 value is greater than 377, the least significant 8 bits are taken. Inside a
627 character class, \ followed by a digit is always an octal number. */
628
629 case '1': case '2': case '3': case '4': case '5':
630 case '6': case '7': case '8': case '9':
631
632 if (!isclass)
633 {
634 oldptr = ptr;
635 c -= '0';
636 while ((digitab[ptr[1]] & ctype_digit) != 0)
637 c = c * 10 + *(++ptr) - '0';
638 if (c < 0) /* Integer overflow */
639 {
640 *errorcodeptr = ERR61;
641 break;
642 }
643 if (c < 10 || c <= bracount)
644 {
645 c = -(ESC_REF + c);
646 break;
647 }
648 ptr = oldptr; /* Put the pointer back and fall through */
649 }
650
651 /* Handle an octal number following \. If the first digit is 8 or 9, Perl
652 generates a binary zero byte and treats the digit as a following literal.
653 Thus we have to pull back the pointer by one. */
654
655 if ((c = *ptr) >= '8')
656 {
657 ptr--;
658 c = 0;
659 break;
660 }
661
662 /* \0 always starts an octal number, but we may drop through to here with a
663 larger first octal digit. The original code used just to take the least
664 significant 8 bits of octal numbers (I think this is what early Perls used
665 to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
666 than 3 octal digits. */
667
668 case '0':
669 c -= '0';
670 while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
671 c = c * 8 + *(++ptr) - '0';
672 if (!utf8 && c > 255) *errorcodeptr = ERR51;
673 break;
674
675 /* \x is complicated. \x{ddd} is a character number which can be greater
676 than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
677 treated as a data character. */
678
679 case 'x':
680 if (ptr[1] == '{')
681 {
682 const uschar *pt = ptr + 2;
683 int count = 0;
684
685 c = 0;
686 while ((digitab[*pt] & ctype_xdigit) != 0)
687 {
688 register int cc = *pt++;
689 if (c == 0 && cc == '0') continue; /* Leading zeroes */
690 count++;
691
692 #ifndef EBCDIC /* ASCII coding */
693 if (cc >= 'a') cc -= 32; /* Convert to upper case */
694 c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));
695 #else /* EBCDIC coding */
696 if (cc >= 'a' && cc <= 'z') cc += 64; /* Convert to upper case */
697 c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));
698 #endif
699 }
700
701 if (*pt == '}')
702 {
703 if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
704 ptr = pt;
705 break;
706 }
707
708 /* If the sequence of hex digits does not end with '}', then we don't
709 recognize this construct; fall through to the normal \x handling. */
710 }
711
712 /* Read just a single-byte hex-defined char */
713
714 c = 0;
715 while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
716 {
717 int cc; /* Some compilers don't like ++ */
718 cc = *(++ptr); /* in initializers */
719 #ifndef EBCDIC /* ASCII coding */
720 if (cc >= 'a') cc -= 32; /* Convert to upper case */
721 c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
722 #else /* EBCDIC coding */
723 if (cc <= 'z') cc += 64; /* Convert to upper case */
724 c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
725 #endif
726 }
727 break;
728
729 /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
730 This coding is ASCII-specific, but then the whole concept of \cx is
731 ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
732
733 case 'c':
734 c = *(++ptr);
735 if (c == 0)
736 {
737 *errorcodeptr = ERR2;
738 break;
739 }
740
741 #ifndef EBCDIC /* ASCII coding */
742 if (c >= 'a' && c <= 'z') c -= 32;
743 c ^= 0x40;
744 #else /* EBCDIC coding */
745 if (c >= 'a' && c <= 'z') c += 64;
746 c ^= 0xC0;
747 #endif
748 break;
749
750 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
751 other alphanumeric following \ is an error if PCRE_EXTRA was set;
752 otherwise, for Perl compatibility, it is a literal. This code looks a bit
753 odd, but there used to be some cases other than the default, and there may
754 be again in future, so I haven't "optimized" it. */
755
756 default:
757 if ((options & PCRE_EXTRA) != 0) switch(c)
758 {
759 default:
760 *errorcodeptr = ERR3;
761 break;
762 }
763 break;
764 }
765 }
766
767 *ptrptr = ptr;
768 return c;
769 }
770
771
772
773 #ifdef SUPPORT_UCP
774 /*************************************************
775 * Handle \P and \p *
776 *************************************************/
777
778 /* This function is called after \P or \p has been encountered, provided that
779 PCRE is compiled with support for Unicode properties. On entry, ptrptr is
780 pointing at the P or p. On exit, it is pointing at the final character of the
781 escape sequence.
782
783 Argument:
784 ptrptr points to the pattern position pointer
785 negptr points to a boolean that is set TRUE for negation else FALSE
786 dptr points to an int that is set to the detailed property value
787 errorcodeptr points to the error code variable
788
789 Returns: type value from ucp_type_table, or -1 for an invalid type
790 */
791
792 static int
793 get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
794 {
795 int c, i, bot, top;
796 const uschar *ptr = *ptrptr;
797 char name[32];
798
799 c = *(++ptr);
800 if (c == 0) goto ERROR_RETURN;
801
802 *negptr = FALSE;
803
804 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
805 negation. */
806
807 if (c == '{')
808 {
809 if (ptr[1] == '^')
810 {
811 *negptr = TRUE;
812 ptr++;
813 }
814 for (i = 0; i < (int)sizeof(name) - 1; i++)
815 {
816 c = *(++ptr);
817 if (c == 0) goto ERROR_RETURN;
818 if (c == '}') break;
819 name[i] = c;
820 }
821 if (c !='}') goto ERROR_RETURN;
822 name[i] = 0;
823 }
824
825 /* Otherwise there is just one following character */
826
827 else
828 {
829 name[0] = c;
830 name[1] = 0;
831 }
832
833 *ptrptr = ptr;
834
835 /* Search for a recognized property name using binary chop */
836
837 bot = 0;
838 top = _pcre_utt_size;
839
840 while (bot < top)
841 {
842 i = (bot + top) >> 1;
843 c = strcmp(name, _pcre_utt_names + _pcre_utt[i].name_offset);
844 if (c == 0)
845 {
846 *dptr = _pcre_utt[i].value;
847 return _pcre_utt[i].type;
848 }
849 if (c > 0) bot = i + 1; else top = i;
850 }
851
852 *errorcodeptr = ERR47;
853 *ptrptr = ptr;
854 return -1;
855
856 ERROR_RETURN:
857 *errorcodeptr = ERR46;
858 *ptrptr = ptr;
859 return -1;
860 }
861 #endif
862
863
864
865
866 /*************************************************
867 * Check for counted repeat *
868 *************************************************/
869
870 /* This function is called when a '{' is encountered in a place where it might
871 start a quantifier. It looks ahead to see if it really is a quantifier or not.
872 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
873 where the ddds are digits.
874
875 Arguments:
876 p pointer to the first char after '{'
877
878 Returns: TRUE or FALSE
879 */
880
881 static BOOL
882 is_counted_repeat(const uschar *p)
883 {
884 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
885 while ((digitab[*p] & ctype_digit) != 0) p++;
886 if (*p == '}') return TRUE;
887
888 if (*p++ != ',') return FALSE;
889 if (*p == '}') return TRUE;
890
891 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
892 while ((digitab[*p] & ctype_digit) != 0) p++;
893
894 return (*p == '}');
895 }
896
897
898
899 /*************************************************
900 * Read repeat counts *
901 *************************************************/
902
903 /* Read an item of the form {n,m} and return the values. This is called only
904 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
905 so the syntax is guaranteed to be correct, but we need to check the values.
906
907 Arguments:
908 p pointer to first char after '{'
909 minp pointer to int for min
910 maxp pointer to int for max
911 returned as -1 if no max
912 errorcodeptr points to error code variable
913
914 Returns: pointer to '}' on success;
915 current ptr on error, with errorcodeptr set non-zero
916 */
917
918 static const uschar *
919 read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
920 {
921 int min = 0;
922 int max = -1;
923
924 /* Read the minimum value and do a paranoid check: a negative value indicates
925 an integer overflow. */
926
927 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
928 if (min < 0 || min > 65535)
929 {
930 *errorcodeptr = ERR5;
931 return p;
932 }
933
934 /* Read the maximum value if there is one, and again do a paranoid on its size.
935 Also, max must not be less than min. */
936
937 if (*p == '}') max = min; else
938 {
939 if (*(++p) != '}')
940 {
941 max = 0;
942 while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
943 if (max < 0 || max > 65535)
944 {
945 *errorcodeptr = ERR5;
946 return p;
947 }
948 if (max < min)
949 {
950 *errorcodeptr = ERR4;
951 return p;
952 }
953 }
954 }
955
956 /* Fill in the required variables, and pass back the pointer to the terminating
957 '}'. */
958
959 *minp = min;
960 *maxp = max;
961 return p;
962 }
963
964
965
966 /*************************************************
967 * Find forward referenced subpattern *
968 *************************************************/
969
970 /* This function scans along a pattern's text looking for capturing
971 subpatterns, and counting them. If it finds a named pattern that matches the
972 name it is given, it returns its number. Alternatively, if the name is NULL, it
973 returns when it reaches a given numbered subpattern. This is used for forward
974 references to subpatterns. We know that if (?P< is encountered, the name will
975 be terminated by '>' because that is checked in the first pass.
976
977 Arguments:
978 ptr current position in the pattern
979 count current count of capturing parens so far encountered
980 name name to seek, or NULL if seeking a numbered subpattern
981 lorn name length, or subpattern number if name is NULL
982 xmode TRUE if we are in /x mode
983
984 Returns: the number of the named subpattern, or -1 if not found
985 */
986
987 static int
988 find_parens(const uschar *ptr, int count, const uschar *name, int lorn,
989 BOOL xmode)
990 {
991 const uschar *thisname;
992
993 for (; *ptr != 0; ptr++)
994 {
995 int term;
996
997 /* Skip over backslashed characters and also entire \Q...\E */
998
999 if (*ptr == '\\')
1000 {
1001 if (*(++ptr) == 0) return -1;
1002 if (*ptr == 'Q') for (;;)
1003 {
1004 while (*(++ptr) != 0 && *ptr != '\\');
1005 if (*ptr == 0) return -1;
1006 if (*(++ptr) == 'E') break;
1007 }
1008 continue;
1009 }
1010
1011 /* Skip over character classes */
1012
1013 if (*ptr == '[')
1014 {
1015 while (*(++ptr) != ']')
1016 {
1017 if (*ptr == 0) return -1;
1018 if (*ptr == '\\')
1019 {
1020 if (*(++ptr) == 0) return -1;
1021 if (*ptr == 'Q') for (;;)
1022 {
1023 while (*(++ptr) != 0 && *ptr != '\\');
1024 if (*ptr == 0) return -1;
1025 if (*(++ptr) == 'E') break;
1026 }
1027 continue;
1028 }
1029 }
1030 continue;
1031 }
1032
1033 /* Skip comments in /x mode */
1034
1035 if (xmode && *ptr == '#')
1036 {
1037 while (*(++ptr) != 0 && *ptr != '\n');
1038 if (*ptr == 0) return -1;
1039 continue;
1040 }
1041
1042 /* An opening parens must now be a real metacharacter */
1043
1044 if (*ptr != '(') continue;
1045 if (ptr[1] != '?' && ptr[1] != '*')
1046 {
1047 count++;
1048 if (name == NULL && count == lorn) return count;
1049 continue;
1050 }
1051
1052 ptr += 2;
1053 if (*ptr == 'P') ptr++; /* Allow optional P */
1054
1055 /* We have to disambiguate (?<! and (?<= from (?<name> */
1056
1057 if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') &&
1058 *ptr != '\'')
1059 continue;
1060
1061 count++;
1062
1063 if (name == NULL && count == lorn) return count;
1064 term = *ptr++;
1065 if (term == '<') term = '>';
1066 thisname = ptr;
1067 while (*ptr != term) ptr++;
1068 if (name != NULL && lorn == ptr - thisname &&
1069 strncmp((const char *)name, (const char *)thisname, lorn) == 0)
1070 return count;
1071 }
1072
1073 return -1;
1074 }
1075
1076
1077
1078 /*************************************************
1079 * Find first significant op code *
1080 *************************************************/
1081
1082 /* This is called by several functions that scan a compiled expression looking
1083 for a fixed first character, or an anchoring op code etc. It skips over things
1084 that do not influence this. For some calls, a change of option is important.
1085 For some calls, it makes sense to skip negative forward and all backward
1086 assertions, and also the \b assertion; for others it does not.
1087
1088 Arguments:
1089 code pointer to the start of the group
1090 options pointer to external options
1091 optbit the option bit whose changing is significant, or
1092 zero if none are
1093 skipassert TRUE if certain assertions are to be skipped
1094
1095 Returns: pointer to the first significant opcode
1096 */
1097
1098 static const uschar*
1099 first_significant_code(const uschar *code, int *options, int optbit,
1100 BOOL skipassert)
1101 {
1102 for (;;)
1103 {
1104 switch ((int)*code)
1105 {
1106 case OP_OPT:
1107 if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
1108 *options = (int)code[1];
1109 code += 2;
1110 break;
1111
1112 case OP_ASSERT_NOT:
1113 case OP_ASSERTBACK:
1114 case OP_ASSERTBACK_NOT:
1115 if (!skipassert) return code;
1116 do code += GET(code, 1); while (*code == OP_ALT);
1117 code += _pcre_OP_lengths[*code];
1118 break;
1119
1120 case OP_WORD_BOUNDARY:
1121 case OP_NOT_WORD_BOUNDARY:
1122 if (!skipassert) return code;
1123 /* Fall through */
1124
1125 case OP_CALLOUT:
1126 case OP_CREF:
1127 case OP_RREF:
1128 case OP_DEF:
1129 code += _pcre_OP_lengths[*code];
1130 break;
1131
1132 default:
1133 return code;
1134 }
1135 }
1136 /* Control never reaches here */
1137 }
1138
1139
1140
1141
1142 /*************************************************
1143 * Find the fixed length of a pattern *
1144 *************************************************/
1145
1146 /* Scan a pattern and compute the fixed length of subject that will match it,
1147 if the length is fixed. This is needed for dealing with backward assertions.
1148 In UTF8 mode, the result is in characters rather than bytes.
1149
1150 Arguments:
1151 code points to the start of the pattern (the bracket)
1152 options the compiling options
1153
1154 Returns: the fixed length, or -1 if there is no fixed length,
1155 or -2 if \C was encountered
1156 */
1157
1158 static int
1159 find_fixedlength(uschar *code, int options)
1160 {
1161 int length = -1;
1162
1163 register int branchlength = 0;
1164 register uschar *cc = code + 1 + LINK_SIZE;
1165
1166 /* Scan along the opcodes for this branch. If we get to the end of the
1167 branch, check the length against that of the other branches. */
1168
1169 for (;;)
1170 {
1171 int d;
1172 register int op = *cc;
1173 switch (op)
1174 {
1175 case OP_CBRA:
1176 case OP_BRA:
1177 case OP_ONCE:
1178 case OP_COND:
1179 d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);
1180 if (d < 0) return d;
1181 branchlength += d;
1182 do cc += GET(cc, 1); while (*cc == OP_ALT);
1183 cc += 1 + LINK_SIZE;
1184 break;
1185
1186 /* Reached end of a branch; if it's a ket it is the end of a nested
1187 call. If it's ALT it is an alternation in a nested call. If it is
1188 END it's the end of the outer call. All can be handled by the same code. */
1189
1190 case OP_ALT:
1191 case OP_KET:
1192 case OP_KETRMAX:
1193 case OP_KETRMIN:
1194 case OP_END:
1195 if (length < 0) length = branchlength;
1196 else if (length != branchlength) return -1;
1197 if (*cc != OP_ALT) return length;
1198 cc += 1 + LINK_SIZE;
1199 branchlength = 0;
1200 break;
1201
1202 /* Skip over assertive subpatterns */
1203
1204 case OP_ASSERT:
1205 case OP_ASSERT_NOT:
1206 case OP_ASSERTBACK:
1207 case OP_ASSERTBACK_NOT:
1208 do cc += GET(cc, 1); while (*cc == OP_ALT);
1209 /* Fall through */
1210
1211 /* Skip over things that don't match chars */
1212
1213 case OP_REVERSE:
1214 case OP_CREF:
1215 case OP_RREF:
1216 case OP_DEF:
1217 case OP_OPT:
1218 case OP_CALLOUT:
1219 case OP_SOD:
1220 case OP_SOM:
1221 case OP_EOD:
1222 case OP_EODN:
1223 case OP_CIRC:
1224 case OP_DOLL:
1225 case OP_NOT_WORD_BOUNDARY:
1226 case OP_WORD_BOUNDARY:
1227 cc += _pcre_OP_lengths[*cc];
1228 break;
1229
1230 /* Handle literal characters */
1231
1232 case OP_CHAR:
1233 case OP_CHARNC:
1234 case OP_NOT:
1235 branchlength++;
1236 cc += 2;
1237 #ifdef SUPPORT_UTF8
1238 if ((options & PCRE_UTF8) != 0)
1239 {
1240 while ((*cc & 0xc0) == 0x80) cc++;
1241 }
1242 #endif
1243 break;
1244
1245 /* Handle exact repetitions. The count is already in characters, but we
1246 need to skip over a multibyte character in UTF8 mode. */
1247
1248 case OP_EXACT:
1249 branchlength += GET2(cc,1);
1250 cc += 4;
1251 #ifdef SUPPORT_UTF8
1252 if ((options & PCRE_UTF8) != 0)
1253 {
1254 while((*cc & 0x80) == 0x80) cc++;
1255 }
1256 #endif
1257 break;
1258
1259 case OP_TYPEEXACT:
1260 branchlength += GET2(cc,1);
1261 if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
1262 cc += 4;
1263 break;
1264
1265 /* Handle single-char matchers */
1266
1267 case OP_PROP:
1268 case OP_NOTPROP:
1269 cc += 2;
1270 /* Fall through */
1271
1272 case OP_NOT_DIGIT:
1273 case OP_DIGIT:
1274 case OP_NOT_WHITESPACE:
1275 case OP_WHITESPACE:
1276 case OP_NOT_WORDCHAR:
1277 case OP_WORDCHAR:
1278 case OP_ANY:
1279 branchlength++;
1280 cc++;
1281 break;
1282
1283 /* The single-byte matcher isn't allowed */
1284
1285 case OP_ANYBYTE:
1286 return -2;
1287
1288 /* Check a class for variable quantification */
1289
1290 #ifdef SUPPORT_UTF8
1291 case OP_XCLASS:
1292 cc += GET(cc, 1) - 33;
1293 /* Fall through */
1294 #endif
1295
1296 case OP_CLASS:
1297 case OP_NCLASS:
1298 cc += 33;
1299
1300 switch (*cc)
1301 {
1302 case OP_CRSTAR:
1303 case OP_CRMINSTAR:
1304 case OP_CRQUERY:
1305 case OP_CRMINQUERY:
1306 return -1;
1307
1308 case OP_CRRANGE:
1309 case OP_CRMINRANGE:
1310 if (GET2(cc,1) != GET2(cc,3)) return -1;
1311 branchlength += GET2(cc,1);
1312 cc += 5;
1313 break;
1314
1315 default:
1316 branchlength++;
1317 }
1318 break;
1319
1320 /* Anything else is variable length */
1321
1322 default:
1323 return -1;
1324 }
1325 }
1326 /* Control never gets here */
1327 }
1328
1329
1330
1331
1332 /*************************************************
1333 * Scan compiled regex for numbered bracket *
1334 *************************************************/
1335
1336 /* This little function scans through a compiled pattern until it finds a
1337 capturing bracket with the given number.
1338
1339 Arguments:
1340 code points to start of expression
1341 utf8 TRUE in UTF-8 mode
1342 number the required bracket number
1343
1344 Returns: pointer to the opcode for the bracket, or NULL if not found
1345 */
1346
1347 static const uschar *
1348 find_bracket(const uschar *code, BOOL utf8, int number)
1349 {
1350 for (;;)
1351 {
1352 register int c = *code;
1353 if (c == OP_END) return NULL;
1354
1355 /* XCLASS is used for classes that cannot be represented just by a bit
1356 map. This includes negated single high-valued characters. The length in
1357 the table is zero; the actual length is stored in the compiled code. */
1358
1359 if (c == OP_XCLASS) code += GET(code, 1);
1360
1361 /* Handle capturing bracket */
1362
1363 else if (c == OP_CBRA)
1364 {
1365 int n = GET2(code, 1+LINK_SIZE);
1366 if (n == number) return (uschar *)code;
1367 code += _pcre_OP_lengths[c];
1368 }
1369
1370 /* Otherwise, we can get the item's length from the table, except that for
1371 repeated character types, we have to test for \p and \P, which have an extra
1372 two bytes of parameters. */
1373
1374 else
1375 {
1376 switch(c)
1377 {
1378 case OP_TYPESTAR:
1379 case OP_TYPEMINSTAR:
1380 case OP_TYPEPLUS:
1381 case OP_TYPEMINPLUS:
1382 case OP_TYPEQUERY:
1383 case OP_TYPEMINQUERY:
1384 case OP_TYPEPOSSTAR:
1385 case OP_TYPEPOSPLUS:
1386 case OP_TYPEPOSQUERY:
1387 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1388 break;
1389
1390 case OP_TYPEUPTO:
1391 case OP_TYPEMINUPTO:
1392 case OP_TYPEEXACT:
1393 case OP_TYPEPOSUPTO:
1394 if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1395 break;
1396 }
1397
1398 /* Add in the fixed length from the table */
1399
1400 code += _pcre_OP_lengths[c];
1401
1402 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1403 a multi-byte character. The length in the table is a minimum, so we have to
1404 arrange to skip the extra bytes. */
1405
1406 #ifdef SUPPORT_UTF8
1407 if (utf8) switch(c)
1408 {
1409 case OP_CHAR:
1410 case OP_CHARNC:
1411 case OP_EXACT:
1412 case OP_UPTO:
1413 case OP_MINUPTO:
1414 case OP_POSUPTO:
1415 case OP_STAR:
1416 case OP_MINSTAR:
1417 case OP_POSSTAR:
1418 case OP_PLUS:
1419 case OP_MINPLUS:
1420 case OP_POSPLUS:
1421 case OP_QUERY:
1422 case OP_MINQUERY:
1423 case OP_POSQUERY:
1424 if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1425 break;
1426 }
1427 #endif
1428 }
1429 }
1430 }
1431
1432
1433
1434 /*************************************************
1435 * Scan compiled regex for recursion reference *
1436 *************************************************/
1437
1438 /* This little function scans through a compiled pattern until it finds an
1439 instance of OP_RECURSE.
1440
1441 Arguments:
1442 code points to start of expression
1443 utf8 TRUE in UTF-8 mode
1444
1445 Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1446 */
1447
1448 static const uschar *
1449 find_recurse(const uschar *code, BOOL utf8)
1450 {
1451 for (;;)
1452 {
1453 register int c = *code;
1454 if (c == OP_END) return NULL;
1455 if (c == OP_RECURSE) return code;
1456
1457 /* XCLASS is used for classes that cannot be represented just by a bit
1458 map. This includes negated single high-valued characters. The length in
1459 the table is zero; the actual length is stored in the compiled code. */
1460
1461 if (c == OP_XCLASS) code += GET(code, 1);
1462
1463 /* Otherwise, we can get the item's length from the table, except that for
1464 repeated character types, we have to test for \p and \P, which have an extra
1465 two bytes of parameters. */
1466
1467 else
1468 {
1469 switch(c)
1470 {
1471 case OP_TYPESTAR:
1472 case OP_TYPEMINSTAR:
1473 case OP_TYPEPLUS:
1474 case OP_TYPEMINPLUS:
1475 case OP_TYPEQUERY:
1476 case OP_TYPEMINQUERY:
1477 case OP_TYPEPOSSTAR:
1478 case OP_TYPEPOSPLUS:
1479 case OP_TYPEPOSQUERY:
1480 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1481 break;
1482
1483 case OP_TYPEPOSUPTO:
1484 case OP_TYPEUPTO:
1485 case OP_TYPEMINUPTO:
1486 case OP_TYPEEXACT:
1487 if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1488 break;
1489 }
1490
1491 /* Add in the fixed length from the table */
1492
1493 code += _pcre_OP_lengths[c];
1494
1495 /* In UTF-8 mode, opcodes that are followed by a character may be followed
1496 by a multi-byte character. The length in the table is a minimum, so we have
1497 to arrange to skip the extra bytes. */
1498
1499 #ifdef SUPPORT_UTF8
1500 if (utf8) switch(c)
1501 {
1502 case OP_CHAR:
1503 case OP_CHARNC:
1504 case OP_EXACT:
1505 case OP_UPTO:
1506 case OP_MINUPTO:
1507 case OP_POSUPTO:
1508 case OP_STAR:
1509 case OP_MINSTAR:
1510 case OP_POSSTAR:
1511 case OP_PLUS:
1512 case OP_MINPLUS:
1513 case OP_POSPLUS:
1514 case OP_QUERY:
1515 case OP_MINQUERY:
1516 case OP_POSQUERY:
1517 if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1518 break;
1519 }
1520 #endif
1521 }
1522 }
1523 }
1524
1525
1526
1527 /*************************************************
1528 * Scan compiled branch for non-emptiness *
1529 *************************************************/
1530
1531 /* This function scans through a branch of a compiled pattern to see whether it
1532 can match the empty string or not. It is called from could_be_empty()
1533 below and from compile_branch() when checking for an unlimited repeat of a
1534 group that can match nothing. Note that first_significant_code() skips over
1535 backward and negative forward assertions when its final argument is TRUE. If we
1536 hit an unclosed bracket, we return "empty" - this means we've struck an inner
1537 bracket whose current branch will already have been scanned.
1538
1539 Arguments:
1540 code points to start of search
1541 endcode points to where to stop
1542 utf8 TRUE if in UTF8 mode
1543
1544 Returns: TRUE if what is matched could be empty
1545 */
1546
1547 static BOOL
1548 could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1549 {
1550 register int c;
1551 for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1552 code < endcode;
1553 code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1554 {
1555 const uschar *ccode;
1556
1557 c = *code;
1558
1559 /* Skip over forward assertions; the other assertions are skipped by
1560 first_significant_code() with a TRUE final argument. */
1561
1562 if (c == OP_ASSERT)
1563 {
1564 do code += GET(code, 1); while (*code == OP_ALT);
1565 c = *code;
1566 continue;
1567 }
1568
1569 /* Groups with zero repeats can of course be empty; skip them. */
1570
1571 if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO)
1572 {
1573 code += _pcre_OP_lengths[c];
1574 do code += GET(code, 1); while (*code == OP_ALT);
1575 c = *code;
1576 continue;
1577 }
1578
1579 /* For other groups, scan the branches. */
1580
1581 if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND)
1582 {
1583 BOOL empty_branch;
1584 if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1585
1586 /* Scan a closed bracket */
1587
1588 empty_branch = FALSE;
1589 do
1590 {
1591 if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1592 empty_branch = TRUE;
1593 code += GET(code, 1);
1594 }
1595 while (*code == OP_ALT);
1596 if (!empty_branch) return FALSE; /* All branches are non-empty */
1597 c = *code;
1598 continue;
1599 }
1600
1601 /* Handle the other opcodes */
1602
1603 switch (c)
1604 {
1605 /* Check for quantifiers after a class. XCLASS is used for classes that
1606 cannot be represented just by a bit map. This includes negated single
1607 high-valued characters. The length in _pcre_OP_lengths[] is zero; the
1608 actual length is stored in the compiled code, so we must update "code"
1609 here. */
1610
1611 #ifdef SUPPORT_UTF8
1612 case OP_XCLASS:
1613 ccode = code += GET(code, 1);
1614 goto CHECK_CLASS_REPEAT;
1615 #endif
1616
1617 case OP_CLASS:
1618 case OP_NCLASS:
1619 ccode = code + 33;
1620
1621 #ifdef SUPPORT_UTF8
1622 CHECK_CLASS_REPEAT:
1623 #endif
1624
1625 switch (*ccode)
1626 {
1627 case OP_CRSTAR: /* These could be empty; continue */
1628 case OP_CRMINSTAR:
1629 case OP_CRQUERY:
1630 case OP_CRMINQUERY:
1631 break;
1632
1633 default: /* Non-repeat => class must match */
1634 case OP_CRPLUS: /* These repeats aren't empty */
1635 case OP_CRMINPLUS:
1636 return FALSE;
1637
1638 case OP_CRRANGE:
1639 case OP_CRMINRANGE:
1640 if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
1641 break;
1642 }
1643 break;
1644
1645 /* Opcodes that must match a character */
1646
1647 case OP_PROP:
1648 case OP_NOTPROP:
1649 case OP_EXTUNI:
1650 case OP_NOT_DIGIT:
1651 case OP_DIGIT:
1652 case OP_NOT_WHITESPACE:
1653 case OP_WHITESPACE:
1654 case OP_NOT_WORDCHAR:
1655 case OP_WORDCHAR:
1656 case OP_ANY:
1657 case OP_ANYBYTE:
1658 case OP_CHAR:
1659 case OP_CHARNC:
1660 case OP_NOT:
1661 case OP_PLUS:
1662 case OP_MINPLUS:
1663 case OP_POSPLUS:
1664 case OP_EXACT:
1665 case OP_NOTPLUS:
1666 case OP_NOTMINPLUS:
1667 case OP_NOTPOSPLUS:
1668 case OP_NOTEXACT:
1669 case OP_TYPEPLUS:
1670 case OP_TYPEMINPLUS:
1671 case OP_TYPEPOSPLUS:
1672 case OP_TYPEEXACT:
1673 return FALSE;
1674
1675 /* These are going to continue, as they may be empty, but we have to
1676 fudge the length for the \p and \P cases. */
1677
1678 case OP_TYPESTAR:
1679 case OP_TYPEMINSTAR:
1680 case OP_TYPEPOSSTAR:
1681 case OP_TYPEQUERY:
1682 case OP_TYPEMINQUERY:
1683 case OP_TYPEPOSQUERY:
1684 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1685 break;
1686
1687 /* Same for these */
1688
1689 case OP_TYPEUPTO:
1690 case OP_TYPEMINUPTO:
1691 case OP_TYPEPOSUPTO:
1692 if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1693 break;
1694
1695 /* End of branch */
1696
1697 case OP_KET:
1698 case OP_KETRMAX:
1699 case OP_KETRMIN:
1700 case OP_ALT:
1701 return TRUE;
1702
1703 /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1704 MINUPTO, and POSUPTO may be followed by a multibyte character */
1705
1706 #ifdef SUPPORT_UTF8
1707 case OP_STAR:
1708 case OP_MINSTAR:
1709 case OP_POSSTAR:
1710 case OP_QUERY:
1711 case OP_MINQUERY:
1712 case OP_POSQUERY:
1713 case OP_UPTO:
1714 case OP_MINUPTO:
1715 case OP_POSUPTO:
1716 if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1717 break;
1718 #endif
1719 }
1720 }
1721
1722 return TRUE;
1723 }
1724
1725
1726
1727 /*************************************************
1728 * Scan compiled regex for non-emptiness *
1729 *************************************************/
1730
1731 /* This function is called to check for left recursive calls. We want to check
1732 the current branch of the current pattern to see if it could match the empty
1733 string. If it could, we must look outwards for branches at other levels,
1734 stopping when we pass beyond the bracket which is the subject of the recursion.
1735
1736 Arguments:
1737 code points to start of the recursion
1738 endcode points to where to stop (current RECURSE item)
1739 bcptr points to the chain of current (unclosed) branch starts
1740 utf8 TRUE if in UTF-8 mode
1741
1742 Returns: TRUE if what is matched could be empty
1743 */
1744
1745 static BOOL
1746 could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1747 BOOL utf8)
1748 {
1749 while (bcptr != NULL && bcptr->current >= code)
1750 {
1751 if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1752 bcptr = bcptr->outer;
1753 }
1754 return TRUE;
1755 }
1756
1757
1758
1759 /*************************************************
1760 * Check for POSIX class syntax *
1761 *************************************************/
1762
1763 /* This function is called when the sequence "[:" or "[." or "[=" is
1764 encountered in a character class. It checks whether this is followed by a
1765 sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
1766 reach an unescaped ']' without the special preceding character, return FALSE.
1767
1768 Originally, this function only recognized a sequence of letters between the
1769 terminators, but it seems that Perl recognizes any sequence of characters,
1770 though of course unknown POSIX names are subsequently rejected. Perl gives an
1771 "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
1772 didn't consider this to be a POSIX class. Likewise for [:1234:].
1773
1774 The problem in trying to be exactly like Perl is in the handling of escapes. We
1775 have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
1776 class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
1777 below handles the special case of \], but does not try to do any other escape
1778 processing. This makes it different from Perl for cases such as [:l\ower:]
1779 where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize
1780 "l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,
1781 I think.
1782
1783 Arguments:
1784 ptr pointer to the initial [
1785 endptr where to return the end pointer
1786
1787 Returns: TRUE or FALSE
1788 */
1789
1790 static BOOL
1791 check_posix_syntax(const uschar *ptr, const uschar **endptr)
1792 {
1793 int terminator; /* Don't combine these lines; the Solaris cc */
1794 terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
1795 for (++ptr; *ptr != 0; ptr++)
1796 {
1797 if (*ptr == '\\' && ptr[1] == ']') ptr++; else
1798 {
1799 if (*ptr == ']') return FALSE;
1800 if (*ptr == terminator && ptr[1] == ']')
1801 {
1802 *endptr = ptr;
1803 return TRUE;
1804 }
1805 }
1806 }
1807 return FALSE;
1808 }
1809
1810
1811
1812
1813 /*************************************************
1814 * Check POSIX class name *
1815 *************************************************/
1816
1817 /* This function is called to check the name given in a POSIX-style class entry
1818 such as [:alnum:].
1819
1820 Arguments:
1821 ptr points to the first letter
1822 len the length of the name
1823
1824 Returns: a value representing the name, or -1 if unknown
1825 */
1826
1827 static int
1828 check_posix_name(const uschar *ptr, int len)
1829 {
1830 const char *pn = posix_names;
1831 register int yield = 0;
1832 while (posix_name_lengths[yield] != 0)
1833 {
1834 if (len == posix_name_lengths[yield] &&
1835 strncmp((const char *)ptr, pn, len) == 0) return yield;
1836 pn += posix_name_lengths[yield] + 1;
1837 yield++;
1838 }
1839 return -1;
1840 }
1841
1842
1843 /*************************************************
1844 * Adjust OP_RECURSE items in repeated group *
1845 *************************************************/
1846
1847 /* OP_RECURSE items contain an offset from the start of the regex to the group
1848 that is referenced. This means that groups can be replicated for fixed
1849 repetition simply by copying (because the recursion is allowed to refer to
1850 earlier groups that are outside the current group). However, when a group is
1851 optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is
1852 inserted before it, after it has been compiled. This means that any OP_RECURSE
1853 items within it that refer to the group itself or any contained groups have to
1854 have their offsets adjusted. That one of the jobs of this function. Before it
1855 is called, the partially compiled regex must be temporarily terminated with
1856 OP_END.
1857
1858 This function has been extended with the possibility of forward references for
1859 recursions and subroutine calls. It must also check the list of such references
1860 for the group we are dealing with. If it finds that one of the recursions in
1861 the current group is on this list, it adjusts the offset in the list, not the
1862 value in the reference (which is a group number).
1863
1864 Arguments:
1865 group points to the start of the group
1866 adjust the amount by which the group is to be moved
1867 utf8 TRUE in UTF-8 mode
1868 cd contains pointers to tables etc.
1869 save_hwm the hwm forward reference pointer at the start of the group
1870
1871 Returns: nothing
1872 */
1873
1874 static void
1875 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
1876 uschar *save_hwm)
1877 {
1878 uschar *ptr = group;
1879
1880 while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1881 {
1882 int offset;
1883 uschar *hc;
1884
1885 /* See if this recursion is on the forward reference list. If so, adjust the
1886 reference. */
1887
1888 for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
1889 {
1890 offset = GET(hc, 0);
1891 if (cd->start_code + offset == ptr + 1)
1892 {
1893 PUT(hc, 0, offset + adjust);
1894 break;
1895 }
1896 }
1897
1898 /* Otherwise, adjust the recursion offset if it's after the start of this
1899 group. */
1900
1901 if (hc >= cd->hwm)
1902 {
1903 offset = GET(ptr, 1);
1904 if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1905 }
1906
1907 ptr += 1 + LINK_SIZE;
1908 }
1909 }
1910
1911
1912
1913 /*************************************************
1914 * Insert an automatic callout point *
1915 *************************************************/
1916
1917 /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
1918 callout points before each pattern item.
1919
1920 Arguments:
1921 code current code pointer
1922 ptr current pattern pointer
1923 cd pointers to tables etc
1924
1925 Returns: new code pointer
1926 */
1927
1928 static uschar *
1929 auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
1930 {
1931 *code++ = OP_CALLOUT;
1932 *code++ = 255;
1933 PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */
1934 PUT(code, LINK_SIZE, 0); /* Default length */
1935 return code + 2*LINK_SIZE;
1936 }
1937
1938
1939
1940 /*************************************************
1941 * Complete a callout item *
1942 *************************************************/
1943
1944 /* A callout item contains the length of the next item in the pattern, which
1945 we can't fill in till after we have reached the relevant point. This is used
1946 for both automatic and manual callouts.
1947
1948 Arguments:
1949 previous_callout points to previous callout item
1950 ptr current pattern pointer
1951 cd pointers to tables etc
1952
1953 Returns: nothing
1954 */
1955
1956 static void
1957 complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
1958 {
1959 int length = ptr - cd->start_pattern - GET(previous_callout, 2);
1960 PUT(previous_callout, 2 + LINK_SIZE, length);
1961 }
1962
1963
1964
1965 #ifdef SUPPORT_UCP
1966 /*************************************************
1967 * Get othercase range *
1968 *************************************************/
1969
1970 /* This function is passed the start and end of a class range, in UTF-8 mode
1971 with UCP support. It searches up the characters, looking for internal ranges of
1972 characters in the "other" case. Each call returns the next one, updating the
1973 start address.
1974
1975 Arguments:
1976 cptr points to starting character value; updated
1977 d end value
1978 ocptr where to put start of othercase range
1979 odptr where to put end of othercase range
1980
1981 Yield: TRUE when range returned; FALSE when no more
1982 */
1983
1984 static BOOL
1985 get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
1986 unsigned int *odptr)
1987 {
1988 unsigned int c, othercase, next;
1989
1990 for (c = *cptr; c <= d; c++)
1991 { if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR) break; }
1992
1993 if (c > d) return FALSE;
1994
1995 *ocptr = othercase;
1996 next = othercase + 1;
1997
1998 for (++c; c <= d; c++)
1999 {
2000 if (_pcre_ucp_othercase(c) != next) break;
2001 next++;
2002 }
2003
2004 *odptr = next - 1;
2005 *cptr = c;
2006
2007 return TRUE;
2008 }
2009 #endif /* SUPPORT_UCP */
2010
2011
2012
2013 /*************************************************
2014 * Check if auto-possessifying is possible *
2015 *************************************************/
2016
2017 /* This function is called for unlimited repeats of certain items, to see
2018 whether the next thing could possibly match the repeated item. If not, it makes
2019 sense to automatically possessify the repeated item.
2020
2021 Arguments:
2022 op_code the repeated op code
2023 this data for this item, depends on the opcode
2024 utf8 TRUE in UTF-8 mode
2025 utf8_char used for utf8 character bytes, NULL if not relevant
2026 ptr next character in pattern
2027 options options bits
2028 cd contains pointers to tables etc.
2029
2030 Returns: TRUE if possessifying is wanted
2031 */
2032
2033 static BOOL
2034 check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
2035 const uschar *ptr, int options, compile_data *cd)
2036 {
2037 int next;
2038
2039 /* Skip whitespace and comments in extended mode */
2040
2041 if ((options & PCRE_EXTENDED) != 0)
2042 {
2043 for (;;)
2044 {
2045 while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2046 if (*ptr == '#')
2047 {
2048 while (*(++ptr) != 0)
2049 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2050 }
2051 else break;
2052 }
2053 }
2054
2055 /* If the next item is one that we can handle, get its value. A non-negative
2056 value is a character, a negative value is an escape value. */
2057
2058 if (*ptr == '\\')
2059 {
2060 int temperrorcode = 0;
2061 next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
2062 if (temperrorcode != 0) return FALSE;
2063 ptr++; /* Point after the escape sequence */
2064 }
2065
2066 else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
2067 {
2068 #ifdef SUPPORT_UTF8
2069 if (utf8) { GETCHARINC(next, ptr); } else
2070 #endif
2071 next = *ptr++;
2072 }
2073
2074 else return FALSE;
2075
2076 /* Skip whitespace and comments in extended mode */
2077
2078 if ((options & PCRE_EXTENDED) != 0)
2079 {
2080 for (;;)
2081 {
2082 while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2083 if (*ptr == '#')
2084 {
2085 while (*(++ptr) != 0)
2086 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2087 }
2088 else break;
2089 }
2090 }
2091
2092 /* If the next thing is itself optional, we have to give up. */
2093
2094 if (*ptr == '*' || *ptr == '?' || strncmp((char *)ptr, "{0,", 3) == 0)
2095 return FALSE;
2096
2097 /* Now compare the next item with the previous opcode. If the previous is a
2098 positive single character match, "item" either contains the character or, if
2099 "item" is greater than 127 in utf8 mode, the character's bytes are in
2100 utf8_char. */
2101
2102
2103 /* Handle cases when the next item is a character. */
2104
2105 if (next >= 0) switch(op_code)
2106 {
2107 case OP_CHAR:
2108 #ifdef SUPPORT_UTF8
2109 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2110 #endif
2111 return item != next;
2112
2113 /* For CHARNC (caseless character) we must check the other case. If we have
2114 Unicode property support, we can use it to test the other case of
2115 high-valued characters. */
2116
2117 case OP_CHARNC:
2118 #ifdef SUPPORT_UTF8
2119 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2120 #endif
2121 if (item == next) return FALSE;
2122 #ifdef SUPPORT_UTF8
2123 if (utf8)
2124 {
2125 unsigned int othercase;
2126 if (next < 128) othercase = cd->fcc[next]; else
2127 #ifdef SUPPORT_UCP
2128 othercase = _pcre_ucp_othercase((unsigned int)next);
2129 #else
2130 othercase = NOTACHAR;
2131 #endif
2132 return (unsigned int)item != othercase;
2133 }
2134 else
2135 #endif /* SUPPORT_UTF8 */
2136 return (item != cd->fcc[next]); /* Non-UTF-8 mode */
2137
2138 /* For OP_NOT, "item" must be a single-byte character. */
2139
2140 case OP_NOT:
2141 if (item == next) return TRUE;
2142 if ((options & PCRE_CASELESS) == 0) return FALSE;
2143 #ifdef SUPPORT_UTF8
2144 if (utf8)
2145 {
2146 unsigned int othercase;
2147 if (next < 128) othercase = cd->fcc[next]; else
2148 #ifdef SUPPORT_UCP
2149 othercase = _pcre_ucp_othercase(next);
2150 #else
2151 othercase = NOTACHAR;
2152 #endif
2153 return (unsigned int)item == othercase;
2154 }
2155 else
2156 #endif /* SUPPORT_UTF8 */
2157 return (item == cd->fcc[next]); /* Non-UTF-8 mode */
2158
2159 case OP_DIGIT:
2160 return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
2161
2162 case OP_NOT_DIGIT:
2163 return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
2164
2165 case OP_WHITESPACE:
2166 return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
2167
2168 case OP_NOT_WHITESPACE:
2169 return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
2170
2171 case OP_WORDCHAR:
2172 return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
2173
2174 case OP_NOT_WORDCHAR:
2175 return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
2176
2177 case OP_HSPACE:
2178 case OP_NOT_HSPACE:
2179 switch(next)
2180 {
2181 case 0x09:
2182 case 0x20:
2183 case 0xa0:
2184 case 0x1680:
2185 case 0x180e:
2186 case 0x2000:
2187 case 0x2001:
2188 case 0x2002:
2189 case 0x2003:
2190 case 0x2004:
2191 case 0x2005:
2192 case 0x2006:
2193 case 0x2007:
2194 case 0x2008:
2195 case 0x2009:
2196 case 0x200A:
2197 case 0x202f:
2198 case 0x205f:
2199 case 0x3000:
2200 return op_code != OP_HSPACE;
2201 default:
2202 return op_code == OP_HSPACE;
2203 }
2204
2205 case OP_VSPACE:
2206 case OP_NOT_VSPACE:
2207 switch(next)
2208 {
2209 case 0x0a:
2210 case 0x0b:
2211 case 0x0c:
2212 case 0x0d:
2213 case 0x85:
2214 case 0x2028:
2215 case 0x2029:
2216 return op_code != OP_VSPACE;
2217 default:
2218 return op_code == OP_VSPACE;
2219 }
2220
2221 default:
2222 return FALSE;
2223 }
2224
2225
2226 /* Handle the case when the next item is \d, \s, etc. */
2227
2228 switch(op_code)
2229 {
2230 case OP_CHAR:
2231 case OP_CHARNC:
2232 #ifdef SUPPORT_UTF8
2233 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2234 #endif
2235 switch(-next)
2236 {
2237 case ESC_d:
2238 return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
2239
2240 case ESC_D:
2241 return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
2242
2243 case ESC_s:
2244 return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
2245
2246 case ESC_S:
2247 return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
2248
2249 case ESC_w:
2250 return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
2251
2252 case ESC_W:
2253 return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
2254
2255 case ESC_h:
2256 case ESC_H:
2257 switch(item)
2258 {
2259 case 0x09:
2260 case 0x20:
2261 case 0xa0:
2262 case 0x1680:
2263 case 0x180e:
2264 case 0x2000:
2265 case 0x2001:
2266 case 0x2002:
2267 case 0x2003:
2268 case 0x2004:
2269 case 0x2005:
2270 case 0x2006:
2271 case 0x2007:
2272 case 0x2008:
2273 case 0x2009:
2274 case 0x200A:
2275 case 0x202f:
2276 case 0x205f:
2277 case 0x3000:
2278 return -next != ESC_h;
2279 default:
2280 return -next == ESC_h;
2281 }
2282
2283 case ESC_v:
2284 case ESC_V:
2285 switch(item)
2286 {
2287 case 0x0a:
2288 case 0x0b:
2289 case 0x0c:
2290 case 0x0d:
2291 case 0x85:
2292 case 0x2028:
2293 case 0x2029:
2294 return -next != ESC_v;
2295 default:
2296 return -next == ESC_v;
2297 }
2298
2299 default:
2300 return FALSE;
2301 }
2302
2303 case OP_DIGIT:
2304 return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
2305 next == -ESC_h || next == -ESC_v;
2306
2307 case OP_NOT_DIGIT:
2308 return next == -ESC_d;
2309
2310 case OP_WHITESPACE:
2311 return next == -ESC_S || next == -ESC_d || next == -ESC_w;
2312
2313 case OP_NOT_WHITESPACE:
2314 return next == -ESC_s || next == -ESC_h || next == -ESC_v;
2315
2316 case OP_HSPACE:
2317 return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w;
2318
2319 case OP_NOT_HSPACE:
2320 return next == -ESC_h;
2321
2322 /* Can't have \S in here because VT matches \S (Perl anomaly) */
2323 case OP_VSPACE:
2324 return next == -ESC_V || next == -ESC_d || next == -ESC_w;
2325
2326 case OP_NOT_VSPACE:
2327 return next == -ESC_v;
2328
2329 case OP_WORDCHAR:
2330 return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v;
2331
2332 case OP_NOT_WORDCHAR:
2333 return next == -ESC_w || next == -ESC_d;
2334
2335 default:
2336 return FALSE;
2337 }
2338
2339 /* Control does not reach here */
2340 }
2341
2342
2343
2344 /*************************************************
2345 * Compile one branch *
2346 *************************************************/
2347
2348 /* Scan the pattern, compiling it into the a vector. If the options are
2349 changed during the branch, the pointer is used to change the external options
2350 bits. This function is used during the pre-compile phase when we are trying
2351 to find out the amount of memory needed, as well as during the real compile
2352 phase. The value of lengthptr distinguishes the two phases.
2353
2354 Arguments:
2355 optionsptr pointer to the option bits
2356 codeptr points to the pointer to the current code point
2357 ptrptr points to the current pattern pointer
2358 errorcodeptr points to error code variable
2359 firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2360 reqbyteptr set to the last literal character required, else < 0
2361 bcptr points to current branch chain
2362 cd contains pointers to tables etc.
2363 lengthptr NULL during the real compile phase
2364 points to length accumulator during pre-compile phase
2365
2366 Returns: TRUE on success
2367 FALSE, with *errorcodeptr set non-zero on error
2368 */
2369
2370 static BOOL
2371 compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2372 int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2373 compile_data *cd, int *lengthptr)
2374 {
2375 int repeat_type, op_type;
2376 int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
2377 int bravalue = 0;
2378 int greedy_default, greedy_non_default;
2379 int firstbyte, reqbyte;
2380 int zeroreqbyte, zerofirstbyte;
2381 int req_caseopt, reqvary, tempreqvary;
2382 int options = *optionsptr;
2383 int after_manual_callout = 0;
2384 int length_prevgroup = 0;
2385 register int c;
2386 register uschar *code = *codeptr;
2387 uschar *last_code = code;
2388 uschar *orig_code = code;
2389 uschar *tempcode;
2390 BOOL inescq = FALSE;
2391 BOOL groupsetfirstbyte = FALSE;
2392 const uschar *ptr = *ptrptr;
2393 const uschar *tempptr;
2394 uschar *previous = NULL;
2395 uschar *previous_callout = NULL;
2396 uschar *save_hwm = NULL;
2397 uschar classbits[32];
2398
2399 #ifdef SUPPORT_UTF8
2400 BOOL class_utf8;
2401 BOOL utf8 = (options & PCRE_UTF8) != 0;
2402 uschar *class_utf8data;
2403 uschar *class_utf8data_base;
2404 uschar utf8_char[6];
2405 #else
2406 BOOL utf8 = FALSE;
2407 uschar *utf8_char = NULL;
2408 #endif
2409
2410 #ifdef DEBUG
2411 if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2412 #endif
2413
2414 /* Set up the default and non-default settings for greediness */
2415
2416 greedy_default = ((options & PCRE_UNGREEDY) != 0);
2417 greedy_non_default = greedy_default ^ 1;
2418
2419 /* Initialize no first byte, no required byte. REQ_UNSET means "no char
2420 matching encountered yet". It gets changed to REQ_NONE if we hit something that
2421 matches a non-fixed char first char; reqbyte just remains unset if we never
2422 find one.
2423
2424 When we hit a repeat whose minimum is zero, we may have to adjust these values
2425 to take the zero repeat into account. This is implemented by setting them to
2426 zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2427 item types that can be repeated set these backoff variables appropriately. */
2428
2429 firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2430
2431 /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2432 according to the current setting of the caseless flag. REQ_CASELESS is a bit
2433 value > 255. It is added into the firstbyte or reqbyte variables to record the
2434 case status of the value. This is used only for ASCII characters. */
2435
2436 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2437
2438 /* Switch on next character until the end of the branch */
2439
2440 for (;; ptr++)
2441 {
2442 BOOL negate_class;
2443 BOOL should_flip_negation;
2444 BOOL possessive_quantifier;
2445 BOOL is_quantifier;
2446 BOOL is_recurse;
2447 BOOL reset_bracount;
2448 int class_charcount;
2449 int class_lastchar;
2450 int newoptions;
2451 int recno;
2452 int refsign;
2453 int skipbytes;
2454 int subreqbyte;
2455 int subfirstbyte;
2456 int terminator;
2457 int mclength;
2458 uschar mcbuffer[8];
2459
2460 /* Get next byte in the pattern */
2461
2462 c = *ptr;
2463
2464 /* If we are in the pre-compile phase, accumulate the length used for the
2465 previous cycle of this loop. */
2466
2467 if (lengthptr != NULL)
2468 {
2469 #ifdef DEBUG
2470 if (code > cd->hwm) cd->hwm = code; /* High water info */
2471 #endif
2472 if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
2473 {
2474 *errorcodeptr = ERR52;
2475 goto FAILED;
2476 }
2477
2478 /* There is at least one situation where code goes backwards: this is the
2479 case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2480 the class is simply eliminated. However, it is created first, so we have to
2481 allow memory for it. Therefore, don't ever reduce the length at this point.
2482 */
2483
2484 if (code < last_code) code = last_code;
2485
2486 /* Paranoid check for integer overflow */
2487
2488 if (OFLOW_MAX - *lengthptr < code - last_code)
2489 {
2490 *errorcodeptr = ERR20;
2491 goto FAILED;
2492 }
2493
2494 *lengthptr += code - last_code;
2495 DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2496
2497 /* If "previous" is set and it is not at the start of the work space, move
2498 it back to there, in order to avoid filling up the work space. Otherwise,
2499 if "previous" is NULL, reset the current code pointer to the start. */
2500
2501 if (previous != NULL)
2502 {
2503 if (previous > orig_code)
2504 {
2505 memmove(orig_code, previous, code - previous);
2506 code -= previous - orig_code;
2507 previous = orig_code;
2508 }
2509 }
2510 else code = orig_code;
2511
2512 /* Remember where this code item starts so we can pick up the length
2513 next time round. */
2514
2515 last_code = code;
2516 }
2517
2518 /* In the real compile phase, just check the workspace used by the forward
2519 reference list. */
2520
2521 else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
2522 {
2523 *errorcodeptr = ERR52;
2524 goto FAILED;
2525 }
2526
2527 /* If in \Q...\E, check for the end; if not, we have a literal */
2528
2529 if (inescq && c != 0)
2530 {
2531 if (c == '\\' && ptr[1] == 'E')
2532 {
2533 inescq = FALSE;
2534 ptr++;
2535 continue;
2536 }
2537 else
2538 {
2539 if (previous_callout != NULL)
2540 {
2541 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2542 complete_callout(previous_callout, ptr, cd);
2543 previous_callout = NULL;
2544 }
2545 if ((options & PCRE_AUTO_CALLOUT) != 0)
2546 {
2547 previous_callout = code;
2548 code = auto_callout(code, ptr, cd);
2549 }
2550 goto NORMAL_CHAR;
2551 }
2552 }
2553
2554 /* Fill in length of a previous callout, except when the next thing is
2555 a quantifier. */
2556
2557 is_quantifier = c == '*' || c == '+' || c == '?' ||
2558 (c == '{' && is_counted_repeat(ptr+1));
2559
2560 if (!is_quantifier && previous_callout != NULL &&
2561 after_manual_callout-- <= 0)
2562 {
2563 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2564 complete_callout(previous_callout, ptr, cd);
2565 previous_callout = NULL;
2566 }
2567
2568 /* In extended mode, skip white space and comments */
2569
2570 if ((options & PCRE_EXTENDED) != 0)
2571 {
2572 if ((cd->ctypes[c] & ctype_space) != 0) continue;
2573 if (c == '#')
2574 {
2575 while (*(++ptr) != 0)
2576 {
2577 if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
2578 }
2579 if (*ptr != 0) continue;
2580
2581 /* Else fall through to handle end of string */
2582 c = 0;
2583 }
2584 }
2585
2586 /* No auto callout for quantifiers. */
2587
2588 if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
2589 {
2590 previous_callout = code;
2591 code = auto_callout(code, ptr, cd);
2592 }
2593
2594 switch(c)
2595 {
2596 /* ===================================================================*/
2597 case 0: /* The branch terminates at string end */
2598 case '|': /* or | or ) */
2599 case ')':
2600 *firstbyteptr = firstbyte;
2601 *reqbyteptr = reqbyte;
2602 *codeptr = code;
2603 *ptrptr = ptr;
2604 if (lengthptr != NULL)
2605 {
2606 if (OFLOW_MAX - *lengthptr < code - last_code)
2607 {
2608 *errorcodeptr = ERR20;
2609 goto FAILED;
2610 }
2611 *lengthptr += code - last_code; /* To include callout length */
2612 DPRINTF((">> end branch\n"));
2613 }
2614 return TRUE;
2615
2616
2617 /* ===================================================================*/
2618 /* Handle single-character metacharacters. In multiline mode, ^ disables
2619 the setting of any following char as a first character. */
2620
2621 case '^':
2622 if ((options & PCRE_MULTILINE) != 0)
2623 {
2624 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2625 }
2626 previous = NULL;
2627 *code++ = OP_CIRC;
2628 break;
2629
2630 case '$':
2631 previous = NULL;
2632 *code++ = OP_DOLL;
2633 break;
2634
2635 /* There can never be a first char if '.' is first, whatever happens about
2636 repeats. The value of reqbyte doesn't change either. */
2637
2638 case '.':
2639 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2640 zerofirstbyte = firstbyte;
2641 zeroreqbyte = reqbyte;
2642 previous = code;
2643 *code++ = OP_ANY;
2644 break;
2645
2646
2647 /* ===================================================================*/
2648 /* Character classes. If the included characters are all < 256, we build a
2649 32-byte bitmap of the permitted characters, except in the special case
2650 where there is only one such character. For negated classes, we build the
2651 map as usual, then invert it at the end. However, we use a different opcode
2652 so that data characters > 255 can be handled correctly.
2653
2654 If the class contains characters outside the 0-255 range, a different
2655 opcode is compiled. It may optionally have a bit map for characters < 256,
2656 but those above are are explicitly listed afterwards. A flag byte tells
2657 whether the bitmap is present, and whether this is a negated class or not.
2658
2659 In JavaScript compatibility mode, an isolated ']' causes an error. In
2660 default (Perl) mode, it is treated as a data character. */
2661
2662 case ']':
2663 if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
2664 {
2665 *errorcodeptr = ERR64;
2666 goto FAILED;
2667 }
2668 goto NORMAL_CHAR;
2669
2670 case '[':
2671 previous = code;
2672
2673 /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
2674 they are encountered at the top level, so we'll do that too. */
2675
2676 if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2677 check_posix_syntax(ptr, &tempptr))
2678 {
2679 *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;
2680 goto FAILED;
2681 }
2682
2683 /* If the first character is '^', set the negation flag and skip it. Also,
2684 if the first few characters (either before or after ^) are \Q\E or \E we
2685 skip them too. This makes for compatibility with Perl. */
2686
2687 negate_class = FALSE;
2688 for (;;)
2689 {
2690 c = *(++ptr);
2691 if (c == '\\')
2692 {
2693 if (ptr[1] == 'E') ptr++;
2694 else if (strncmp((const char *)ptr+1, "Q\\E", 3) == 0) ptr += 3;
2695 else break;
2696 }
2697 else if (!negate_class && c == '^')
2698 negate_class = TRUE;
2699 else break;
2700 }
2701
2702 /* If a class contains a negative special such as \S, we need to flip the
2703 negation flag at the end, so that support for characters > 255 works
2704 correctly (they are all included in the class). */
2705
2706 should_flip_negation = FALSE;
2707
2708 /* Keep a count of chars with values < 256 so that we can optimize the case
2709 of just a single character (as long as it's < 256). However, For higher
2710 valued UTF-8 characters, we don't yet do any optimization. */
2711
2712 class_charcount = 0;
2713 class_lastchar = -1;
2714
2715 /* Initialize the 32-char bit map to all zeros. We build the map in a
2716 temporary bit of memory, in case the class contains only 1 character (less
2717 than 256), because in that case the compiled code doesn't use the bit map.
2718 */
2719
2720 memset(classbits, 0, 32 * sizeof(uschar));
2721
2722 #ifdef SUPPORT_UTF8
2723 class_utf8 = FALSE; /* No chars >= 256 */
2724 class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */
2725 class_utf8data_base = class_utf8data; /* For resetting in pass 1 */
2726 #endif
2727
2728 /* Process characters until ] is reached. By writing this as a "do" it
2729 means that an initial ] is taken as a data character. At the start of the
2730 loop, c contains the first byte of the character. */
2731
2732 if (c != 0) do
2733 {
2734 const uschar *oldptr;
2735
2736 #ifdef SUPPORT_UTF8
2737 if (utf8 && c > 127)
2738 { /* Braces are required because the */
2739 GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
2740 }
2741
2742 /* In the pre-compile phase, accumulate the length of any UTF-8 extra
2743 data and reset the pointer. This is so that very large classes that
2744 contain a zillion UTF-8 characters no longer overwrite the work space
2745 (which is on the stack). */
2746
2747 if (lengthptr != NULL)
2748 {
2749 *lengthptr += class_utf8data - class_utf8data_base;
2750 class_utf8data = class_utf8data_base;
2751 }
2752
2753 #endif
2754
2755 /* Inside \Q...\E everything is literal except \E */
2756
2757 if (inescq)
2758 {
2759 if (c == '\\' && ptr[1] == 'E') /* If we are at \E */
2760 {
2761 inescq = FALSE; /* Reset literal state */
2762 ptr++; /* Skip the 'E' */
2763 continue; /* Carry on with next */
2764 }
2765 goto CHECK_RANGE; /* Could be range if \E follows */
2766 }
2767
2768 /* Handle POSIX class names. Perl allows a negation extension of the
2769 form [:^name:]. A square bracket that doesn't match the syntax is
2770 treated as a literal. We also recognize the POSIX constructions
2771 [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
2772 5.6 and 5.8 do. */
2773
2774 if (c == '[' &&
2775 (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2776 check_posix_syntax(ptr, &tempptr))
2777 {
2778 BOOL local_negate = FALSE;
2779 int posix_class, taboffset, tabopt;
2780 register const uschar *cbits = cd->cbits;
2781 uschar pbits[32];
2782
2783 if (ptr[1] != ':')
2784 {
2785 *errorcodeptr = ERR31;
2786 goto FAILED;
2787 }
2788
2789 ptr += 2;
2790 if (*ptr == '^')
2791 {
2792 local_negate = TRUE;
2793 should_flip_negation = TRUE; /* Note negative special */
2794 ptr++;
2795 }
2796
2797 posix_class = check_posix_name(ptr, tempptr - ptr);
2798 if (posix_class < 0)
2799 {
2800 *errorcodeptr = ERR30;
2801 goto FAILED;
2802 }
2803
2804 /* If matching is caseless, upper and lower are converted to
2805 alpha. This relies on the fact that the class table starts with
2806 alpha, lower, upper as the first 3 entries. */
2807
2808 if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
2809 posix_class = 0;
2810
2811 /* We build the bit map for the POSIX class in a chunk of local store
2812 because we may be adding and subtracting from it, and we don't want to
2813 subtract bits that may be in the main map already. At the end we or the
2814 result into the bit map that is being built. */
2815
2816 posix_class *= 3;
2817
2818 /* Copy in the first table (always present) */
2819
2820 memcpy(pbits, cbits + posix_class_maps[posix_class],
2821 32 * sizeof(uschar));
2822
2823 /* If there is a second table, add or remove it as required. */
2824
2825 taboffset = posix_class_maps[posix_class + 1];
2826 tabopt = posix_class_maps[posix_class + 2];
2827
2828 if (taboffset >= 0)
2829 {
2830 if (tabopt >= 0)
2831 for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
2832 else
2833 for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
2834 }
2835
2836 /* Not see if we need to remove any special characters. An option
2837 value of 1 removes vertical space and 2 removes underscore. */
2838
2839 if (tabopt < 0) tabopt = -tabopt;
2840 if (tabopt == 1) pbits[1] &= ~0x3c;
2841 else if (tabopt == 2) pbits[11] &= 0x7f;
2842
2843 /* Add the POSIX table or its complement into the main table that is
2844 being built and we are done. */
2845
2846 if (local_negate)
2847 for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
2848 else
2849 for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
2850
2851 ptr = tempptr + 1;
2852 class_charcount = 10; /* Set > 1; assumes more than 1 per class */
2853 continue; /* End of POSIX syntax handling */
2854 }
2855
2856 /* Backslash may introduce a single character, or it may introduce one
2857 of the specials, which just set a flag. The sequence \b is a special
2858 case. Inside a class (and only there) it is treated as backspace.
2859 Elsewhere it marks a word boundary. Other escapes have preset maps ready
2860 to 'or' into the one we are building. We assume they have more than one
2861 character in them, so set class_charcount bigger than one. */
2862
2863 if (c == '\\')
2864 {
2865 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2866 if (*errorcodeptr != 0) goto FAILED;
2867
2868 if (-c == ESC_b) c = '\b'; /* \b is backspace in a class */
2869 else if (-c == ESC_X) c = 'X'; /* \X is literal X in a class */
2870 else if (-c == ESC_R) c = 'R'; /* \R is literal R in a class */
2871 else if (-c == ESC_Q) /* Handle start of quoted string */
2872 {
2873 if (ptr[1] == '\\' && ptr[2] == 'E')
2874 {
2875 ptr += 2; /* avoid empty string */
2876 }
2877 else inescq = TRUE;
2878 continue;
2879 }
2880 else if (-c == ESC_E) continue; /* Ignore orphan \E */
2881
2882 if (c < 0)
2883 {
2884 register const uschar *cbits = cd->cbits;
2885 class_charcount += 2; /* Greater than 1 is what matters */
2886
2887 /* Save time by not doing this in the pre-compile phase. */
2888
2889 if (lengthptr == NULL) switch (-c)
2890 {
2891 case ESC_d:
2892 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
2893 continue;
2894
2895 case ESC_D:
2896 should_flip_negation = TRUE;
2897 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
2898 continue;
2899
2900 case ESC_w:
2901 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
2902 continue;
2903
2904 case ESC_W:
2905 should_flip_negation = TRUE;
2906 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
2907 continue;
2908
2909 case ESC_s:
2910 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
2911 classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
2912 continue;
2913
2914 case ESC_S:
2915 should_flip_negation = TRUE;
2916 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
2917 classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
2918 continue;
2919
2920 default: /* Not recognized; fall through */
2921 break; /* Need "default" setting to stop compiler warning. */
2922 }
2923
2924 /* In the pre-compile phase, just do the recognition. */
2925
2926 else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
2927 c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
2928
2929 /* We need to deal with \H, \h, \V, and \v in both phases because
2930 they use extra memory. */
2931
2932 if (-c == ESC_h)
2933 {
2934 SETBIT(classbits, 0x09); /* VT */
2935 SETBIT(classbits, 0x20); /* SPACE */
2936 SETBIT(classbits, 0xa0); /* NSBP */
2937 #ifdef SUPPORT_UTF8
2938 if (utf8)
2939 {
2940 class_utf8 = TRUE;
2941 *class_utf8data++ = XCL_SINGLE;
2942 class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
2943 *class_utf8data++ = XCL_SINGLE;
2944 class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
2945 *class_utf8data++ = XCL_RANGE;
2946 class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
2947 class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
2948 *class_utf8data++ = XCL_SINGLE;
2949 class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
2950 *class_utf8data++ = XCL_SINGLE;
2951 class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
2952 *class_utf8data++ = XCL_SINGLE;
2953 class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
2954 }
2955 #endif
2956 continue;
2957 }
2958
2959 if (-c == ESC_H)
2960 {
2961 for (c = 0; c < 32; c++)
2962 {
2963 int x = 0xff;
2964 switch (c)
2965 {
2966 case 0x09/8: x ^= 1 << (0x09%8); break;
2967 case 0x20/8: x ^= 1 << (0x20%8); break;
2968 case 0xa0/8: x ^= 1 << (0xa0%8); break;
2969 default: break;
2970 }
2971 classbits[c] |= x;
2972 }
2973
2974 #ifdef SUPPORT_UTF8
2975 if (utf8)
2976 {
2977 class_utf8 = TRUE;
2978 *class_utf8data++ = XCL_RANGE;
2979 class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2980 class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
2981 *class_utf8data++ = XCL_RANGE;
2982 class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
2983 class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
2984 *class_utf8data++ = XCL_RANGE;
2985 class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
2986 class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
2987 *class_utf8data++ = XCL_RANGE;
2988 class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
2989 class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
2990 *class_utf8data++ = XCL_RANGE;
2991 class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
2992 class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
2993 *class_utf8data++ = XCL_RANGE;
2994 class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
2995 class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
2996 *class_utf8data++ = XCL_RANGE;
2997 class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
2998 class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2999 }
3000 #endif
3001 continue;
3002 }
3003
3004 if (-c == ESC_v)
3005 {
3006 SETBIT(classbits, 0x0a); /* LF */
3007 SETBIT(classbits, 0x0b); /* VT */
3008 SETBIT(classbits, 0x0c); /* FF */
3009 SETBIT(classbits, 0x0d); /* CR */
3010 SETBIT(classbits, 0x85); /* NEL */
3011 #ifdef SUPPORT_UTF8
3012 if (utf8)
3013 {
3014 class_utf8 = TRUE;
3015 *class_utf8data++ = XCL_RANGE;
3016 class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
3017 class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
3018 }
3019 #endif
3020 continue;
3021 }
3022
3023 if (-c == ESC_V)
3024 {
3025 for (c = 0; c < 32; c++)
3026 {
3027 int x = 0xff;
3028 switch (c)
3029 {
3030 case 0x0a/8: x ^= 1 << (0x0a%8);
3031 x ^= 1 << (0x0b%8);
3032 x ^= 1 << (0x0c%8);
3033 x ^= 1 << (0x0d%8);
3034 break;
3035 case 0x85/8: x ^= 1 << (0x85%8); break;
3036 default: break;
3037 }
3038 classbits[c] |= x;
3039 }
3040
3041 #ifdef SUPPORT_UTF8
3042 if (utf8)
3043 {
3044 class_utf8 = TRUE;
3045 *class_utf8data++ = XCL_RANGE;
3046 class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
3047 class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
3048 *class_utf8data++ = XCL_RANGE;
3049 class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
3050 class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
3051 }
3052 #endif
3053 continue;
3054 }
3055
3056 /* We need to deal with \P and \p in both phases. */
3057
3058 #ifdef SUPPORT_UCP
3059 if (-c == ESC_p || -c == ESC_P)
3060 {
3061 BOOL negated;
3062 int pdata;
3063 int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
3064 if (ptype < 0) goto FAILED;
3065 class_utf8 = TRUE;
3066 *class_utf8data++ = ((-c == ESC_p) != negated)?
3067 XCL_PROP : XCL_NOTPROP;
3068 *class_utf8data++ = ptype;
3069 *class_utf8data++ = pdata;
3070 class_charcount -= 2; /* Not a < 256 character */
3071 continue;
3072 }
3073 #endif
3074 /* Unrecognized escapes are faulted if PCRE is running in its
3075 strict mode. By default, for compatibility with Perl, they are
3076 treated as literals. */
3077
3078 if ((options & PCRE_EXTRA) != 0)
3079 {
3080 *errorcodeptr = ERR7;
3081 goto FAILED;
3082 }
3083
3084 class_charcount -= 2; /* Undo the default count from above */
3085 c = *ptr; /* Get the final character and fall through */
3086 }
3087
3088 /* Fall through if we have a single character (c >= 0). This may be
3089 greater than 256 in UTF-8 mode. */
3090
3091 } /* End of backslash handling */
3092
3093 /* A single character may be followed by '-' to form a range. However,
3094 Perl does not permit ']' to be the end of the range. A '-' character
3095 at the end is treated as a literal. Perl ignores orphaned \E sequences
3096 entirely. The code for handling \Q and \E is messy. */
3097
3098 CHECK_RANGE:
3099 while (ptr[1] == '\\' && ptr[2] == 'E')
3100 {
3101 inescq = FALSE;
3102 ptr += 2;
3103 }
3104
3105 oldptr = ptr;
3106
3107 /* Remember \r or \n */
3108
3109 if (c == '\r' || c == '\n') cd->external_flags |= PCRE_HASCRORLF;
3110
3111 /* Check for range */
3112
3113 if (!inescq && ptr[1] == '-')
3114 {
3115 int d;
3116 ptr += 2;
3117 while (*ptr == '\\' && ptr[1] == 'E') ptr += 2;
3118
3119 /* If we hit \Q (not followed by \E) at this point, go into escaped
3120 mode. */
3121
3122 while (*ptr == '\\' && ptr[1] == 'Q')
3123 {
3124 ptr += 2;
3125 if (*ptr == '\\' && ptr[1] == 'E') { ptr += 2; continue; }
3126 inescq = TRUE;
3127 break;
3128 }
3129
3130 if (*ptr == 0 || (!inescq && *ptr == ']'))
3131 {
3132 ptr = oldptr;
3133 goto LONE_SINGLE_CHARACTER;
3134 }
3135
3136 #ifdef SUPPORT_UTF8
3137 if (utf8)
3138 { /* Braces are required because the */
3139 GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
3140 }
3141 else
3142 #endif
3143 d = *ptr; /* Not UTF-8 mode */
3144
3145 /* The second part of a range can be a single-character escape, but
3146 not any of the other escapes. Perl 5.6 treats a hyphen as a literal
3147 in such circumstances. */
3148
3149 if (!inescq && d == '\\')
3150 {
3151 d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3152 if (*errorcodeptr != 0) goto FAILED;
3153
3154 /* \b is backspace; \X is literal X; \R is literal R; any other
3155 special means the '-' was literal */
3156
3157 if (d < 0)
3158 {
3159 if (d == -ESC_b) d = '\b';
3160 else if (d == -ESC_X) d = 'X';
3161 else if (d == -ESC_R) d = 'R'; else
3162 {
3163 ptr = oldptr;
3164 goto LONE_SINGLE_CHARACTER; /* A few lines below */
3165 }
3166 }
3167 }
3168
3169 /* Check that the two values are in the correct order. Optimize
3170 one-character ranges */
3171
3172 if (d < c)
3173 {
3174 *errorcodeptr = ERR8;
3175 goto FAILED;
3176 }
3177
3178 if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
3179
3180 /* Remember \r or \n */
3181
3182 if (d == '\r' || d == '\n') cd->external_flags |= PCRE_HASCRORLF;
3183
3184 /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
3185 matching, we have to use an XCLASS with extra data items. Caseless
3186 matching for characters > 127 is available only if UCP support is
3187 available. */
3188
3189 #ifdef SUPPORT_UTF8
3190 if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
3191 {
3192 class_utf8 = TRUE;
3193
3194 /* With UCP support, we can find the other case equivalents of
3195 the relevant characters. There may be several ranges. Optimize how
3196 they fit with the basic range. */
3197
3198 #ifdef SUPPORT_UCP
3199 if ((options & PCRE_CASELESS) != 0)
3200 {
3201 unsigned int occ, ocd;
3202 unsigned int cc = c;
3203 unsigned int origd = d;
3204 while (get_othercase_range(&cc, origd, &occ, &ocd))
3205 {
3206 if (occ >= (unsigned int)c &&
3207 ocd <= (unsigned int)d)
3208 continue; /* Skip embedded ranges */
3209
3210 if (occ < (unsigned int)c &&
3211 ocd >= (unsigned int)c - 1) /* Extend the basic range */
3212 { /* if there is overlap, */
3213 c = occ; /* noting that if occ < c */
3214 continue; /* we can't have ocd > d */
3215 } /* because a subrange is */
3216 if (ocd > (unsigned int)d &&
3217 occ <= (unsigned int)d + 1) /* always shorter than */
3218 { /* the basic range. */
3219 d = ocd;
3220 continue;
3221 }
3222
3223 if (occ == ocd)
3224 {
3225 *class_utf8data++ = XCL_SINGLE;
3226 }
3227 else
3228 {
3229 *class_utf8data++ = XCL_RANGE;
3230 class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
3231 }
3232 class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
3233 }
3234 }
3235 #endif /* SUPPORT_UCP */
3236
3237 /* Now record the original range, possibly modified for UCP caseless
3238 overlapping ranges. */
3239
3240 *class_utf8data++ = XCL_RANGE;
3241 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3242 class_utf8data += _pcre_ord2utf8(d, class_utf8data);
3243
3244 /* With UCP support, we are done. Without UCP support, there is no
3245 caseless matching for UTF-8 characters > 127; we can use the bit map
3246 for the smaller ones. */
3247
3248 #ifdef SUPPORT_UCP
3249 continue; /* With next character in the class */
3250 #else
3251 if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
3252
3253 /* Adjust upper limit and fall through to set up the map */
3254
3255 d = 127;
3256
3257 #endif /* SUPPORT_UCP */
3258 }
3259 #endif /* SUPPORT_UTF8 */
3260
3261 /* We use the bit map for all cases when not in UTF-8 mode; else
3262 ranges that lie entirely within 0-127 when there is UCP support; else
3263 for partial ranges without UCP support. */
3264
3265 class_charcount += d - c + 1;
3266 class_lastchar = d;
3267
3268 /* We can save a bit of time by skipping this in the pre-compile. */
3269
3270 if (lengthptr == NULL) for (; c <= d; c++)
3271 {
3272 classbits[c/8] |= (1 << (c&7));
3273 if ((options & PCRE_CASELESS) != 0)
3274 {
3275 int uc = cd->fcc[c]; /* flip case */
3276 classbits[uc/8] |= (1 << (uc&7));
3277 }
3278 }
3279
3280 continue; /* Go get the next char in the class */
3281 }
3282
3283 /* Handle a lone single character - we can get here for a normal
3284 non-escape char, or after \ that introduces a single character or for an
3285 apparent range that isn't. */
3286
3287 LONE_SINGLE_CHARACTER:
3288
3289 /* Handle a character that cannot go in the bit map */
3290
3291 #ifdef SUPPORT_UTF8
3292 if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
3293 {
3294 class_utf8 = TRUE;
3295 *class_utf8data++ = XCL_SINGLE;
3296 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3297
3298 #ifdef SUPPORT_UCP
3299 if ((options & PCRE_CASELESS) != 0)
3300 {
3301 unsigned int othercase;
3302 if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR)
3303 {
3304 *class_utf8data++ = XCL_SINGLE;
3305 class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
3306 }
3307 }
3308 #endif /* SUPPORT_UCP */
3309
3310 }
3311 else
3312 #endif /* SUPPORT_UTF8 */
3313
3314 /* Handle a single-byte character */
3315 {
3316 classbits[c/8] |= (1 << (c&7));
3317 if ((options & PCRE_CASELESS) != 0)
3318 {
3319 c = cd->fcc[c]; /* flip case */
3320 classbits[c/8] |= (1 << (c&7));
3321 }
3322 class_charcount++;
3323 class_lastchar = c;
3324 }
3325 }
3326
3327 /* Loop until ']' reached. This "while" is the end of the "do" above. */
3328
3329 while ((c = *(++ptr)) != 0 && (c != ']' || inescq));
3330
3331 if (c == 0) /* Missing terminating ']' */
3332 {
3333 *errorcodeptr = ERR6;
3334 goto FAILED;
3335 }
3336
3337
3338 /* This code has been disabled because it would mean that \s counts as
3339 an explicit \r or \n reference, and that's not really what is wanted. Now
3340 we set the flag only if there is a literal "\r" or "\n" in the class. */
3341
3342 #if 0
3343 /* Remember whether \r or \n are in this class */
3344
3345 if (negate_class)
3346 {
3347 if ((classbits[1] & 0x24) != 0x24) cd->external_flags |= PCRE_HASCRORLF;
3348 }
3349 else
3350 {
3351 if ((classbits[1] & 0x24) != 0) cd->external_flags |= PCRE_HASCRORLF;
3352 }
3353 #endif
3354
3355
3356 /* If class_charcount is 1, we saw precisely one character whose value is
3357 less than 256. As long as there were no characters >= 128 and there was no
3358 use of \p or \P, in other words, no use of any XCLASS features, we can
3359 optimize.
3360
3361 In UTF-8 mode, we can optimize the negative case only if there were no
3362 characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR
3363 operate on single-bytes only. This is an historical hangover. Maybe one day
3364 we can tidy these opcodes to handle multi-byte characters.
3365
3366 The optimization throws away the bit map. We turn the item into a
3367 1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
3368 that OP_NOT does not support multibyte characters. In the positive case, it
3369 can cause firstbyte to be set. Otherwise, there can be no first char if
3370 this item is first, whatever repeat count may follow. In the case of
3371 reqbyte, save the previous value for reinstating. */
3372
3373 #ifdef SUPPORT_UTF8
3374 if (class_charcount == 1 && !class_utf8 &&
3375 (!utf8 || !negate_class || class_lastchar < 128))
3376 #else
3377 if (class_charcount == 1)
3378 #endif
3379 {
3380 zeroreqbyte = reqbyte;
3381
3382 /* The OP_NOT opcode works on one-byte characters only. */
3383
3384 if (negate_class)
3385 {
3386 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3387 zerofirstbyte = firstbyte;
3388 *code++ = OP_NOT;
3389 *code++ = class_lastchar;
3390 break;
3391 }
3392
3393 /* For a single, positive character, get the value into mcbuffer, and
3394 then we can handle this with the normal one-character code. */
3395
3396 #ifdef SUPPORT_UTF8
3397 if (utf8 && class_lastchar > 127)
3398 mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
3399 else
3400 #endif
3401 {
3402 mcbuffer[0] = class_lastchar;
3403 mclength = 1;
3404 }
3405 goto ONE_CHAR;
3406 } /* End of 1-char optimization */
3407
3408 /* The general case - not the one-char optimization. If this is the first
3409 thing in the branch, there can be no first char setting, whatever the
3410 repeat count. Any reqbyte setting must remain unchanged after any kind of
3411 repeat. */
3412
3413 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3414 zerofirstbyte = firstbyte;
3415 zeroreqbyte = reqbyte;
3416
3417 /* If there are characters with values > 255, we have to compile an
3418 extended class, with its own opcode, unless there was a negated special
3419 such as \S in the class, because in that case all characters > 255 are in
3420 the class, so any that were explicitly given as well can be ignored. If
3421 (when there are explicit characters > 255 that must be listed) there are no
3422 characters < 256, we can omit the bitmap in the actual compiled code. */
3423
3424 #ifdef SUPPORT_UTF8
3425 if (class_utf8 && !should_flip_negation)
3426 {
3427 *class_utf8data++ = XCL_END; /* Marks the end of extra data */
3428 *code++ = OP_XCLASS;
3429 code += LINK_SIZE;
3430 *code = negate_class? XCL_NOT : 0;
3431
3432 /* If the map is required, move up the extra data to make room for it;
3433 otherwise just move the code pointer to the end of the extra data. */
3434
3435 if (class_charcount > 0)
3436 {
3437 *code++ |= XCL_MAP;
3438 memmove(code + 32, code, class_utf8data - code);
3439 memcpy(code, classbits, 32);
3440 code = class_utf8data + 32;
3441 }
3442 else code = class_utf8data;
3443
3444 /* Now fill in the complete length of the item */
3445
3446 PUT(previous, 1, code - previous);
3447 break; /* End of class handling */
3448 }
3449 #endif
3450
3451 /* If there are no characters > 255, set the opcode to OP_CLASS or
3452 OP_NCLASS, depending on whether the whole class was negated and whether
3453 there were negative specials such as \S in the class. Then copy the 32-byte
3454 map into the code vector, negating it if necessary. */
3455
3456 *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
3457 if (negate_class)
3458 {
3459 if (lengthptr == NULL) /* Save time in the pre-compile phase */
3460 for (c = 0; c < 32; c++) code[c] = ~classbits[c];
3461 }
3462 else
3463 {
3464 memcpy(code, classbits, 32);
3465 }
3466 code += 32;
3467 break;
3468
3469
3470 /* ===================================================================*/
3471 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
3472 has been tested above. */
3473
3474 case '{':
3475 if (!is_quantifier) goto NORMAL_CHAR;
3476 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
3477 if (*errorcodeptr != 0) goto FAILED;
3478 goto REPEAT;
3479
3480 case '*':
3481 repeat_min = 0;
3482 repeat_max = -1;
3483 goto REPEAT;
3484
3485 case '+':
3486 repeat_min = 1;
3487 repeat_max = -1;
3488 goto REPEAT;
3489
3490 case '?':
3491 repeat_min = 0;
3492 repeat_max = 1;
3493
3494 REPEAT:
3495 if (previous == NULL)
3496 {
3497 *errorcodeptr = ERR9;
3498 goto FAILED;
3499 }
3500
3501 if (repeat_min == 0)
3502 {
3503 firstbyte = zerofirstbyte; /* Adjust for zero repeat */
3504 reqbyte = zeroreqbyte; /* Ditto */
3505 }
3506
3507 /* Remember whether this is a variable length repeat */
3508
3509 reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
3510
3511 op_type = 0; /* Default single-char op codes */
3512 possessive_quantifier = FALSE; /* Default not possessive quantifier */
3513
3514 /* Save start of previous item, in case we have to move it up to make space
3515 for an inserted OP_ONCE for the additional '+' extension. */
3516
3517 tempcode = previous;
3518
3519 /* If the next character is '+', we have a possessive quantifier. This
3520 implies greediness, whatever the setting of the PCRE_UNGREEDY option.
3521 If the next character is '?' this is a minimizing repeat, by default,
3522 but if PCRE_UNGREEDY is set, it works the other way round. We change the
3523 repeat type to the non-default. */
3524
3525 if (ptr[1] == '+')
3526 {
3527 repeat_type = 0; /* Force greedy */
3528 possessive_quantifier = TRUE;
3529 ptr++;
3530 }
3531 else if (ptr[1] == '?')
3532 {
3533 repeat_type = greedy_non_default;
3534 ptr++;
3535 }
3536 else repeat_type = greedy_default;
3537
3538 /* If previous was a character match, abolish the item and generate a
3539 repeat item instead. If a char item has a minumum of more than one, ensure
3540 that it is set in reqbyte - it might not be if a sequence such as x{3} is
3541 the first thing in a branch because the x will have gone into firstbyte
3542 instead. */
3543
3544 if (*previous == OP_CHAR || *previous == OP_CHARNC)
3545 {
3546 /* Deal with UTF-8 characters that take up more than one byte. It's
3547 easier to write this out separately than try to macrify it. Use c to
3548 hold the length of the character in bytes, plus 0x80 to flag that it's a
3549 length rather than a small character. */
3550
3551 #ifdef SUPPORT_UTF8
3552 if (utf8 && (code[-1] & 0x80) != 0)
3553 {
3554 uschar *lastchar = code - 1;
3555 while((*lastchar & 0xc0) == 0x80) lastchar--;
3556 c = code - lastchar; /* Length of UTF-8 character */
3557 memcpy(utf8_char, lastchar, c); /* Save the char */
3558 c |= 0x80; /* Flag c as a length */
3559 }
3560 else
3561 #endif
3562
3563 /* Handle the case of a single byte - either with no UTF8 support, or
3564 with UTF-8 disabled, or for a UTF-8 character < 128. */
3565
3566 {
3567 c = code[-1];
3568 if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
3569 }
3570
3571 /* If the repetition is unlimited, it pays to see if the next thing on
3572 the line is something that cannot possibly match this character. If so,
3573 automatically possessifying this item gains some performance in the case
3574 where the match fails. */
3575
3576 if (!possessive_quantifier &&
3577 repeat_max < 0 &&
3578 check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3579 options, cd))
3580 {
3581 repeat_type = 0; /* Force greedy */
3582 possessive_quantifier = TRUE;
3583 }
3584
3585 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
3586 }
3587
3588 /* If previous was a single negated character ([^a] or similar), we use
3589 one of the special opcodes, replacing it. The code is shared with single-
3590 character repeats by setting opt_type to add a suitable offset into
3591 repeat_type. We can also test for auto-possessification. OP_NOT is
3592 currently used only for single-byte chars. */
3593
3594 else if (*previous == OP_NOT)
3595 {
3596 op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
3597 c = previous[1];
3598 if (!possessive_quantifier &&
3599 repeat_max < 0 &&
3600 check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3601 {
3602 repeat_type = 0; /* Force greedy */
3603 possessive_quantifier = TRUE;
3604 }
3605 goto OUTPUT_SINGLE_REPEAT;
3606 }
3607
3608 /* If previous was a character type match (\d or similar), abolish it and
3609 create a suitable repeat item. The code is shared with single-character
3610 repeats by setting op_type to add a suitable offset into repeat_type. Note
3611 the the Unicode property types will be present only when SUPPORT_UCP is
3612 defined, but we don't wrap the little bits of code here because it just
3613 makes it horribly messy. */
3614
3615 else if (*previous < OP_EODN)
3616 {
3617 uschar *oldcode;
3618 int prop_type, prop_value;
3619 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
3620 c = *previous;
3621
3622 if (!possessive_quantifier &&
3623 repeat_max < 0 &&
3624 check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3625 {
3626 repeat_type = 0; /* Force greedy */
3627 possessive_quantifier = TRUE;
3628 }
3629
3630 OUTPUT_SINGLE_REPEAT:
3631 if (*previous == OP_PROP || *previous == OP_NOTPROP)
3632 {
3633 prop_type = previous[1];
3634 prop_value = previous[2];
3635 }
3636 else prop_type = prop_value = -1;
3637
3638 oldcode = code;
3639 code = previous; /* Usually overwrite previous item */
3640
3641 /* If the maximum is zero then the minimum must also be zero; Perl allows
3642 this case, so we do too - by simply omitting the item altogether. */
3643
3644 if (repeat_max == 0) goto END_REPEAT;
3645
3646 /* All real repeats make it impossible to handle partial matching (maybe
3647 one day we will be able to remove this restriction). */
3648
3649 if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL;
3650
3651 /* Combine the op_type with the repeat_type */
3652
3653 repeat_type += op_type;
3654
3655 /* A minimum of zero is handled either as the special case * or ?, or as
3656 an UPTO, with the maximum given. */
3657
3658 if (repeat_min == 0)
3659 {
3660 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
3661 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
3662 else
3663 {
3664 *code++ = OP_UPTO + repeat_type;
3665 PUT2INC(code, 0, repeat_max);
3666 }
3667 }
3668
3669 /* A repeat minimum of 1 is optimized into some special cases. If the
3670 maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
3671 left in place and, if the maximum is greater than 1, we use OP_UPTO with
3672 one less than the maximum. */
3673
3674 else if (repeat_min == 1)
3675 {
3676 if (repeat_max == -1)
3677 *code++ = OP_PLUS + repeat_type;
3678 else
3679 {
3680 code = oldcode; /* leave previous item in place */
3681 if (repeat_max == 1) goto END_REPEAT;
3682 *code++ = OP_UPTO + repeat_type;
3683 PUT2INC(code, 0, repeat_max - 1);
3684 }
3685 }
3686
3687 /* The case {n,n} is just an EXACT, while the general case {n,m} is
3688 handled as an EXACT followed by an UPTO. */
3689
3690 else
3691 {
3692 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
3693 PUT2INC(code, 0, repeat_min);
3694
3695 /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
3696 we have to insert the character for the previous code. For a repeated
3697 Unicode property match, there are two extra bytes that define the
3698 required property. In UTF-8 mode, long characters have their length in
3699 c, with the 0x80 bit as a flag. */
3700
3701 if (repeat_max < 0)
3702 {
3703 #ifdef SUPPORT_UTF8
3704 if (utf8 && c >= 128)
3705 {
3706 memcpy(code, utf8_char, c & 7);
3707 code += c & 7;
3708 }
3709 else
3710 #endif
3711 {
3712 *code++ = c;
3713 if (prop_type >= 0)
3714 {
3715 *code++ = prop_type;
3716 *code++ = prop_value;
3717 }
3718 }
3719 *code++ = OP_STAR + repeat_type;
3720 }
3721
3722 /* Else insert an UPTO if the max is greater than the min, again
3723 preceded by the character, for the previously inserted code. If the
3724 UPTO is just for 1 instance, we can use QUERY instead. */
3725
3726 else if (repeat_max != repeat_min)
3727 {
3728 #ifdef SUPPORT_UTF8
3729 if (utf8 && c >= 128)
3730 {
3731 memcpy(code, utf8_char, c & 7);
3732 code += c & 7;
3733 }
3734 else
3735 #endif
3736 *code++ = c;
3737 if (prop_type >= 0)
3738 {
3739 *code++ = prop_type;
3740 *code++ = prop_value;
3741 }
3742 repeat_max -= repeat_min;
3743
3744 if (repeat_max == 1)
3745 {
3746 *code++ = OP_QUERY + repeat_type;
3747 }
3748 else
3749 {
3750 *code++ = OP_UPTO + repeat_type;
3751 PUT2INC(code, 0, repeat_max);
3752 }
3753 }
3754 }
3755
3756 /* The character or character type itself comes last in all cases. */
3757
3758 #ifdef SUPPORT_UTF8
3759 if (utf8 && c >= 128)
3760 {
3761 memcpy(code, utf8_char, c & 7);
3762 code += c & 7;
3763 }
3764 else
3765 #endif
3766 *code++ = c;
3767
3768 /* For a repeated Unicode property match, there are two extra bytes that
3769 define the required property. */
3770
3771 #ifdef SUPPORT_UCP
3772 if (prop_type >= 0)
3773 {
3774 *code++ = prop_type;
3775 *code++ = prop_value;
3776 }
3777 #endif
3778 }
3779
3780 /* If previous was a character class or a back reference, we put the repeat
3781 stuff after it, but just skip the item if the repeat was {0,0}. */
3782
3783 else if (*previous == OP_CLASS ||
3784 *previous == OP_NCLASS ||
3785 #ifdef SUPPORT_UTF8
3786 *previous == OP_XCLASS ||
3787 #endif
3788 *previous == OP_REF)
3789 {
3790 if (repeat_max == 0)
3791 {
3792 code = previous;
3793 goto END_REPEAT;
3794 }
3795
3796 /* All real repeats make it impossible to handle partial matching (maybe
3797 one day we will be able to remove this restriction). */
3798
3799 if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL;
3800
3801 if (repeat_min == 0 && repeat_max == -1)
3802 *code++ = OP_CRSTAR + repeat_type;
3803 else if (repeat_min == 1 && repeat_max == -1)
3804 *code++ = OP_CRPLUS + repeat_type;
3805 else if (repeat_min == 0 && repeat_max == 1)
3806 *code++ = OP_CRQUERY + repeat_type;
3807 else
3808 {
3809 *code++ = OP_CRRANGE + repeat_type;
3810 PUT2INC(code, 0, repeat_min);
3811 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
3812 PUT2INC(code, 0, repeat_max);
3813 }
3814 }
3815
3816 /* If previous was a bracket group, we may have to replicate it in certain
3817 cases. */
3818
3819 else if (*previous == OP_BRA || *previous == OP_CBRA ||
3820 *previous == OP_ONCE || *previous == OP_COND)
3821 {
3822 register int i;
3823 int ketoffset = 0;
3824 int len = code - previous;
3825 uschar *bralink = NULL;
3826
3827 /* Repeating a DEFINE group is pointless */
3828
3829 if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
3830 {
3831 *errorcodeptr = ERR55;
3832 goto FAILED;
3833 }
3834
3835 /* If the maximum repeat count is unlimited, find the end of the bracket
3836 by scanning through from the start, and compute the offset back to it
3837 from the current code pointer. There may be an OP_OPT setting following
3838 the final KET, so we can't find the end just by going back from the code
3839 pointer. */
3840
3841 if (repeat_max == -1)
3842 {
3843 register uschar *ket = previous;
3844 do ket += GET(ket, 1); while (*ket != OP_KET);
3845 ketoffset = code - ket;
3846 }
3847
3848 /* The case of a zero minimum is special because of the need to stick
3849 OP_BRAZERO in front of it, and because the group appears once in the
3850 data, whereas in other cases it appears the minimum number of times. For
3851 this reason, it is simplest to treat this case separately, as otherwise
3852 the code gets far too messy. There are several special subcases when the
3853 minimum is zero. */
3854
3855 if (repeat_min == 0)
3856 {
3857 /* If the maximum is also zero, we used to just omit the group from the
3858 output altogether, like this:
3859
3860 ** if (repeat_max == 0)
3861 ** {
3862 ** code = previous;
3863 ** goto END_REPEAT;
3864 ** }
3865
3866 However, that fails when a group is referenced as a subroutine from
3867 elsewhere in the pattern, so now we stick in OP_SKIPZERO in front of it
3868 so that it is skipped on execution. As we don't have a list of which
3869 groups are referenced, we cannot do this selectively.
3870
3871 If the maximum is 1 or unlimited, we just have to stick in the BRAZERO
3872 and do no more at this point. However, we do need to adjust any
3873 OP_RECURSE calls inside the group that refer to the group itself or any
3874 internal or forward referenced group, because the offset is from the
3875 start of the whole regex. Temporarily terminate the pattern while doing
3876 this. */
3877
3878 if (repeat_max <= 1) /* Covers 0, 1, and unlimited */
3879 {
3880 *code = OP_END;
3881 adjust_recurse(previous, 1, utf8, cd, save_hwm);
3882 memmove(previous+1, previous, len);
3883 code++;
3884 if (repeat_max == 0)
3885 {
3886 *previous++ = OP_SKIPZERO;
3887 goto END_REPEAT;
3888 }
3889 *previous++ = OP_BRAZERO + repeat_type;
3890 }
3891
3892 /* If the maximum is greater than 1 and limited, we have to replicate
3893 in a nested fashion, sticking OP_BRAZERO before each set of brackets.
3894 The first one has to be handled carefully because it's the original
3895 copy, which has to be moved up. The remainder can be handled by code
3896 that is common with the non-zero minimum case below. We have to
3897 adjust the value or repeat_max, since one less copy is required. Once
3898 again, we may have to adjust any OP_RECURSE calls inside the group. */
3899
3900 else
3901 {
3902 int offset;
3903 *code = OP_END;
3904 adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
3905 memmove(previous + 2 + LINK_SIZE, previous, len);
3906 code += 2 + LINK_SIZE;
3907 *previous++ = OP_BRAZERO + repeat_type;
3908 *previous++ = OP_BRA;
3909
3910 /* We chain together the bracket offset fields that have to be
3911 filled in later when the ends of the brackets are reached. */
3912
3913 offset = (bralink == NULL)? 0 : previous - bralink;
3914 bralink = previous;
3915 PUTINC(previous, 0, offset);
3916 }
3917
3918 repeat_max--;
3919 }
3920
3921 /* If the minimum is greater than zero, replicate the group as many
3922 times as necessary, and adjust the maximum to the number of subsequent
3923 copies that we need. If we set a first char from the group, and didn't
3924 set a required char, copy the latter from the former. If there are any
3925 forward reference subroutine calls in the group, there will be entries on
3926 the workspace list; replicate these with an appropriate increment. */
3927
3928 else
3929 {
3930 if (repeat_min > 1)
3931 {
3932 /* In the pre-compile phase, we don't actually do the replication. We
3933 just adjust the length as if we had. Do some paranoid checks for
3934 potential integer overflow. */
3935
3936 if (lengthptr != NULL)
3937 {
3938 int delta = (repeat_min - 1)*length_prevgroup;
3939 if ((double)(repeat_min - 1)*(double)length_prevgroup >
3940 (double)INT_MAX ||
3941 OFLOW_MAX - *lengthptr < delta)
3942 {
3943 *errorcodeptr = ERR20;
3944 goto FAILED;
3945 }
3946 *lengthptr += delta;
3947 }
3948
3949 /* This is compiling for real */
3950
3951 else
3952 {
3953 if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
3954 for (i = 1; i < repeat_min; i++)
3955 {
3956 uschar *hc;
3957 uschar *this_hwm = cd->hwm;
3958 memcpy(code, previous, len);
3959 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3960 {
3961 PUT(cd->hwm, 0, GET(hc, 0) + len);
3962 cd->hwm += LINK_SIZE;
3963 }
3964 save_hwm = this_hwm;
3965 code += len;
3966 }
3967 }
3968 }
3969
3970 if (repeat_max > 0) repeat_max -= repeat_min;
3971 }
3972
3973 /* This code is common to both the zero and non-zero minimum cases. If
3974 the maximum is limited, it replicates the group in a nested fashion,
3975 remembering the bracket starts on a stack. In the case of a zero minimum,
3976 the first one was set up above. In all cases the repeat_max now specifies
3977 the number of additional copies needed. Again, we must remember to
3978 replicate entries on the forward reference list. */
3979
3980 if (repeat_max >= 0)
3981 {
3982 /* In the pre-compile phase, we don't actually do the replication. We
3983 just adjust the length as if we had. For each repetition we must add 1
3984 to the length for BRAZERO and for all but the last repetition we must
3985 add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
3986 paranoid checks to avoid integer overflow. */
3987
3988 if (lengthptr != NULL && repeat_max > 0)
3989 {
3990 int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
3991 2 - 2*LINK_SIZE; /* Last one doesn't nest */
3992 if ((double)repeat_max *
3993 (double)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
3994 > (double)INT_MAX ||
3995 OFLOW_MAX - *lengthptr < delta)
3996 {
3997 *errorcodeptr = ERR20;
3998 goto FAILED;
3999 }
4000 *lengthptr += delta;
4001 }
4002
4003 /* This is compiling for real */
4004
4005 else for (i = repeat_max - 1; i >= 0; i--)
4006 {
4007 uschar *hc;
4008 uschar *this_hwm = cd->hwm;
4009
4010 *code++ = OP_BRAZERO + repeat_type;
4011
4012 /* All but the final copy start a new nesting, maintaining the
4013 chain of brackets outstanding. */
4014
4015 if (i != 0)
4016 {
4017 int offset;
4018 *code++ = OP_BRA;
4019 offset = (bralink == NULL)? 0 : code - bralink;
4020 bralink = code;
4021 PUTINC(code, 0, offset);
4022 }
4023
4024 memcpy(code, previous, len);
4025 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
4026 {
4027 PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
4028 cd->hwm += LINK_SIZE;
4029 }
4030 save_hwm = this_hwm;
4031 code += len;
4032 }
4033
4034 /* Now chain through the pending brackets, and fill in their length
4035 fields (which are holding the chain links pro tem). */
4036
4037 while (bralink != NULL)
4038 {
4039 int oldlinkoffset;
4040 int offset = code - bralink + 1;
4041 uschar *bra = code - offset;
4042 oldlinkoffset = GET(bra, 1);
4043 bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
4044 *code++ = OP_KET;
4045 PUTINC(code, 0, offset);
4046 PUT(bra, 1, offset);
4047 }
4048 }
4049
4050 /* If the maximum is unlimited, set a repeater in the final copy. We
4051 can't just offset backwards from the current code point, because we
4052 don't know if there's been an options resetting after the ket. The
4053 correct offset was computed above.
4054
4055 Then, when we are doing the actual compile phase, check to see whether
4056 this group is a non-atomic one that could match an empty string. If so,
4057 convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
4058 that runtime checking can be done. [This check is also applied to
4059 atomic groups at runtime, but in a different way.] */
4060
4061 else
4062 {
4063 uschar *ketcode = code - ketoffset;
4064 uschar *bracode = ketcode - GET(ketcode, 1);
4065 *ketcode = OP_KETRMAX + repeat_type;
4066 if (lengthptr == NULL && *bracode != OP_ONCE)
4067 {
4068 uschar *scode = bracode;
4069 do
4070 {
4071 if (could_be_empty_branch(scode, ketcode, utf8))
4072 {
4073 *bracode += OP_SBRA - OP_BRA;
4074 break;
4075 }
4076 scode += GET(scode, 1);
4077 }
4078 while (*scode == OP_ALT);
4079 }
4080 }
4081 }
4082
4083 /* Else there's some kind of shambles */
4084
4085 else
4086 {
4087 *errorcodeptr = ERR11;
4088 goto FAILED;
4089 }
4090
4091 /* If the character following a repeat is '+', or if certain optimization
4092 tests above succeeded, possessive_quantifier is TRUE. For some of the
4093 simpler opcodes, there is an special alternative opcode for this. For
4094 anything else, we wrap the entire repeated item inside OP_ONCE brackets.
4095 The '+' notation is just syntactic sugar, taken from Sun's Java package,
4096 but the special opcodes can optimize it a bit. The repeated item starts at
4097 tempcode, not at previous, which might be the first part of a string whose
4098 (former) last char we repeated.
4099
4100 Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
4101 an 'upto' may follow. We skip over an 'exact' item, and then test the
4102 length of what remains before proceeding. */
4103
4104 if (possessive_quantifier)
4105 {
4106 int len;
4107 if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||
4108 *tempcode == OP_NOTEXACT)
4109 tempcode += _pcre_OP_lengths[*tempcode] +
4110 ((*tempcode == OP_TYPEEXACT &&
4111 (tempcode[3] == OP_PROP || tempcode[3] == OP_NOTPROP))? 2:0);
4112 len = code - tempcode;
4113 if (len > 0) switch (*tempcode)
4114 {
4115 case OP_STAR: *tempcode = OP_POSSTAR; break;
4116 case OP_PLUS: *tempcode = OP_POSPLUS; break;
4117 case OP_QUERY: *tempcode = OP_POSQUERY; break;
4118 case OP_UPTO: *tempcode = OP_POSUPTO; break;
4119
4120 case OP_TYPESTAR: *tempcode = OP_TYPEPOSSTAR; break;
4121 case OP_TYPEPLUS: *tempcode = OP_TYPEPOSPLUS; break;
4122 case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
4123 case OP_TYPEUPTO: *tempcode = OP_TYPEPOSUPTO; break;
4124
4125 case OP_NOTSTAR: *tempcode = OP_NOTPOSSTAR; break;
4126 case OP_NOTPLUS: *tempcode = OP_NOTPOSPLUS; break;
4127 case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
4128 case OP_NOTUPTO: *tempcode = OP_NOTPOSUPTO; break;
4129
4130 default:
4131 memmove(tempcode + 1+LINK_SIZE, tempcode, len);
4132 code += 1 + LINK_SIZE;
4133 len += 1 + LINK_SIZE;
4134 tempcode[0] = OP_ONCE;
4135 *code++ = OP_KET;
4136 PUTINC(code, 0, len);
4137 PUT(tempcode, 1, len);
4138 break;
4139 }
4140 }
4141
4142 /* In all case we no longer have a previous item. We also set the
4143 "follows varying string" flag for subsequently encountered reqbytes if
4144 it isn't already set and we have just passed a varying length item. */
4145
4146 END_REPEAT:
4147 previous = NULL;
4148 cd->req_varyopt |= reqvary;
4149 break;
4150
4151
4152 /* ===================================================================*/
4153 /* Start of nested parenthesized sub-expression, or comment or lookahead or
4154 lookbehind or option setting or condition or all the other extended
4155 parenthesis forms. */
4156
4157 case '(':
4158 newoptions = options;
4159 skipbytes = 0;
4160 bravalue = OP_CBRA;
4161 save_hwm = cd->hwm;
4162 reset_bracount = FALSE;
4163
4164 /* First deal with various "verbs" that can be introduced by '*'. */
4165
4166 if (*(++ptr) == '*' && (cd->ctypes[ptr[1]] & ctype_letter) != 0)
4167 {
4168 int i, namelen;
4169 const char *vn = verbnames;
4170 const uschar *name = ++ptr;
4171 previous = NULL;
4172 while ((cd->ctypes[*++ptr] & ctype_letter) != 0);
4173 if (*ptr == ':')
4174 {
4175 *errorcodeptr = ERR59; /* Not supported */
4176 goto FAILED;
4177 }
4178 if (*ptr != ')')
4179 {
4180 *errorcodeptr = ERR60;
4181 goto FAILED;
4182 }
4183 namelen = ptr - name;
4184 for (i = 0; i < verbcount; i++)
4185 {
4186 if (namelen == verbs[i].len &&
4187 strncmp((char *)name, vn, namelen) == 0)
4188 {
4189 *code = verbs[i].op;
4190 if (*code++ == OP_ACCEPT) cd->had_accept = TRUE;
4191 break;
4192 }
4193 vn += verbs[i].len + 1;
4194 }
4195 if (i < verbcount) continue;
4196 *errorcodeptr = ERR60;
4197 goto FAILED;
4198 }
4199
4200 /* Deal with the extended parentheses; all are introduced by '?', and the
4201 appearance of any of them means that this is not a capturing group. */
4202
4203 else if (*ptr == '?')
4204 {
4205 int i, set, unset, namelen;
4206 int *optset;
4207 const uschar *name;
4208 uschar *slot;
4209
4210 switch (*(++ptr))
4211 {
4212 case '#': /* Comment; skip to ket */
4213 ptr++;
4214 while (*ptr != 0 && *ptr != ')') ptr++;
4215 if (*ptr == 0)
4216 {
4217 *errorcodeptr = ERR18;
4218 goto FAILED;
4219 }
4220 continue;
4221
4222
4223 /* ------------------------------------------------------------ */
4224 case '|': /* Reset capture count for each branch */
4225 reset_bracount = TRUE;
4226 /* Fall through */
4227
4228 /* ------------------------------------------------------------ */
4229 case ':': /* Non-capturing bracket */
4230 bravalue = OP_BRA;
4231 ptr++;
4232 break;
4233
4234
4235 /* ------------------------------------------------------------ */
4236 case '(':
4237 bravalue = OP_COND; /* Conditional group */
4238
4239 /* A condition can be an assertion, a number (referring to a numbered
4240 group), a name (referring to a named group), or 'R', referring to
4241 recursion. R<digits> and R&name are also permitted for recursion tests.
4242
4243 There are several syntaxes for testing a named group: (?(name)) is used
4244 by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
4245
4246 There are two unfortunate ambiguities, caused by history. (a) 'R' can
4247 be the recursive thing or the name 'R' (and similarly for 'R' followed
4248 by digits), and (b) a number could be a name that consists of digits.
4249 In both cases, we look for a name first; if not found, we try the other
4250 cases. */
4251
4252 /* For conditions that are assertions, check the syntax, and then exit
4253 the switch. This will take control down to where bracketed groups,
4254 including assertions, are processed. */
4255
4256 if (ptr[1] == '?' && (ptr[2] == '=' || ptr[2] == '!' || ptr[2] == '<'))
4257 break;
4258
4259 /* Most other conditions use OP_CREF (a couple change to OP_RREF
4260 below), and all need to skip 3 bytes at the start of the group. */
4261
4262 code[1+LINK_SIZE] = OP_CREF;
4263 skipbytes = 3;
4264 refsign = -1;
4265
4266 /* Check for a test for recursion in a named group. */
4267
4268 if (ptr[1] == 'R' && ptr[2] == '&')
4269 {
4270 terminator = -1;
4271 ptr += 2;
4272 code[1+LINK_SIZE] = OP_RREF; /* Change the type of test */
4273 }
4274
4275 /* Check for a test for a named group's having been set, using the Perl
4276 syntax (?(<name>) or (?('name') */
4277
4278 else if (ptr[1] == '<')
4279 {
4280 terminator = '>';
4281 ptr++;
4282 }
4283 else if (ptr[1] == '\'')
4284 {
4285 terminator = '\'';
4286 ptr++;
4287 }
4288 else
4289 {
4290 terminator = 0;
4291 if (ptr[1] == '-' || ptr[1] == '+') refsign = *(++ptr);
4292 }
4293
4294 /* We now expect to read a name; any thing else is an error */
4295
4296 if ((cd->ctypes[ptr[1]] & ctype_word) == 0)
4297 {
4298 ptr += 1; /* To get the right offset */
4299 *errorcodeptr = ERR28;
4300 goto FAILED;
4301 }
4302
4303 /* Read the name, but also get it as a number if it's all digits */
4304
4305 recno = 0;
4306 name = ++ptr;
4307 while ((cd->ctypes[*ptr] & ctype_word) != 0)
4308 {
4309 if (recno >= 0)
4310 recno = ((digitab[*ptr] & ctype_digit) != 0)?
4311 recno * 10 + *ptr - '0' : -1;
4312 ptr++;
4313 }
4314 namelen = ptr - name;
4315
4316 if ((terminator > 0 && *ptr++ != terminator) || *ptr++ != ')')
4317 {
4318 ptr--; /* Error offset */
4319 *errorcodeptr = ERR26;
4320 goto FAILED;
4321 }
4322
4323 /* Do no further checking in the pre-compile phase. */
4324
4325 if (lengthptr != NULL) break;
4326
4327 /* In the real compile we do the work of looking for the actual
4328 reference. If the string started with "+" or "-" we require the rest to
4329 be digits, in which case recno will be set. */
4330
4331 if (refsign > 0)
4332 {
4333 if (recno <= 0)
4334 {
4335 *errorcodeptr = ERR58;
4336 goto FAILED;
4337 }
4338 recno = (refsign == '-')?
4339 cd->bracount - recno + 1 : recno +cd->bracount;
4340 if (recno <= 0 || recno > cd->final_bracount)
4341 {
4342 *errorcodeptr = ERR15;
4343 goto FAILED;
4344 }
4345 PUT2(code, 2+LINK_SIZE, recno);
4346 break;
4347 }
4348
4349 /* Otherwise (did not start with "+" or "-"), start by looking for the
4350 name. */
4351
4352 slot = cd->name_table;
4353 for (i = 0; i < cd->names_found; i++)
4354 {
4355 if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
4356 slot += cd->name_entry_size;
4357 }
4358
4359 /* Found a previous named subpattern */
4360
4361 if (i < cd->names_found)
4362 {
4363 recno = GET2(slot, 0);
4364 PUT2(code, 2+LINK_SIZE, recno);
4365 }
4366
4367 /* Search the pattern for a forward reference */
4368
4369 else if ((i = find_parens(ptr, cd->bracount, name, namelen,
4370 (options & PCRE_EXTENDED) != 0)) > 0)
4371 {
4372 PUT2(code, 2+LINK_SIZE, i);
4373 }
4374
4375 /* If terminator == 0 it means that the name followed directly after
4376 the opening parenthesis [e.g. (?(abc)...] and in this case there are
4377 some further alternatives to try. For the cases where terminator != 0
4378 [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
4379 now checked all the possibilities, so give an error. */
4380
4381 else if (terminator != 0)
4382 {
4383 *errorcodeptr = ERR15;
4384 goto FAILED;
4385 }
4386
4387 /* Check for (?(R) for recursion. Allow digits after R to specify a
4388 specific group number. */
4389
4390 else if (*name == 'R')
4391 {
4392 recno = 0;
4393 for (i = 1; i < namelen; i++)
4394 {
4395 if ((digitab[name[i]] & ctype_digit) == 0)
4396 {
4397 *errorcodeptr = ERR15;
4398 goto FAILED;
4399 }
4400 recno = recno * 10 + name[i] - '0';
4401 }
4402 if (recno == 0) recno = RREF_ANY;
4403 code[1+LINK_SIZE] = OP_RREF; /* Change test type */
4404 PUT2(code, 2+LINK_SIZE, recno);
4405 }
4406
4407 /* Similarly, check for the (?(DEFINE) "condition", which is always
4408 false. */
4409
4410 else if (namelen == 6 && strncmp((char *)name, "DEFINE", 6) == 0)
4411 {
4412 code[1+LINK_SIZE] = OP_DEF;
4413 skipbytes = 1;
4414 }
4415
4416 /* Check for the "name" actually being a subpattern number. We are
4417 in the second pass here, so final_bracount is set. */
4418
4419 else if (recno > 0 && recno <= cd->final_bracount)
4420 {
4421 PUT2(code, 2+LINK_SIZE, recno);
4422 }
4423
4424 /* Either an unidentified subpattern, or a reference to (?(0) */
4425
4426 else
4427 {
4428 *errorcodeptr = (recno == 0)? ERR35: ERR15;
4429 goto FAILED;
4430 }
4431 break;
4432
4433
4434 /* ------------------------------------------------------------ */
4435 case '=': /* Positive lookahead */
4436 bravalue = OP_ASSERT;
4437 ptr++;
4438 break;
4439
4440
4441 /* ------------------------------------------------------------ */
4442 case '!': /* Negative lookahead */
4443 ptr++;
4444 if (*ptr == ')') /* Optimize (?!) */
4445 {
4446 *code++ = OP_FAIL;
4447 previous = NULL;
4448 continue;
4449 }
4450 bravalue = OP_ASSERT_NOT;
4451 break;
4452
4453
4454 /* ------------------------------------------------------------ */
4455 case '<': /* Lookbehind or named define */
4456 switch (ptr[1])
4457 {
4458 case '=': /* Positive lookbehind */
4459 bravalue = OP_ASSERTBACK;
4460 ptr += 2;
4461 break;
4462
4463 case '!': /* Negative lookbehind */
4464 bravalue = OP_ASSERTBACK_NOT;
4465 ptr += 2;
4466 break;
4467
4468 default: /* Could be name define, else bad */
4469 if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME;
4470 ptr++; /* Correct offset for error */
4471 *errorcodeptr = ERR24;
4472 goto FAILED;
4473 }
4474 break;
4475
4476
4477 /* ------------------------------------------------------------ */
4478 case '>': /* One-time brackets */
4479 bravalue = OP_ONCE;
4480 ptr++;
4481 break;
4482
4483
4484 /* ------------------------------------------------------------ */
4485 case 'C': /* Callout - may be followed by digits; */
4486 previous_callout = code; /* Save for later completion */
4487 after_manual_callout = 1; /* Skip one item before completing */
4488 *code++ = OP_CALLOUT;
4489 {
4490 int n = 0;
4491 while ((digitab[*(++ptr)] & ctype_digit) != 0)
4492 n = n * 10 + *ptr - '0';
4493 if (*ptr != ')')
4494 {
4495 *errorcodeptr = ERR39;
4496 goto FAILED;
4497 }
4498 if (n > 255)
4499 {
4500 *errorcodeptr = ERR38;
4501 goto FAILED;
4502 }
4503 *code++ = n;
4504 PUT(code, 0, ptr - cd->start_pattern + 1); /* Pattern offset */
4505 PUT(code, LINK_SIZE, 0); /* Default length */
4506 code += 2 * LINK_SIZE;
4507 }
4508 previous = NULL;
4509 continue;
4510
4511
4512 /* ------------------------------------------------------------ */
4513 case 'P': /* Python-style named subpattern handling */
4514 if (*(++ptr) == '=' || *ptr == '>') /* Reference or recursion */
4515 {
4516 is_recurse = *ptr == '>';
4517 terminator = ')';
4518 goto NAMED_REF_OR_RECURSE;
4519 }
4520 else if (*ptr != '<') /* Test for Python-style definition */
4521 {
4522 *errorcodeptr = ERR41;
4523 goto FAILED;
4524 }
4525 /* Fall through to handle (?P< as (?< is handled */
4526
4527
4528 /* ------------------------------------------------------------ */
4529 DEFINE_NAME: /* Come here from (?< handling */
4530 case '\'':
4531 {
4532 terminator = (*ptr == '<')? '>' : '\'';
4533 name = ++ptr;
4534
4535 while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4536 namelen = ptr - name;
4537
4538 /* In the pre-compile phase, just do a syntax check. */
4539
4540 if (lengthptr != NULL)
4541 {
4542 if (*ptr != terminator)
4543 {
4544 *errorcodeptr = ERR42;
4545 goto FAILED;
4546 }
4547 if (cd->names_found >= MAX_NAME_COUNT)
4548 {
4549 *errorcodeptr = ERR49;
4550 goto FAILED;
4551 }
4552 if (namelen + 3 > cd->name_entry_size)
4553 {
4554 cd->name_entry_size = namelen + 3;
4555 if (namelen > MAX_NAME_SIZE)
4556 {
4557 *errorcodeptr = ERR48;
4558 goto FAILED;
4559 }
4560 }
4561 }
4562
4563 /* In the real compile, create the entry in the table */
4564
4565 else
4566 {
4567 slot = cd->name_table;
4568 for (i = 0; i < cd->names_found; i++)
4569 {
4570 int crc = memcmp(name, slot+2, namelen);
4571 if (crc == 0)
4572 {
4573 if (slot[2+namelen] == 0)
4574 {
4575 if ((options & PCRE_DUPNAMES) == 0)
4576 {
4577 *errorcodeptr = ERR43;
4578 goto FAILED;
4579 }
4580 }
4581 else crc = -1; /* Current name is substring */
4582 }
4583 if (crc < 0)
4584 {
4585 memmove(slot + cd->name_entry_size, slot,
4586 (cd->names_found - i) * cd->name_entry_size);
4587 break;
4588 }
4589 slot += cd->name_entry_size;
4590 }
4591
4592 PUT2(slot, 0, cd->bracount + 1);
4593 memcpy(slot + 2, name, namelen);
4594 slot[2+namelen] = 0;
4595 }
4596 }
4597
4598 /* In both cases, count the number of names we've encountered. */
4599
4600 ptr++; /* Move past > or ' */
4601 cd->names_found++;
4602 goto NUMBERED_GROUP;
4603
4604
4605 /* ------------------------------------------------------------ */
4606 case '&': /* Perl recursion/subroutine syntax */
4607 terminator = ')';
4608 is_recurse = TRUE;
4609 /* Fall through */
4610
4611 /* We come here from the Python syntax above that handles both
4612 references (?P=name) and recursion (?P>name), as well as falling
4613 through from the Perl recursion syntax (?&name). We also come here from
4614 the Perl \k<name> or \k'name' back reference syntax and the \k{name}
4615 .NET syntax, and the Oniguruma \g<...> and \g'...' subroutine syntax. */
4616
4617 NAMED_REF_OR_RECURSE:
4618 name = ++ptr;
4619 while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4620 namelen = ptr - name;
4621
4622 /* In the pre-compile phase, do a syntax check and set a dummy
4623 reference number. */
4624
4625 if (lengthptr != NULL)
4626 {
4627 if (namelen == 0)
4628 {
4629 *errorcodeptr = ERR62;
4630 goto FAILED;
4631 }
4632 if (*ptr != terminator)
4633 {
4634 *errorcodeptr = ERR42;
4635 goto FAILED;
4636 }
4637 if (namelen > MAX_NAME_SIZE)
4638 {
4639 *errorcodeptr = ERR48;
4640 goto FAILED;
4641 }
4642 recno = 0;
4643 }
4644
4645 /* In the real compile, seek the name in the table. We check the name
4646 first, and then check that we have reached the end of the name in the
4647 table. That way, if the name that is longer than any in the table,
4648 the comparison will fail without reading beyond the table entry. */
4649
4650 else
4651 {
4652 slot = cd->name_table;
4653 for (i = 0; i < cd->names_found; i++)
4654 {
4655 if (strncmp((char *)name, (char *)slot+2, namelen) == 0 &&
4656 slot[2+namelen] == 0)
4657 break;
4658 slot += cd->name_entry_size;
4659 }
4660
4661 if (i < cd->names_found) /* Back reference */
4662 {
4663 recno = GET2(slot, 0);
4664 }
4665 else if ((recno = /* Forward back reference */
4666 find_parens(ptr, cd->bracount, name, namelen,
4667 (options & PCRE_EXTENDED) != 0)) <= 0)
4668 {
4669 *errorcodeptr = ERR15;
4670 goto FAILED;
4671 }
4672 }
4673
4674 /* In both phases, we can now go to the code than handles numerical
4675 recursion or backreferences. */
4676
4677 if (is_recurse) goto HANDLE_RECURSION;
4678 else goto HANDLE_REFERENCE;
4679
4680
4681 /* ------------------------------------------------------------ */
4682 case 'R': /* Recursion */
4683 ptr++; /* Same as (?0) */
4684 /* Fall through */
4685
4686
4687 /* ------------------------------------------------------------ */
4688 case '-': case '+':
4689 case '0': case '1': case '2': case '3': case '4': /* Recursion or */
4690 case '5': case '6': case '7': case '8': case '9': /* subroutine */
4691 {
4692 const uschar *called;
4693 terminator = ')';
4694
4695 /* Come here from the \g<...> and \g'...' code (Oniguruma
4696 compatibility). However, the syntax has been checked to ensure that
4697 the ... are a (signed) number, so that neither ERR63 nor ERR29 will
4698 be called on this path, nor with the jump to OTHER_CHAR_AFTER_QUERY
4699 ever be taken. */
4700
4701 HANDLE_NUMERICAL_RECURSION:
4702
4703 if ((refsign = *ptr) == '+')
4704 {
4705 ptr++;
4706 if ((digitab[*ptr] & ctype_digit) == 0)
4707 {
4708 *errorcodeptr = ERR63;
4709 goto FAILED;
4710 }
4711 }
4712 else if (refsign == '-')
4713 {
4714 if ((digitab[ptr[1]] & ctype_digit) == 0)
4715 goto OTHER_CHAR_AFTER_QUERY;
4716 ptr++;
4717 }
4718
4719 recno = 0;
4720 while((digitab[*ptr] & ctype_digit) != 0)
4721 recno = recno * 10 + *ptr++ - '0';
4722
4723 if (*ptr != terminator)
4724 {
4725 *errorcodeptr = ERR29;
4726 goto FAILED;
4727 }
4728
4729 if (refsign == '-')
4730 {
4731 if (recno == 0)
4732 {
4733 *errorcodeptr = ERR58;
4734 goto FAILED;
4735 }
4736 recno = cd->bracount - recno + 1;
4737 if (recno <= 0)
4738 {
4739 *errorcodeptr = ERR15;
4740 goto FAILED;
4741 }
4742 }
4743 else if (refsign == '+')
4744 {
4745 if (recno == 0)
4746 {
4747 *errorcodeptr = ERR58;
4748 goto FAILED;
4749 }
4750 recno += cd->bracount;
4751 }
4752
4753 /* Come here from code above that handles a named recursion */
4754
4755 HANDLE_RECURSION:
4756
4757 previous = code;
4758 called = cd->start_code;
4759
4760 /* When we are actually compiling, find the bracket that is being
4761 referenced. Temporarily end the regex in case it doesn't exist before
4762 this point. If we end up with a forward reference, first check that
4763 the bracket does occur later so we can give the error (and position)
4764 now. Then remember this forward reference in the workspace so it can
4765 be filled in at the end. */
4766
4767 if (lengthptr == NULL)
4768 {
4769 *code = OP_END;
4770 if (recno != 0) called = find_bracket(cd->start_code, utf8, recno);
4771
4772 /* Forward reference */
4773
4774 if (called == NULL)
4775 {
4776 if (find_parens(ptr, cd->bracount, NULL, recno,
4777 (options & PCRE_EXTENDED) != 0) < 0)
4778 {
4779 *errorcodeptr = ERR15;
4780 goto FAILED;
4781 }
4782 called = cd->start_code + recno;
4783 PUTINC(cd->hwm, 0, code + 2 + LINK_SIZE - cd->start_code);
4784 }
4785
4786 /* If not a forward reference, and the subpattern is still open,
4787 this is a recursive call. We check to see if this is a left
4788 recursion that could loop for ever, and diagnose that case. */
4789
4790 else if (GET(called, 1) == 0 &&
4791 could_be_empty(called, code, bcptr, utf8))
4792 {
4793 *errorcodeptr = ERR40;
4794 goto FAILED;
4795 }
4796 }
4797
4798 /* Insert the recursion/subroutine item, automatically wrapped inside
4799 "once" brackets. Set up a "previous group" length so that a
4800 subsequent quantifier will work. */
4801
4802 *code = OP_ONCE;
4803 PUT(code, 1, 2 + 2*LINK_SIZE);
4804 code += 1 + LINK_SIZE;
4805
4806 *code = OP_RECURSE;
4807 PUT(code, 1, called - cd->start_code);
4808 code += 1 + LINK_SIZE;
4809
4810 *code = OP_KET;
4811 PUT(code, 1, 2 + 2*LINK_SIZE);
4812 code += 1 + LINK_SIZE;
4813
4814 length_prevgroup = 3 + 3*LINK_SIZE;
4815 }
4816
4817 /* Can't determine a first byte now */
4818
4819 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
4820 continue;
4821
4822
4823 /* ------------------------------------------------------------ */
4824 default: /* Other characters: check option setting */
4825 OTHER_CHAR_AFTER_QUERY:
4826 set = unset = 0;
4827 optset = &set;
4828
4829 while (*ptr != ')' && *ptr != ':')
4830 {
4831 switch (*ptr++)
4832 {
4833 case '-': optset = &unset; break;
4834
4835 case 'J': /* Record that it changed in the external options */
4836 *optset |= PCRE_DUPNAMES;
4837 cd->external_flags |= PCRE_JCHANGED;
4838 break;
4839
4840 case 'i': *optset |= PCRE_CASELESS; break;
4841 case 'm': *optset |= PCRE_MULTILINE; break;
4842 case 's': *optset |= PCRE_DOTALL; break;
4843 case 'x': *optset |= PCRE_EXTENDED; break;
4844 case 'U': *optset |= PCRE_UNGREEDY; break;
4845 case 'X': *optset |= PCRE_EXTRA; break;
4846
4847 default: *errorcodeptr = ERR12;
4848 ptr--; /* Correct the offset */
4849 goto FAILED;
4850 }
4851 }
4852
4853 /* Set up the changed option bits, but don't change anything yet. */
4854
4855 newoptions = (options | set) & (~unset);
4856
4857 /* If the options ended with ')' this is not the start of a nested
4858 group with option changes, so the options change at this level. If this
4859 item is right at the start of the pattern, the options can be
4860 abstracted and made external in the pre-compile phase, and ignored in
4861 the compile phase. This can be helpful when matching -- for instance in
4862 caseless checking of required bytes.
4863
4864 If the code pointer is not (cd->start_code + 1 + LINK_SIZE), we are
4865 definitely *not* at the start of the pattern because something has been
4866 compiled. In the pre-compile phase, however, the code pointer can have
4867 that value after the start, because it gets reset as code is discarded
4868 during the pre-compile. However, this can happen only at top level - if
4869 we are within parentheses, the starting BRA will still be present. At
4870 any parenthesis level, the length value can be used to test if anything
4871 has been compiled at that level. Thus, a test for both these conditions
4872 is necessary to ensure we correctly detect the start of the pattern in
4873 both phases.
4874
4875 If we are not at the pattern start, compile code to change the ims
4876 options if this setting actually changes any of them. We also pass the
4877 new setting back so that it can be put at the start of any following
4878 branches, and when this group ends (if we are in a group), a resetting
4879 item can be compiled. */
4880
4881 if (*ptr == ')')
4882 {
4883 if (code == cd->start_code + 1 + LINK_SIZE &&
4884 (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE))
4885 {
4886 cd->external_options = newoptions;
4887 options = newoptions;
4888 }
4889 else
4890 {
4891 if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
4892 {
4893 *code++ = OP_OPT;
4894 *code++ = newoptions & PCRE_IMS;
4895 }
4896
4897 /* Change options at this level, and pass them back for use
4898 in subsequent branches. Reset the greedy defaults and the case
4899 value for firstbyte and reqbyte. */
4900
4901 *optionsptr = options = newoptions;
4902 greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
4903 greedy_non_default = greedy_default ^ 1;
4904 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
4905 }
4906
4907 previous = NULL; /* This item can't be repeated */
4908 continue; /* It is complete */
4909 }
4910
4911 /* If the options ended with ':' we are heading into a nested group
4912 with possible change of options. Such groups are non-capturing and are
4913 not assertions of any kind. All we need to do is skip over the ':';
4914 the newoptions value is handled below. */
4915
4916 bravalue = OP_BRA;
4917 ptr++;
4918 } /* End of switch for character following (? */
4919 } /* End of (? handling */
4920
4921 /* Opening parenthesis not followed by '?'. If PCRE_NO_AUTO_CAPTURE is set,
4922 all unadorned brackets become non-capturing and behave like (?:...)
4923 brackets. */
4924
4925 else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
4926 {
4927 bravalue = OP_BRA;
4928 }
4929
4930 /* Else we have a capturing group. */
4931
4932 else
4933 {
4934 NUMBERED_GROUP:
4935 cd->bracount += 1;
4936 PUT2(code, 1+LINK_SIZE, cd->bracount);
4937 skipbytes = 2;
4938 }
4939
4940 /* Process nested bracketed regex. Assertions may not be repeated, but
4941 other kinds can be. All their opcodes are >= OP_ONCE. We copy code into a
4942 non-register variable in order to be able to pass its address because some
4943 compilers complain otherwise. Pass in a new setting for the ims options if
4944 they have changed. */
4945
4946 previous = (bravalue >= OP_ONCE)? code : NULL;
4947 *code = bravalue;
4948 tempcode = code;
4949 tempreqvary = cd->req_varyopt; /* Save value before bracket */
4950 length_prevgroup = 0; /* Initialize for pre-compile phase */
4951
4952 if (!compile_regex(
4953 newoptions, /* The complete new option state */
4954 options & PCRE_IMS, /* The previous ims option state */
4955 &tempcode, /* Where to put code (updated) */
4956 &ptr, /* Input pointer (updated) */
4957 errorcodeptr, /* Where to put an error message */
4958 (bravalue == OP_ASSERTBACK ||
4959 bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
4960 reset_bracount, /* True if (?| group */
4961 skipbytes, /* Skip over bracket number */
4962 &subfirstbyte, /* For possible first char */
4963 &subreqbyte, /* For possible last char */
4964 bcptr, /* Current branch chain */
4965 cd, /* Tables block */
4966 (lengthptr == NULL)? NULL : /* Actual compile phase */
4967 &length_prevgroup /* Pre-compile phase */
4968 ))
4969 goto FAILED;
4970
4971 /* At the end of compiling, code is still pointing to the start of the
4972 group, while tempcode has been updated to point past the end of the group
4973 and any option resetting that may follow it. The pattern pointer (ptr)
4974 is on the bracket. */
4975
4976 /* If this is a conditional bracket, check that there are no more than
4977 two branches in the group, or just one if it's a DEFINE group. We do this
4978 in the real compile phase, not in the pre-pass, where the whole group may
4979 not be available. */
4980
4981 if (bravalue == OP_COND && lengthptr == NULL)
4982 {
4983 uschar *tc = code;
4984 int condcount = 0;
4985
4986 do {
4987 condcount++;
4988 tc += GET(tc,1);
4989 }
4990 while (*tc != OP_KET);
4991
4992 /* A DEFINE group is never obeyed inline (the "condition" is always
4993 false). It must have only one branch. */
4994
4995 if (code[LINK_SIZE+1] == OP_DEF)
4996 {
4997 if (condcount > 1)
4998 {
4999 *errorcodeptr = ERR54;
5000 goto FAILED;
5001 }
5002 bravalue = OP_DEF; /* Just a flag to suppress char handling below */
5003 }
5004
5005 /* A "normal" conditional group. If there is just one branch, we must not
5006 make use of its firstbyte or reqbyte, because this is equivalent to an
5007 empty second branch. */
5008
5009 else
5010 {
5011 if (condcount > 2)
5012 {
5013 *errorcodeptr = ERR27;
5014 goto FAILED;
5015 }
5016 if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
5017 }
5018 }
5019
5020 /* Error if hit end of pattern */
5021
5022 if (*ptr != ')')
5023 {
5024 *errorcodeptr = ERR14;
5025 goto FAILED;
5026 }
5027
5028 /* In the pre-compile phase, update the length by the length of the group,
5029 less the brackets at either end. Then reduce the compiled code to just a
5030 set of non-capturing brackets so that it doesn't use much memory if it is
5031 duplicated by a quantifier.*/
5032
5033 if (lengthptr != NULL)
5034 {
5035 if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE)
5036 {
5037 *errorcodeptr = ERR20;
5038 goto FAILED;
5039 }
5040 *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
5041 *code++ = OP_BRA;
5042 PUTINC(code, 0, 1 + LINK_SIZE);
5043 *code++ = OP_KET;
5044 PUTINC(code, 0, 1 + LINK_SIZE);
5045 break; /* No need to waste time with special character handling */
5046 }
5047
5048 /* Otherwise update the main code pointer to the end of the group. */
5049
5050 code = tempcode;
5051
5052 /* For a DEFINE group, required and first character settings are not
5053 relevant. */
5054
5055 if (bravalue == OP_DEF) break;
5056
5057 /* Handle updating of the required and first characters for other types of
5058 group. Update for normal brackets of all kinds, and conditions with two
5059 branches (see code above). If the bracket is followed by a quantifier with
5060 zero repeat, we have to back off. Hence the definition of zeroreqbyte and
5061 zerofirstbyte outside the main loop so that they can be accessed for the
5062 back off. */
5063
5064 zeroreqbyte = reqbyte;
5065 zerofirstbyte = firstbyte;
5066 groupsetfirstbyte = FALSE;
5067
5068 if (bravalue >= OP_ONCE)
5069 {
5070 /* If we have not yet set a firstbyte in this branch, take it from the
5071 subpattern, remembering that it was set here so that a repeat of more
5072 than one can replicate it as reqbyte if necessary. If the subpattern has
5073 no firstbyte, set "none" for the whole branch. In both cases, a zero
5074 repeat forces firstbyte to "none". */
5075
5076 if (firstbyte == REQ_UNSET)
5077 {
5078 if (subfirstbyte >= 0)
5079 {
5080 firstbyte = subfirstbyte;
5081 groupsetfirstbyte = TRUE;
5082 }
5083 else firstbyte = REQ_NONE;
5084 zerofirstbyte = REQ_NONE;
5085 }
5086
5087 /* If firstbyte was previously set, convert the subpattern's firstbyte
5088 into reqbyte if there wasn't one, using the vary flag that was in
5089 existence beforehand. */
5090
5091 else if (subfirstbyte >= 0 && subreqbyte < 0)
5092 subreqbyte = subfirstbyte | tempreqvary;
5093
5094 /* If the subpattern set a required byte (or set a first byte that isn't
5095 really the first byte - see above), set it. */
5096
5097 if (subreqbyte >= 0) reqbyte = subreqbyte;
5098 }
5099
5100 /* For a forward assertion, we take the reqbyte, if set. This can be
5101 helpful if the pattern that follows the assertion doesn't set a different
5102 char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
5103 for an assertion, however because it leads to incorrect effect for patterns
5104 such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
5105 of a firstbyte. This is overcome by a scan at the end if there's no
5106 firstbyte, looking for an asserted first char. */
5107
5108 else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
5109 break; /* End of processing '(' */
5110
5111
5112 /* ===================================================================*/
5113 /* Handle metasequences introduced by \. For ones like \d, the ESC_ values
5114 are arranged to be the negation of the corresponding OP_values. For the
5115 back references, the values are ESC_REF plus the reference number. Only
5116 back references and those types that consume a character may be repeated.
5117 We can test for values between ESC_b and ESC_Z for the latter; this may
5118 have to change if any new ones are ever created. */
5119
5120 case '\\':
5121 tempptr = ptr;
5122 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, FALSE);
5123 if (*errorcodeptr != 0) goto FAILED;
5124
5125 if (c < 0)
5126 {
5127 if (-c == ESC_Q) /* Handle start of quoted string */
5128 {
5129 if (ptr[1] == '\\' && ptr[2] == 'E') ptr += 2; /* avoid empty string */
5130 else inescq = TRUE;
5131 continue;
5132 }
5133
5134 if (-c == ESC_E) continue; /* Perl ignores an orphan \E */
5135
5136 /* For metasequences that actually match a character, we disable the
5137 setting of a first character if it hasn't already been set. */
5138
5139 if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
5140 firstbyte = REQ_NONE;
5141
5142 /* Set values to reset to if this is followed by a zero repeat. */
5143
5144 zerofirstbyte = firstbyte;
5145 zeroreqbyte = reqbyte;
5146
5147 /* \g<name> or \g'name' is a subroutine call by name and \g<n> or \g'n'
5148 is a subroutine call by number (Oniguruma syntax). In fact, the value
5149 -ESC_g is returned only for these cases. So we don't need to check for <
5150 or ' if the value is -ESC_g. For the Perl syntax \g{n} the value is
5151 -ESC_REF+n, and for the Perl syntax \g{name} the result is -ESC_k (as
5152 that is a synonym for a named back reference). */
5153
5154 if (-c == ESC_g)
5155 {
5156 const uschar *p;
5157 save_hwm = cd->hwm; /* Normally this is set when '(' is read */
5158 terminator = (*(++ptr) == '<')? '>' : '\'';
5159
5160 /* These two statements stop the compiler for warning about possibly
5161 unset variables caused by the jump to HANDLE_NUMERICAL_RECURSION. In
5162 fact, because we actually check for a number below, the paths that
5163 would actually be in error are never taken. */
5164
5165 skipbytes = 0;
5166 reset_bracount = FALSE;
5167
5168 /* Test for a name */
5169
5170 if (ptr[1] != '+' && ptr[1] != '-')
5171 {
5172 BOOL isnumber = TRUE;
5173 for (p = ptr + 1; *p != 0 && *p != terminator; p++)
5174 {
5175 if ((cd->ctypes[*p] & ctype_digit) == 0) isnumber = FALSE;
5176 if ((cd->ctypes[*p] & ctype_word) == 0) break;
5177 }
5178 if (*p != terminator)
5179 {
5180 *errorcodeptr = ERR57;
5181 break;
5182 }
5183 if (isnumber)
5184 {
5185 ptr++;
5186 goto HANDLE_NUMERICAL_RECURSION;
5187 }
5188 is_recurse = TRUE;
5189 goto NAMED_REF_OR_RECURSE;
5190 }
5191
5192 /* Test a signed number in angle brackets or quotes. */
5193
5194 p = ptr + 2;
5195 while ((digitab[*p] & ctype_digit) != 0) p++;
5196 if (*p != terminator)
5197 {
5198 *errorcodeptr = ERR57;
5199 break;
5200 }
5201 ptr++;
5202 goto HANDLE_NUMERICAL_RECURSION;
5203 }
5204
5205 /* \k<name> or \k'name' is a back reference by name (Perl syntax).
5206 We also support \k{name} (.NET syntax) */
5207
5208 if (-c == ESC_k && (ptr[1] == '<' || ptr[1] == '\'' || ptr[1] == '{'))
5209 {
5210 is_recurse = FALSE;
5211 terminator = (*(++ptr) == '<')? '>' : (*ptr == '\'')? '\'' : '}';
5212 goto NAMED_REF_OR_RECURSE;
5213 }
5214
5215 /* Back references are handled specially; must disable firstbyte if
5216 not set to cope with cases like (?=(\w+))\1: which would otherwise set
5217 ':' later. */
5218
5219 if (-c >= ESC_REF)
5220 {
5221 recno = -c - ESC_REF;
5222
5223 HANDLE_REFERENCE: /* Come here from named backref handling */
5224 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
5225 previous = code;
5226 *code++ = OP_REF;
5227 PUT2INC(code, 0, recno);
5228 cd->backref_map |= (recno < 32)? (1 << recno) : 1;
5229 if (recno > cd->top_backref) cd->top_backref = recno;
5230 }
5231
5232 /* So are Unicode property matches, if supported. */
5233
5234 #ifdef SUPPORT_UCP
5235 else if (-c == ESC_P || -c == ESC_p)
5236 {
5237 BOOL negated;
5238 int pdata;
5239 int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
5240 if (ptype < 0) goto FAILED;
5241 previous = code;
5242 *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
5243 *code++ = ptype;
5244 *code++ = pdata;
5245 }
5246 #else
5247
5248 /* If Unicode properties are not supported, \X, \P, and \p are not
5249 allowed. */
5250
5251 else if (-c == ESC_X || -c == ESC_P || -c == ESC_p)
5252 {
5253 *errorcodeptr = ERR45;
5254 goto FAILED;
5255 }
5256 #endif
5257
5258 /* For the rest (including \X when Unicode properties are supported), we
5259 can obtain the OP value by negating the escape value. */
5260
5261 else
5262 {
5263 previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
5264 *code++ = -c;
5265 }
5266 continue;
5267 }
5268
5269 /* We have a data character whose value is in c. In UTF-8 mode it may have
5270 a value > 127. We set its representation in the length/buffer, and then
5271 handle it as a data character. */
5272
5273 #ifdef SUPPORT_UTF8
5274 if (utf8 && c > 127)
5275 mclength = _pcre_ord2utf8(c, mcbuffer);
5276 else
5277 #endif
5278
5279 {
5280 mcbuffer[0] = c;
5281 mclength = 1;
5282 }
5283 goto ONE_CHAR;
5284
5285
5286 /* ===================================================================*/
5287 /* Handle a literal character. It is guaranteed not to be whitespace or #
5288 when the extended flag is set. If we are in UTF-8 mode, it may be a
5289 multi-byte literal character. */
5290
5291 default:
5292 NORMAL_CHAR:
5293 mclength = 1;
5294 mcbuffer[0] = c;
5295
5296 #ifdef SUPPORT_UTF8
5297 if (utf8 && c >= 0xc0)
5298 {
5299 while ((ptr[1] & 0xc0) == 0x80)
5300 mcbuffer[mclength++] = *(++ptr);
5301 }
5302 #endif
5303
5304 /* At this point we have the character's bytes in mcbuffer, and the length
5305 in mclength. When not in UTF-8 mode, the length is always 1. */
5306
5307 ONE_CHAR:
5308 previous = code;
5309 *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARNC : OP_CHAR;
5310 for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
5311
5312 /* Remember if \r or \n were seen */
5313
5314 if (mcbuffer[0] == '\r' || mcbuffer[0] == '\n')
5315 cd->external_flags |= PCRE_HASCRORLF;
5316
5317 /* Set the first and required bytes appropriately. If no previous first
5318 byte, set it from this character, but revert to none on a zero repeat.
5319 Otherwise, leave the firstbyte value alone, and don't change it on a zero
5320 repeat. */
5321
5322 if (firstbyte == REQ_UNSET)
5323 {
5324 zerofirstbyte = REQ_NONE;
5325 zeroreqbyte = reqbyte;
5326
5327 /* If the character is more than one byte long, we can set firstbyte
5328 only if it is not to be matched caselessly. */
5329
5330 if (mclength == 1 || req_caseopt == 0)
5331 {
5332 firstbyte = mcbuffer[0] | req_caseopt;
5333 if (mclength != 1) reqbyte = code[-1] | cd->req_varyopt;
5334 }
5335 else firstbyte = reqbyte = REQ_NONE;
5336 }
5337
5338 /* firstbyte was previously set; we can set reqbyte only the length is
5339 1 or the matching is caseful. */
5340
5341 else
5342 {
5343 zerofirstbyte = firstbyte;
5344 zeroreqbyte = reqbyte;
5345 if (mclength == 1 || req_caseopt == 0)
5346 reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
5347 }
5348
5349 break; /* End of literal character handling */
5350 }
5351 } /* end of big loop */
5352
5353
5354 /* Control never reaches here by falling through, only by a goto for all the
5355 error states. Pass back the position in the pattern so that it can be displayed
5356 to the user for diagnosing the error. */
5357
5358 FAILED:
5359 *ptrptr = ptr;
5360 return FALSE;
5361 }
5362
5363
5364
5365
5366 /*************************************************
5367 * Compile sequence of alternatives *
5368 *************************************************/
5369
5370 /* On entry, ptr is pointing past the bracket character, but on return it
5371 points to the closing bracket, or vertical bar, or end of string. The code
5372 variable is pointing at the byte into which the BRA operator has been stored.
5373 If the ims options are changed at the start (for a (?ims: group) or during any
5374 branch, we need to insert an OP_OPT item at the start of every following branch
5375 to ensure they get set correctly at run time, and also pass the new options
5376 into every subsequent branch compile.
5377
5378 This function is used during the pre-compile phase when we are trying to find
5379 out the amount of memory needed, as well as during the real compile phase. The
5380 value of lengthptr distinguishes the two phases.
5381
5382 Arguments:
5383 options option bits, including any changes for this subpattern
5384 oldims previous settings of ims option bits
5385 codeptr -> the address of the current code pointer
5386 ptrptr -> the address of the current pattern pointer
5387 errorcodeptr -> pointer to error code variable
5388 lookbehind TRUE if this is a lookbehind assertion
5389 reset_bracount TRUE to reset the count for each branch
5390 skipbytes skip this many bytes at start (for brackets and OP_COND)
5391 firstbyteptr place to put the first required character, or a negative number
5392 reqbyteptr place to put the last required character, or a negative number
5393 bcptr pointer to the chain of currently open branches
5394 cd points to the data block with tables pointers etc.
5395 lengthptr NULL during the real compile phase
5396 points to length accumulator during pre-compile phase
5397
5398 Returns: TRUE on success
5399 */
5400
5401 static BOOL
5402 compile_regex(int options, int oldims, uschar **codeptr, const uschar **ptrptr,
5403 int *errorcodeptr, BOOL lookbehind, BOOL reset_bracount, int skipbytes,
5404 int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd,
5405 int *lengthptr)
5406 {
5407 const uschar *ptr = *ptrptr;
5408 uschar *code = *codeptr;
5409 uschar *last_branch = code;
5410 uschar *start_bracket = code;
5411 uschar *reverse_count = NULL;
5412 int firstbyte, reqbyte;
5413 int branchfirstbyte, branchreqbyte;
5414 int length;
5415 int orig_bracount;
5416 int max_bracount;
5417 branch_chain bc;
5418
5419 bc.outer = bcptr;
5420 bc.current = code;
5421
5422 firstbyte = reqbyte = REQ_UNSET;
5423
5424 /* Accumulate the length for use in the pre-compile phase. Start with the
5425 length of the BRA and KET and any extra bytes that are required at the
5426 beginning. We accumulate in a local variable to save frequent testing of
5427 lenthptr for NULL. We cannot do this by looking at the value of code at the
5428 start and end of each alternative, because compiled items are discarded during
5429 the pre-compile phase so that the work space is not exceeded. */
5430
5431 length = 2 + 2*LINK_SIZE + skipbytes;
5432
5433 /* WARNING: If the above line is changed for any reason, you must also change
5434 the code that abstracts option settings at the start of the pattern and makes
5435 them global. It tests the value of length for (2 + 2*LINK_SIZE) in the
5436 pre-compile phase to find out whether anything has yet been compiled or not. */
5437
5438 /* Offset is set zero to mark that this bracket is still open */
5439
5440 PUT(code, 1, 0);
5441 code += 1 + LINK_SIZE + skipbytes;
5442
5443 /* Loop for each alternative branch */
5444
5445 orig_bracount = max_bracount = cd->bracount;
5446 for (;;)
5447 {
5448 /* For a (?| group, reset the capturing bracket count so that each branch
5449 uses the same numbers. */
5450
5451 if (reset_bracount) cd->bracount = orig_bracount;
5452
5453 /* Handle a change of ims options at the start of the branch */
5454
5455 if ((options & PCRE_IMS) != oldims)
5456 {
5457 *code++ = OP_OPT;
5458 *code++ = options & PCRE_IMS;
5459 length += 2;
5460 }
5461
5462 /* Set up dummy OP_REVERSE if lookbehind assertion */
5463
5464 if (lookbehind)
5465 {
5466 *code++ = OP_REVERSE;
5467 reverse_count = code;
5468 PUTINC(code, 0, 0);
5469 length += 1 + LINK_SIZE;
5470 }
5471
5472 /* Now compile the branch; in the pre-compile phase its length gets added
5473 into the length. */
5474
5475 if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstbyte,
5476 &branchreqbyte, &bc, cd, (lengthptr == NULL)? NULL : &length))
5477 {
5478 *ptrptr = ptr;
5479 return FALSE;
5480 }
5481
5482 /* Keep the highest bracket count in case (?| was used and some branch
5483 has fewer than the rest. */
5484
5485 if (cd->bracount > max_bracount) max_bracount = cd->bracount;
5486
5487 /* In the real compile phase, there is some post-processing to be done. */
5488
5489 if (lengthptr == NULL)
5490 {
5491 /* If this is the first branch, the firstbyte and reqbyte values for the
5492 branch become the values for the regex. */
5493
5494 if (*last_branch != OP_ALT)
5495 {
5496 firstbyte = branchfirstbyte;
5497 reqbyte = branchreqbyte;
5498 }
5499
5500 /* If this is not the first branch, the first char and reqbyte have to
5501 match the values from all the previous branches, except that if the
5502 previous value for reqbyte didn't have REQ_VARY set, it can still match,
5503 and we set REQ_VARY for the regex. */
5504
5505 else
5506 {
5507 /* If we previously had a firstbyte, but it doesn't match the new branch,
5508 we have to abandon the firstbyte for the regex, but if there was
5509 previously no reqbyte, it takes on the value of the old firstbyte. */
5510
5511 if (firstbyte >= 0 && firstbyte != branchfirstbyte)
5512 {
5513 if (reqbyte < 0) reqbyte = firstbyte;
5514 firstbyte = REQ_NONE;
5515 }
5516
5517 /* If we (now or from before) have no firstbyte, a firstbyte from the
5518 branch becomes a reqbyte if there isn't a branch reqbyte. */
5519
5520 if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
5521 branchreqbyte = branchfirstbyte;
5522
5523 /* Now ensure that the reqbytes match */
5524
5525 if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
5526 reqbyte = REQ_NONE;
5527 else reqbyte |= branchreqbyte; /* To "or" REQ_VARY */
5528 }
5529
5530 /* If lookbehind, check that this branch matches a fixed-length string, and
5531 put the length into the OP_REVERSE item. Temporarily mark the end of the
5532 branch with OP_END. */
5533
5534 if (lookbehind)
5535 {
5536 int fixed_length;
5537 *code = OP_END;
5538 fixed_length = find_fixedlength(last_branch, options);
5539 DPRINTF(("fixed length = %d\n", fixed_length));
5540 if (fixed_length < 0)
5541 {
5542 *errorcodeptr = (fixed_length == -2)? ERR36 : ERR25;
5543 *ptrptr = ptr;
5544 return FALSE;
5545 }
5546 PUT(reverse_count, 0, fixed_length);
5547 }
5548 }
5549
5550 /* Reached end of expression, either ')' or end of pattern. In the real
5551 compile phase, go back through the alternative branches and reverse the chain
5552 of offsets, with the field in the BRA item now becoming an offset to the
5553 first alternative. If there are no alternatives, it points to the end of the
5554 group. The length in the terminating ket is always the length of the whole
5555 bracketed item. If any of the ims options were changed inside the group,
5556 compile a resetting op-code following, except at the very end of the pattern.
5557 Return leaving the pointer at the terminating char. */
5558
5559 if (*ptr != '|')
5560 {
5561 if (lengthptr == NULL)
5562 {
5563 int branch_length = code - last_branch;
5564 do
5565 {
5566 int prev_length = GET(last_branch, 1);
5567 PUT(last_branch, 1, branch_length);
5568 branch_length = prev_length;
5569 last_branch -= branch_length;
5570 }
5571 while (branch_length > 0);
5572 }
5573
5574 /* Fill in the ket */
5575
5576 *code = OP_KET;
5577 PUT(code, 1, code - start_bracket);
5578 code += 1 + LINK_SIZE;
5579
5580 /* Resetting option if needed */
5581
5582 if ((options & PCRE_IMS) != oldims && *ptr == ')')
5583 {
5584 *code++ = OP_OPT;
5585 *code++ = oldims;
5586 length += 2;
5587 }
5588
5589 /* Retain the highest bracket number, in case resetting was used. */
5590
5591 cd->bracount = max_bracount;
5592
5593 /* Set values to pass back */
5594
5595 *codeptr = code;
5596 *ptrptr = ptr;
5597 *firstbyteptr = firstbyte;
5598 *reqbyteptr = reqbyte;
5599 if (lengthptr != NULL)
5600 {
5601 if (OFLOW_MAX - *lengthptr < length)
5602 {
5603 *errorcodeptr = ERR20;
5604 return FALSE;
5605 }
5606 *lengthptr += length;
5607 }
5608 return TRUE;
5609 }
5610
5611 /* Another branch follows. In the pre-compile phase, we can move the code
5612 pointer back to where it was for the start of the first branch. (That is,
5613 pretend that each branch is the only one.)
5614
5615 In the real compile phase, insert an ALT node. Its length field points back
5616 to the previous branch while the bracket remains open. At the end the chain
5617 is reversed. It's done like this so that the start of the bracket has a
5618 zero offset until it is closed, making it possible to detect recursion. */
5619
5620 if (lengthptr != NULL)
5621 {
5622 code = *codeptr + 1 + LINK_SIZE + skipbytes;
5623 length += 1 + LINK_SIZE;
5624 }
5625 else
5626 {
5627 *code = OP_ALT;
5628 PUT(code, 1, code - last_branch);
5629 bc.current = last_branch = code;
5630 code += 1 + LINK_SIZE;
5631 }
5632
5633 ptr++;
5634 }
5635 /* Control never reaches here */
5636 }
5637
5638
5639
5640
5641 /*************************************************
5642 * Check for anchored expression *
5643 *************************************************/
5644
5645 /* Try to find out if this is an anchored regular expression. Consider each
5646 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
5647 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
5648 it's anchored. However, if this is a multiline pattern, then only OP_SOD
5649 counts, since OP_CIRC can match in the middle.
5650
5651 We can also consider a regex to be anchored if OP_SOM starts all its branches.
5652 This is the code for \G, which means "match at start of match position, taking
5653 into account the match offset".
5654
5655 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
5656 because that will try the rest of the pattern at all possible matching points,
5657 so there is no point trying again.... er ....
5658
5659 .... except when the .* appears inside capturing parentheses, and there is a
5660 subsequent back reference to those parentheses. We haven't enough information
5661 to catch that case precisely.
5662
5663 At first, the best we could do was to detect when .* was in capturing brackets
5664 and the highest back reference was greater than or equal to that level.
5665 However, by keeping a bitmap of the first 31 back references, we can catch some
5666 of the more common cases more precisely.
5667
5668 Arguments:
5669 code points to start of expression (the bracket)
5670 options points to the options setting
5671 bracket_map a bitmap of which brackets we are inside while testing; this
5672 handles up to substring 31; after that we just have to take
5673 the less precise approach
5674 backref_map the back reference bitmap
5675
5676 Returns: TRUE or FALSE
5677 */
5678
5679 static BOOL
5680 is_anchored(register const uschar *code, int *options, unsigned int bracket_map,
5681 unsigned int backref_map)
5682 {
5683 do {
5684 const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
5685 options, PCRE_MULTILINE, FALSE);
5686 register int op = *scode;
5687
5688 /* Non-capturing brackets */
5689
5690 if (op == OP_BRA)
5691 {
5692 if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
5693 }
5694
5695 /* Capturing brackets */
5696
5697 else if (op == OP_CBRA)
5698 {
5699 int n = GET2(scode, 1+LINK_SIZE);
5700 int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
5701 if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;
5702 }
5703
5704 /* Other brackets */
5705
5706 else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
5707 {
5708 if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
5709 }
5710
5711 /* .* is not anchored unless DOTALL is set and it isn't in brackets that
5712 are or may be referenced. */
5713
5714 else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
5715 op == OP_TYPEPOSSTAR) &&
5716 (*options & PCRE_DOTALL) != 0)
5717 {
5718 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
5719 }
5720
5721 /* Check for explicit anchoring */
5722
5723 else if (op != OP_SOD && op != OP_SOM &&
5724 ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
5725 return FALSE;
5726 code += GET(code, 1);
5727 }
5728 while (*code == OP_ALT); /* Loop for each alternative */
5729 return TRUE;
5730 }
5731
5732
5733
5734 /*************************************************
5735 * Check for starting with ^ or .* *
5736 *************************************************/
5737
5738 /* This is called to find out if every branch starts with ^ or .* so that
5739 "first char" processing can be done to speed things up in multiline
5740 matching and for non-DOTALL patterns that start with .* (which must start at
5741 the beginning or after \n). As in the case of is_anchored() (see above), we
5742 have to take account of back references to capturing brackets that contain .*
5743 because in that case we can't make the assumption.
5744
5745 Arguments:
5746 code points to start of expression (the bracket)
5747 bracket_map a bitmap of which brackets we are inside while testing; this
5748 handles up to substring 31; after that we just have to take
5749 the less precise approach
5750 backref_map the back reference bitmap
5751
5752 Returns: TRUE or FALSE
5753 */
5754
5755 static BOOL
5756 is_startline(const uschar *code, unsigned int bracket_map,
5757 unsigned int backref_map)
5758 {
5759 do {
5760 const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
5761 NULL, 0, FALSE);
5762 register int op = *scode;
5763
5764 /* Non-capturing brackets */
5765
5766 if (op == OP_BRA)
5767 {
5768 if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
5769 }
5770
5771 /* Capturing brackets */
5772
5773 else if (op == OP_CBRA)
5774 {
5775 int n = GET2(scode, 1+LINK_SIZE);
5776 int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
5777 if (!is_startline(scode, new_map, backref_map)) return FALSE;
5778 }
5779
5780 /* Other brackets */
5781
5782 else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
5783 { if (!is_startline(scode, bracket_map, backref_map)) return FALSE; }
5784
5785 /* .* means "start at start or after \n" if it isn't in brackets that
5786 may be referenced. */
5787
5788 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
5789 {
5790 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
5791 }
5792
5793 /* Check for explicit circumflex */
5794
5795 else if (op != OP_CIRC) return FALSE;
5796
5797 /* Move on to the next alternative */
5798
5799 code += GET(code, 1);
5800 }
5801 while (*code == OP_ALT); /* Loop for each alternative */
5802 return TRUE;
5803 }
5804
5805
5806
5807 /*************************************************
5808 * Check for asserted fixed first char *
5809 *************************************************/
5810
5811 /* During compilation, the "first char" settings from forward assertions are
5812 discarded, because they can cause conflicts with actual literals that follow.
5813 However, if we end up without a first char setting for an unanchored pattern,
5814 it is worth scanning the regex to see if there is an initial asserted first
5815 char. If all branches start with the same asserted char, or with a bracket all
5816 of whose alternatives start with the same asserted char (recurse ad lib), then
5817 we return that char, otherwise -1.
5818
5819 Arguments:
5820 code points to start of expression (the bracket)
5821 options pointer to the options (used to check casing changes)
5822 inassert TRUE if in an assertion
5823
5824 Returns: -1 or the fixed first char
5825 */
5826
5827 static int
5828 find_firstassertedchar(const uschar *code, int *options, BOOL inassert)
5829 {
5830 register int c = -1;
5831 do {
5832 int d;
5833 const uschar *scode =
5834 first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE);
5835 register int op = *scode;
5836
5837 switch(op)
5838 {
5839 default:
5840 return -1;
5841
5842 case OP_BRA:
5843 case OP_CBRA:
5844 case OP_ASSERT:
5845 case OP_ONCE:
5846 case OP_COND:
5847 if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0)
5848 return -1;
5849 if (c < 0) c = d; else if (c != d) return -1;
5850 break;
5851
5852 case OP_EXACT: /* Fall through */
5853 scode += 2;
5854
5855 case OP_CHAR:
5856 case OP_CHARNC:
5857 case OP_PLUS:
5858 case OP_MINPLUS:
5859 case OP_POSPLUS:
5860 if (!inassert) return -1;
5861 if (c < 0)
5862 {
5863 c = scode[1];
5864 if ((*options & PCRE_CASELESS) != 0) c |= REQ_CASELESS;
5865 }
5866 else if (c != scode[1]) return -1;
5867 break;
5868 }
5869
5870 code += GET(code, 1);
5871 }
5872 while (*code == OP_ALT);
5873 return c;
5874 }
5875
5876
5877
5878 /*************************************************
5879 * Compile a Regular Expression *
5880 *************************************************/
5881
5882 /* This function takes a string and returns a pointer to a block of store
5883 holding a compiled version of the expression. The original API for this
5884 function had no error code return variable; it is retained for backwards
5885 compatibility. The new function is given a new name.
5886
5887 Arguments:
5888 pattern the regular expression
5889 options various option bits
5890 errorcodeptr pointer to error code variable (pcre_compile2() only)
5891 can be NULL if you don't want a code value
5892 errorptr pointer to pointer to error text
5893 erroroffset ptr offset in pattern where error was detected
5894 tables pointer to character tables or NULL
5895
5896 Returns: pointer to compiled data block, or NULL on error,
5897 with errorptr and erroroffset set
5898 */
5899
5900 PCRE_EXP_DEFN pcre *
5901 pcre_compile(const char *pattern, int options, const char **errorptr,
5902 int *erroroffset, const unsigned char *tables)
5903 {
5904 return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
5905 }
5906
5907
5908 PCRE_EXP_DEFN pcre *
5909 pcre_compile2(const char *pattern, int options, int *errorcodeptr,
5910 const char **errorptr, int *erroroffset, const unsigned char *tables)
5911 {
5912 real_pcre *re;
5913 int length = 1; /* For final END opcode */
5914 int firstbyte, reqbyte, newline;
5915 int errorcode = 0;
5916 int skipatstart = 0;
5917 #ifdef SUPPORT_UTF8
5918 BOOL utf8;
5919 #endif
5920 size_t size;
5921 uschar *code;
5922 const uschar *codestart;
5923 const uschar *ptr;
5924 compile_data compile_block;
5925 compile_data *cd = &compile_block;
5926
5927 /* This space is used for "compiling" into during the first phase, when we are
5928 computing the amount of memory that is needed. Compiled items are thrown away
5929 as soon as possible, so that a fairly large buffer should be sufficient for
5930 this purpose. The same space is used in the second phase for remembering where
5931 to fill in forward references to subpatterns. */
5932
5933 uschar cworkspace[COMPILE_WORK_SIZE];
5934
5935 /* Set this early so that early errors get offset 0. */
5936
5937 ptr = (const uschar *)pattern;
5938
5939 /* We can't pass back an error message if errorptr is NULL; I guess the best we
5940 can do is just return NULL, but we can set a code value if there is a code
5941 pointer. */
5942
5943 if (errorptr == NULL)
5944 {
5945 if (errorcodeptr != NULL) *errorcodeptr = 99;
5946 return NULL;
5947 }
5948
5949 *errorptr = NULL;
5950 if (errorcodeptr != NULL) *errorcodeptr = ERR0;
5951
5952 /* However, we can give a message for this error */
5953
5954 if (erroroffset == NULL)
5955 {
5956 errorcode = ERR16;
5957 goto PCRE_EARLY_ERROR_RETURN2;
5958 }
5959
5960 *erroroffset = 0;
5961
5962 /* Can't support UTF8 unless PCRE has been compiled to include the code. */
5963
5964 #ifdef SUPPORT_UTF8
5965 utf8 = (options & PCRE_UTF8) != 0;
5966 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
5967 (*erroroffset = _pcre_valid_utf8((uschar *)pattern, -1)) >= 0)
5968 {
5969 errorcode = ERR44;
5970 goto PCRE_EARLY_ERROR_RETURN2;
5971 }
5972 #else
5973 if ((options & PCRE_UTF8) != 0)
5974 {
5975 errorcode = ERR32;
5976 goto PCRE_EARLY_ERROR_RETURN;
5977 }
5978 #endif
5979
5980 if ((options & ~PUBLIC_OPTIONS) != 0)
5981 {
5982 errorcode = ERR17;
5983 goto PCRE_EARLY_ERROR_RETURN;
5984 }
5985
5986 /* Set up pointers to the individual character tables */
5987
5988 if (tables == NULL) tables = _pcre_default_tables;
5989 cd->lcc = tables + lcc_offset;
5990 cd->fcc = tables + fcc_offset;
5991 cd->cbits = tables + cbits_offset;
5992 cd->ctypes = tables + ctypes_offset;
5993
5994 /* Check for global one-time settings at the start of the pattern, and remember
5995 the offset for later. */
5996
5997 while (ptr[skipatstart] == '(' && ptr[skipatstart+1] == '*')
5998 {
5999 int newnl = 0;
6000 int newbsr = 0;
6001
6002 if (strncmp((char *)(ptr+skipatstart+2), "CR)", 3) == 0)
6003 { skipatstart += 5; newnl = PCRE_NEWLINE_CR; }
6004 else if (strncmp((char *)(ptr+skipatstart+2), "LF)", 3) == 0)
6005 { skipatstart += 5; newnl = PCRE_NEWLINE_LF; }
6006 else if (strncmp((char *)(ptr+skipatstart+2), "CRLF)", 5) == 0)
6007 { skipatstart += 7; newnl = PCRE_NEWLINE_CR + PCRE_NEWLINE_LF; }
6008 else if (strncmp((char *)(ptr+skipatstart+2), "ANY)", 4) == 0)
6009 { skipatstart += 6; newnl = PCRE_NEWLINE_ANY; }
6010 else if (strncmp((char *)(ptr+skipatstart+2), "ANYCRLF)", 8) == 0)
6011 { skipatstart += 10; newnl = PCRE_NEWLINE_ANYCRLF; }
6012
6013 else if (strncmp((char *)(ptr+skipatstart+2), "BSR_ANYCRLF)", 12) == 0)
6014 { skipatstart += 14; newbsr = PCRE_BSR_ANYCRLF; }
6015 else if (strncmp((char *)(ptr+skipatstart+2), "BSR_UNICODE)", 12) == 0)
6016 { skipatstart += 14; newbsr = PCRE_BSR_UNICODE; }
6017
6018 if (newnl != 0)
6019 options = (options & ~PCRE_NEWLINE_BITS) | newnl;
6020 else if (newbsr != 0)
6021 options = (options & ~(PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) | newbsr;
6022 else break;
6023 }
6024
6025 /* Check validity of \R options. */
6026
6027 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
6028 {
6029 case 0:
6030 case PCRE_BSR_ANYCRLF:
6031 case PCRE_BSR_UNICODE:
6032 break;
6033 default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
6034 }
6035
6036 /* Handle different types of newline. The three bits give seven cases. The
6037 current code allows for fixed one- or two-byte sequences, plus "any" and
6038 "anycrlf". */
6039
6040 switch (options & PCRE_NEWLINE_BITS)
6041 {
6042 case 0: newline = NEWLINE; break; /* Build-time default */
6043 case PCRE_NEWLINE_CR: newline = '\r'; break;
6044 case PCRE_NEWLINE_LF: newline = '\n'; break;
6045 case PCRE_NEWLINE_CR+
6046 PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
6047 case PCRE_NEWLINE_ANY: newline = -1; break;
6048 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
6049 default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
6050 }
6051
6052 if (newline == -2)
6053 {
6054 cd->nltype = NLTYPE_ANYCRLF;
6055 }
6056 else if (newline < 0)
6057 {
6058 cd->nltype = NLTYPE_ANY;
6059 }
6060 else
6061 {
6062 cd->nltype = NLTYPE_FIXED;
6063 if (newline > 255)
6064 {
6065 cd->nllen = 2;
6066 cd->nl[0] = (newline >> 8) & 255;
6067 cd->nl[1] = newline & 255;
6068 }
6069 else
6070 {
6071 cd->nllen = 1;
6072 cd->nl[0] = newline;
6073 }
6074 }
6075
6076 /* Maximum back reference and backref bitmap. The bitmap records up to 31 back
6077 references to help in deciding whether (.*) can be treated as anchored or not.
6078 */
6079
6080 cd->top_backref = 0;
6081 cd->backref_map = 0;
6082
6083 /* Reflect pattern for debugging output */
6084
6085 DPRINTF(("------------------------------------------------------------------\n"));
6086 DPRINTF(("%s\n", pattern));
6087
6088 /* Pretend to compile the pattern while actually just accumulating the length
6089 of memory required. This behaviour is triggered by passing a non-NULL final
6090 argument to compile_regex(). We pass a block of workspace (cworkspace) for it
6091 to compile parts of the pattern into; the compiled code is discarded when it is
6092 no longer needed, so hopefully this workspace will never overflow, though there
6093 is a test for its doing so. */
6094
6095 cd->bracount = cd->final_bracount = 0;
6096 cd->names_found = 0;
6097 cd->name_entry_size = 0;
6098 cd->name_table = NULL;
6099 cd->start_workspace = cworkspace;
6100 cd->start_code = cworkspace;
6101 cd->hwm = cworkspace;
6102 cd->start_pattern = (const uschar *)pattern;
6103 cd->end_pattern = (const uschar *)(pattern + strlen(pattern));
6104 cd->req_varyopt = 0;
6105 cd->external_options = options;
6106 cd->external_flags = 0;
6107
6108 /* Now do the pre-compile. On error, errorcode will be set non-zero, so we
6109 don't need to look at the result of the function here. The initial options have
6110 been put into the cd block so that they can be changed if an option setting is
6111 found within the regex right at the beginning. Bringing initial option settings
6112 outside can help speed up starting point checks. */
6113
6114 ptr += skipatstart;
6115 code = cworkspace;
6116 *code = OP_BRA;
6117 (void)compile_regex(cd->external_options, cd->external_options & PCRE_IMS,
6118 &code, &ptr, &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd,
6119 &length);
6120 if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN;
6121
6122 DPRINTF(("end pre-compile: length=%d workspace=%d\n", length,
6123 cd->hwm - cworkspace));
6124
6125 if (length > MAX_PATTERN_SIZE)
6126 {
6127 errorcode = ERR20;
6128 goto PCRE_EARLY_ERROR_RETURN;
6129 }
6130
6131 /* Compute the size of data block needed and get it, either from malloc or
6132 externally provided function. Integer overflow should no longer be possible
6133 because nowadays we limit the maximum value of cd->names_found and
6134 cd->name_entry_size. */
6135
6136 size = length + sizeof(real_pcre) + cd->names_found * (cd->name_entry_size + 3);
6137 re = (real_pcre *)(pcre_malloc)(size);
6138
6139 if (re == NULL)
6140 {
6141 errorcode = ERR21;
6142 goto PCRE_EARLY_ERROR_RETURN;
6143 }
6144
6145 /* Put in the magic number, and save the sizes, initial options, internal
6146 flags, and character table pointer. NULL is used for the default character
6147 tables. The nullpad field is at the end; it's there to help in the case when a
6148 regex compiled on a system with 4-byte pointers is run on another with 8-byte
6149 pointers. */
6150
6151 re->magic_number = MAGIC_NUMBER;
6152 re->size = size;
6153 re->options = cd->external_options;
6154 re->flags = cd->external_flags;
6155 re->dummy1 = 0;
6156 re->first_byte = 0;
6157 re->req_byte = 0;
6158 re->name_table_offset = sizeof(real_pcre);
6159 re->name_entry_size = cd->name_entry_size;
6160 re->name_count = cd->names_found;
6161 re->ref_count = 0;
6162 re->tables = (tables == _pcre_default_tables)? NULL : tables;
6163 re->nullpad = NULL;
6164
6165 /* The starting points of the name/number translation table and of the code are
6166 passed around in the compile data block. The start/end pattern and initial
6167 options are already set from the pre-compile phase, as is the name_entry_size
6168 field. Reset the bracket count and the names_found field. Also reset the hwm
6169 field; this time it's used for remembering forward references to subpatterns.
6170 */
6171
6172 cd->final_bracount = cd->bracount; /* Save for checking forward references */
6173 cd->bracount = 0;
6174 cd->names_found = 0;
6175 cd->name_table = (uschar *)re + re->name_table_offset;
6176 codestart = cd->name_table + re->name_entry_size * re->name_count;
6177 cd->start_code = codestart;
6178 cd->hwm = cworkspace;
6179 cd->req_varyopt = 0;
6180 cd->had_accept = FALSE;
6181
6182 /* Set up a starting, non-extracting bracket, then compile the expression. On
6183 error, errorcode will be set non-zero, so we don't need to look at the result
6184 of the function here. */
6185
6186 ptr = (const uschar *)pattern + skipatstart;
6187 code = (uschar *)codestart;
6188 *code = OP_BRA;
6189 (void)compile_regex(re->options, re->options & PCRE_IMS, &code, &ptr,
6190 &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd, NULL);
6191 re->top_bracket = cd->bracount;
6192 re->top_backref = cd->top_backref;
6193 re->flags = cd->external_flags;
6194
6195 if (cd->had_accept) reqbyte = -1; /* Must disable after (*ACCEPT) */
6196
6197 /* If not reached end of pattern on success, there's an excess bracket. */
6198
6199 if (errorcode == 0 && *ptr != 0) errorcode = ERR22;
6200
6201 /* Fill in the terminating state and check for disastrous overflow, but
6202 if debugging, leave the test till after things are printed out. */
6203
6204 *code++ = OP_END;
6205
6206 #ifndef DEBUG
6207 if (code - codestart > length) errorcode = ERR23;
6208 #endif
6209
6210 /* Fill in any forward references that are required. */
6211
6212 while (errorcode == 0 && cd->hwm > cworkspace)
6213 {
6214 int offset, recno;
6215 const uschar *groupptr;
6216 cd->hwm -= LINK_SIZE;
6217 offset = GET(cd->hwm, 0);
6218 recno = GET(codestart, offset);
6219 groupptr = find_bracket(codestart, (re->options & PCRE_UTF8) != 0, recno);
6220 if (groupptr == NULL) errorcode = ERR53;
6221 else PUT(((uschar *)codestart), offset, groupptr - codestart);
6222 }
6223
6224 /* Give an error if there's back reference to a non-existent capturing
6225 subpattern. */
6226
6227 if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15;
6228
6229 /* Failed to compile, or error while post-processing */
6230
6231 if (errorcode != 0)
6232 {
6233 (pcre_free)(re);
6234 PCRE_EARLY_ERROR_RETURN:
6235 *erroroffset = ptr - (const uschar *)pattern;
6236 PCRE_EARLY_ERROR_RETURN2:
6237 *errorptr = find_error_text(errorcode);
6238 if (errorcodeptr != NULL) *errorcodeptr = errorcode;
6239 return NULL;
6240 }
6241
6242 /* If the anchored option was not passed, set the flag if we can determine that
6243 the pattern is anchored by virtue of ^ characters or \A or anything else (such
6244 as starting with .* when DOTALL is set).
6245
6246 Otherwise, if we know what the first byte has to be, save it, because that
6247 speeds up unanchored matches no end. If not, see if we can set the
6248 PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
6249 start with ^. and also when all branches start with .* for non-DOTALL matches.
6250 */
6251
6252 if ((re->options & PCRE_ANCHORED) == 0)
6253 {
6254 int temp_options = re->options; /* May get changed during these scans */
6255 if (is_anchored(codestart, &temp_options, 0, cd->backref_map))
6256 re->options |= PCRE_ANCHORED;
6257 else
6258 {
6259 if (firstbyte < 0)
6260 firstbyte = find_firstassertedchar(codestart, &temp_options, FALSE);
6261 if (firstbyte >= 0) /* Remove caseless flag for non-caseable chars */
6262 {
6263 int ch = firstbyte & 255;
6264 re->first_byte = ((firstbyte & REQ_CASELESS) != 0 &&
6265 cd->fcc[ch] == ch)? ch : firstbyte;
6266 re->flags |= PCRE_FIRSTSET;
6267 }
6268 else if (is_startline(codestart, 0, cd->backref_map))
6269 re->flags |= PCRE_STARTLINE;
6270 }
6271 }
6272
6273 /* For an anchored pattern, we use the "required byte" only if it follows a
6274 variable length item in the regex. Remove the caseless flag for non-caseable
6275 bytes. */
6276
6277 if (reqbyte >= 0 &&
6278 ((re->options & PCRE_ANCHORED) == 0 || (reqbyte & REQ_VARY) != 0))
6279 {
6280 int ch = reqbyte & 255;
6281 re->req_byte = ((reqbyte & REQ_CASELESS) != 0 &&
6282 cd->fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte;
6283 re->flags |= PCRE_REQCHSET;
6284 }
6285
6286 /* Print out the compiled data if debugging is enabled. This is never the
6287 case when building a production library. */
6288
6289 #ifdef DEBUG
6290
6291 printf("Length = %d top_bracket = %d top_backref = %d\n",
6292 length, re->top_bracket, re->top_backref);
6293
6294 printf("Options=%08x\n", re->options);
6295
6296 if ((re->flags & PCRE_FIRSTSET) != 0)
6297 {
6298 int ch = re->first_byte & 255;
6299 const char *caseless = ((re->first_byte & REQ_CASELESS) == 0)?
6300 "" : " (caseless)";
6301 if (isprint(ch)) printf("First char = %c%s\n", ch, caseless);
6302 else printf("First char = \\x%02x%s\n", ch, caseless);
6303 }
6304
6305 if ((re->flags & PCRE_REQCHSET) != 0)
6306 {
6307 int ch = re->req_byte & 255;
6308 const char *caseless = ((re->req_byte & REQ_CASELESS) == 0)?
6309 "" : " (caseless)";
6310 if (isprint(ch)) printf("Req char = %c%s\n", ch, caseless);
6311 else printf("Req char = \\x%02x%s\n", ch, caseless);
6312 }
6313
6314 pcre_printint(re, stdout, TRUE);
6315
6316 /* This check is done here in the debugging case so that the code that
6317 was compiled can be seen. */
6318
6319 if (code - codestart > length)
6320 {
6321 (pcre_free)(re);
6322 *errorptr = find_error_text(ERR23);
6323 *erroroffset = ptr - (uschar *)pattern;
6324 if (errorcodeptr != NULL) *errorcodeptr = ERR23;
6325 return NULL;
6326 }
6327 #endif /* DEBUG */
6328
6329 return (pcre *)re;
6330 }
6331
6332 /* End of pcre_compile.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12