/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Contents of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 268 - (show annotations) (download)
Thu Nov 15 10:28:09 2007 UTC (6 years, 8 months ago) by ph10
File MIME type: text/plain
File size: 194795 byte(s)
Fix (?&) non-diagnosis bug and missing length check for (?&a) etc.

1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2007 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains the external function pcre_compile(), along with
42 supporting internal functions that are not used by other modules. */
43
44
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48
49 #define NLBLOCK cd /* Block containing newline information */
50 #define PSSTART start_pattern /* Field containing processed string start */
51 #define PSEND end_pattern /* Field containing processed string end */
52
53 #include "pcre_internal.h"
54
55
56 /* When DEBUG is defined, we need the pcre_printint() function, which is also
57 used by pcretest. DEBUG is not defined when building a production library. */
58
59 #ifdef DEBUG
60 #include "pcre_printint.src"
61 #endif
62
63
64 /* Macro for setting individual bits in class bitmaps. */
65
66 #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
67
68 /* Maximum length value to check against when making sure that the integer that
69 holds the compiled pattern length does not overflow. We make it a bit less than
70 INT_MAX to allow for adding in group terminating bytes, so that we don't have
71 to check them every time. */
72
73 #define OFLOW_MAX (INT_MAX - 20)
74
75
76 /*************************************************
77 * Code parameters and static tables *
78 *************************************************/
79
80 /* This value specifies the size of stack workspace that is used during the
81 first pre-compile phase that determines how much memory is required. The regex
82 is partly compiled into this space, but the compiled parts are discarded as
83 soon as they can be, so that hopefully there will never be an overrun. The code
84 does, however, check for an overrun. The largest amount I've seen used is 218,
85 so this number is very generous.
86
87 The same workspace is used during the second, actual compile phase for
88 remembering forward references to groups so that they can be filled in at the
89 end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
90 is 4 there is plenty of room. */
91
92 #define COMPILE_WORK_SIZE (4096)
93
94
95 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
96 are simple data values; negative values are for special things like \d and so
97 on. Zero means further processing is needed (for things like \x), or the escape
98 is invalid. */
99
100 #ifndef EBCDIC /* This is the "normal" table for ASCII systems */
101 static const short int escapes[] = {
102 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
103 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
104 '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */
105 -ESC_H, 0, 0, -ESC_K, 0, 0, 0, 0, /* H - O */
106 -ESC_P, -ESC_Q, -ESC_R, -ESC_S, 0, 0, -ESC_V, -ESC_W, /* P - W */
107 -ESC_X, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
108 '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */
109 -ESC_h, 0, 0, -ESC_k, 0, 0, ESC_n, 0, /* h - o */
110 -ESC_p, 0, ESC_r, -ESC_s, ESC_tee, 0, -ESC_v, -ESC_w, /* p - w */
111 0, 0, -ESC_z /* x - z */
112 };
113
114 #else /* This is the "abnormal" table for EBCDIC systems */
115 static const short int escapes[] = {
116 /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
117 /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
118 /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
119 /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
120 /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
121 /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
122 /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
123 /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
124 /* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0,
125 /* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
126 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
127 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0,
128 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
129 /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
130 /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
131 /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
132 /* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0,
133 /* D0 */ '}', 0, -ESC_K, 0, 0, 0, 0, -ESC_P,
134 /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
135 /* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X,
136 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
137 /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
138 /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
139 };
140 #endif
141
142
143 /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
144 searched linearly. Put all the names into a single string, in order to reduce
145 the number of relocations when a shared library is dynamically linked. */
146
147 typedef struct verbitem {
148 int len;
149 int op;
150 } verbitem;
151
152 static const char verbnames[] =
153 "ACCEPT\0"
154 "COMMIT\0"
155 "F\0"
156 "FAIL\0"
157 "PRUNE\0"
158 "SKIP\0"
159 "THEN";
160
161 static verbitem verbs[] = {
162 { 6, OP_ACCEPT },
163 { 6, OP_COMMIT },
164 { 1, OP_FAIL },
165 { 4, OP_FAIL },
166 { 5, OP_PRUNE },
167 { 4, OP_SKIP },
168 { 4, OP_THEN }
169 };
170
171 static int verbcount = sizeof(verbs)/sizeof(verbitem);
172
173
174 /* Tables of names of POSIX character classes and their lengths. The names are
175 now all in a single string, to reduce the number of relocations when a shared
176 library is dynamically loaded. The list of lengths is terminated by a zero
177 length entry. The first three must be alpha, lower, upper, as this is assumed
178 for handling case independence. */
179
180 static const char posix_names[] =
181 "alpha\0" "lower\0" "upper\0" "alnum\0" "ascii\0" "blank\0"
182 "cntrl\0" "digit\0" "graph\0" "print\0" "punct\0" "space\0"
183 "word\0" "xdigit";
184
185 static const uschar posix_name_lengths[] = {
186 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
187
188 /* Table of class bit maps for each POSIX class. Each class is formed from a
189 base map, with an optional addition or removal of another map. Then, for some
190 classes, there is some additional tweaking: for [:blank:] the vertical space
191 characters are removed, and for [:alpha:] and [:alnum:] the underscore
192 character is removed. The triples in the table consist of the base map offset,
193 second map offset or -1 if no second map, and a non-negative value for map
194 addition or a negative value for map subtraction (if there are two maps). The
195 absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
196 remove vertical space characters, 2 => remove underscore. */
197
198 static const int posix_class_maps[] = {
199 cbit_word, cbit_digit, -2, /* alpha */
200 cbit_lower, -1, 0, /* lower */
201 cbit_upper, -1, 0, /* upper */
202 cbit_word, -1, 2, /* alnum - word without underscore */
203 cbit_print, cbit_cntrl, 0, /* ascii */
204 cbit_space, -1, 1, /* blank - a GNU extension */
205 cbit_cntrl, -1, 0, /* cntrl */
206 cbit_digit, -1, 0, /* digit */
207 cbit_graph, -1, 0, /* graph */
208 cbit_print, -1, 0, /* print */
209 cbit_punct, -1, 0, /* punct */
210 cbit_space, -1, 0, /* space */
211 cbit_word, -1, 0, /* word - a Perl extension */
212 cbit_xdigit,-1, 0 /* xdigit */
213 };
214
215
216 #define STRING(a) # a
217 #define XSTRING(s) STRING(s)
218
219 /* The texts of compile-time error messages. These are "char *" because they
220 are passed to the outside world. Do not ever re-use any error number, because
221 they are documented. Always add a new error instead. Messages marked DEAD below
222 are no longer used. This used to be a table of strings, but in order to reduce
223 the number of relocations needed when a shared library is loaded dynamically,
224 it is now one long string. We cannot use a table of offsets, because the
225 lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
226 simply count through to the one we want - this isn't a performance issue
227 because these strings are used only when there is a compilation error. */
228
229 static const char error_texts[] =
230 "no error\0"
231 "\\ at end of pattern\0"
232 "\\c at end of pattern\0"
233 "unrecognized character follows \\\0"
234 "numbers out of order in {} quantifier\0"
235 /* 5 */
236 "number too big in {} quantifier\0"
237 "missing terminating ] for character class\0"
238 "invalid escape sequence in character class\0"
239 "range out of order in character class\0"
240 "nothing to repeat\0"
241 /* 10 */
242 "operand of unlimited repeat could match the empty string\0" /** DEAD **/
243 "internal error: unexpected repeat\0"
244 "unrecognized character after (?\0"
245 "POSIX named classes are supported only within a class\0"
246 "missing )\0"
247 /* 15 */
248 "reference to non-existent subpattern\0"
249 "erroffset passed as NULL\0"
250 "unknown option bit(s) set\0"
251 "missing ) after comment\0"
252 "parentheses nested too deeply\0" /** DEAD **/
253 /* 20 */
254 "regular expression is too large\0"
255 "failed to get memory\0"
256 "unmatched parentheses\0"
257 "internal error: code overflow\0"
258 "unrecognized character after (?<\0"
259 /* 25 */
260 "lookbehind assertion is not fixed length\0"
261 "malformed number or name after (?(\0"
262 "conditional group contains more than two branches\0"
263 "assertion expected after (?(\0"
264 "(?R or (?[+-]digits must be followed by )\0"
265 /* 30 */
266 "unknown POSIX class name\0"
267 "POSIX collating elements are not supported\0"
268 "this version of PCRE is not compiled with PCRE_UTF8 support\0"
269 "spare error\0" /** DEAD **/
270 "character value in \\x{...} sequence is too large\0"
271 /* 35 */
272 "invalid condition (?(0)\0"
273 "\\C not allowed in lookbehind assertion\0"
274 "PCRE does not support \\L, \\l, \\N, \\U, or \\u\0"
275 "number after (?C is > 255\0"
276 "closing ) for (?C expected\0"
277 /* 40 */
278 "recursive call could loop indefinitely\0"
279 "unrecognized character after (?P\0"
280 "syntax error in subpattern name (missing terminator)\0"
281 "two named subpatterns have the same name\0"
282 "invalid UTF-8 string\0"
283 /* 45 */
284 "support for \\P, \\p, and \\X has not been compiled\0"
285 "malformed \\P or \\p sequence\0"
286 "unknown property name after \\P or \\p\0"
287 "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
288 "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
289 /* 50 */
290 "repeated subpattern is too long\0" /** DEAD **/
291 "octal value is greater than \\377 (not in UTF-8 mode)\0"
292 "internal error: overran compiling workspace\0"
293 "internal error: previously-checked referenced subpattern not found\0"
294 "DEFINE group contains more than one branch\0"
295 /* 55 */
296 "repeating a DEFINE group is not allowed\0"
297 "inconsistent NEWLINE options\0"
298 "\\g is not followed by a braced name or an optionally braced non-zero number\0"
299 "(?+ or (?- or (?(+ or (?(- must be followed by a non-zero number\0"
300 "(*VERB) with an argument is not supported\0"
301 /* 60 */
302 "(*VERB) not recognized\0"
303 "number is too big\0"
304 "subpattern name expected after (?&";
305
306
307 /* Table to identify digits and hex digits. This is used when compiling
308 patterns. Note that the tables in chartables are dependent on the locale, and
309 may mark arbitrary characters as digits - but the PCRE compiling code expects
310 to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
311 a private table here. It costs 256 bytes, but it is a lot faster than doing
312 character value tests (at least in some simple cases I timed), and in some
313 applications one wants PCRE to compile efficiently as well as match
314 efficiently.
315
316 For convenience, we use the same bit definitions as in chartables:
317
318 0x04 decimal digit
319 0x08 hexadecimal digit
320
321 Then we can use ctype_digit and ctype_xdigit in the code. */
322
323 #ifndef EBCDIC /* This is the "normal" case, for ASCII systems */
324 static const unsigned char digitab[] =
325 {
326 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
327 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
328 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
329 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
330 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
331 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
332 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
333 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
334 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
335 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
336 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
337 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
338 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
339 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
340 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
341 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
342 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
343 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
344 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
345 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
346 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
347 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
348 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
349 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
350 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
351 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
352 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
353 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
354 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
355 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
356 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
357 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
358
359 #else /* This is the "abnormal" case, for EBCDIC systems */
360 static const unsigned char digitab[] =
361 {
362 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
363 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
364 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
365 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
366 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
367 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
368 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
369 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
370 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
371 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
372 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
373 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */
374 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
375 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
376 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
377 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
378 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
379 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
380 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
381 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
382 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
383 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
384 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
385 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
386 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
387 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
388 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
389 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
390 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
391 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
392 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
393 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
394
395 static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
396 0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
397 0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
398 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
399 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
400 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
401 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
402 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
403 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
404 0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
405 0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
406 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
407 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */
408 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
409 0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
410 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
411 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
412 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
413 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
414 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
415 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
416 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
417 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
418 0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
419 0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
420 0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
421 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
422 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
423 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
424 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
425 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
426 0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
427 0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
428 #endif
429
430
431 /* Definition to allow mutual recursion */
432
433 static BOOL
434 compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
435 int *, int *, branch_chain *, compile_data *, int *);
436
437
438
439 /*************************************************
440 * Find an error text *
441 *************************************************/
442
443 /* The error texts are now all in one long string, to save on relocations. As
444 some of the text is of unknown length, we can't use a table of offsets.
445 Instead, just count through the strings. This is not a performance issue
446 because it happens only when there has been a compilation error.
447
448 Argument: the error number
449 Returns: pointer to the error string
450 */
451
452 static const char *
453 find_error_text(int n)
454 {
455 const char *s = error_texts;
456 for (; n > 0; n--) while (*s++ != 0);
457 return s;
458 }
459
460
461 /*************************************************
462 * Handle escapes *
463 *************************************************/
464
465 /* This function is called when a \ has been encountered. It either returns a
466 positive value for a simple escape such as \n, or a negative value which
467 encodes one of the more complicated things such as \d. A backreference to group
468 n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
469 UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
470 ptr is pointing at the \. On exit, it is on the final character of the escape
471 sequence.
472
473 Arguments:
474 ptrptr points to the pattern position pointer
475 errorcodeptr points to the errorcode variable
476 bracount number of previous extracting brackets
477 options the options bits
478 isclass TRUE if inside a character class
479
480 Returns: zero or positive => a data character
481 negative => a special escape sequence
482 on error, errorcodeptr is set
483 */
484
485 static int
486 check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
487 int options, BOOL isclass)
488 {
489 BOOL utf8 = (options & PCRE_UTF8) != 0;
490 const uschar *ptr = *ptrptr + 1;
491 int c, i;
492
493 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
494 ptr--; /* Set pointer back to the last byte */
495
496 /* If backslash is at the end of the pattern, it's an error. */
497
498 if (c == 0) *errorcodeptr = ERR1;
499
500 /* Non-alphamerics are literals. For digits or letters, do an initial lookup in
501 a table. A non-zero result is something that can be returned immediately.
502 Otherwise further processing may be required. */
503
504 #ifndef EBCDIC /* ASCII coding */
505 else if (c < '0' || c > 'z') {} /* Not alphameric */
506 else if ((i = escapes[c - '0']) != 0) c = i;
507
508 #else /* EBCDIC coding */
509 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphameric */
510 else if ((i = escapes[c - 0x48]) != 0) c = i;
511 #endif
512
513 /* Escapes that need further processing, or are illegal. */
514
515 else
516 {
517 const uschar *oldptr;
518 BOOL braced, negated;
519
520 switch (c)
521 {
522 /* A number of Perl escapes are not handled by PCRE. We give an explicit
523 error. */
524
525 case 'l':
526 case 'L':
527 case 'N':
528 case 'u':
529 case 'U':
530 *errorcodeptr = ERR37;
531 break;
532
533 /* \g must be followed by a number, either plain or braced. If positive, it
534 is an absolute backreference. If negative, it is a relative backreference.
535 This is a Perl 5.10 feature. Perl 5.10 also supports \g{name} as a
536 reference to a named group. This is part of Perl's movement towards a
537 unified syntax for back references. As this is synonymous with \k{name}, we
538 fudge it up by pretending it really was \k. */
539
540 case 'g':
541 if (ptr[1] == '{')
542 {
543 const uschar *p;
544 for (p = ptr+2; *p != 0 && *p != '}'; p++)
545 if (*p != '-' && (digitab[*p] & ctype_digit) == 0) break;
546 if (*p != 0 && *p != '}')
547 {
548 c = -ESC_k;
549 break;
550 }
551 braced = TRUE;
552 ptr++;
553 }
554 else braced = FALSE;
555
556 if (ptr[1] == '-')
557 {
558 negated = TRUE;
559 ptr++;
560 }
561 else negated = FALSE;
562
563 c = 0;
564 while ((digitab[ptr[1]] & ctype_digit) != 0)
565 c = c * 10 + *(++ptr) - '0';
566
567 if (c < 0)
568 {
569 *errorcodeptr = ERR61;
570 break;
571 }
572
573 if (c == 0 || (braced && *(++ptr) != '}'))
574 {
575 *errorcodeptr = ERR57;
576 break;
577 }
578
579 if (negated)
580 {
581 if (c > bracount)
582 {
583 *errorcodeptr = ERR15;
584 break;
585 }
586 c = bracount - (c - 1);
587 }
588
589 c = -(ESC_REF + c);
590 break;
591
592 /* The handling of escape sequences consisting of a string of digits
593 starting with one that is not zero is not straightforward. By experiment,
594 the way Perl works seems to be as follows:
595
596 Outside a character class, the digits are read as a decimal number. If the
597 number is less than 10, or if there are that many previous extracting
598 left brackets, then it is a back reference. Otherwise, up to three octal
599 digits are read to form an escaped byte. Thus \123 is likely to be octal
600 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
601 value is greater than 377, the least significant 8 bits are taken. Inside a
602 character class, \ followed by a digit is always an octal number. */
603
604 case '1': case '2': case '3': case '4': case '5':
605 case '6': case '7': case '8': case '9':
606
607 if (!isclass)
608 {
609 oldptr = ptr;
610 c -= '0';
611 while ((digitab[ptr[1]] & ctype_digit) != 0)
612 c = c * 10 + *(++ptr) - '0';
613 if (c < 0)
614 {
615 *errorcodeptr = ERR61;
616 break;
617 }
618 if (c < 10 || c <= bracount)
619 {
620 c = -(ESC_REF + c);
621 break;
622 }
623 ptr = oldptr; /* Put the pointer back and fall through */
624 }
625
626 /* Handle an octal number following \. If the first digit is 8 or 9, Perl
627 generates a binary zero byte and treats the digit as a following literal.
628 Thus we have to pull back the pointer by one. */
629
630 if ((c = *ptr) >= '8')
631 {
632 ptr--;
633 c = 0;
634 break;
635 }
636
637 /* \0 always starts an octal number, but we may drop through to here with a
638 larger first octal digit. The original code used just to take the least
639 significant 8 bits of octal numbers (I think this is what early Perls used
640 to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
641 than 3 octal digits. */
642
643 case '0':
644 c -= '0';
645 while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
646 c = c * 8 + *(++ptr) - '0';
647 if (!utf8 && c > 255) *errorcodeptr = ERR51;
648 break;
649
650 /* \x is complicated. \x{ddd} is a character number which can be greater
651 than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
652 treated as a data character. */
653
654 case 'x':
655 if (ptr[1] == '{')
656 {
657 const uschar *pt = ptr + 2;
658 int count = 0;
659
660 c = 0;
661 while ((digitab[*pt] & ctype_xdigit) != 0)
662 {
663 register int cc = *pt++;
664 if (c == 0 && cc == '0') continue; /* Leading zeroes */
665 count++;
666
667 #ifndef EBCDIC /* ASCII coding */
668 if (cc >= 'a') cc -= 32; /* Convert to upper case */
669 c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));
670 #else /* EBCDIC coding */
671 if (cc >= 'a' && cc <= 'z') cc += 64; /* Convert to upper case */
672 c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));
673 #endif
674 }
675
676 if (*pt == '}')
677 {
678 if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
679 ptr = pt;
680 break;
681 }
682
683 /* If the sequence of hex digits does not end with '}', then we don't
684 recognize this construct; fall through to the normal \x handling. */
685 }
686
687 /* Read just a single-byte hex-defined char */
688
689 c = 0;
690 while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
691 {
692 int cc; /* Some compilers don't like ++ */
693 cc = *(++ptr); /* in initializers */
694 #ifndef EBCDIC /* ASCII coding */
695 if (cc >= 'a') cc -= 32; /* Convert to upper case */
696 c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
697 #else /* EBCDIC coding */
698 if (cc <= 'z') cc += 64; /* Convert to upper case */
699 c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
700 #endif
701 }
702 break;
703
704 /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
705 This coding is ASCII-specific, but then the whole concept of \cx is
706 ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
707
708 case 'c':
709 c = *(++ptr);
710 if (c == 0)
711 {
712 *errorcodeptr = ERR2;
713 break;
714 }
715
716 #ifndef EBCDIC /* ASCII coding */
717 if (c >= 'a' && c <= 'z') c -= 32;
718 c ^= 0x40;
719 #else /* EBCDIC coding */
720 if (c >= 'a' && c <= 'z') c += 64;
721 c ^= 0xC0;
722 #endif
723 break;
724
725 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
726 other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
727 for Perl compatibility, it is a literal. This code looks a bit odd, but
728 there used to be some cases other than the default, and there may be again
729 in future, so I haven't "optimized" it. */
730
731 default:
732 if ((options & PCRE_EXTRA) != 0) switch(c)
733 {
734 default:
735 *errorcodeptr = ERR3;
736 break;
737 }
738 break;
739 }
740 }
741
742 *ptrptr = ptr;
743 return c;
744 }
745
746
747
748 #ifdef SUPPORT_UCP
749 /*************************************************
750 * Handle \P and \p *
751 *************************************************/
752
753 /* This function is called after \P or \p has been encountered, provided that
754 PCRE is compiled with support for Unicode properties. On entry, ptrptr is
755 pointing at the P or p. On exit, it is pointing at the final character of the
756 escape sequence.
757
758 Argument:
759 ptrptr points to the pattern position pointer
760 negptr points to a boolean that is set TRUE for negation else FALSE
761 dptr points to an int that is set to the detailed property value
762 errorcodeptr points to the error code variable
763
764 Returns: type value from ucp_type_table, or -1 for an invalid type
765 */
766
767 static int
768 get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
769 {
770 int c, i, bot, top;
771 const uschar *ptr = *ptrptr;
772 char name[32];
773
774 c = *(++ptr);
775 if (c == 0) goto ERROR_RETURN;
776
777 *negptr = FALSE;
778
779 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
780 negation. */
781
782 if (c == '{')
783 {
784 if (ptr[1] == '^')
785 {
786 *negptr = TRUE;
787 ptr++;
788 }
789 for (i = 0; i < (int)sizeof(name) - 1; i++)
790 {
791 c = *(++ptr);
792 if (c == 0) goto ERROR_RETURN;
793 if (c == '}') break;
794 name[i] = c;
795 }
796 if (c !='}') goto ERROR_RETURN;
797 name[i] = 0;
798 }
799
800 /* Otherwise there is just one following character */
801
802 else
803 {
804 name[0] = c;
805 name[1] = 0;
806 }
807
808 *ptrptr = ptr;
809
810 /* Search for a recognized property name using binary chop */
811
812 bot = 0;
813 top = _pcre_utt_size;
814
815 while (bot < top)
816 {
817 i = (bot + top) >> 1;
818 c = strcmp(name, _pcre_utt_names + _pcre_utt[i].name_offset);
819 if (c == 0)
820 {
821 *dptr = _pcre_utt[i].value;
822 return _pcre_utt[i].type;
823 }
824 if (c > 0) bot = i + 1; else top = i;
825 }
826
827 *errorcodeptr = ERR47;
828 *ptrptr = ptr;
829 return -1;
830
831 ERROR_RETURN:
832 *errorcodeptr = ERR46;
833 *ptrptr = ptr;
834 return -1;
835 }
836 #endif
837
838
839
840
841 /*************************************************
842 * Check for counted repeat *
843 *************************************************/
844
845 /* This function is called when a '{' is encountered in a place where it might
846 start a quantifier. It looks ahead to see if it really is a quantifier or not.
847 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
848 where the ddds are digits.
849
850 Arguments:
851 p pointer to the first char after '{'
852
853 Returns: TRUE or FALSE
854 */
855
856 static BOOL
857 is_counted_repeat(const uschar *p)
858 {
859 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
860 while ((digitab[*p] & ctype_digit) != 0) p++;
861 if (*p == '}') return TRUE;
862
863 if (*p++ != ',') return FALSE;
864 if (*p == '}') return TRUE;
865
866 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
867 while ((digitab[*p] & ctype_digit) != 0) p++;
868
869 return (*p == '}');
870 }
871
872
873
874 /*************************************************
875 * Read repeat counts *
876 *************************************************/
877
878 /* Read an item of the form {n,m} and return the values. This is called only
879 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
880 so the syntax is guaranteed to be correct, but we need to check the values.
881
882 Arguments:
883 p pointer to first char after '{'
884 minp pointer to int for min
885 maxp pointer to int for max
886 returned as -1 if no max
887 errorcodeptr points to error code variable
888
889 Returns: pointer to '}' on success;
890 current ptr on error, with errorcodeptr set non-zero
891 */
892
893 static const uschar *
894 read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
895 {
896 int min = 0;
897 int max = -1;
898
899 /* Read the minimum value and do a paranoid check: a negative value indicates
900 an integer overflow. */
901
902 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
903 if (min < 0 || min > 65535)
904 {
905 *errorcodeptr = ERR5;
906 return p;
907 }
908
909 /* Read the maximum value if there is one, and again do a paranoid on its size.
910 Also, max must not be less than min. */
911
912 if (*p == '}') max = min; else
913 {
914 if (*(++p) != '}')
915 {
916 max = 0;
917 while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
918 if (max < 0 || max > 65535)
919 {
920 *errorcodeptr = ERR5;
921 return p;
922 }
923 if (max < min)
924 {
925 *errorcodeptr = ERR4;
926 return p;
927 }
928 }
929 }
930
931 /* Fill in the required variables, and pass back the pointer to the terminating
932 '}'. */
933
934 *minp = min;
935 *maxp = max;
936 return p;
937 }
938
939
940
941 /*************************************************
942 * Find forward referenced subpattern *
943 *************************************************/
944
945 /* This function scans along a pattern's text looking for capturing
946 subpatterns, and counting them. If it finds a named pattern that matches the
947 name it is given, it returns its number. Alternatively, if the name is NULL, it
948 returns when it reaches a given numbered subpattern. This is used for forward
949 references to subpatterns. We know that if (?P< is encountered, the name will
950 be terminated by '>' because that is checked in the first pass.
951
952 Arguments:
953 ptr current position in the pattern
954 count current count of capturing parens so far encountered
955 name name to seek, or NULL if seeking a numbered subpattern
956 lorn name length, or subpattern number if name is NULL
957 xmode TRUE if we are in /x mode
958
959 Returns: the number of the named subpattern, or -1 if not found
960 */
961
962 static int
963 find_parens(const uschar *ptr, int count, const uschar *name, int lorn,
964 BOOL xmode)
965 {
966 const uschar *thisname;
967
968 for (; *ptr != 0; ptr++)
969 {
970 int term;
971
972 /* Skip over backslashed characters and also entire \Q...\E */
973
974 if (*ptr == '\\')
975 {
976 if (*(++ptr) == 0) return -1;
977 if (*ptr == 'Q') for (;;)
978 {
979 while (*(++ptr) != 0 && *ptr != '\\');
980 if (*ptr == 0) return -1;
981 if (*(++ptr) == 'E') break;
982 }
983 continue;
984 }
985
986 /* Skip over character classes */
987
988 if (*ptr == '[')
989 {
990 while (*(++ptr) != ']')
991 {
992 if (*ptr == 0) return -1;
993 if (*ptr == '\\')
994 {
995 if (*(++ptr) == 0) return -1;
996 if (*ptr == 'Q') for (;;)
997 {
998 while (*(++ptr) != 0 && *ptr != '\\');
999 if (*ptr == 0) return -1;
1000 if (*(++ptr) == 'E') break;
1001 }
1002 continue;
1003 }
1004 }
1005 continue;
1006 }
1007
1008 /* Skip comments in /x mode */
1009
1010 if (xmode && *ptr == '#')
1011 {
1012 while (*(++ptr) != 0 && *ptr != '\n');
1013 if (*ptr == 0) return -1;
1014 continue;
1015 }
1016
1017 /* An opening parens must now be a real metacharacter */
1018
1019 if (*ptr != '(') continue;
1020 if (ptr[1] != '?' && ptr[1] != '*')
1021 {
1022 count++;
1023 if (name == NULL && count == lorn) return count;
1024 continue;
1025 }
1026
1027 ptr += 2;
1028 if (*ptr == 'P') ptr++; /* Allow optional P */
1029
1030 /* We have to disambiguate (?<! and (?<= from (?<name> */
1031
1032 if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') &&
1033 *ptr != '\'')
1034 continue;
1035
1036 count++;
1037
1038 if (name == NULL && count == lorn) return count;
1039 term = *ptr++;
1040 if (term == '<') term = '>';
1041 thisname = ptr;
1042 while (*ptr != term) ptr++;
1043 if (name != NULL && lorn == ptr - thisname &&
1044 strncmp((const char *)name, (const char *)thisname, lorn) == 0)
1045 return count;
1046 }
1047
1048 return -1;
1049 }
1050
1051
1052
1053 /*************************************************
1054 * Find first significant op code *
1055 *************************************************/
1056
1057 /* This is called by several functions that scan a compiled expression looking
1058 for a fixed first character, or an anchoring op code etc. It skips over things
1059 that do not influence this. For some calls, a change of option is important.
1060 For some calls, it makes sense to skip negative forward and all backward
1061 assertions, and also the \b assertion; for others it does not.
1062
1063 Arguments:
1064 code pointer to the start of the group
1065 options pointer to external options
1066 optbit the option bit whose changing is significant, or
1067 zero if none are
1068 skipassert TRUE if certain assertions are to be skipped
1069
1070 Returns: pointer to the first significant opcode
1071 */
1072
1073 static const uschar*
1074 first_significant_code(const uschar *code, int *options, int optbit,
1075 BOOL skipassert)
1076 {
1077 for (;;)
1078 {
1079 switch ((int)*code)
1080 {
1081 case OP_OPT:
1082 if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
1083 *options = (int)code[1];
1084 code += 2;
1085 break;
1086
1087 case OP_ASSERT_NOT:
1088 case OP_ASSERTBACK:
1089 case OP_ASSERTBACK_NOT:
1090 if (!skipassert) return code;
1091 do code += GET(code, 1); while (*code == OP_ALT);
1092 code += _pcre_OP_lengths[*code];
1093 break;
1094
1095 case OP_WORD_BOUNDARY:
1096 case OP_NOT_WORD_BOUNDARY:
1097 if (!skipassert) return code;
1098 /* Fall through */
1099
1100 case OP_CALLOUT:
1101 case OP_CREF:
1102 case OP_RREF:
1103 case OP_DEF:
1104 code += _pcre_OP_lengths[*code];
1105 break;
1106
1107 default:
1108 return code;
1109 }
1110 }
1111 /* Control never reaches here */
1112 }
1113
1114
1115
1116
1117 /*************************************************
1118 * Find the fixed length of a pattern *
1119 *************************************************/
1120
1121 /* Scan a pattern and compute the fixed length of subject that will match it,
1122 if the length is fixed. This is needed for dealing with backward assertions.
1123 In UTF8 mode, the result is in characters rather than bytes.
1124
1125 Arguments:
1126 code points to the start of the pattern (the bracket)
1127 options the compiling options
1128
1129 Returns: the fixed length, or -1 if there is no fixed length,
1130 or -2 if \C was encountered
1131 */
1132
1133 static int
1134 find_fixedlength(uschar *code, int options)
1135 {
1136 int length = -1;
1137
1138 register int branchlength = 0;
1139 register uschar *cc = code + 1 + LINK_SIZE;
1140
1141 /* Scan along the opcodes for this branch. If we get to the end of the
1142 branch, check the length against that of the other branches. */
1143
1144 for (;;)
1145 {
1146 int d;
1147 register int op = *cc;
1148 switch (op)
1149 {
1150 case OP_CBRA:
1151 case OP_BRA:
1152 case OP_ONCE:
1153 case OP_COND:
1154 d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);
1155 if (d < 0) return d;
1156 branchlength += d;
1157 do cc += GET(cc, 1); while (*cc == OP_ALT);
1158 cc += 1 + LINK_SIZE;
1159 break;
1160
1161 /* Reached end of a branch; if it's a ket it is the end of a nested
1162 call. If it's ALT it is an alternation in a nested call. If it is
1163 END it's the end of the outer call. All can be handled by the same code. */
1164
1165 case OP_ALT:
1166 case OP_KET:
1167 case OP_KETRMAX:
1168 case OP_KETRMIN:
1169 case OP_END:
1170 if (length < 0) length = branchlength;
1171 else if (length != branchlength) return -1;
1172 if (*cc != OP_ALT) return length;
1173 cc += 1 + LINK_SIZE;
1174 branchlength = 0;
1175 break;
1176
1177 /* Skip over assertive subpatterns */
1178
1179 case OP_ASSERT:
1180 case OP_ASSERT_NOT:
1181 case OP_ASSERTBACK:
1182 case OP_ASSERTBACK_NOT:
1183 do cc += GET(cc, 1); while (*cc == OP_ALT);
1184 /* Fall through */
1185
1186 /* Skip over things that don't match chars */
1187
1188 case OP_REVERSE:
1189 case OP_CREF:
1190 case OP_RREF:
1191 case OP_DEF:
1192 case OP_OPT:
1193 case OP_CALLOUT:
1194 case OP_SOD:
1195 case OP_SOM:
1196 case OP_EOD:
1197 case OP_EODN:
1198 case OP_CIRC:
1199 case OP_DOLL:
1200 case OP_NOT_WORD_BOUNDARY:
1201 case OP_WORD_BOUNDARY:
1202 cc += _pcre_OP_lengths[*cc];
1203 break;
1204
1205 /* Handle literal characters */
1206
1207 case OP_CHAR:
1208 case OP_CHARNC:
1209 case OP_NOT:
1210 branchlength++;
1211 cc += 2;
1212 #ifdef SUPPORT_UTF8
1213 if ((options & PCRE_UTF8) != 0)
1214 {
1215 while ((*cc & 0xc0) == 0x80) cc++;
1216 }
1217 #endif
1218 break;
1219
1220 /* Handle exact repetitions. The count is already in characters, but we
1221 need to skip over a multibyte character in UTF8 mode. */
1222
1223 case OP_EXACT:
1224 branchlength += GET2(cc,1);
1225 cc += 4;
1226 #ifdef SUPPORT_UTF8
1227 if ((options & PCRE_UTF8) != 0)
1228 {
1229 while((*cc & 0x80) == 0x80) cc++;
1230 }
1231 #endif
1232 break;
1233
1234 case OP_TYPEEXACT:
1235 branchlength += GET2(cc,1);
1236 if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
1237 cc += 4;
1238 break;
1239
1240 /* Handle single-char matchers */
1241
1242 case OP_PROP:
1243 case OP_NOTPROP:
1244 cc += 2;
1245 /* Fall through */
1246
1247 case OP_NOT_DIGIT:
1248 case OP_DIGIT:
1249 case OP_NOT_WHITESPACE:
1250 case OP_WHITESPACE:
1251 case OP_NOT_WORDCHAR:
1252 case OP_WORDCHAR:
1253 case OP_ANY:
1254 branchlength++;
1255 cc++;
1256 break;
1257
1258 /* The single-byte matcher isn't allowed */
1259
1260 case OP_ANYBYTE:
1261 return -2;
1262
1263 /* Check a class for variable quantification */
1264
1265 #ifdef SUPPORT_UTF8
1266 case OP_XCLASS:
1267 cc += GET(cc, 1) - 33;
1268 /* Fall through */
1269 #endif
1270
1271 case OP_CLASS:
1272 case OP_NCLASS:
1273 cc += 33;
1274
1275 switch (*cc)
1276 {
1277 case OP_CRSTAR:
1278 case OP_CRMINSTAR:
1279 case OP_CRQUERY:
1280 case OP_CRMINQUERY:
1281 return -1;
1282
1283 case OP_CRRANGE:
1284 case OP_CRMINRANGE:
1285 if (GET2(cc,1) != GET2(cc,3)) return -1;
1286 branchlength += GET2(cc,1);
1287 cc += 5;
1288 break;
1289
1290 default:
1291 branchlength++;
1292 }
1293 break;
1294
1295 /* Anything else is variable length */
1296
1297 default:
1298 return -1;
1299 }
1300 }
1301 /* Control never gets here */
1302 }
1303
1304
1305
1306
1307 /*************************************************
1308 * Scan compiled regex for numbered bracket *
1309 *************************************************/
1310
1311 /* This little function scans through a compiled pattern until it finds a
1312 capturing bracket with the given number.
1313
1314 Arguments:
1315 code points to start of expression
1316 utf8 TRUE in UTF-8 mode
1317 number the required bracket number
1318
1319 Returns: pointer to the opcode for the bracket, or NULL if not found
1320 */
1321
1322 static const uschar *
1323 find_bracket(const uschar *code, BOOL utf8, int number)
1324 {
1325 for (;;)
1326 {
1327 register int c = *code;
1328 if (c == OP_END) return NULL;
1329
1330 /* XCLASS is used for classes that cannot be represented just by a bit
1331 map. This includes negated single high-valued characters. The length in
1332 the table is zero; the actual length is stored in the compiled code. */
1333
1334 if (c == OP_XCLASS) code += GET(code, 1);
1335
1336 /* Handle capturing bracket */
1337
1338 else if (c == OP_CBRA)
1339 {
1340 int n = GET2(code, 1+LINK_SIZE);
1341 if (n == number) return (uschar *)code;
1342 code += _pcre_OP_lengths[c];
1343 }
1344
1345 /* Otherwise, we can get the item's length from the table, except that for
1346 repeated character types, we have to test for \p and \P, which have an extra
1347 two bytes of parameters. */
1348
1349 else
1350 {
1351 switch(c)
1352 {
1353 case OP_TYPESTAR:
1354 case OP_TYPEMINSTAR:
1355 case OP_TYPEPLUS:
1356 case OP_TYPEMINPLUS:
1357 case OP_TYPEQUERY:
1358 case OP_TYPEMINQUERY:
1359 case OP_TYPEPOSSTAR:
1360 case OP_TYPEPOSPLUS:
1361 case OP_TYPEPOSQUERY:
1362 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1363 break;
1364
1365 case OP_TYPEUPTO:
1366 case OP_TYPEMINUPTO:
1367 case OP_TYPEEXACT:
1368 case OP_TYPEPOSUPTO:
1369 if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1370 break;
1371 }
1372
1373 /* Add in the fixed length from the table */
1374
1375 code += _pcre_OP_lengths[c];
1376
1377 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1378 a multi-byte character. The length in the table is a minimum, so we have to
1379 arrange to skip the extra bytes. */
1380
1381 #ifdef SUPPORT_UTF8
1382 if (utf8) switch(c)
1383 {
1384 case OP_CHAR:
1385 case OP_CHARNC:
1386 case OP_EXACT:
1387 case OP_UPTO:
1388 case OP_MINUPTO:
1389 case OP_POSUPTO:
1390 case OP_STAR:
1391 case OP_MINSTAR:
1392 case OP_POSSTAR:
1393 case OP_PLUS:
1394 case OP_MINPLUS:
1395 case OP_POSPLUS:
1396 case OP_QUERY:
1397 case OP_MINQUERY:
1398 case OP_POSQUERY:
1399 if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1400 break;
1401 }
1402 #endif
1403 }
1404 }
1405 }
1406
1407
1408
1409 /*************************************************
1410 * Scan compiled regex for recursion reference *
1411 *************************************************/
1412
1413 /* This little function scans through a compiled pattern until it finds an
1414 instance of OP_RECURSE.
1415
1416 Arguments:
1417 code points to start of expression
1418 utf8 TRUE in UTF-8 mode
1419
1420 Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1421 */
1422
1423 static const uschar *
1424 find_recurse(const uschar *code, BOOL utf8)
1425 {
1426 for (;;)
1427 {
1428 register int c = *code;
1429 if (c == OP_END) return NULL;
1430 if (c == OP_RECURSE) return code;
1431
1432 /* XCLASS is used for classes that cannot be represented just by a bit
1433 map. This includes negated single high-valued characters. The length in
1434 the table is zero; the actual length is stored in the compiled code. */
1435
1436 if (c == OP_XCLASS) code += GET(code, 1);
1437
1438 /* Otherwise, we can get the item's length from the table, except that for
1439 repeated character types, we have to test for \p and \P, which have an extra
1440 two bytes of parameters. */
1441
1442 else
1443 {
1444 switch(c)
1445 {
1446 case OP_TYPESTAR:
1447 case OP_TYPEMINSTAR:
1448 case OP_TYPEPLUS:
1449 case OP_TYPEMINPLUS:
1450 case OP_TYPEQUERY:
1451 case OP_TYPEMINQUERY:
1452 case OP_TYPEPOSSTAR:
1453 case OP_TYPEPOSPLUS:
1454 case OP_TYPEPOSQUERY:
1455 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1456 break;
1457
1458 case OP_TYPEPOSUPTO:
1459 case OP_TYPEUPTO:
1460 case OP_TYPEMINUPTO:
1461 case OP_TYPEEXACT:
1462 if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1463 break;
1464 }
1465
1466 /* Add in the fixed length from the table */
1467
1468 code += _pcre_OP_lengths[c];
1469
1470 /* In UTF-8 mode, opcodes that are followed by a character may be followed
1471 by a multi-byte character. The length in the table is a minimum, so we have
1472 to arrange to skip the extra bytes. */
1473
1474 #ifdef SUPPORT_UTF8
1475 if (utf8) switch(c)
1476 {
1477 case OP_CHAR:
1478 case OP_CHARNC:
1479 case OP_EXACT:
1480 case OP_UPTO:
1481 case OP_MINUPTO:
1482 case OP_POSUPTO:
1483 case OP_STAR:
1484 case OP_MINSTAR:
1485 case OP_POSSTAR:
1486 case OP_PLUS:
1487 case OP_MINPLUS:
1488 case OP_POSPLUS:
1489 case OP_QUERY:
1490 case OP_MINQUERY:
1491 case OP_POSQUERY:
1492 if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1493 break;
1494 }
1495 #endif
1496 }
1497 }
1498 }
1499
1500
1501
1502 /*************************************************
1503 * Scan compiled branch for non-emptiness *
1504 *************************************************/
1505
1506 /* This function scans through a branch of a compiled pattern to see whether it
1507 can match the empty string or not. It is called from could_be_empty()
1508 below and from compile_branch() when checking for an unlimited repeat of a
1509 group that can match nothing. Note that first_significant_code() skips over
1510 assertions. If we hit an unclosed bracket, we return "empty" - this means we've
1511 struck an inner bracket whose current branch will already have been scanned.
1512
1513 Arguments:
1514 code points to start of search
1515 endcode points to where to stop
1516 utf8 TRUE if in UTF8 mode
1517
1518 Returns: TRUE if what is matched could be empty
1519 */
1520
1521 static BOOL
1522 could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1523 {
1524 register int c;
1525 for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1526 code < endcode;
1527 code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1528 {
1529 const uschar *ccode;
1530
1531 c = *code;
1532
1533 /* Groups with zero repeats can of course be empty; skip them. */
1534
1535 if (c == OP_BRAZERO || c == OP_BRAMINZERO)
1536 {
1537 code += _pcre_OP_lengths[c];
1538 do code += GET(code, 1); while (*code == OP_ALT);
1539 c = *code;
1540 continue;
1541 }
1542
1543 /* For other groups, scan the branches. */
1544
1545 if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND)
1546 {
1547 BOOL empty_branch;
1548 if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1549
1550 /* Scan a closed bracket */
1551
1552 empty_branch = FALSE;
1553 do
1554 {
1555 if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1556 empty_branch = TRUE;
1557 code += GET(code, 1);
1558 }
1559 while (*code == OP_ALT);
1560 if (!empty_branch) return FALSE; /* All branches are non-empty */
1561 c = *code;
1562 continue;
1563 }
1564
1565 /* Handle the other opcodes */
1566
1567 switch (c)
1568 {
1569 /* Check for quantifiers after a class. XCLASS is used for classes that
1570 cannot be represented just by a bit map. This includes negated single
1571 high-valued characters. The length in _pcre_OP_lengths[] is zero; the
1572 actual length is stored in the compiled code, so we must update "code"
1573 here. */
1574
1575 #ifdef SUPPORT_UTF8
1576 case OP_XCLASS:
1577 ccode = code += GET(code, 1);
1578 goto CHECK_CLASS_REPEAT;
1579 #endif
1580
1581 case OP_CLASS:
1582 case OP_NCLASS:
1583 ccode = code + 33;
1584
1585 #ifdef SUPPORT_UTF8
1586 CHECK_CLASS_REPEAT:
1587 #endif
1588
1589 switch (*ccode)
1590 {
1591 case OP_CRSTAR: /* These could be empty; continue */
1592 case OP_CRMINSTAR:
1593 case OP_CRQUERY:
1594 case OP_CRMINQUERY:
1595 break;
1596
1597 default: /* Non-repeat => class must match */
1598 case OP_CRPLUS: /* These repeats aren't empty */
1599 case OP_CRMINPLUS:
1600 return FALSE;
1601
1602 case OP_CRRANGE:
1603 case OP_CRMINRANGE:
1604 if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
1605 break;
1606 }
1607 break;
1608
1609 /* Opcodes that must match a character */
1610
1611 case OP_PROP:
1612 case OP_NOTPROP:
1613 case OP_EXTUNI:
1614 case OP_NOT_DIGIT:
1615 case OP_DIGIT:
1616 case OP_NOT_WHITESPACE:
1617 case OP_WHITESPACE:
1618 case OP_NOT_WORDCHAR:
1619 case OP_WORDCHAR:
1620 case OP_ANY:
1621 case OP_ANYBYTE:
1622 case OP_CHAR:
1623 case OP_CHARNC:
1624 case OP_NOT:
1625 case OP_PLUS:
1626 case OP_MINPLUS:
1627 case OP_POSPLUS:
1628 case OP_EXACT:
1629 case OP_NOTPLUS:
1630 case OP_NOTMINPLUS:
1631 case OP_NOTPOSPLUS:
1632 case OP_NOTEXACT:
1633 case OP_TYPEPLUS:
1634 case OP_TYPEMINPLUS:
1635 case OP_TYPEPOSPLUS:
1636 case OP_TYPEEXACT:
1637 return FALSE;
1638
1639 /* These are going to continue, as they may be empty, but we have to
1640 fudge the length for the \p and \P cases. */
1641
1642 case OP_TYPESTAR:
1643 case OP_TYPEMINSTAR:
1644 case OP_TYPEPOSSTAR:
1645 case OP_TYPEQUERY:
1646 case OP_TYPEMINQUERY:
1647 case OP_TYPEPOSQUERY:
1648 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1649 break;
1650
1651 /* Same for these */
1652
1653 case OP_TYPEUPTO:
1654 case OP_TYPEMINUPTO:
1655 case OP_TYPEPOSUPTO:
1656 if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1657 break;
1658
1659 /* End of branch */
1660
1661 case OP_KET:
1662 case OP_KETRMAX:
1663 case OP_KETRMIN:
1664 case OP_ALT:
1665 return TRUE;
1666
1667 /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1668 MINUPTO, and POSUPTO may be followed by a multibyte character */
1669
1670 #ifdef SUPPORT_UTF8
1671 case OP_STAR:
1672 case OP_MINSTAR:
1673 case OP_POSSTAR:
1674 case OP_QUERY:
1675 case OP_MINQUERY:
1676 case OP_POSQUERY:
1677 case OP_UPTO:
1678 case OP_MINUPTO:
1679 case OP_POSUPTO:
1680 if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1681 break;
1682 #endif
1683 }
1684 }
1685
1686 return TRUE;
1687 }
1688
1689
1690
1691 /*************************************************
1692 * Scan compiled regex for non-emptiness *
1693 *************************************************/
1694
1695 /* This function is called to check for left recursive calls. We want to check
1696 the current branch of the current pattern to see if it could match the empty
1697 string. If it could, we must look outwards for branches at other levels,
1698 stopping when we pass beyond the bracket which is the subject of the recursion.
1699
1700 Arguments:
1701 code points to start of the recursion
1702 endcode points to where to stop (current RECURSE item)
1703 bcptr points to the chain of current (unclosed) branch starts
1704 utf8 TRUE if in UTF-8 mode
1705
1706 Returns: TRUE if what is matched could be empty
1707 */
1708
1709 static BOOL
1710 could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1711 BOOL utf8)
1712 {
1713 while (bcptr != NULL && bcptr->current >= code)
1714 {
1715 if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1716 bcptr = bcptr->outer;
1717 }
1718 return TRUE;
1719 }
1720
1721
1722
1723 /*************************************************
1724 * Check for POSIX class syntax *
1725 *************************************************/
1726
1727 /* This function is called when the sequence "[:" or "[." or "[=" is
1728 encountered in a character class. It checks whether this is followed by an
1729 optional ^ and then a sequence of letters, terminated by a matching ":]" or
1730 ".]" or "=]".
1731
1732 Argument:
1733 ptr pointer to the initial [
1734 endptr where to return the end pointer
1735 cd pointer to compile data
1736
1737 Returns: TRUE or FALSE
1738 */
1739
1740 static BOOL
1741 check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
1742 {
1743 int terminator; /* Don't combine these lines; the Solaris cc */
1744 terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
1745 if (*(++ptr) == '^') ptr++;
1746 while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
1747 if (*ptr == terminator && ptr[1] == ']')
1748 {
1749 *endptr = ptr;
1750 return TRUE;
1751 }
1752 return FALSE;
1753 }
1754
1755
1756
1757
1758 /*************************************************
1759 * Check POSIX class name *
1760 *************************************************/
1761
1762 /* This function is called to check the name given in a POSIX-style class entry
1763 such as [:alnum:].
1764
1765 Arguments:
1766 ptr points to the first letter
1767 len the length of the name
1768
1769 Returns: a value representing the name, or -1 if unknown
1770 */
1771
1772 static int
1773 check_posix_name(const uschar *ptr, int len)
1774 {
1775 const char *pn = posix_names;
1776 register int yield = 0;
1777 while (posix_name_lengths[yield] != 0)
1778 {
1779 if (len == posix_name_lengths[yield] &&
1780 strncmp((const char *)ptr, pn, len) == 0) return yield;
1781 pn += posix_name_lengths[yield] + 1;
1782 yield++;
1783 }
1784 return -1;
1785 }
1786
1787
1788 /*************************************************
1789 * Adjust OP_RECURSE items in repeated group *
1790 *************************************************/
1791
1792 /* OP_RECURSE items contain an offset from the start of the regex to the group
1793 that is referenced. This means that groups can be replicated for fixed
1794 repetition simply by copying (because the recursion is allowed to refer to
1795 earlier groups that are outside the current group). However, when a group is
1796 optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
1797 it, after it has been compiled. This means that any OP_RECURSE items within it
1798 that refer to the group itself or any contained groups have to have their
1799 offsets adjusted. That one of the jobs of this function. Before it is called,
1800 the partially compiled regex must be temporarily terminated with OP_END.
1801
1802 This function has been extended with the possibility of forward references for
1803 recursions and subroutine calls. It must also check the list of such references
1804 for the group we are dealing with. If it finds that one of the recursions in
1805 the current group is on this list, it adjusts the offset in the list, not the
1806 value in the reference (which is a group number).
1807
1808 Arguments:
1809 group points to the start of the group
1810 adjust the amount by which the group is to be moved
1811 utf8 TRUE in UTF-8 mode
1812 cd contains pointers to tables etc.
1813 save_hwm the hwm forward reference pointer at the start of the group
1814
1815 Returns: nothing
1816 */
1817
1818 static void
1819 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
1820 uschar *save_hwm)
1821 {
1822 uschar *ptr = group;
1823
1824 while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1825 {
1826 int offset;
1827 uschar *hc;
1828
1829 /* See if this recursion is on the forward reference list. If so, adjust the
1830 reference. */
1831
1832 for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
1833 {
1834 offset = GET(hc, 0);
1835 if (cd->start_code + offset == ptr + 1)
1836 {
1837 PUT(hc, 0, offset + adjust);
1838 break;
1839 }
1840 }
1841
1842 /* Otherwise, adjust the recursion offset if it's after the start of this
1843 group. */
1844
1845 if (hc >= cd->hwm)
1846 {
1847 offset = GET(ptr, 1);
1848 if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1849 }
1850
1851 ptr += 1 + LINK_SIZE;
1852 }
1853 }
1854
1855
1856
1857 /*************************************************
1858 * Insert an automatic callout point *
1859 *************************************************/
1860
1861 /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
1862 callout points before each pattern item.
1863
1864 Arguments:
1865 code current code pointer
1866 ptr current pattern pointer
1867 cd pointers to tables etc
1868
1869 Returns: new code pointer
1870 */
1871
1872 static uschar *
1873 auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
1874 {
1875 *code++ = OP_CALLOUT;
1876 *code++ = 255;
1877 PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */
1878 PUT(code, LINK_SIZE, 0); /* Default length */
1879 return code + 2*LINK_SIZE;
1880 }
1881
1882
1883
1884 /*************************************************
1885 * Complete a callout item *
1886 *************************************************/
1887
1888 /* A callout item contains the length of the next item in the pattern, which
1889 we can't fill in till after we have reached the relevant point. This is used
1890 for both automatic and manual callouts.
1891
1892 Arguments:
1893 previous_callout points to previous callout item
1894 ptr current pattern pointer
1895 cd pointers to tables etc
1896
1897 Returns: nothing
1898 */
1899
1900 static void
1901 complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
1902 {
1903 int length = ptr - cd->start_pattern - GET(previous_callout, 2);
1904 PUT(previous_callout, 2 + LINK_SIZE, length);
1905 }
1906
1907
1908
1909 #ifdef SUPPORT_UCP
1910 /*************************************************
1911 * Get othercase range *
1912 *************************************************/
1913
1914 /* This function is passed the start and end of a class range, in UTF-8 mode
1915 with UCP support. It searches up the characters, looking for internal ranges of
1916 characters in the "other" case. Each call returns the next one, updating the
1917 start address.
1918
1919 Arguments:
1920 cptr points to starting character value; updated
1921 d end value
1922 ocptr where to put start of othercase range
1923 odptr where to put end of othercase range
1924
1925 Yield: TRUE when range returned; FALSE when no more
1926 */
1927
1928 static BOOL
1929 get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
1930 unsigned int *odptr)
1931 {
1932 unsigned int c, othercase, next;
1933
1934 for (c = *cptr; c <= d; c++)
1935 { if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR) break; }
1936
1937 if (c > d) return FALSE;
1938
1939 *ocptr = othercase;
1940 next = othercase + 1;
1941
1942 for (++c; c <= d; c++)
1943 {
1944 if (_pcre_ucp_othercase(c) != next) break;
1945 next++;
1946 }
1947
1948 *odptr = next - 1;
1949 *cptr = c;
1950
1951 return TRUE;
1952 }
1953 #endif /* SUPPORT_UCP */
1954
1955
1956
1957 /*************************************************
1958 * Check if auto-possessifying is possible *
1959 *************************************************/
1960
1961 /* This function is called for unlimited repeats of certain items, to see
1962 whether the next thing could possibly match the repeated item. If not, it makes
1963 sense to automatically possessify the repeated item.
1964
1965 Arguments:
1966 op_code the repeated op code
1967 this data for this item, depends on the opcode
1968 utf8 TRUE in UTF-8 mode
1969 utf8_char used for utf8 character bytes, NULL if not relevant
1970 ptr next character in pattern
1971 options options bits
1972 cd contains pointers to tables etc.
1973
1974 Returns: TRUE if possessifying is wanted
1975 */
1976
1977 static BOOL
1978 check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
1979 const uschar *ptr, int options, compile_data *cd)
1980 {
1981 int next;
1982
1983 /* Skip whitespace and comments in extended mode */
1984
1985 if ((options & PCRE_EXTENDED) != 0)
1986 {
1987 for (;;)
1988 {
1989 while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1990 if (*ptr == '#')
1991 {
1992 while (*(++ptr) != 0)
1993 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1994 }
1995 else break;
1996 }
1997 }
1998
1999 /* If the next item is one that we can handle, get its value. A non-negative
2000 value is a character, a negative value is an escape value. */
2001
2002 if (*ptr == '\\')
2003 {
2004 int temperrorcode = 0;
2005 next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
2006 if (temperrorcode != 0) return FALSE;
2007 ptr++; /* Point after the escape sequence */
2008 }
2009
2010 else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
2011 {
2012 #ifdef SUPPORT_UTF8
2013 if (utf8) { GETCHARINC(next, ptr); } else
2014 #endif
2015 next = *ptr++;
2016 }
2017
2018 else return FALSE;
2019
2020 /* Skip whitespace and comments in extended mode */
2021
2022 if ((options & PCRE_EXTENDED) != 0)
2023 {
2024 for (;;)
2025 {
2026 while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2027 if (*ptr == '#')
2028 {
2029 while (*(++ptr) != 0)
2030 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2031 }
2032 else break;
2033 }
2034 }
2035
2036 /* If the next thing is itself optional, we have to give up. */
2037
2038 if (*ptr == '*' || *ptr == '?' || strncmp((char *)ptr, "{0,", 3) == 0)
2039 return FALSE;
2040
2041 /* Now compare the next item with the previous opcode. If the previous is a
2042 positive single character match, "item" either contains the character or, if
2043 "item" is greater than 127 in utf8 mode, the character's bytes are in
2044 utf8_char. */
2045
2046
2047 /* Handle cases when the next item is a character. */
2048
2049 if (next >= 0) switch(op_code)
2050 {
2051 case OP_CHAR:
2052 #ifdef SUPPORT_UTF8
2053 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2054 #endif
2055 return item != next;
2056
2057 /* For CHARNC (caseless character) we must check the other case. If we have
2058 Unicode property support, we can use it to test the other case of
2059 high-valued characters. */
2060
2061 case OP_CHARNC:
2062 #ifdef SUPPORT_UTF8
2063 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2064 #endif
2065 if (item == next) return FALSE;
2066 #ifdef SUPPORT_UTF8
2067 if (utf8)
2068 {
2069 unsigned int othercase;
2070 if (next < 128) othercase = cd->fcc[next]; else
2071 #ifdef SUPPORT_UCP
2072 othercase = _pcre_ucp_othercase((unsigned int)next);
2073 #else
2074 othercase = NOTACHAR;
2075 #endif
2076 return (unsigned int)item != othercase;
2077 }
2078 else
2079 #endif /* SUPPORT_UTF8 */
2080 return (item != cd->fcc[next]); /* Non-UTF-8 mode */
2081
2082 /* For OP_NOT, "item" must be a single-byte character. */
2083
2084 case OP_NOT:
2085 if (next < 0) return FALSE; /* Not a character */
2086 if (item == next) return TRUE;
2087 if ((options & PCRE_CASELESS) == 0) return FALSE;
2088 #ifdef SUPPORT_UTF8
2089 if (utf8)
2090 {
2091 unsigned int othercase;
2092 if (next < 128) othercase = cd->fcc[next]; else
2093 #ifdef SUPPORT_UCP
2094 othercase = _pcre_ucp_othercase(next);
2095 #else
2096 othercase = NOTACHAR;
2097 #endif
2098 return (unsigned int)item == othercase;
2099 }
2100 else
2101 #endif /* SUPPORT_UTF8 */
2102 return (item == cd->fcc[next]); /* Non-UTF-8 mode */
2103
2104 case OP_DIGIT:
2105 return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
2106
2107 case OP_NOT_DIGIT:
2108 return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
2109
2110 case OP_WHITESPACE:
2111 return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
2112
2113 case OP_NOT_WHITESPACE:
2114 return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
2115
2116 case OP_WORDCHAR:
2117 return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
2118
2119 case OP_NOT_WORDCHAR:
2120 return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
2121
2122 case OP_HSPACE:
2123 case OP_NOT_HSPACE:
2124 switch(next)
2125 {
2126 case 0x09:
2127 case 0x20:
2128 case 0xa0:
2129 case 0x1680:
2130 case 0x180e:
2131 case 0x2000:
2132 case 0x2001:
2133 case 0x2002:
2134 case 0x2003:
2135 case 0x2004:
2136 case 0x2005:
2137 case 0x2006:
2138 case 0x2007:
2139 case 0x2008:
2140 case 0x2009:
2141 case 0x200A:
2142 case 0x202f:
2143 case 0x205f:
2144 case 0x3000:
2145 return op_code != OP_HSPACE;
2146 default:
2147 return op_code == OP_HSPACE;
2148 }
2149
2150 case OP_VSPACE:
2151 case OP_NOT_VSPACE:
2152 switch(next)
2153 {
2154 case 0x0a:
2155 case 0x0b:
2156 case 0x0c:
2157 case 0x0d:
2158 case 0x85:
2159 case 0x2028:
2160 case 0x2029:
2161 return op_code != OP_VSPACE;
2162 default:
2163 return op_code == OP_VSPACE;
2164 }
2165
2166 default:
2167 return FALSE;
2168 }
2169
2170
2171 /* Handle the case when the next item is \d, \s, etc. */
2172
2173 switch(op_code)
2174 {
2175 case OP_CHAR:
2176 case OP_CHARNC:
2177 #ifdef SUPPORT_UTF8
2178 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2179 #endif
2180 switch(-next)
2181 {
2182 case ESC_d:
2183 return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
2184
2185 case ESC_D:
2186 return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
2187
2188 case ESC_s:
2189 return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
2190
2191 case ESC_S:
2192 return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
2193
2194 case ESC_w:
2195 return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
2196
2197 case ESC_W:
2198 return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
2199
2200 case ESC_h:
2201 case ESC_H:
2202 switch(item)
2203 {
2204 case 0x09:
2205 case 0x20:
2206 case 0xa0:
2207 case 0x1680:
2208 case 0x180e:
2209 case 0x2000:
2210 case 0x2001:
2211 case 0x2002:
2212 case 0x2003:
2213 case 0x2004:
2214 case 0x2005:
2215 case 0x2006:
2216 case 0x2007:
2217 case 0x2008:
2218 case 0x2009:
2219 case 0x200A:
2220 case 0x202f:
2221 case 0x205f:
2222 case 0x3000:
2223 return -next != ESC_h;
2224 default:
2225 return -next == ESC_h;
2226 }
2227
2228 case ESC_v:
2229 case ESC_V:
2230 switch(item)
2231 {
2232 case 0x0a:
2233 case 0x0b:
2234 case 0x0c:
2235 case 0x0d:
2236 case 0x85:
2237 case 0x2028:
2238 case 0x2029:
2239 return -next != ESC_v;
2240 default:
2241 return -next == ESC_v;
2242 }
2243
2244 default:
2245 return FALSE;
2246 }
2247
2248 case OP_DIGIT:
2249 return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
2250 next == -ESC_h || next == -ESC_v;
2251
2252 case OP_NOT_DIGIT:
2253 return next == -ESC_d;
2254
2255 case OP_WHITESPACE:
2256 return next == -ESC_S || next == -ESC_d || next == -ESC_w;
2257
2258 case OP_NOT_WHITESPACE:
2259 return next == -ESC_s || next == -ESC_h || next == -ESC_v;
2260
2261 case OP_HSPACE:
2262 return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w;
2263
2264 case OP_NOT_HSPACE:
2265 return next == -ESC_h;
2266
2267 /* Can't have \S in here because VT matches \S (Perl anomaly) */
2268 case OP_VSPACE:
2269 return next == -ESC_V || next == -ESC_d || next == -ESC_w;
2270
2271 case OP_NOT_VSPACE:
2272 return next == -ESC_v;
2273
2274 case OP_WORDCHAR:
2275 return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v;
2276
2277 case OP_NOT_WORDCHAR:
2278 return next == -ESC_w || next == -ESC_d;
2279
2280 default:
2281 return FALSE;
2282 }
2283
2284 /* Control does not reach here */
2285 }
2286
2287
2288
2289 /*************************************************
2290 * Compile one branch *
2291 *************************************************/
2292
2293 /* Scan the pattern, compiling it into the a vector. If the options are
2294 changed during the branch, the pointer is used to change the external options
2295 bits. This function is used during the pre-compile phase when we are trying
2296 to find out the amount of memory needed, as well as during the real compile
2297 phase. The value of lengthptr distinguishes the two phases.
2298
2299 Arguments:
2300 optionsptr pointer to the option bits
2301 codeptr points to the pointer to the current code point
2302 ptrptr points to the current pattern pointer
2303 errorcodeptr points to error code variable
2304 firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2305 reqbyteptr set to the last literal character required, else < 0
2306 bcptr points to current branch chain
2307 cd contains pointers to tables etc.
2308 lengthptr NULL during the real compile phase
2309 points to length accumulator during pre-compile phase
2310
2311 Returns: TRUE on success
2312 FALSE, with *errorcodeptr set non-zero on error
2313 */
2314
2315 static BOOL
2316 compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2317 int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2318 compile_data *cd, int *lengthptr)
2319 {
2320 int repeat_type, op_type;
2321 int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
2322 int bravalue = 0;
2323 int greedy_default, greedy_non_default;
2324 int firstbyte, reqbyte;
2325 int zeroreqbyte, zerofirstbyte;
2326 int req_caseopt, reqvary, tempreqvary;
2327 int options = *optionsptr;
2328 int after_manual_callout = 0;
2329 int length_prevgroup = 0;
2330 register int c;
2331 register uschar *code = *codeptr;
2332 uschar *last_code = code;
2333 uschar *orig_code = code;
2334 uschar *tempcode;
2335 BOOL inescq = FALSE;
2336 BOOL groupsetfirstbyte = FALSE;
2337 const uschar *ptr = *ptrptr;
2338 const uschar *tempptr;
2339 uschar *previous = NULL;
2340 uschar *previous_callout = NULL;
2341 uschar *save_hwm = NULL;
2342 uschar classbits[32];
2343
2344 #ifdef SUPPORT_UTF8
2345 BOOL class_utf8;
2346 BOOL utf8 = (options & PCRE_UTF8) != 0;
2347 uschar *class_utf8data;
2348 uschar utf8_char[6];
2349 #else
2350 BOOL utf8 = FALSE;
2351 uschar *utf8_char = NULL;
2352 #endif
2353
2354 #ifdef DEBUG
2355 if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2356 #endif
2357
2358 /* Set up the default and non-default settings for greediness */
2359
2360 greedy_default = ((options & PCRE_UNGREEDY) != 0);
2361 greedy_non_default = greedy_default ^ 1;
2362
2363 /* Initialize no first byte, no required byte. REQ_UNSET means "no char
2364 matching encountered yet". It gets changed to REQ_NONE if we hit something that
2365 matches a non-fixed char first char; reqbyte just remains unset if we never
2366 find one.
2367
2368 When we hit a repeat whose minimum is zero, we may have to adjust these values
2369 to take the zero repeat into account. This is implemented by setting them to
2370 zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2371 item types that can be repeated set these backoff variables appropriately. */
2372
2373 firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2374
2375 /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2376 according to the current setting of the caseless flag. REQ_CASELESS is a bit
2377 value > 255. It is added into the firstbyte or reqbyte variables to record the
2378 case status of the value. This is used only for ASCII characters. */
2379
2380 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2381
2382 /* Switch on next character until the end of the branch */
2383
2384 for (;; ptr++)
2385 {
2386 BOOL negate_class;
2387 BOOL should_flip_negation;
2388 BOOL possessive_quantifier;
2389 BOOL is_quantifier;
2390 BOOL is_recurse;
2391 BOOL reset_bracount;
2392 int class_charcount;
2393 int class_lastchar;
2394 int newoptions;
2395 int recno;
2396 int refsign;
2397 int skipbytes;
2398 int subreqbyte;
2399 int subfirstbyte;
2400 int terminator;
2401 int mclength;
2402 uschar mcbuffer[8];
2403
2404 /* Get next byte in the pattern */
2405
2406 c = *ptr;
2407
2408 /* If we are in the pre-compile phase, accumulate the length used for the
2409 previous cycle of this loop. */
2410
2411 if (lengthptr != NULL)
2412 {
2413 #ifdef DEBUG
2414 if (code > cd->hwm) cd->hwm = code; /* High water info */
2415 #endif
2416 if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
2417 {
2418 *errorcodeptr = ERR52;
2419 goto FAILED;
2420 }
2421
2422 /* There is at least one situation where code goes backwards: this is the
2423 case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2424 the class is simply eliminated. However, it is created first, so we have to
2425 allow memory for it. Therefore, don't ever reduce the length at this point.
2426 */
2427
2428 if (code < last_code) code = last_code;
2429
2430 /* Paranoid check for integer overflow */
2431
2432 if (OFLOW_MAX - *lengthptr < code - last_code)
2433 {
2434 *errorcodeptr = ERR20;
2435 goto FAILED;
2436 }
2437
2438 *lengthptr += code - last_code;
2439 DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2440
2441 /* If "previous" is set and it is not at the start of the work space, move
2442 it back to there, in order to avoid filling up the work space. Otherwise,
2443 if "previous" is NULL, reset the current code pointer to the start. */
2444
2445 if (previous != NULL)
2446 {
2447 if (previous > orig_code)
2448 {
2449 memmove(orig_code, previous, code - previous);
2450 code -= previous - orig_code;
2451 previous = orig_code;
2452 }
2453 }
2454 else code = orig_code;
2455
2456 /* Remember where this code item starts so we can pick up the length
2457 next time round. */
2458
2459 last_code = code;
2460 }
2461
2462 /* In the real compile phase, just check the workspace used by the forward
2463 reference list. */
2464
2465 else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
2466 {
2467 *errorcodeptr = ERR52;
2468 goto FAILED;
2469 }
2470
2471 /* If in \Q...\E, check for the end; if not, we have a literal */
2472
2473 if (inescq && c != 0)
2474 {
2475 if (c == '\\' && ptr[1] == 'E')
2476 {
2477 inescq = FALSE;
2478 ptr++;
2479 continue;
2480 }
2481 else
2482 {
2483 if (previous_callout != NULL)
2484 {
2485 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2486 complete_callout(previous_callout, ptr, cd);
2487 previous_callout = NULL;
2488 }
2489 if ((options & PCRE_AUTO_CALLOUT) != 0)
2490 {
2491 previous_callout = code;
2492 code = auto_callout(code, ptr, cd);
2493 }
2494 goto NORMAL_CHAR;
2495 }
2496 }
2497
2498 /* Fill in length of a previous callout, except when the next thing is
2499 a quantifier. */
2500
2501 is_quantifier = c == '*' || c == '+' || c == '?' ||
2502 (c == '{' && is_counted_repeat(ptr+1));
2503
2504 if (!is_quantifier && previous_callout != NULL &&
2505 after_manual_callout-- <= 0)
2506 {
2507 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2508 complete_callout(previous_callout, ptr, cd);
2509 previous_callout = NULL;
2510 }
2511
2512 /* In extended mode, skip white space and comments */
2513
2514 if ((options & PCRE_EXTENDED) != 0)
2515 {
2516 if ((cd->ctypes[c] & ctype_space) != 0) continue;
2517 if (c == '#')
2518 {
2519 while (*(++ptr) != 0)
2520 {
2521 if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
2522 }
2523 if (*ptr != 0) continue;
2524
2525 /* Else fall through to handle end of string */
2526 c = 0;
2527 }
2528 }
2529
2530 /* No auto callout for quantifiers. */
2531
2532 if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
2533 {
2534 previous_callout = code;
2535 code = auto_callout(code, ptr, cd);
2536 }
2537
2538 switch(c)
2539 {
2540 /* ===================================================================*/
2541 case 0: /* The branch terminates at string end */
2542 case '|': /* or | or ) */
2543 case ')':
2544 *firstbyteptr = firstbyte;
2545 *reqbyteptr = reqbyte;
2546 *codeptr = code;
2547 *ptrptr = ptr;
2548 if (lengthptr != NULL)
2549 {
2550 if (OFLOW_MAX - *lengthptr < code - last_code)
2551 {
2552 *errorcodeptr = ERR20;
2553 goto FAILED;
2554 }
2555 *lengthptr += code - last_code; /* To include callout length */
2556 DPRINTF((">> end branch\n"));
2557 }
2558 return TRUE;
2559
2560
2561 /* ===================================================================*/
2562 /* Handle single-character metacharacters. In multiline mode, ^ disables
2563 the setting of any following char as a first character. */
2564
2565 case '^':
2566 if ((options & PCRE_MULTILINE) != 0)
2567 {
2568 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2569 }
2570 previous = NULL;
2571 *code++ = OP_CIRC;
2572 break;
2573
2574 case '$':
2575 previous = NULL;
2576 *code++ = OP_DOLL;
2577 break;
2578
2579 /* There can never be a first char if '.' is first, whatever happens about
2580 repeats. The value of reqbyte doesn't change either. */
2581
2582 case '.':
2583 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2584 zerofirstbyte = firstbyte;
2585 zeroreqbyte = reqbyte;
2586 previous = code;
2587 *code++ = OP_ANY;
2588 break;
2589
2590
2591 /* ===================================================================*/
2592 /* Character classes. If the included characters are all < 256, we build a
2593 32-byte bitmap of the permitted characters, except in the special case
2594 where there is only one such character. For negated classes, we build the
2595 map as usual, then invert it at the end. However, we use a different opcode
2596 so that data characters > 255 can be handled correctly.
2597
2598 If the class contains characters outside the 0-255 range, a different
2599 opcode is compiled. It may optionally have a bit map for characters < 256,
2600 but those above are are explicitly listed afterwards. A flag byte tells
2601 whether the bitmap is present, and whether this is a negated class or not.
2602 */
2603
2604 case '[':
2605 previous = code;
2606
2607 /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
2608 they are encountered at the top level, so we'll do that too. */
2609
2610 if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2611 check_posix_syntax(ptr, &tempptr, cd))
2612 {
2613 *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;
2614 goto FAILED;
2615 }
2616
2617 /* If the first character is '^', set the negation flag and skip it. Also,
2618 if the first few characters (either before or after ^) are \Q\E or \E we
2619 skip them too. This makes for compatibility with Perl. */
2620
2621 negate_class = FALSE;
2622 for (;;)
2623 {
2624 c = *(++ptr);
2625 if (c == '\\')
2626 {
2627 if (ptr[1] == 'E') ptr++;
2628 else if (strncmp((const char *)ptr+1, "Q\\E", 3) == 0) ptr += 3;
2629 else break;
2630 }
2631 else if (!negate_class && c == '^')
2632 negate_class = TRUE;
2633 else break;
2634 }
2635
2636 /* If a class contains a negative special such as \S, we need to flip the
2637 negation flag at the end, so that support for characters > 255 works
2638 correctly (they are all included in the class). */
2639
2640 should_flip_negation = FALSE;
2641
2642 /* Keep a count of chars with values < 256 so that we can optimize the case
2643 of just a single character (as long as it's < 256). However, For higher
2644 valued UTF-8 characters, we don't yet do any optimization. */
2645
2646 class_charcount = 0;
2647 class_lastchar = -1;
2648
2649 /* Initialize the 32-char bit map to all zeros. We build the map in a
2650 temporary bit of memory, in case the class contains only 1 character (less
2651 than 256), because in that case the compiled code doesn't use the bit map.
2652 */
2653
2654 memset(classbits, 0, 32 * sizeof(uschar));
2655
2656 #ifdef SUPPORT_UTF8
2657 class_utf8 = FALSE; /* No chars >= 256 */
2658 class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */
2659 #endif
2660
2661 /* Process characters until ] is reached. By writing this as a "do" it
2662 means that an initial ] is taken as a data character. At the start of the
2663 loop, c contains the first byte of the character. */
2664
2665 if (c != 0) do
2666 {
2667 const uschar *oldptr;
2668
2669 #ifdef SUPPORT_UTF8
2670 if (utf8 && c > 127)
2671 { /* Braces are required because the */
2672 GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
2673 }
2674 #endif
2675
2676 /* Inside \Q...\E everything is literal except \E */
2677
2678 if (inescq)
2679 {
2680 if (c == '\\' && ptr[1] == 'E') /* If we are at \E */
2681 {
2682 inescq = FALSE; /* Reset literal state */
2683 ptr++; /* Skip the 'E' */
2684 continue; /* Carry on with next */
2685 }
2686 goto CHECK_RANGE; /* Could be range if \E follows */
2687 }
2688
2689 /* Handle POSIX class names. Perl allows a negation extension of the
2690 form [:^name:]. A square bracket that doesn't match the syntax is
2691 treated as a literal. We also recognize the POSIX constructions
2692 [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
2693 5.6 and 5.8 do. */
2694
2695 if (c == '[' &&
2696 (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2697 check_posix_syntax(ptr, &tempptr, cd))
2698 {
2699 BOOL local_negate = FALSE;
2700 int posix_class, taboffset, tabopt;
2701 register const uschar *cbits = cd->cbits;
2702 uschar pbits[32];
2703
2704 if (ptr[1] != ':')
2705 {
2706 *errorcodeptr = ERR31;
2707 goto FAILED;
2708 }
2709
2710 ptr += 2;
2711 if (*ptr == '^')
2712 {
2713 local_negate = TRUE;
2714 should_flip_negation = TRUE; /* Note negative special */
2715 ptr++;
2716 }
2717
2718 posix_class = check_posix_name(ptr, tempptr - ptr);
2719 if (posix_class < 0)
2720 {
2721 *errorcodeptr = ERR30;
2722 goto FAILED;
2723 }
2724
2725 /* If matching is caseless, upper and lower are converted to
2726 alpha. This relies on the fact that the class table starts with
2727 alpha, lower, upper as the first 3 entries. */
2728
2729 if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
2730 posix_class = 0;
2731
2732 /* We build the bit map for the POSIX class in a chunk of local store
2733 because we may be adding and subtracting from it, and we don't want to
2734 subtract bits that may be in the main map already. At the end we or the
2735 result into the bit map that is being built. */
2736
2737 posix_class *= 3;
2738
2739 /* Copy in the first table (always present) */
2740
2741 memcpy(pbits, cbits + posix_class_maps[posix_class],
2742 32 * sizeof(uschar));
2743
2744 /* If there is a second table, add or remove it as required. */
2745
2746 taboffset = posix_class_maps[posix_class + 1];
2747 tabopt = posix_class_maps[posix_class + 2];
2748
2749 if (taboffset >= 0)
2750 {
2751 if (tabopt >= 0)
2752 for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
2753 else
2754 for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
2755 }
2756
2757 /* Not see if we need to remove any special characters. An option
2758 value of 1 removes vertical space and 2 removes underscore. */
2759
2760 if (tabopt < 0) tabopt = -tabopt;
2761 if (tabopt == 1) pbits[1] &= ~0x3c;
2762 else if (tabopt == 2) pbits[11] &= 0x7f;
2763
2764 /* Add the POSIX table or its complement into the main table that is
2765 being built and we are done. */
2766
2767 if (local_negate)
2768 for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
2769 else
2770 for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
2771
2772 ptr = tempptr + 1;
2773 class_charcount = 10; /* Set > 1; assumes more than 1 per class */
2774 continue; /* End of POSIX syntax handling */
2775 }
2776
2777 /* Backslash may introduce a single character, or it may introduce one
2778 of the specials, which just set a flag. The sequence \b is a special
2779 case. Inside a class (and only there) it is treated as backspace.
2780 Elsewhere it marks a word boundary. Other escapes have preset maps ready
2781 to 'or' into the one we are building. We assume they have more than one
2782 character in them, so set class_charcount bigger than one. */
2783
2784 if (c == '\\')
2785 {
2786 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2787 if (*errorcodeptr != 0) goto FAILED;
2788
2789 if (-c == ESC_b) c = '\b'; /* \b is backslash in a class */
2790 else if (-c == ESC_X) c = 'X'; /* \X is literal X in a class */
2791 else if (-c == ESC_R) c = 'R'; /* \R is literal R in a class */
2792 else if (-c == ESC_Q) /* Handle start of quoted string */
2793 {
2794 if (ptr[1] == '\\' && ptr[2] == 'E')
2795 {
2796 ptr += 2; /* avoid empty string */
2797 }
2798 else inescq = TRUE;
2799 continue;
2800 }
2801 else if (-c == ESC_E) continue; /* Ignore orphan \E */
2802
2803 if (c < 0)
2804 {
2805 register const uschar *cbits = cd->cbits;
2806 class_charcount += 2; /* Greater than 1 is what matters */
2807
2808 /* Save time by not doing this in the pre-compile phase. */
2809
2810 if (lengthptr == NULL) switch (-c)
2811 {
2812 case ESC_d:
2813 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
2814 continue;
2815
2816 case ESC_D:
2817 should_flip_negation = TRUE;
2818 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
2819 continue;
2820
2821 case ESC_w:
2822 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
2823 continue;
2824
2825 case ESC_W:
2826 should_flip_negation = TRUE;
2827 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
2828 continue;
2829
2830 case ESC_s:
2831 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
2832 classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
2833 continue;
2834
2835 case ESC_S:
2836 should_flip_negation = TRUE;
2837 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
2838 classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
2839 continue;
2840
2841 case ESC_E: /* Perl ignores an orphan \E */
2842 continue;
2843
2844 default: /* Not recognized; fall through */
2845 break; /* Need "default" setting to stop compiler warning. */
2846 }
2847
2848 /* In the pre-compile phase, just do the recognition. */
2849
2850 else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
2851 c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
2852
2853 /* We need to deal with \H, \h, \V, and \v in both phases because
2854 they use extra memory. */
2855
2856 if (-c == ESC_h)
2857 {
2858 SETBIT(classbits, 0x09); /* VT */
2859 SETBIT(classbits, 0x20); /* SPACE */
2860 SETBIT(classbits, 0xa0); /* NSBP */
2861 #ifdef SUPPORT_UTF8
2862 if (utf8)
2863 {
2864 class_utf8 = TRUE;
2865 *class_utf8data++ = XCL_SINGLE;
2866 class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
2867 *class_utf8data++ = XCL_SINGLE;
2868 class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
2869 *class_utf8data++ = XCL_RANGE;
2870 class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
2871 class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
2872 *class_utf8data++ = XCL_SINGLE;
2873 class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
2874 *class_utf8data++ = XCL_SINGLE;
2875 class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
2876 *class_utf8data++ = XCL_SINGLE;
2877 class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
2878 }
2879 #endif
2880 continue;
2881 }
2882
2883 if (-c == ESC_H)
2884 {
2885 for (c = 0; c < 32; c++)
2886 {
2887 int x = 0xff;
2888 switch (c)
2889 {
2890 case 0x09/8: x ^= 1 << (0x09%8); break;
2891 case 0x20/8: x ^= 1 << (0x20%8); break;
2892 case 0xa0/8: x ^= 1 << (0xa0%8); break;
2893 default: break;
2894 }
2895 classbits[c] |= x;
2896 }
2897
2898 #ifdef SUPPORT_UTF8
2899 if (utf8)
2900 {
2901 class_utf8 = TRUE;
2902 *class_utf8data++ = XCL_RANGE;
2903 class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2904 class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
2905 *class_utf8data++ = XCL_RANGE;
2906 class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
2907 class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
2908 *class_utf8data++ = XCL_RANGE;
2909 class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
2910 class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
2911 *class_utf8data++ = XCL_RANGE;
2912 class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
2913 class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
2914 *class_utf8data++ = XCL_RANGE;
2915 class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
2916 class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
2917 *class_utf8data++ = XCL_RANGE;
2918 class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
2919 class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
2920 *class_utf8data++ = XCL_RANGE;
2921 class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
2922 class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2923 }
2924 #endif
2925 continue;
2926 }
2927
2928 if (-c == ESC_v)
2929 {
2930 SETBIT(classbits, 0x0a); /* LF */
2931 SETBIT(classbits, 0x0b); /* VT */
2932 SETBIT(classbits, 0x0c); /* FF */
2933 SETBIT(classbits, 0x0d); /* CR */
2934 SETBIT(classbits, 0x85); /* NEL */
2935 #ifdef SUPPORT_UTF8
2936 if (utf8)
2937 {
2938 class_utf8 = TRUE;
2939 *class_utf8data++ = XCL_RANGE;
2940 class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
2941 class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
2942 }
2943 #endif
2944 continue;
2945 }
2946
2947 if (-c == ESC_V)
2948 {
2949 for (c = 0; c < 32; c++)
2950 {
2951 int x = 0xff;
2952 switch (c)
2953 {
2954 case 0x0a/8: x ^= 1 << (0x0a%8);
2955 x ^= 1 << (0x0b%8);
2956 x ^= 1 << (0x0c%8);
2957 x ^= 1 << (0x0d%8);
2958 break;
2959 case 0x85/8: x ^= 1 << (0x85%8); break;
2960 default: break;
2961 }
2962 classbits[c] |= x;
2963 }
2964
2965 #ifdef SUPPORT_UTF8
2966 if (utf8)
2967 {
2968 class_utf8 = TRUE;
2969 *class_utf8data++ = XCL_RANGE;
2970 class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2971 class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
2972 *class_utf8data++ = XCL_RANGE;
2973 class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
2974 class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2975 }
2976 #endif
2977 continue;
2978 }
2979
2980 /* We need to deal with \P and \p in both phases. */
2981
2982 #ifdef SUPPORT_UCP
2983 if (-c == ESC_p || -c == ESC_P)
2984 {
2985 BOOL negated;
2986 int pdata;
2987 int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
2988 if (ptype < 0) goto FAILED;
2989 class_utf8 = TRUE;
2990 *class_utf8data++ = ((-c == ESC_p) != negated)?
2991 XCL_PROP : XCL_NOTPROP;
2992 *class_utf8data++ = ptype;
2993 *class_utf8data++ = pdata;
2994 class_charcount -= 2; /* Not a < 256 character */
2995 continue;
2996 }
2997 #endif
2998 /* Unrecognized escapes are faulted if PCRE is running in its
2999 strict mode. By default, for compatibility with Perl, they are
3000 treated as literals. */
3001
3002 if ((options & PCRE_EXTRA) != 0)
3003 {
3004 *errorcodeptr = ERR7;
3005 goto FAILED;
3006 }
3007
3008 class_charcount -= 2; /* Undo the default count from above */
3009 c = *ptr; /* Get the final character and fall through */
3010 }
3011
3012 /* Fall through if we have a single character (c >= 0). This may be
3013 greater than 256 in UTF-8 mode. */
3014
3015 } /* End of backslash handling */
3016
3017 /* A single character may be followed by '-' to form a range. However,
3018 Perl does not permit ']' to be the end of the range. A '-' character
3019 at the end is treated as a literal. Perl ignores orphaned \E sequences
3020 entirely. The code for handling \Q and \E is messy. */
3021
3022 CHECK_RANGE:
3023 while (ptr[1] == '\\' && ptr[2] == 'E')
3024 {
3025 inescq = FALSE;
3026 ptr += 2;
3027 }
3028
3029 oldptr = ptr;
3030
3031 /* Remember \r or \n */
3032
3033 if (c == '\r' || c == '\n') cd->external_flags |= PCRE_HASCRORLF;
3034
3035 /* Check for range */
3036
3037 if (!inescq && ptr[1] == '-')
3038 {
3039 int d;
3040 ptr += 2;
3041 while (*ptr == '\\' && ptr[1] == 'E') ptr += 2;
3042
3043 /* If we hit \Q (not followed by \E) at this point, go into escaped
3044 mode. */
3045
3046 while (*ptr == '\\' && ptr[1] == 'Q')
3047 {
3048 ptr += 2;
3049 if (*ptr == '\\' && ptr[1] == 'E') { ptr += 2; continue; }
3050 inescq = TRUE;
3051 break;
3052 }
3053
3054 if (*ptr == 0 || (!inescq && *ptr == ']'))
3055 {
3056 ptr = oldptr;
3057 goto LONE_SINGLE_CHARACTER;
3058 }
3059
3060 #ifdef SUPPORT_UTF8
3061 if (utf8)
3062 { /* Braces are required because the */
3063 GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
3064 }
3065 else
3066 #endif
3067 d = *ptr; /* Not UTF-8 mode */
3068
3069 /* The second part of a range can be a single-character escape, but
3070 not any of the other escapes. Perl 5.6 treats a hyphen as a literal
3071 in such circumstances. */
3072
3073 if (!inescq && d == '\\')
3074 {
3075 d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3076 if (*errorcodeptr != 0) goto FAILED;
3077
3078 /* \b is backslash; \X is literal X; \R is literal R; any other
3079 special means the '-' was literal */
3080
3081 if (d < 0)
3082 {
3083 if (d == -ESC_b) d = '\b';
3084 else if (d == -ESC_X) d = 'X';
3085 else if (d == -ESC_R) d = 'R'; else
3086 {
3087 ptr = oldptr;
3088 goto LONE_SINGLE_CHARACTER; /* A few lines below */
3089 }
3090 }
3091 }
3092
3093 /* Check that the two values are in the correct order. Optimize
3094 one-character ranges */
3095
3096 if (d < c)
3097 {
3098 *errorcodeptr = ERR8;
3099 goto FAILED;
3100 }
3101
3102 if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
3103
3104 /* Remember \r or \n */
3105
3106 if (d == '\r' || d == '\n') cd->external_flags |= PCRE_HASCRORLF;
3107
3108 /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
3109 matching, we have to use an XCLASS with extra data items. Caseless
3110 matching for characters > 127 is available only if UCP support is
3111 available. */
3112
3113 #ifdef SUPPORT_UTF8
3114 if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
3115 {
3116 class_utf8 = TRUE;
3117
3118 /* With UCP support, we can find the other case equivalents of
3119 the relevant characters. There may be several ranges. Optimize how
3120 they fit with the basic range. */
3121
3122 #ifdef SUPPORT_UCP
3123 if ((options & PCRE_CASELESS) != 0)
3124 {
3125 unsigned int occ, ocd;
3126 unsigned int cc = c;
3127 unsigned int origd = d;
3128 while (get_othercase_range(&cc, origd, &occ, &ocd))
3129 {
3130 if (occ >= (unsigned int)c &&
3131 ocd <= (unsigned int)d)
3132 continue; /* Skip embedded ranges */
3133
3134 if (occ < (unsigned int)c &&
3135 ocd >= (unsigned int)c - 1) /* Extend the basic range */
3136 { /* if there is overlap, */
3137 c = occ; /* noting that if occ < c */
3138 continue; /* we can't have ocd > d */
3139 } /* because a subrange is */
3140 if (ocd > (unsigned int)d &&
3141 occ <= (unsigned int)d + 1) /* always shorter than */
3142 { /* the basic range. */
3143 d = ocd;
3144 continue;
3145 }
3146
3147 if (occ == ocd)
3148 {
3149 *class_utf8data++ = XCL_SINGLE;
3150 }
3151 else
3152 {
3153 *class_utf8data++ = XCL_RANGE;
3154 class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
3155 }
3156 class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
3157 }
3158 }
3159 #endif /* SUPPORT_UCP */
3160
3161 /* Now record the original range, possibly modified for UCP caseless
3162 overlapping ranges. */
3163
3164 *class_utf8data++ = XCL_RANGE;
3165 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3166 class_utf8data += _pcre_ord2utf8(d, class_utf8data);
3167
3168 /* With UCP support, we are done. Without UCP support, there is no
3169 caseless matching for UTF-8 characters > 127; we can use the bit map
3170 for the smaller ones. */
3171
3172 #ifdef SUPPORT_UCP
3173 continue; /* With next character in the class */
3174 #else
3175 if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
3176
3177 /* Adjust upper limit and fall through to set up the map */
3178
3179 d = 127;
3180
3181 #endif /* SUPPORT_UCP */
3182 }
3183 #endif /* SUPPORT_UTF8 */
3184
3185 /* We use the bit map for all cases when not in UTF-8 mode; else
3186 ranges that lie entirely within 0-127 when there is UCP support; else
3187 for partial ranges without UCP support. */
3188
3189 class_charcount += d - c + 1;
3190 class_lastchar = d;
3191
3192 /* We can save a bit of time by skipping this in the pre-compile. */
3193
3194 if (lengthptr == NULL) for (; c <= d; c++)
3195 {
3196 classbits[c/8] |= (1 << (c&7));
3197 if ((options & PCRE_CASELESS) != 0)
3198 {
3199 int uc = cd->fcc[c]; /* flip case */
3200 classbits[uc/8] |= (1 << (uc&7));
3201 }
3202 }
3203
3204 continue; /* Go get the next char in the class */
3205 }
3206
3207 /* Handle a lone single character - we can get here for a normal
3208 non-escape char, or after \ that introduces a single character or for an
3209 apparent range that isn't. */
3210
3211 LONE_SINGLE_CHARACTER:
3212
3213 /* Handle a character that cannot go in the bit map */
3214
3215 #ifdef SUPPORT_UTF8
3216 if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
3217 {
3218 class_utf8 = TRUE;
3219 *class_utf8data++ = XCL_SINGLE;
3220 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3221
3222 #ifdef SUPPORT_UCP
3223 if ((options & PCRE_CASELESS) != 0)
3224 {
3225 unsigned int othercase;
3226 if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR)
3227 {
3228 *class_utf8data++ = XCL_SINGLE;
3229 class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
3230 }
3231 }
3232 #endif /* SUPPORT_UCP */
3233
3234 }
3235 else
3236 #endif /* SUPPORT_UTF8 */
3237
3238 /* Handle a single-byte character */
3239 {
3240 classbits[c/8] |= (1 << (c&7));
3241 if ((options & PCRE_CASELESS) != 0)
3242 {
3243 c = cd->fcc[c]; /* flip case */
3244 classbits[c/8] |= (1 << (c&7));
3245 }
3246 class_charcount++;
3247 class_lastchar = c;
3248 }
3249 }
3250
3251 /* Loop until ']' reached. This "while" is the end of the "do" above. */
3252
3253 while ((c = *(++ptr)) != 0 && (c != ']' || inescq));
3254
3255 if (c == 0) /* Missing terminating ']' */
3256 {
3257 *errorcodeptr = ERR6;
3258 goto FAILED;
3259 }
3260
3261
3262 /* This code has been disabled because it would mean that \s counts as
3263 an explicit \r or \n reference, and that's not really what is wanted. Now
3264 we set the flag only if there is a literal "\r" or "\n" in the class. */
3265
3266 #if 0
3267 /* Remember whether \r or \n are in this class */
3268
3269 if (negate_class)
3270 {
3271 if ((classbits[1] & 0x24) != 0x24) cd->external_flags |= PCRE_HASCRORLF;
3272 }
3273 else
3274 {
3275 if ((classbits[1] & 0x24) != 0) cd->external_flags |= PCRE_HASCRORLF;
3276 }
3277 #endif
3278
3279
3280 /* If class_charcount is 1, we saw precisely one character whose value is
3281 less than 256. As long as there were no characters >= 128 and there was no
3282 use of \p or \P, in other words, no use of any XCLASS features, we can
3283 optimize.
3284
3285 In UTF-8 mode, we can optimize the negative case only if there were no
3286 characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR
3287 operate on single-bytes only. This is an historical hangover. Maybe one day
3288 we can tidy these opcodes to handle multi-byte characters.
3289
3290 The optimization throws away the bit map. We turn the item into a
3291 1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
3292 that OP_NOT does not support multibyte characters. In the positive case, it
3293 can cause firstbyte to be set. Otherwise, there can be no first char if
3294 this item is first, whatever repeat count may follow. In the case of
3295 reqbyte, save the previous value for reinstating. */
3296
3297 #ifdef SUPPORT_UTF8
3298 if (class_charcount == 1 && !class_utf8 &&
3299 (!utf8 || !negate_class || class_lastchar < 128))
3300 #else
3301 if (class_charcount == 1)
3302 #endif
3303 {
3304 zeroreqbyte = reqbyte;
3305
3306 /* The OP_NOT opcode works on one-byte characters only. */
3307
3308 if (negate_class)
3309 {
3310 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3311 zerofirstbyte = firstbyte;
3312 *code++ = OP_NOT;
3313 *code++ = class_lastchar;
3314 break;
3315 }
3316
3317 /* For a single, positive character, get the value into mcbuffer, and
3318 then we can handle this with the normal one-character code. */
3319
3320 #ifdef SUPPORT_UTF8
3321 if (utf8 && class_lastchar > 127)
3322 mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
3323 else
3324 #endif
3325 {
3326 mcbuffer[0] = class_lastchar;
3327 mclength = 1;
3328 }
3329 goto ONE_CHAR;
3330 } /* End of 1-char optimization */
3331
3332 /* The general case - not the one-char optimization. If this is the first
3333 thing in the branch, there can be no first char setting, whatever the
3334 repeat count. Any reqbyte setting must remain unchanged after any kind of
3335 repeat. */
3336
3337 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3338 zerofirstbyte = firstbyte;
3339 zeroreqbyte = reqbyte;
3340
3341 /* If there are characters with values > 255, we have to compile an
3342 extended class, with its own opcode, unless there was a negated special
3343 such as \S in the class, because in that case all characters > 255 are in
3344 the class, so any that were explicitly given as well can be ignored. If
3345 (when there are explicit characters > 255 that must be listed) there are no
3346 characters < 256, we can omit the bitmap in the actual compiled code. */
3347
3348 #ifdef SUPPORT_UTF8
3349 if (class_utf8 && !should_flip_negation)
3350 {
3351 *class_utf8data++ = XCL_END; /* Marks the end of extra data */
3352 *code++ = OP_XCLASS;
3353 code += LINK_SIZE;
3354 *code = negate_class? XCL_NOT : 0;
3355
3356 /* If the map is required, move up the extra data to make room for it;
3357 otherwise just move the code pointer to the end of the extra data. */
3358
3359 if (class_charcount > 0)
3360 {
3361 *code++ |= XCL_MAP;
3362 memmove(code + 32, code, class_utf8data - code);
3363 memcpy(code, classbits, 32);
3364 code = class_utf8data + 32;
3365 }
3366 else code = class_utf8data;
3367
3368 /* Now fill in the complete length of the item */
3369
3370 PUT(previous, 1, code - previous);
3371 break; /* End of class handling */
3372 }
3373 #endif
3374
3375 /* If there are no characters > 255, set the opcode to OP_CLASS or
3376 OP_NCLASS, depending on whether the whole class was negated and whether
3377 there were negative specials such as \S in the class. Then copy the 32-byte
3378 map into the code vector, negating it if necessary. */
3379
3380 *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
3381 if (negate_class)
3382 {
3383 if (lengthptr == NULL) /* Save time in the pre-compile phase */
3384 for (c = 0; c < 32; c++) code[c] = ~classbits[c];
3385 }
3386 else
3387 {
3388 memcpy(code, classbits, 32);
3389 }
3390 code += 32;
3391 break;
3392
3393
3394 /* ===================================================================*/
3395 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
3396 has been tested above. */
3397
3398 case '{':
3399 if (!is_quantifier) goto NORMAL_CHAR;
3400 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
3401 if (*errorcodeptr != 0) goto FAILED;
3402 goto REPEAT;
3403
3404 case '*':
3405 repeat_min = 0;
3406 repeat_max = -1;
3407 goto REPEAT;
3408
3409 case '+':
3410 repeat_min = 1;
3411 repeat_max = -1;
3412 goto REPEAT;
3413
3414 case '?':
3415 repeat_min = 0;
3416 repeat_max = 1;
3417
3418 REPEAT:
3419 if (previous == NULL)
3420 {
3421 *errorcodeptr = ERR9;
3422 goto FAILED;
3423 }
3424
3425 if (repeat_min == 0)
3426 {
3427 firstbyte = zerofirstbyte; /* Adjust for zero repeat */
3428 reqbyte = zeroreqbyte; /* Ditto */
3429 }
3430
3431 /* Remember whether this is a variable length repeat */
3432
3433 reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
3434
3435 op_type = 0; /* Default single-char op codes */
3436 possessive_quantifier = FALSE; /* Default not possessive quantifier */
3437
3438 /* Save start of previous item, in case we have to move it up to make space
3439 for an inserted OP_ONCE for the additional '+' extension. */
3440
3441 tempcode = previous;
3442
3443 /* If the next character is '+', we have a possessive quantifier. This
3444 implies greediness, whatever the setting of the PCRE_UNGREEDY option.
3445 If the next character is '?' this is a minimizing repeat, by default,
3446 but if PCRE_UNGREEDY is set, it works the other way round. We change the
3447 repeat type to the non-default. */
3448
3449 if (ptr[1] == '+')
3450 {
3451 repeat_type = 0; /* Force greedy */
3452 possessive_quantifier = TRUE;
3453 ptr++;
3454 }
3455 else if (ptr[1] == '?')
3456 {
3457 repeat_type = greedy_non_default;
3458 ptr++;
3459 }
3460 else repeat_type = greedy_default;
3461
3462 /* If previous was a character match, abolish the item and generate a
3463 repeat item instead. If a char item has a minumum of more than one, ensure
3464 that it is set in reqbyte - it might not be if a sequence such as x{3} is
3465 the first thing in a branch because the x will have gone into firstbyte
3466 instead. */
3467
3468 if (*previous == OP_CHAR || *previous == OP_CHARNC)
3469 {
3470 /* Deal with UTF-8 characters that take up more than one byte. It's
3471 easier to write this out separately than try to macrify it. Use c to
3472 hold the length of the character in bytes, plus 0x80 to flag that it's a
3473 length rather than a small character. */
3474
3475 #ifdef SUPPORT_UTF8
3476 if (utf8 && (code[-1] & 0x80) != 0)
3477 {
3478 uschar *lastchar = code - 1;
3479 while((*lastchar & 0xc0) == 0x80) lastchar--;
3480 c = code - lastchar; /* Length of UTF-8 character */
3481 memcpy(utf8_char, lastchar, c); /* Save the char */
3482 c |= 0x80; /* Flag c as a length */
3483 }
3484 else
3485 #endif
3486
3487 /* Handle the case of a single byte - either with no UTF8 support, or
3488 with UTF-8 disabled, or for a UTF-8 character < 128. */
3489
3490 {
3491 c = code[-1];
3492 if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
3493 }
3494
3495 /* If the repetition is unlimited, it pays to see if the next thing on
3496 the line is something that cannot possibly match this character. If so,
3497 automatically possessifying this item gains some performance in the case
3498 where the match fails. */
3499
3500 if (!possessive_quantifier &&
3501 repeat_max < 0 &&
3502 check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3503 options, cd))
3504 {
3505 repeat_type = 0; /* Force greedy */
3506 possessive_quantifier = TRUE;
3507 }
3508
3509 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
3510 }
3511
3512 /* If previous was a single negated character ([^a] or similar), we use
3513 one of the special opcodes, replacing it. The code is shared with single-
3514 character repeats by setting opt_type to add a suitable offset into
3515 repeat_type. We can also test for auto-possessification. OP_NOT is
3516 currently used only for single-byte chars. */
3517
3518 else if (*previous == OP_NOT)
3519 {
3520 op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
3521 c = previous[1];
3522 if (!possessive_quantifier &&
3523 repeat_max < 0 &&
3524 check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3525 {
3526 repeat_type = 0; /* Force greedy */
3527 possessive_quantifier = TRUE;
3528 }
3529 goto OUTPUT_SINGLE_REPEAT;
3530 }
3531
3532 /* If previous was a character type match (\d or similar), abolish it and
3533 create a suitable repeat item. The code is shared with single-character
3534 repeats by setting op_type to add a suitable offset into repeat_type. Note
3535 the the Unicode property types will be present only when SUPPORT_UCP is
3536 defined, but we don't wrap the little bits of code here because it just
3537 makes it horribly messy. */
3538
3539 else if (*previous < OP_EODN)
3540 {
3541 uschar *oldcode;
3542 int prop_type, prop_value;
3543 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
3544 c = *previous;
3545
3546 if (!possessive_quantifier &&
3547 repeat_max < 0 &&
3548 check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3549 {
3550 repeat_type = 0; /* Force greedy */
3551 possessive_quantifier = TRUE;
3552 }
3553
3554 OUTPUT_SINGLE_REPEAT:
3555 if (*previous == OP_PROP || *previous == OP_NOTPROP)
3556 {
3557 prop_type = previous[1];
3558 prop_value = previous[2];
3559 }
3560 else prop_type = prop_value = -1;
3561
3562 oldcode = code;
3563 code = previous; /* Usually overwrite previous item */
3564
3565 /* If the maximum is zero then the minimum must also be zero; Perl allows
3566 this case, so we do too - by simply omitting the item altogether. */
3567
3568 if (repeat_max == 0) goto END_REPEAT;
3569
3570 /* All real repeats make it impossible to handle partial matching (maybe
3571 one day we will be able to remove this restriction). */
3572
3573 if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL;
3574
3575 /* Combine the op_type with the repeat_type */
3576
3577 repeat_type += op_type;
3578
3579 /* A minimum of zero is handled either as the special case * or ?, or as
3580 an UPTO, with the maximum given. */
3581
3582 if (repeat_min == 0)
3583 {
3584 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
3585 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
3586 else
3587 {
3588 *code++ = OP_UPTO + repeat_type;
3589 PUT2INC(code, 0, repeat_max);
3590 }
3591 }
3592
3593 /* A repeat minimum of 1 is optimized into some special cases. If the
3594 maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
3595 left in place and, if the maximum is greater than 1, we use OP_UPTO with
3596 one less than the maximum. */
3597
3598 else if (repeat_min == 1)
3599 {
3600 if (repeat_max == -1)
3601 *code++ = OP_PLUS + repeat_type;
3602 else
3603 {
3604 code = oldcode; /* leave previous item in place */
3605 if (repeat_max == 1) goto END_REPEAT;
3606 *code++ = OP_UPTO + repeat_type;
3607 PUT2INC(code, 0, repeat_max - 1);
3608 }
3609 }
3610
3611 /* The case {n,n} is just an EXACT, while the general case {n,m} is
3612 handled as an EXACT followed by an UPTO. */
3613
3614 else
3615 {
3616 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
3617 PUT2INC(code, 0, repeat_min);
3618
3619 /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
3620 we have to insert the character for the previous code. For a repeated
3621 Unicode property match, there are two extra bytes that define the
3622 required property. In UTF-8 mode, long characters have their length in
3623 c, with the 0x80 bit as a flag. */
3624
3625 if (repeat_max < 0)
3626 {
3627 #ifdef SUPPORT_UTF8
3628 if (utf8 && c >= 128)
3629 {
3630 memcpy(code, utf8_char, c & 7);
3631 code += c & 7;
3632 }
3633 else
3634 #endif
3635 {
3636 *code++ = c;
3637 if (prop_type >= 0)
3638 {
3639 *code++ = prop_type;
3640 *code++ = prop_value;
3641 }
3642 }
3643 *code++ = OP_STAR + repeat_type;
3644 }
3645
3646 /* Else insert an UPTO if the max is greater than the min, again
3647 preceded by the character, for the previously inserted code. If the
3648 UPTO is just for 1 instance, we can use QUERY instead. */
3649
3650 else if (repeat_max != repeat_min)
3651 {
3652 #ifdef SUPPORT_UTF8
3653 if (utf8 && c >= 128)
3654 {
3655 memcpy(code, utf8_char, c & 7);
3656 code += c & 7;
3657 }
3658 else
3659 #endif
3660 *code++ = c;
3661 if (prop_type >= 0)
3662 {
3663 *code++ = prop_type;
3664 *code++ = prop_value;
3665 }
3666 repeat_max -= repeat_min;
3667
3668 if (repeat_max == 1)
3669 {
3670 *code++ = OP_QUERY + repeat_type;
3671 }
3672 else
3673 {
3674 *code++ = OP_UPTO + repeat_type;
3675 PUT2INC(code, 0, repeat_max);
3676 }
3677 }
3678 }
3679
3680 /* The character or character type itself comes last in all cases. */
3681
3682 #ifdef SUPPORT_UTF8
3683 if (utf8 && c >= 128)
3684 {
3685 memcpy(code, utf8_char, c & 7);
3686 code += c & 7;
3687 }
3688 else
3689 #endif
3690 *code++ = c;
3691
3692 /* For a repeated Unicode property match, there are two extra bytes that
3693 define the required property. */
3694
3695 #ifdef SUPPORT_UCP
3696 if (prop_type >= 0)
3697 {
3698 *code++ = prop_type;
3699 *code++ = prop_value;
3700 }
3701 #endif
3702 }
3703
3704 /* If previous was a character class or a back reference, we put the repeat
3705 stuff after it, but just skip the item if the repeat was {0,0}. */
3706
3707 else if (*previous == OP_CLASS ||
3708 *previous == OP_NCLASS ||
3709 #ifdef SUPPORT_UTF8
3710 *previous == OP_XCLASS ||
3711 #endif
3712 *previous == OP_REF)
3713 {
3714 if (repeat_max == 0)
3715 {
3716 code = previous;
3717 goto END_REPEAT;
3718 }
3719
3720 /* All real repeats make it impossible to handle partial matching (maybe
3721 one day we will be able to remove this restriction). */
3722
3723 if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL;
3724
3725 if (repeat_min == 0 && repeat_max == -1)
3726 *code++ = OP_CRSTAR + repeat_type;
3727 else if (repeat_min == 1 && repeat_max == -1)
3728 *code++ = OP_CRPLUS + repeat_type;
3729 else if (repeat_min == 0 && repeat_max == 1)
3730 *code++ = OP_CRQUERY + repeat_type;
3731 else
3732 {
3733 *code++ = OP_CRRANGE + repeat_type;
3734 PUT2INC(code, 0, repeat_min);
3735 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
3736 PUT2INC(code, 0, repeat_max);
3737 }
3738 }
3739
3740 /* If previous was a bracket group, we may have to replicate it in certain
3741 cases. */
3742
3743 else if (*previous == OP_BRA || *previous == OP_CBRA ||
3744 *previous == OP_ONCE || *previous == OP_COND)
3745 {
3746 register int i;
3747 int ketoffset = 0;
3748 int len = code - previous;
3749 uschar *bralink = NULL;
3750
3751 /* Repeating a DEFINE group is pointless */
3752
3753 if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
3754 {
3755 *errorcodeptr = ERR55;
3756 goto FAILED;
3757 }
3758
3759 /* If the maximum repeat count is unlimited, find the end of the bracket
3760 by scanning through from the start, and compute the offset back to it
3761 from the current code pointer. There may be an OP_OPT setting following
3762 the final KET, so we can't find the end just by going back from the code
3763 pointer. */
3764
3765 if (repeat_max == -1)
3766 {
3767 register uschar *ket = previous;
3768 do ket += GET(ket, 1); while (*ket != OP_KET);
3769 ketoffset = code - ket;
3770 }
3771
3772 /* The case of a zero minimum is special because of the need to stick
3773 OP_BRAZERO in front of it, and because the group appears once in the
3774 data, whereas in other cases it appears the minimum number of times. For
3775 this reason, it is simplest to treat this case separately, as otherwise
3776 the code gets far too messy. There are several special subcases when the
3777 minimum is zero. */
3778
3779 if (repeat_min == 0)
3780 {
3781 /* If the maximum is also zero, we just omit the group from the output
3782 altogether. */
3783
3784 if (repeat_max == 0)
3785 {
3786 code = previous;
3787 goto END_REPEAT;
3788 }
3789
3790 /* If the maximum is 1 or unlimited, we just have to stick in the
3791 BRAZERO and do no more at this point. However, we do need to adjust
3792 any OP_RECURSE calls inside the group that refer to the group itself or
3793 any internal or forward referenced group, because the offset is from
3794 the start of the whole regex. Temporarily terminate the pattern while
3795 doing this. */
3796
3797 if (repeat_max <= 1)
3798 {
3799 *code = OP_END;
3800 adjust_recurse(previous, 1, utf8, cd, save_hwm);
3801 memmove(previous+1, previous, len);
3802 code++;
3803 *previous++ = OP_BRAZERO + repeat_type;
3804 }
3805
3806 /* If the maximum is greater than 1 and limited, we have to replicate
3807 in a nested fashion, sticking OP_BRAZERO before each set of brackets.
3808 The first one has to be handled carefully because it's the original
3809 copy, which has to be moved up. The remainder can be handled by code
3810 that is common with the non-zero minimum case below. We have to
3811 adjust the value or repeat_max, since one less copy is required. Once
3812 again, we may have to adjust any OP_RECURSE calls inside the group. */
3813
3814 else
3815 {
3816 int offset;
3817 *code = OP_END;
3818 adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
3819 memmove(previous + 2 + LINK_SIZE, previous, len);
3820 code += 2 + LINK_SIZE;
3821 *previous++ = OP_BRAZERO + repeat_type;
3822 *previous++ = OP_BRA;
3823
3824 /* We chain together the bracket offset fields that have to be
3825 filled in later when the ends of the brackets are reached. */
3826
3827 offset = (bralink == NULL)? 0 : previous - bralink;
3828 bralink = previous;
3829 PUTINC(previous, 0, offset);
3830 }
3831
3832 repeat_max--;
3833 }
3834
3835 /* If the minimum is greater than zero, replicate the group as many
3836 times as necessary, and adjust the maximum to the number of subsequent
3837 copies that we need. If we set a first char from the group, and didn't
3838 set a required char, copy the latter from the former. If there are any
3839 forward reference subroutine calls in the group, there will be entries on
3840 the workspace list; replicate these with an appropriate increment. */
3841
3842 else
3843 {
3844 if (repeat_min > 1)
3845 {
3846 /* In the pre-compile phase, we don't actually do the replication. We
3847 just adjust the length as if we had. Do some paranoid checks for
3848 potential integer overflow. */
3849
3850 if (lengthptr != NULL)
3851 {
3852 int delta = (repeat_min - 1)*length_prevgroup;
3853 if ((double)(repeat_min - 1)*(double)length_prevgroup >
3854 (double)INT_MAX ||
3855 OFLOW_MAX - *lengthptr < delta)
3856 {
3857 *errorcodeptr = ERR20;
3858 goto FAILED;
3859 }
3860 *lengthptr += delta;
3861 }
3862
3863 /* This is compiling for real */
3864
3865 else
3866 {
3867 if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
3868 for (i = 1; i < repeat_min; i++)
3869 {
3870 uschar *hc;
3871 uschar *this_hwm = cd->hwm;
3872 memcpy(code, previous, len);
3873 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3874 {
3875 PUT(cd->hwm, 0, GET(hc, 0) + len);
3876 cd->hwm += LINK_SIZE;
3877 }
3878 save_hwm = this_hwm;
3879 code += len;
3880 }
3881 }
3882 }
3883
3884 if (repeat_max > 0) repeat_max -= repeat_min;
3885 }
3886
3887 /* This code is common to both the zero and non-zero minimum cases. If
3888 the maximum is limited, it replicates the group in a nested fashion,
3889 remembering the bracket starts on a stack. In the case of a zero minimum,
3890 the first one was set up above. In all cases the repeat_max now specifies
3891 the number of additional copies needed. Again, we must remember to
3892 replicate entries on the forward reference list. */
3893
3894 if (repeat_max >= 0)
3895 {
3896 /* In the pre-compile phase, we don't actually do the replication. We
3897 just adjust the length as if we had. For each repetition we must add 1
3898 to the length for BRAZERO and for all but the last repetition we must
3899 add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
3900 paranoid checks to avoid integer overflow. */
3901
3902 if (lengthptr != NULL && repeat_max > 0)
3903 {
3904 int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
3905 2 - 2*LINK_SIZE; /* Last one doesn't nest */
3906 if ((double)repeat_max *
3907 (double)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
3908 > (double)INT_MAX ||
3909 OFLOW_MAX - *lengthptr < delta)
3910 {
3911 *errorcodeptr = ERR20;
3912 goto FAILED;
3913 }
3914 *lengthptr += delta;
3915 }
3916
3917 /* This is compiling for real */
3918
3919 else for (i = repeat_max - 1; i >= 0; i--)
3920 {
3921 uschar *hc;
3922 uschar *this_hwm = cd->hwm;
3923
3924 *code++ = OP_BRAZERO + repeat_type;
3925
3926 /* All but the final copy start a new nesting, maintaining the
3927 chain of brackets outstanding. */
3928
3929 if (i != 0)
3930 {
3931 int offset;
3932 *code++ = OP_BRA;
3933 offset = (bralink == NULL)? 0 : code - bralink;
3934 bralink = code;
3935 PUTINC(code, 0, offset);
3936 }
3937
3938 memcpy(code, previous, len);
3939 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3940 {
3941 PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
3942 cd->hwm += LINK_SIZE;
3943 }
3944 save_hwm = this_hwm;
3945 code += len;
3946 }
3947
3948 /* Now chain through the pending brackets, and fill in their length
3949 fields (which are holding the chain links pro tem). */
3950
3951 while (bralink != NULL)
3952 {
3953 int oldlinkoffset;
3954 int offset = code - bralink + 1;
3955 uschar *bra = code - offset;
3956 oldlinkoffset = GET(bra, 1);
3957 bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
3958 *code++ = OP_KET;
3959 PUTINC(code, 0, offset);
3960 PUT(bra, 1, offset);
3961 }
3962 }
3963
3964 /* If the maximum is unlimited, set a repeater in the final copy. We
3965 can't just offset backwards from the current code point, because we
3966 don't know if there's been an options resetting after the ket. The
3967 correct offset was computed above.
3968
3969 Then, when we are doing the actual compile phase, check to see whether
3970 this group is a non-atomic one that could match an empty string. If so,
3971 convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
3972 that runtime checking can be done. [This check is also applied to
3973 atomic groups at runtime, but in a different way.] */
3974
3975 else
3976 {
3977 uschar *ketcode = code - ketoffset;
3978 uschar *bracode = ketcode - GET(ketcode, 1);
3979 *ketcode = OP_KETRMAX + repeat_type;
3980 if (lengthptr == NULL && *bracode != OP_ONCE)
3981 {
3982 uschar *scode = bracode;
3983 do
3984 {
3985 if (could_be_empty_branch(scode, ketcode, utf8))
3986 {
3987 *bracode += OP_SBRA - OP_BRA;
3988 break;
3989 }
3990 scode += GET(scode, 1);
3991 }
3992 while (*scode == OP_ALT);
3993 }
3994 }
3995 }
3996
3997 /* Else there's some kind of shambles */
3998
3999 else
4000 {
4001 *errorcodeptr = ERR11;
4002 goto FAILED;
4003 }
4004
4005 /* If the character following a repeat is '+', or if certain optimization
4006 tests above succeeded, possessive_quantifier is TRUE. For some of the
4007 simpler opcodes, there is an special alternative opcode for this. For
4008 anything else, we wrap the entire repeated item inside OP_ONCE brackets.
4009 The '+' notation is just syntactic sugar, taken from Sun's Java package,
4010 but the special opcodes can optimize it a bit. The repeated item starts at
4011 tempcode, not at previous, which might be the first part of a string whose
4012 (former) last char we repeated.
4013
4014 Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
4015 an 'upto' may follow. We skip over an 'exact' item, and then test the
4016 length of what remains before proceeding. */
4017
4018 if (possessive_quantifier)
4019 {
4020 int len;
4021 if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||
4022 *tempcode == OP_NOTEXACT)
4023 tempcode += _pcre_OP_lengths[*tempcode];
4024 len = code - tempcode;
4025 if (len > 0) switch (*tempcode)
4026 {
4027 case OP_STAR: *tempcode = OP_POSSTAR; break;
4028 case OP_PLUS: *tempcode = OP_POSPLUS; break;
4029 case OP_QUERY: *tempcode = OP_POSQUERY; break;
4030 case OP_UPTO: *tempcode = OP_POSUPTO; break;
4031
4032 case OP_TYPESTAR: *tempcode = OP_TYPEPOSSTAR; break;
4033 case OP_TYPEPLUS: *tempcode = OP_TYPEPOSPLUS; break;
4034 case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
4035 case OP_TYPEUPTO: *tempcode = OP_TYPEPOSUPTO; break;
4036
4037 case OP_NOTSTAR: *tempcode = OP_NOTPOSSTAR; break;
4038 case OP_NOTPLUS: *tempcode = OP_NOTPOSPLUS; break;
4039 case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
4040 case OP_NOTUPTO: *tempcode = OP_NOTPOSUPTO; break;
4041
4042 default:
4043 memmove(tempcode + 1+LINK_SIZE, tempcode, len);
4044 code += 1 + LINK_SIZE;
4045 len += 1 + LINK_SIZE;
4046 tempcode[0] = OP_ONCE;
4047 *code++ = OP_KET;
4048 PUTINC(code, 0, len);
4049 PUT(tempcode, 1, len);
4050 break;
4051 }
4052 }
4053
4054 /* In all case we no longer have a previous item. We also set the
4055 "follows varying string" flag for subsequently encountered reqbytes if
4056 it isn't already set and we have just passed a varying length item. */
4057
4058 END_REPEAT:
4059 previous = NULL;
4060 cd->req_varyopt |= reqvary;
4061 break;
4062
4063
4064 /* ===================================================================*/
4065 /* Start of nested parenthesized sub-expression, or comment or lookahead or
4066 lookbehind or option setting or condition or all the other extended
4067 parenthesis forms. */
4068
4069 case '(':
4070 newoptions = options;
4071 skipbytes = 0;
4072 bravalue = OP_CBRA;
4073 save_hwm = cd->hwm;
4074 reset_bracount = FALSE;
4075
4076 /* First deal with various "verbs" that can be introduced by '*'. */
4077
4078 if (*(++ptr) == '*' && (cd->ctypes[ptr[1]] & ctype_letter) != 0)
4079 {
4080 int i, namelen;
4081 const char *vn = verbnames;
4082 const uschar *name = ++ptr;
4083 previous = NULL;
4084 while ((cd->ctypes[*++ptr] & ctype_letter) != 0);
4085 if (*ptr == ':')
4086 {
4087 *errorcodeptr = ERR59; /* Not supported */
4088 goto FAILED;
4089 }
4090 if (*ptr != ')')
4091 {
4092 *errorcodeptr = ERR60;
4093 goto FAILED;
4094 }
4095 namelen = ptr - name;
4096 for (i = 0; i < verbcount; i++)
4097 {
4098 if (namelen == verbs[i].len &&
4099 strncmp((char *)name, vn, namelen) == 0)
4100 {
4101 *code = verbs[i].op;
4102 if (*code++ == OP_ACCEPT) cd->had_accept = TRUE;
4103 break;
4104 }
4105 vn += verbs[i].len + 1;
4106 }
4107 if (i < verbcount) continue;
4108 *errorcodeptr = ERR60;
4109 goto FAILED;
4110 }
4111
4112 /* Deal with the extended parentheses; all are introduced by '?', and the
4113 appearance of any of them means that this is not a capturing group. */
4114
4115 else if (*ptr == '?')
4116 {
4117 int i, set, unset, namelen;
4118 int *optset;
4119 const uschar *name;
4120 uschar *slot;
4121
4122 switch (*(++ptr))
4123 {
4124 case '#': /* Comment; skip to ket */
4125 ptr++;
4126 while (*ptr != 0 && *ptr != ')') ptr++;
4127 if (*ptr == 0)
4128 {
4129 *errorcodeptr = ERR18;
4130 goto FAILED;
4131 }
4132 continue;
4133
4134
4135 /* ------------------------------------------------------------ */
4136 case '|': /* Reset capture count for each branch */
4137 reset_bracount = TRUE;
4138 /* Fall through */
4139
4140 /* ------------------------------------------------------------ */
4141 case ':': /* Non-capturing bracket */
4142 bravalue = OP_BRA;
4143 ptr++;
4144 break;
4145
4146
4147 /* ------------------------------------------------------------ */
4148 case '(':
4149 bravalue = OP_COND; /* Conditional group */
4150
4151 /* A condition can be an assertion, a number (referring to a numbered
4152 group), a name (referring to a named group), or 'R', referring to
4153 recursion. R<digits> and R&name are also permitted for recursion tests.
4154
4155 There are several syntaxes for testing a named group: (?(name)) is used
4156 by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
4157
4158 There are two unfortunate ambiguities, caused by history. (a) 'R' can
4159 be the recursive thing or the name 'R' (and similarly for 'R' followed
4160 by digits), and (b) a number could be a name that consists of digits.
4161 In both cases, we look for a name first; if not found, we try the other
4162 cases. */
4163
4164 /* For conditions that are assertions, check the syntax, and then exit
4165 the switch. This will take control down to where bracketed groups,
4166 including assertions, are processed. */
4167
4168 if (ptr[1] == '?' && (ptr[2] == '=' || ptr[2] == '!' || ptr[2] == '<'))
4169 break;
4170
4171 /* Most other conditions use OP_CREF (a couple change to OP_RREF
4172 below), and all need to skip 3 bytes at the start of the group. */
4173
4174 code[1+LINK_SIZE] = OP_CREF;
4175 skipbytes = 3;
4176 refsign = -1;
4177
4178 /* Check for a test for recursion in a named group. */
4179
4180 if (ptr[1] == 'R' && ptr[2] == '&')
4181 {
4182 terminator = -1;
4183 ptr += 2;
4184 code[1+LINK_SIZE] = OP_RREF; /* Change the type of test */
4185 }
4186
4187 /* Check for a test for a named group's having been set, using the Perl
4188 syntax (?(<name>) or (?('name') */
4189
4190 else if (ptr[1] == '<')
4191 {
4192 terminator = '>';
4193 ptr++;
4194 }
4195 else if (ptr[1] == '\'')
4196 {
4197 terminator = '\'';
4198 ptr++;
4199 }
4200 else
4201 {
4202 terminator = 0;
4203 if (ptr[1] == '-' || ptr[1] == '+') refsign = *(++ptr);
4204 }
4205
4206 /* We now expect to read a name; any thing else is an error */
4207
4208 if ((cd->ctypes[ptr[1]] & ctype_word) == 0)
4209 {
4210 ptr += 1; /* To get the right offset */
4211 *errorcodeptr = ERR28;
4212 goto FAILED;
4213 }
4214
4215 /* Read the name, but also get it as a number if it's all digits */
4216
4217 recno = 0;
4218 name = ++ptr;
4219 while ((cd->ctypes[*ptr] & ctype_word) != 0)
4220 {
4221 if (recno >= 0)
4222 recno = ((digitab[*ptr] & ctype_digit) != 0)?
4223 recno * 10 + *ptr - '0' : -1;
4224 ptr++;
4225 }
4226 namelen = ptr - name;
4227
4228 if ((terminator > 0 && *ptr++ != terminator) || *ptr++ != ')')
4229 {
4230 ptr--; /* Error offset */
4231 *errorcodeptr = ERR26;
4232 goto FAILED;
4233 }
4234
4235 /* Do no further checking in the pre-compile phase. */
4236
4237 if (lengthptr != NULL) break;
4238
4239 /* In the real compile we do the work of looking for the actual
4240 reference. If the string started with "+" or "-" we require the rest to
4241 be digits, in which case recno will be set. */
4242
4243 if (refsign > 0)
4244 {
4245 if (recno <= 0)
4246 {
4247 *errorcodeptr = ERR58;
4248 goto FAILED;
4249 }
4250 if (refsign == '-')
4251 {
4252 recno = cd->bracount - recno + 1;
4253 if (recno <= 0)
4254 {
4255 *errorcodeptr = ERR15;
4256 goto FAILED;
4257 }
4258 }
4259 else recno += cd->bracount;
4260 PUT2(code, 2+LINK_SIZE, recno);
4261 break;
4262 }
4263
4264 /* Otherwise (did not start with "+" or "-"), start by looking for the
4265 name. */
4266
4267 slot = cd->name_table;
4268 for (i = 0; i < cd->names_found; i++)
4269 {
4270 if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
4271 slot += cd->name_entry_size;
4272 }
4273
4274 /* Found a previous named subpattern */
4275
4276 if (i < cd->names_found)
4277 {
4278 recno = GET2(slot, 0);
4279 PUT2(code, 2+LINK_SIZE, recno);
4280 }
4281
4282 /* Search the pattern for a forward reference */
4283
4284 else if ((i = find_parens(ptr, cd->bracount, name, namelen,
4285 (options & PCRE_EXTENDED) != 0)) > 0)
4286 {
4287 PUT2(code, 2+LINK_SIZE, i);
4288 }
4289
4290 /* If terminator == 0 it means that the name followed directly after
4291 the opening parenthesis [e.g. (?(abc)...] and in this case there are
4292 some further alternatives to try. For the cases where terminator != 0
4293 [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
4294 now checked all the possibilities, so give an error. */
4295
4296 else if (terminator != 0)
4297 {
4298 *errorcodeptr = ERR15;
4299 goto FAILED;
4300 }
4301
4302 /* Check for (?(R) for recursion. Allow digits after R to specify a
4303 specific group number. */
4304
4305 else if (*name == 'R')
4306 {
4307 recno = 0;
4308 for (i = 1; i < namelen; i++)
4309 {
4310 if ((digitab[name[i]] & ctype_digit) == 0)
4311 {
4312 *errorcodeptr = ERR15;
4313 goto FAILED;
4314 }
4315 recno = recno * 10 + name[i] - '0';
4316 }
4317 if (recno == 0) recno = RREF_ANY;
4318 code[1+LINK_SIZE] = OP_RREF; /* Change test type */
4319 PUT2(code, 2+LINK_SIZE, recno);
4320 }
4321
4322 /* Similarly, check for the (?(DEFINE) "condition", which is always
4323 false. */
4324
4325 else if (namelen == 6 && strncmp((char *)name, "DEFINE", 6) == 0)
4326 {
4327 code[1+LINK_SIZE] = OP_DEF;
4328 skipbytes = 1;
4329 }
4330
4331 /* Check for the "name" actually being a subpattern number. */
4332
4333 else if (recno > 0)
4334 {
4335 PUT2(code, 2+LINK_SIZE, recno);
4336 }
4337
4338 /* Either an unidentified subpattern, or a reference to (?(0) */
4339
4340 else
4341 {
4342 *errorcodeptr = (recno == 0)? ERR35: ERR15;
4343 goto FAILED;
4344 }
4345 break;
4346
4347
4348 /* ------------------------------------------------------------ */
4349 case '=': /* Positive lookahead */
4350 bravalue = OP_ASSERT;
4351 ptr++;
4352 break;
4353
4354
4355 /* ------------------------------------------------------------ */
4356 case '!': /* Negative lookahead */
4357 ptr++;
4358 if (*ptr == ')') /* Optimize (?!) */
4359 {
4360 *code++ = OP_FAIL;
4361 previous = NULL;
4362 continue;
4363 }
4364 bravalue = OP_ASSERT_NOT;
4365 break;
4366
4367
4368 /* ------------------------------------------------------------ */
4369 case '<': /* Lookbehind or named define */
4370 switch (ptr[1])
4371 {
4372 case '=': /* Positive lookbehind */
4373 bravalue = OP_ASSERTBACK;
4374 ptr += 2;
4375 break;
4376
4377 case '!': /* Negative lookbehind */
4378 bravalue = OP_ASSERTBACK_NOT;
4379 ptr += 2;
4380 break;
4381
4382 default: /* Could be name define, else bad */
4383 if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME;
4384 ptr++; /* Correct offset for error */
4385 *errorcodeptr = ERR24;
4386 goto FAILED;
4387 }
4388 break;
4389
4390
4391 /* ------------------------------------------------------------ */
4392 case '>': /* One-time brackets */
4393 bravalue = OP_ONCE;
4394 ptr++;
4395 break;
4396
4397
4398 /* ------------------------------------------------------------ */
4399 case 'C': /* Callout - may be followed by digits; */
4400 previous_callout = code; /* Save for later completion */
4401 after_manual_callout = 1; /* Skip one item before completing */
4402 *code++ = OP_CALLOUT;
4403 {
4404 int n = 0;
4405 while ((digitab[*(++ptr)] & ctype_digit) != 0)
4406 n = n * 10 + *ptr - '0';
4407 if (*ptr != ')')
4408 {
4409 *errorcodeptr = ERR39;
4410 goto FAILED;
4411 }
4412 if (n > 255)
4413 {
4414 *errorcodeptr = ERR38;
4415 goto FAILED;
4416 }
4417 *code++ = n;
4418 PUT(code, 0, ptr - cd->start_pattern + 1); /* Pattern offset */
4419 PUT(code, LINK_SIZE, 0); /* Default length */
4420 code += 2 * LINK_SIZE;
4421 }
4422 previous = NULL;
4423 continue;
4424
4425
4426 /* ------------------------------------------------------------ */
4427 case 'P': /* Python-style named subpattern handling */
4428 if (*(++ptr) == '=' || *ptr == '>') /* Reference or recursion */
4429 {
4430 is_recurse = *ptr == '>';
4431 terminator = ')';
4432 goto NAMED_REF_OR_RECURSE;
4433 }
4434 else if (*ptr != '<') /* Test for Python-style definition */
4435 {
4436 *errorcodeptr = ERR41;
4437 goto FAILED;
4438 }
4439 /* Fall through to handle (?P< as (?< is handled */
4440
4441
4442 /* ------------------------------------------------------------ */
4443 DEFINE_NAME: /* Come here from (?< handling */
4444 case '\'':
4445 {
4446 terminator = (*ptr == '<')? '>' : '\'';
4447 name = ++ptr;
4448
4449 while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4450 namelen = ptr - name;
4451
4452 /* In the pre-compile phase, just do a syntax check. */
4453
4454 if (lengthptr != NULL)
4455 {
4456 if (*ptr != terminator)
4457 {
4458 *errorcodeptr = ERR42;
4459 goto FAILED;
4460 }
4461 if (cd->names_found >= MAX_NAME_COUNT)
4462 {
4463 *errorcodeptr = ERR49;
4464 goto FAILED;
4465 }
4466 if (namelen + 3 > cd->name_entry_size)
4467 {
4468 cd->name_entry_size = namelen + 3;
4469 if (namelen > MAX_NAME_SIZE)
4470 {
4471 *errorcodeptr = ERR48;
4472 goto FAILED;
4473 }
4474 }
4475 }
4476
4477 /* In the real compile, create the entry in the table */
4478
4479 else
4480 {
4481 slot = cd->name_table;
4482 for (i = 0; i < cd->names_found; i++)
4483 {
4484 int crc = memcmp(name, slot+2, namelen);
4485 if (crc == 0)
4486 {
4487 if (slot[2+namelen] == 0)
4488 {
4489 if ((options & PCRE_DUPNAMES) == 0)
4490 {
4491 *errorcodeptr = ERR43;
4492 goto FAILED;
4493 }
4494 }
4495 else crc = -1; /* Current name is substring */
4496 }
4497 if (crc < 0)
4498 {
4499 memmove(slot + cd->name_entry_size, slot,
4500 (cd->names_found - i) * cd->name_entry_size);
4501 break;
4502 }
4503 slot += cd->name_entry_size;
4504 }
4505
4506 PUT2(slot, 0, cd->bracount + 1);
4507 memcpy(slot + 2, name, namelen);
4508 slot[2+namelen] = 0;
4509 }
4510 }
4511
4512 /* In both cases, count the number of names we've encountered. */
4513
4514 ptr++; /* Move past > or ' */
4515 cd->names_found++;
4516 goto NUMBERED_GROUP;
4517
4518
4519 /* ------------------------------------------------------------ */
4520 case '&': /* Perl recursion/subroutine syntax */
4521 terminator = ')';
4522 is_recurse = TRUE;
4523 /* Fall through */
4524
4525 /* We come here from the Python syntax above that handles both
4526 references (?P=name) and recursion (?P>name), as well as falling
4527 through from the Perl recursion syntax (?&name). */
4528
4529 NAMED_REF_OR_RECURSE:
4530 name = ++ptr;
4531 while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4532 namelen = ptr - name;
4533
4534 /* In the pre-compile phase, do a syntax check and set a dummy
4535 reference number. */
4536
4537 if (lengthptr != NULL)
4538 {
4539 if (namelen == 0)
4540 {
4541 *errorcodeptr = ERR62;
4542 goto FAILED;
4543 }
4544 if (*ptr != terminator)
4545 {
4546 *errorcodeptr = ERR42;
4547 goto FAILED;
4548 }
4549 if (namelen > MAX_NAME_SIZE)
4550 {
4551 *errorcodeptr = ERR48;
4552 goto FAILED;
4553 }
4554 recno = 0;
4555 }
4556
4557 /* In the real compile, seek the name in the table. We check the name
4558 first, and then check that we have reached the end of the name in the
4559 table. That way, if the name that is longer than any in the table,
4560 the comparison will fail without reading beyond the table entry. */
4561
4562 else
4563 {
4564 slot = cd->name_table;
4565 for (i = 0; i < cd->names_found; i++)
4566 {
4567 if (strncmp((char *)name, (char *)slot+2, namelen) == 0 &&
4568 slot[2+namelen] == 0)
4569 break;
4570 slot += cd->name_entry_size;
4571 }
4572
4573 if (i < cd->names_found) /* Back reference */
4574 {
4575 recno = GET2(slot, 0);
4576 }
4577 else if ((recno = /* Forward back reference */
4578 find_parens(ptr, cd->bracount, name, namelen,
4579 (options & PCRE_EXTENDED) != 0)) <= 0)
4580 {
4581 *errorcodeptr = ERR15;
4582 goto FAILED;
4583 }
4584 }
4585
4586 /* In both phases, we can now go to the code than handles numerical
4587 recursion or backreferences. */
4588
4589 if (is_recurse) goto HANDLE_RECURSION;
4590 else goto HANDLE_REFERENCE;
4591
4592
4593 /* ------------------------------------------------------------ */
4594 case 'R': /* Recursion */
4595 ptr++; /* Same as (?0) */
4596 /* Fall through */
4597
4598
4599 /* ------------------------------------------------------------ */
4600 case '-': case '+':
4601 case '0': case '1': case '2': case '3': case '4': /* Recursion or */
4602 case '5': case '6': case '7': case '8': case '9': /* subroutine */
4603 {
4604 const uschar *called;
4605
4606 if ((refsign = *ptr) == '+') ptr++;
4607 else if (refsign == '-')
4608 {
4609 if ((digitab[ptr[1]] & ctype_digit) == 0)
4610 goto OTHER_CHAR_AFTER_QUERY;
4611 ptr++;
4612 }
4613
4614 recno = 0;
4615 while((digitab[*ptr] & ctype_digit) != 0)
4616 recno = recno * 10 + *ptr++ - '0';
4617
4618 if (*ptr != ')')
4619 {
4620 *errorcodeptr = ERR29;
4621 goto FAILED;
4622 }
4623
4624 if (refsign == '-')
4625 {
4626 if (recno == 0)
4627 {
4628 *errorcodeptr = ERR58;
4629 goto FAILED;
4630 }
4631 recno = cd->bracount - recno + 1;
4632 if (recno <= 0)
4633 {
4634 *errorcodeptr = ERR15;
4635 goto FAILED;
4636 }
4637 }
4638 else if (refsign == '+')
4639 {
4640 if (recno == 0)
4641 {
4642 *errorcodeptr = ERR58;
4643 goto FAILED;
4644 }
4645 recno += cd->bracount;
4646 }
4647
4648 /* Come here from code above that handles a named recursion */
4649
4650 HANDLE_RECURSION:
4651
4652 previous = code;
4653 called = cd->start_code;
4654
4655 /* When we are actually compiling, find the bracket that is being
4656 referenced. Temporarily end the regex in case it doesn't exist before
4657 this point. If we end up with a forward reference, first check that
4658 the bracket does occur later so we can give the error (and position)
4659 now. Then remember this forward reference in the workspace so it can
4660 be filled in at the end. */
4661
4662 if (lengthptr == NULL)
4663 {
4664 *code = OP_END;
4665 if (recno != 0) called = find_bracket(cd->start_code, utf8, recno);
4666
4667 /* Forward reference */
4668
4669 if (called == NULL)
4670 {
4671 if (find_parens(ptr, cd->bracount, NULL, recno,
4672 (options & PCRE_EXTENDED) != 0) < 0)
4673 {
4674 *errorcodeptr = ERR15;
4675 goto FAILED;
4676 }
4677 called = cd->start_code + recno;
4678 PUTINC(cd->hwm, 0, code + 2 + LINK_SIZE - cd->start_code);
4679 }
4680
4681 /* If not a forward reference, and the subpattern is still open,
4682 this is a recursive call. We check to see if this is a left
4683 recursion that could loop for ever, and diagnose that case. */
4684
4685 else if (GET(called, 1) == 0 &&
4686 could_be_empty(called, code, bcptr, utf8))
4687 {
4688 *errorcodeptr = ERR40;
4689 goto FAILED;
4690 }
4691 }
4692
4693 /* Insert the recursion/subroutine item, automatically wrapped inside
4694 "once" brackets. Set up a "previous group" length so that a
4695 subsequent quantifier will work. */
4696
4697 *code = OP_ONCE;
4698 PUT(code, 1, 2 + 2*LINK_SIZE);
4699 code += 1 + LINK_SIZE;
4700
4701 *code = OP_RECURSE;
4702 PUT(code, 1, called - cd->start_code);
4703 code += 1 + LINK_SIZE;
4704
4705 *code = OP_KET;
4706 PUT(code, 1, 2 + 2*LINK_SIZE);
4707 code += 1 + LINK_SIZE;
4708
4709 length_prevgroup = 3 + 3*LINK_SIZE;
4710 }
4711
4712 /* Can't determine a first byte now */
4713
4714 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
4715 continue;
4716
4717
4718 /* ------------------------------------------------------------ */
4719 default: /* Other characters: check option setting */
4720 OTHER_CHAR_AFTER_QUERY:
4721 set = unset = 0;
4722 optset = &set;
4723
4724 while (*ptr != ')' && *ptr != ':')
4725 {
4726 switch (*ptr++)
4727 {
4728 case '-': optset = &unset; break;
4729
4730 case 'J': /* Record that it changed in the external options */
4731 *optset |= PCRE_DUPNAMES;
4732 cd->external_flags |= PCRE_JCHANGED;
4733 break;
4734
4735 case 'i': *optset |= PCRE_CASELESS; break;
4736 case 'm': *optset |= PCRE_MULTILINE; break;
4737 case 's': *optset |= PCRE_DOTALL; break;
4738 case 'x': *optset |= PCRE_EXTENDED; break;
4739 case 'U': *optset |= PCRE_UNGREEDY; break;
4740 case 'X': *optset |= PCRE_EXTRA; break;
4741
4742 default: *errorcodeptr = ERR12;
4743 ptr--; /* Correct the offset */
4744 goto FAILED;
4745 }
4746 }
4747
4748 /* Set up the changed option bits, but don't change anything yet. */
4749
4750 newoptions = (options | set) & (~unset);
4751
4752 /* If the options ended with ')' this is not the start of a nested
4753 group with option changes, so the options change at this level. If this
4754 item is right at the start of the pattern, the options can be
4755 abstracted and made external in the pre-compile phase, and ignored in
4756 the compile phase. This can be helpful when matching -- for instance in
4757 caseless checking of required bytes.
4758
4759 If the code pointer is not (cd->start_code + 1 + LINK_SIZE), we are
4760 definitely *not* at the start of the pattern because something has been
4761 compiled. In the pre-compile phase, however, the code pointer can have
4762 that value after the start, because it gets reset as code is discarded
4763 during the pre-compile. However, this can happen only at top level - if
4764 we are within parentheses, the starting BRA will still be present. At
4765 any parenthesis level, the length value can be used to test if anything
4766 has been compiled at that level. Thus, a test for both these conditions
4767 is necessary to ensure we correctly detect the start of the pattern in
4768 both phases.
4769
4770 If we are not at the pattern start, compile code to change the ims
4771 options if this setting actually changes any of them. We also pass the
4772 new setting back so that it can be put at the start of any following
4773 branches, and when this group ends (if we are in a group), a resetting
4774 item can be compiled. */
4775
4776 if (*ptr == ')')
4777 {
4778 if (code == cd->start_code + 1 + LINK_SIZE &&
4779 (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE))
4780 {
4781 cd->external_options = newoptions;
4782 options = newoptions;
4783 }
4784 else
4785 {
4786 if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
4787 {
4788 *code++ = OP_OPT;
4789 *code++ = newoptions & PCRE_IMS;
4790 }
4791
4792 /* Change options at this level, and pass them back for use
4793 in subsequent branches. Reset the greedy defaults and the case
4794 value for firstbyte and reqbyte. */
4795
4796 *optionsptr = options = newoptions;
4797 greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
4798 greedy_non_default = greedy_default ^ 1;
4799 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
4800 }
4801
4802 previous = NULL; /* This item can't be repeated */
4803 continue; /* It is complete */
4804 }
4805
4806 /* If the options ended with ':' we are heading into a nested group
4807 with possible change of options. Such groups are non-capturing and are
4808 not assertions of any kind. All we need to do is skip over the ':';
4809 the newoptions value is handled below. */
4810
4811 bravalue = OP_BRA;
4812 ptr++;
4813 } /* End of switch for character following (? */
4814 } /* End of (? handling */
4815
4816 /* Opening parenthesis not followed by '?'. If PCRE_NO_AUTO_CAPTURE is set,
4817 all unadorned brackets become non-capturing and behave like (?:...)
4818 brackets. */
4819
4820 else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
4821 {
4822 bravalue = OP_BRA;
4823 }
4824
4825 /* Else we have a capturing group. */
4826
4827 else
4828 {
4829 NUMBERED_GROUP:
4830 cd->bracount += 1;
4831 PUT2(code, 1+LINK_SIZE, cd->bracount);
4832 skipbytes = 2;
4833 }
4834
4835 /* Process nested bracketed regex. Assertions may not be repeated, but
4836 other kinds can be. All their opcodes are >= OP_ONCE. We copy code into a
4837 non-register variable in order to be able to pass its address because some
4838 compilers complain otherwise. Pass in a new setting for the ims options if
4839 they have changed. */
4840
4841 previous = (bravalue >= OP_ONCE)? code : NULL;
4842 *code = bravalue;
4843 tempcode = code;
4844 tempreqvary = cd->req_varyopt; /* Save value before bracket */
4845 length_prevgroup = 0; /* Initialize for pre-compile phase */
4846
4847 if (!compile_regex(
4848 newoptions, /* The complete new option state */
4849 options & PCRE_IMS, /* The previous ims option state */
4850 &tempcode, /* Where to put code (updated) */
4851 &ptr, /* Input pointer (updated) */
4852 errorcodeptr, /* Where to put an error message */
4853 (bravalue == OP_ASSERTBACK ||
4854 bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
4855 reset_bracount, /* True if (?| group */
4856 skipbytes, /* Skip over bracket number */
4857 &subfirstbyte, /* For possible first char */
4858 &subreqbyte, /* For possible last char */
4859 bcptr, /* Current branch chain */
4860 cd, /* Tables block */
4861 (lengthptr == NULL)? NULL : /* Actual compile phase */
4862 &length_prevgroup /* Pre-compile phase */
4863 ))
4864 goto FAILED;
4865
4866 /* At the end of compiling, code is still pointing to the start of the
4867 group, while tempcode has been updated to point past the end of the group
4868 and any option resetting that may follow it. The pattern pointer (ptr)
4869 is on the bracket. */
4870
4871 /* If this is a conditional bracket, check that there are no more than
4872 two branches in the group, or just one if it's a DEFINE group. We do this
4873 in the real compile phase, not in the pre-pass, where the whole group may
4874 not be available. */
4875
4876 if (bravalue == OP_COND && lengthptr == NULL)
4877 {
4878 uschar *tc = code;
4879 int condcount = 0;
4880
4881 do {
4882 condcount++;
4883 tc += GET(tc,1);
4884 }
4885 while (*tc != OP_KET);
4886
4887 /* A DEFINE group is never obeyed inline (the "condition" is always
4888 false). It must have only one branch. */
4889
4890 if (code[LINK_SIZE+1] == OP_DEF)
4891 {
4892 if (condcount > 1)
4893 {
4894 *errorcodeptr = ERR54;
4895 goto FAILED;
4896 }
4897 bravalue = OP_DEF; /* Just a flag to suppress char handling below */
4898 }
4899
4900 /* A "normal" conditional group. If there is just one branch, we must not
4901 make use of its firstbyte or reqbyte, because this is equivalent to an
4902 empty second branch. */
4903
4904 else
4905 {
4906 if (condcount > 2)
4907 {
4908 *errorcodeptr = ERR27;
4909 goto FAILED;
4910 }
4911 if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
4912 }
4913 }
4914
4915 /* Error if hit end of pattern */
4916
4917 if (*ptr != ')')
4918 {
4919 *errorcodeptr = ERR14;
4920 goto FAILED;
4921 }
4922
4923 /* In the pre-compile phase, update the length by the length of the group,
4924 less the brackets at either end. Then reduce the compiled code to just a
4925 set of non-capturing brackets so that it doesn't use much memory if it is
4926 duplicated by a quantifier.*/
4927
4928 if (lengthptr != NULL)
4929 {
4930 if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE)
4931 {
4932 *errorcodeptr = ERR20;
4933 goto FAILED;
4934 }
4935 *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
4936 *code++ = OP_BRA;
4937 PUTINC(code, 0, 1 + LINK_SIZE);
4938 *code++ = OP_KET;
4939 PUTINC(code, 0, 1 + LINK_SIZE);
4940 break; /* No need to waste time with special character handling */
4941 }
4942
4943 /* Otherwise update the main code pointer to the end of the group. */
4944
4945 code = tempcode;
4946
4947 /* For a DEFINE group, required and first character settings are not
4948 relevant. */
4949
4950 if (bravalue == OP_DEF) break;
4951
4952 /* Handle updating of the required and first characters for other types of
4953 group. Update for normal brackets of all kinds, and conditions with two
4954 branches (see code above). If the bracket is followed by a quantifier with
4955 zero repeat, we have to back off. Hence the definition of zeroreqbyte and
4956 zerofirstbyte outside the main loop so that they can be accessed for the
4957 back off. */
4958
4959 zeroreqbyte = reqbyte;
4960 zerofirstbyte = firstbyte;
4961 groupsetfirstbyte = FALSE;
4962
4963 if (bravalue >= OP_ONCE)
4964 {
4965 /* If we have not yet set a firstbyte in this branch, take it from the
4966 subpattern, remembering that it was set here so that a repeat of more
4967 than one can replicate it as reqbyte if necessary. If the subpattern has
4968 no firstbyte, set "none" for the whole branch. In both cases, a zero
4969 repeat forces firstbyte to "none". */
4970
4971 if (firstbyte == REQ_UNSET)
4972 {
4973 if (subfirstbyte >= 0)
4974 {
4975 firstbyte = subfirstbyte;
4976 groupsetfirstbyte = TRUE;
4977 }
4978 else firstbyte = REQ_NONE;
4979 zerofirstbyte = REQ_NONE;
4980 }
4981
4982 /* If firstbyte was previously set, convert the subpattern's firstbyte
4983 into reqbyte if there wasn't one, using the vary flag that was in
4984 existence beforehand. */
4985
4986 else if (subfirstbyte >= 0 && subreqbyte < 0)
4987 subreqbyte = subfirstbyte | tempreqvary;
4988
4989 /* If the subpattern set a required byte (or set a first byte that isn't
4990 really the first byte - see above), set it. */
4991
4992 if (subreqbyte >= 0) reqbyte = subreqbyte;
4993 }
4994
4995 /* For a forward assertion, we take the reqbyte, if set. This can be
4996 helpful if the pattern that follows the assertion doesn't set a different
4997 char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
4998 for an assertion, however because it leads to incorrect effect for patterns
4999 such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
5000 of a firstbyte. This is overcome by a scan at the end if there's no
5001 firstbyte, looking for an asserted first char. */
5002
5003 else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
5004 break; /* End of processing '(' */
5005
5006
5007 /* ===================================================================*/
5008 /* Handle metasequences introduced by \. For ones like \d, the ESC_ values
5009 are arranged to be the negation of the corresponding OP_values. For the
5010 back references, the values are ESC_REF plus the reference number. Only
5011 back references and those types that consume a character may be repeated.
5012 We can test for values between ESC_b and ESC_Z for the latter; this may
5013 have to change if any new ones are ever created. */
5014
5015 case '\\':
5016 tempptr = ptr;
5017 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, FALSE);
5018 if (*errorcodeptr != 0) goto FAILED;
5019
5020 if (c < 0)
5021 {
5022 if (-c == ESC_Q) /* Handle start of quoted string */
5023 {
5024 if (ptr[1] == '\\' && ptr[2] == 'E') ptr += 2; /* avoid empty string */
5025 else inescq = TRUE;
5026 continue;
5027 }
5028
5029 if (-c == ESC_E) continue; /* Perl ignores an orphan \E */
5030
5031 /* For metasequences that actually match a character, we disable the
5032 setting of a first character if it hasn't already been set. */
5033
5034 if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
5035 firstbyte = REQ_NONE;
5036
5037 /* Set values to reset to if this is followed by a zero repeat. */
5038
5039 zerofirstbyte = firstbyte;
5040 zeroreqbyte = reqbyte;
5041
5042 /* \k<name> or \k'name' is a back reference by name (Perl syntax).
5043 We also support \k{name} (.NET syntax) */
5044
5045 if (-c == ESC_k && (ptr[1] == '<' || ptr[1] == '\'' || ptr[1] == '{'))
5046 {
5047 is_recurse = FALSE;
5048 terminator = (*(++ptr) == '<')? '>' : (*ptr == '\'')? '\'' : '}';
5049 goto NAMED_REF_OR_RECURSE;
5050 }
5051
5052 /* Back references are handled specially; must disable firstbyte if
5053 not set to cope with cases like (?=(\w+))\1: which would otherwise set
5054 ':' later. */
5055
5056 if (-c >= ESC_REF)
5057 {
5058 recno = -c - ESC_REF;
5059
5060 HANDLE_REFERENCE: /* Come here from named backref handling */
5061 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
5062 previous = code;
5063 *code++ = OP_REF;
5064 PUT2INC(code, 0, recno);
5065 cd->backref_map |= (recno < 32)? (1 << recno) : 1;
5066 if (recno > cd->top_backref) cd->top_backref = recno;
5067 }
5068
5069 /* So are Unicode property matches, if supported. */
5070
5071 #ifdef SUPPORT_UCP
5072 else if (-c == ESC_P || -c == ESC_p)
5073 {
5074 BOOL negated;
5075 int pdata;
5076 int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
5077 if (ptype < 0) goto FAILED;
5078 previous = code;
5079 *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
5080 *code++ = ptype;
5081 *code++ = pdata;
5082 }
5083 #else
5084
5085 /* If Unicode properties are not supported, \X, \P, and \p are not
5086 allowed. */
5087
5088 else if (-c == ESC_X || -c == ESC_P || -c == ESC_p)
5089 {
5090 *errorcodeptr = ERR45;
5091 goto FAILED;
5092 }
5093 #endif
5094
5095 /* For the rest (including \X when Unicode properties are supported), we
5096 can obtain the OP value by negating the escape value. */
5097
5098 else
5099 {
5100 previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
5101 *code++ = -c;
5102 }
5103 continue;
5104 }
5105
5106 /* We have a data character whose value is in c. In UTF-8 mode it may have
5107 a value > 127. We set its representation in the length/buffer, and then
5108 handle it as a data character. */
5109
5110 #ifdef SUPPORT_UTF8
5111 if (utf8 && c > 127)
5112 mclength = _pcre_ord2utf8(c, mcbuffer);
5113 else
5114 #endif
5115
5116 {
5117 mcbuffer[0] = c;
5118 mclength = 1;
5119 }
5120 goto ONE_CHAR;
5121
5122
5123 /* ===================================================================*/
5124 /* Handle a literal character. It is guaranteed not to be whitespace or #
5125 when the extended flag is set. If we are in UTF-8 mode, it may be a
5126 multi-byte literal character. */
5127
5128 default:
5129 NORMAL_CHAR:
5130 mclength = 1;
5131 mcbuffer[0] = c;
5132
5133 #ifdef SUPPORT_UTF8
5134 if (utf8 && c >= 0xc0)
5135 {
5136 while ((ptr[1] & 0xc0) == 0x80)
5137 mcbuffer[mclength++] = *(++ptr);
5138 }
5139 #endif
5140
5141 /* At this point we have the character's bytes in mcbuffer, and the length
5142 in mclength. When not in UTF-8 mode, the length is always 1. */
5143
5144 ONE_CHAR:
5145 previous = code;
5146 *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARNC : OP_CHAR;
5147 for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
5148
5149 /* Remember if \r or \n were seen */
5150
5151 if (mcbuffer[0] == '\r' || mcbuffer[0] == '\n')
5152 cd->external_flags |= PCRE_HASCRORLF;
5153
5154 /* Set the first and required bytes appropriately. If no previous first
5155 byte, set it from this character, but revert to none on a zero repeat.
5156 Otherwise, leave the firstbyte value alone, and don't change it on a zero
5157 repeat. */
5158
5159 if (firstbyte == REQ_UNSET)
5160 {
5161 zerofirstbyte = REQ_NONE;
5162 zeroreqbyte = reqbyte;
5163
5164 /* If the character is more than one byte long, we can set firstbyte
5165 only if it is not to be matched caselessly. */
5166
5167 if (mclength == 1 || req_caseopt == 0)
5168 {
5169 firstbyte = mcbuffer[0] | req_caseopt;
5170 if (mclength != 1) reqbyte = code[-1] | cd->req_varyopt;
5171 }
5172 else firstbyte = reqbyte = REQ_NONE;
5173 }
5174
5175 /* firstbyte was previously set; we can set reqbyte only the length is
5176 1 or the matching is caseful. */
5177
5178 else
5179 {
5180 zerofirstbyte = firstbyte;
5181 zeroreqbyte = reqbyte;
5182 if (mclength == 1 || req_caseopt == 0)
5183 reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
5184 }
5185
5186 break; /* End of literal character handling */
5187 }
5188 } /* end of big loop */
5189
5190
5191 /* Control never reaches here by falling through, only by a goto for all the
5192 error states. Pass back the position in the pattern so that it can be displayed
5193 to the user for diagnosing the error. */
5194
5195 FAILED:
5196 *ptrptr = ptr;
5197 return FALSE;
5198 }
5199
5200
5201
5202
5203 /*************************************************
5204 * Compile sequence of alternatives *
5205 *************************************************/
5206
5207 /* On entry, ptr is pointing past the bracket character, but on return it
5208 points to the closing bracket, or vertical bar, or end of string. The code
5209 variable is pointing at the byte into which the BRA operator has been stored.
5210 If the ims options are changed at the start (for a (?ims: group) or during any
5211 branch, we need to insert an OP_OPT item at the start of every following branch
5212 to ensure they get set correctly at run time, and also pass the new options
5213 into every subsequent branch compile.
5214
5215 This function is used during the pre-compile phase when we are trying to find
5216 out the amount of memory needed, as well as during the real compile phase. The
5217 value of lengthptr distinguishes the two phases.
5218
5219 Arguments:
5220 options option bits, including any changes for this subpattern
5221 oldims previous settings of ims option bits
5222 codeptr -> the address of the current code pointer
5223 ptrptr -> the address of the current pattern pointer
5224 errorcodeptr -> pointer to error code variable
5225 lookbehind TRUE if this is a lookbehind assertion
5226 reset_bracount TRUE to reset the count for each branch
5227 skipbytes skip this many bytes at start (for brackets and OP_COND)
5228 firstbyteptr place to put the first required character, or a negative number
5229 reqbyteptr place to put the last required character, or a negative number
5230 bcptr pointer to the chain of currently open branches
5231 cd points to the data block with tables pointers etc.
5232 lengthptr NULL during the real compile phase
5233 points to length accumulator during pre-compile phase
5234
5235 Returns: TRUE on success
5236 */
5237
5238 static BOOL
5239 compile_regex(int options, int oldims, uschar **codeptr, const uschar **ptrptr,
5240 int *errorcodeptr, BOOL lookbehind, BOOL reset_bracount, int skipbytes,
5241 int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd,
5242 int *lengthptr)
5243 {
5244 const uschar *ptr = *ptrptr;
5245 uschar *code = *codeptr;
5246 uschar *last_branch = code;
5247 uschar *start_bracket = code;
5248 uschar *reverse_count = NULL;
5249 int firstbyte, reqbyte;
5250 int branchfirstbyte, branchreqbyte;
5251 int length;
5252 int orig_bracount;
5253 int max_bracount;
5254 branch_chain bc;
5255
5256 bc.outer = bcptr;
5257 bc.current = code;
5258
5259 firstbyte = reqbyte = REQ_UNSET;
5260
5261 /* Accumulate the length for use in the pre-compile phase. Start with the
5262 length of the BRA and KET and any extra bytes that are required at the
5263 beginning. We accumulate in a local variable to save frequent testing of
5264 lenthptr for NULL. We cannot do this by looking at the value of code at the
5265 start and end of each alternative, because compiled items are discarded during
5266 the pre-compile phase so that the work space is not exceeded. */
5267
5268 length = 2 + 2*LINK_SIZE + skipbytes;
5269
5270 /* WARNING: If the above line is changed for any reason, you must also change
5271 the code that abstracts option settings at the start of the pattern and makes
5272 them global. It tests the value of length for (2 + 2*LINK_SIZE) in the
5273 pre-compile phase to find out whether anything has yet been compiled or not. */
5274
5275 /* Offset is set zero to mark that this bracket is still open */
5276
5277 PUT(code, 1, 0);
5278 code += 1 + LINK_SIZE + skipbytes;
5279
5280 /* Loop for each alternative branch */
5281
5282 orig_bracount = max_bracount = cd->bracount;
5283 for (;;)
5284 {
5285 /* For a (?| group, reset the capturing bracket count so that each branch
5286 uses the same numbers. */
5287
5288 if (reset_bracount) cd->bracount = orig_bracount;
5289
5290 /* Handle a change of ims options at the start of the branch */
5291
5292 if ((options & PCRE_IMS) != oldims)
5293 {
5294 *code++ = OP_OPT;
5295 *code++ = options & PCRE_IMS;
5296 length += 2;
5297 }
5298
5299 /* Set up dummy OP_REVERSE if lookbehind assertion */
5300
5301 if (lookbehind)
5302 {
5303 *code++ = OP_REVERSE;
5304 reverse_count = code;
5305 PUTINC(code, 0, 0);
5306 length += 1 + LINK_SIZE;
5307 }
5308
5309 /* Now compile the branch; in the pre-compile phase its length gets added
5310 into the length. */
5311
5312 if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstbyte,
5313 &branchreqbyte, &bc, cd, (lengthptr == NULL)? NULL : &length))
5314 {
5315 *ptrptr = ptr;
5316 return FALSE;
5317 }
5318
5319 /* Keep the highest bracket count in case (?| was used and some branch
5320 has fewer than the rest. */
5321
5322 if (cd->bracount > max_bracount) max_bracount = cd->bracount;
5323
5324 /* In the real compile phase, there is some post-processing to be done. */
5325
5326 if (lengthptr == NULL)
5327 {
5328 /* If this is the first branch, the firstbyte and reqbyte values for the
5329 branch become the values for the regex. */
5330
5331 if (*last_branch != OP_ALT)
5332 {
5333 firstbyte = branchfirstbyte;
5334 reqbyte = branchreqbyte;
5335 }
5336
5337 /* If this is not the first branch, the first char and reqbyte have to
5338 match the values from all the previous branches, except that if the
5339 previous value for reqbyte didn't have REQ_VARY set, it can still match,
5340 and we set REQ_VARY for the regex. */
5341
5342 else
5343 {
5344 /* If we previously had a firstbyte, but it doesn't match the new branch,
5345 we have to abandon the firstbyte for the regex, but if there was
5346 previously no reqbyte, it takes on the value of the old firstbyte. */
5347
5348 if (firstbyte >= 0 && firstbyte != branchfirstbyte)
5349 {
5350 if (reqbyte < 0) reqbyte = firstbyte;
5351 firstbyte = REQ_NONE;
5352 }
5353
5354 /* If we (now or from before) have no firstbyte, a firstbyte from the
5355 branch becomes a reqbyte if there isn't a branch reqbyte. */
5356
5357 if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
5358 branchreqbyte = branchfirstbyte;
5359
5360 /* Now ensure that the reqbytes match */
5361
5362 if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
5363 reqbyte = REQ_NONE;
5364 else reqbyte |= branchreqbyte; /* To "or" REQ_VARY */
5365 }
5366
5367 /* If lookbehind, check that this branch matches a fixed-length string, and
5368 put the length into the OP_REVERSE item. Temporarily mark the end of the
5369 branch with OP_END. */
5370
5371 if (lookbehind)
5372 {
5373 int fixed_length;
5374 *code = OP_END;
5375 fixed_length = find_fixedlength(last_branch, options);
5376 DPRINTF(("fixed length = %d\n", fixed_length));
5377 if (fixed_length < 0)
5378 {
5379 *errorcodeptr = (fixed_length == -2)? ERR36 : ERR25;
5380 *ptrptr = ptr;
5381 return FALSE;
5382 }
5383 PUT(reverse_count, 0, fixed_length);
5384 }
5385 }
5386
5387 /* Reached end of expression, either ')' or end of pattern. In the real
5388 compile phase, go back through the alternative branches and reverse the chain
5389 of offsets, with the field in the BRA item now becoming an offset to the
5390 first alternative. If there are no alternatives, it points to the end of the
5391 group. The length in the terminating ket is always the length of the whole
5392 bracketed item. If any of the ims options were changed inside the group,
5393 compile a resetting op-code following, except at the very end of the pattern.
5394 Return leaving the pointer at the terminating char. */
5395
5396 if (*ptr != '|')
5397 {
5398 if (lengthptr == NULL)
5399 {
5400 int branch_length = code - last_branch;
5401 do
5402 {
5403 int prev_length = GET(last_branch, 1);
5404 PUT(last_branch, 1, branch_length);
5405 branch_length = prev_length;
5406 last_branch -= branch_length;
5407 }
5408 while (branch_length > 0);
5409 }
5410
5411 /* Fill in the ket */
5412
5413 *code = OP_KET;
5414 PUT(code, 1, code - start_bracket);
5415 code += 1 + LINK_SIZE;
5416
5417 /* Resetting option if needed */
5418
5419 if ((options & PCRE_IMS) != oldims && *ptr == ')')
5420 {
5421 *code++ = OP_OPT;
5422 *code++ = oldims;
5423 length += 2;
5424 }
5425
5426 /* Retain the highest bracket number, in case resetting was used. */
5427
5428 cd->bracount = max_bracount;
5429
5430 /* Set values to pass back */
5431
5432 *codeptr = code;
5433 *ptrptr = ptr;
5434 *firstbyteptr = firstbyte;
5435 *reqbyteptr = reqbyte;
5436 if (lengthptr != NULL)
5437 {
5438 if (OFLOW_MAX - *lengthptr < length)
5439 {
5440 *errorcodeptr = ERR20;
5441 return FALSE;
5442 }
5443 *lengthptr += length;
5444 }
5445 return TRUE;
5446 }
5447
5448 /* Another branch follows. In the pre-compile phase, we can move the code
5449 pointer back to where it was for the start of the first branch. (That is,
5450 pretend that each branch is the only one.)
5451
5452 In the real compile phase, insert an ALT node. Its length field points back
5453 to the previous branch while the bracket remains open. At the end the chain
5454 is reversed. It's done like this so that the start of the bracket has a
5455 zero offset until it is closed, making it possible to detect recursion. */
5456
5457 if (lengthptr != NULL)
5458 {
5459 code = *codeptr + 1 + LINK_SIZE + skipbytes;
5460 length += 1 + LINK_SIZE;
5461 }
5462 else
5463 {
5464 *code = OP_ALT;
5465 PUT(code, 1, code - last_branch);
5466 bc.current = last_branch = code;
5467 code += 1 + LINK_SIZE;
5468 }
5469
5470 ptr++;
5471 }
5472 /* Control never reaches here */
5473 }
5474
5475
5476
5477
5478 /*************************************************
5479 * Check for anchored expression *
5480 *************************************************/
5481
5482 /* Try to find out if this is an anchored regular expression. Consider each
5483 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
5484 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
5485 it's anchored. However, if this is a multiline pattern, then only OP_SOD
5486 counts, since OP_CIRC can match in the middle.
5487
5488 We can also consider a regex to be anchored if OP_SOM starts all its branches.
5489 This is the code for \G, which means "match at start of match position, taking
5490 into account the match offset".
5491
5492 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
5493 because that will try the rest of the pattern at all possible matching points,
5494 so there is no point trying again.... er ....
5495
5496 .... except when the .* appears inside capturing parentheses, and there is a
5497 subsequent back reference to those parentheses. We haven't enough information
5498 to catch that case precisely.
5499
5500 At first, the best we could do was to detect when .* was in capturing brackets
5501 and the highest back reference was greater than or equal to that level.
5502 However, by keeping a bitmap of the first 31 back references, we can catch some
5503 of the more common cases more precisely.
5504
5505 Arguments:
5506 code points to start of expression (the bracket)
5507 options points to the options setting
5508 bracket_map a bitmap of which brackets we are inside while testing; this
5509 handles up to substring 31; after that we just have to take
5510 the less precise approach
5511 backref_map the back reference bitmap
5512
5513 Returns: TRUE or FALSE
5514 */
5515
5516 static BOOL
5517 is_anchored(register const uschar *code, int *options, unsigned int bracket_map,
5518 unsigned int backref_map)
5519 {
5520 do {
5521 const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
5522 options, PCRE_MULTILINE, FALSE);
5523 register int op = *scode;
5524
5525 /* Non-capturing brackets */
5526
5527 if (op == OP_BRA)
5528 {
5529 if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
5530 }
5531
5532 /* Capturing brackets */
5533
5534 else if (op == OP_CBRA)
5535 {
5536 int n = GET2(scode, 1+LINK_SIZE);
5537 int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
5538 if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;
5539 }
5540
5541 /* Other brackets */
5542
5543 else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
5544 {
5545 if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
5546 }
5547
5548 /* .* is not anchored unless DOTALL is set and it isn't in brackets that
5549 are or may be referenced. */
5550
5551 else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
5552 op == OP_TYPEPOSSTAR) &&
5553 (*options & PCRE_DOTALL) != 0)
5554 {
5555 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
5556 }
5557
5558 /* Check for explicit anchoring */
5559
5560 else if (op != OP_SOD && op != OP_SOM &&
5561 ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
5562 return FALSE;
5563 code += GET(code, 1);
5564 }
5565 while (*code == OP_ALT); /* Loop for each alternative */
5566 return TRUE;
5567 }
5568
5569
5570
5571 /*************************************************
5572 * Check for starting with ^ or .* *
5573 *************************************************/
5574
5575 /* This is called to find out if every branch starts with ^ or .* so that
5576 "first char" processing can be done to speed things up in multiline
5577 matching and for non-DOTALL patterns that start with .* (which must start at
5578 the beginning or after \n). As in the case of is_anchored() (see above), we
5579 have to take account of back references to capturing brackets that contain .*
5580 because in that case we can't make the assumption.
5581
5582 Arguments:
5583 code points to start of expression (the bracket)
5584 bracket_map a bitmap of which brackets we are inside while testing; this
5585 handles up to substring 31; after that we just have to take
5586 the less precise approach
5587 backref_map the back reference bitmap
5588
5589 Returns: TRUE or FALSE
5590 */
5591
5592 static BOOL
5593 is_startline(const uschar *code, unsigned int bracket_map,
5594 unsigned int backref_map)
5595 {
5596 do {
5597 const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
5598 NULL, 0, FALSE);
5599 register int op = *scode;
5600
5601 /* Non-capturing brackets */
5602
5603 if (op == OP_BRA)
5604 {
5605 if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
5606 }
5607
5608 /* Capturing brackets */
5609
5610 else if (op == OP_CBRA)
5611 {
5612 int n = GET2(scode, 1+LINK_SIZE);
5613 int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
5614 if (!is_startline(scode, new_map, backref_map)) return FALSE;
5615 }
5616
5617 /* Other brackets */
5618
5619 else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
5620 { if (!is_startline(scode, bracket_map, backref_map)) return FALSE; }
5621
5622 /* .* means "start at start or after \n" if it isn't in brackets that
5623 may be referenced. */
5624
5625 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
5626 {
5627 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
5628 }
5629
5630 /* Check for explicit circumflex */
5631
5632 else if (op != OP_CIRC) return FALSE;
5633
5634 /* Move on to the next alternative */
5635
5636 code += GET(code, 1);
5637 }
5638 while (*code == OP_ALT); /* Loop for each alternative */
5639 return TRUE;
5640 }
5641
5642
5643
5644 /*************************************************
5645 * Check for asserted fixed first char *
5646 *************************************************/
5647
5648 /* During compilation, the "first char" settings from forward assertions are
5649 discarded, because they can cause conflicts with actual literals that follow.
5650 However, if we end up without a first char setting for an unanchored pattern,
5651 it is worth scanning the regex to see if there is an initial asserted first
5652 char. If all branches start with the same asserted char, or with a bracket all
5653 of whose alternatives start with the same asserted char (recurse ad lib), then
5654 we return that char, otherwise -1.
5655
5656 Arguments:
5657 code points to start of expression (the bracket)
5658 options pointer to the options (used to check casing changes)
5659 inassert TRUE if in an assertion
5660
5661 Returns: -1 or the fixed first char
5662 */
5663
5664 static int
5665 find_firstassertedchar(const uschar *code, int *options, BOOL inassert)
5666 {
5667 register int c = -1;
5668 do {
5669 int d;
5670 const uschar *scode =
5671 first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE);
5672 register int op = *scode;
5673
5674 switch(op)
5675 {
5676 default:
5677 return -1;
5678
5679 case OP_BRA:
5680 case OP_CBRA:
5681 case OP_ASSERT:
5682 case OP_ONCE:
5683 case OP_COND:
5684 if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0)
5685 return -1;
5686 if (c < 0) c = d; else if (c != d) return -1;
5687 break;
5688
5689 case OP_EXACT: /* Fall through */
5690 scode += 2;
5691
5692 case OP_CHAR:
5693 case OP_CHARNC:
5694 case OP_PLUS:
5695 case OP_MINPLUS:
5696 case OP_POSPLUS:
5697 if (!inassert) return -1;
5698 if (c < 0)
5699 {
5700 c = scode[1];
5701 if ((*options & PCRE_CASELESS) != 0) c |= REQ_CASELESS;
5702 }
5703 else if (c != scode[1]) return -1;
5704 break;
5705 }
5706
5707 code += GET(code, 1);
5708 }
5709 while (*code == OP_ALT);
5710 return c;
5711 }
5712
5713
5714
5715 /*************************************************
5716 * Compile a Regular Expression *
5717 *************************************************/
5718
5719 /* This function takes a string and returns a pointer to a block of store
5720 holding a compiled version of the expression. The original API for this
5721 function had no error code return variable; it is retained for backwards
5722 compatibility. The new function is given a new name.
5723
5724 Arguments:
5725 pattern the regular expression
5726 options various option bits
5727 errorcodeptr pointer to error code variable (pcre_compile2() only)
5728 can be NULL if you don't want a code value
5729 errorptr pointer to pointer to error text
5730 erroroffset ptr offset in pattern where error was detected
5731 tables pointer to character tables or NULL
5732
5733 Returns: pointer to compiled data block, or NULL on error,
5734 with errorptr and erroroffset set
5735 */
5736
5737 PCRE_EXP_DEFN pcre *
5738 pcre_compile(const char *pattern, int options, const char **errorptr,
5739 int *erroroffset, const unsigned char *tables)
5740 {
5741 return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
5742 }
5743
5744
5745 PCRE_EXP_DEFN pcre *
5746 pcre_compile2(const char *pattern, int options, int *errorcodeptr,
5747 const char **errorptr, int *erroroffset, const unsigned char *tables)
5748 {
5749 real_pcre *re;
5750 int length = 1; /* For final END opcode */
5751 int firstbyte, reqbyte, newline;
5752 int errorcode = 0;
5753 int skipatstart = 0;
5754 #ifdef SUPPORT_UTF8
5755 BOOL utf8;
5756 #endif
5757 size_t size;
5758 uschar *code;
5759 const uschar *codestart;
5760 const uschar *ptr;
5761 compile_data compile_block;
5762 compile_data *cd = &compile_block;
5763
5764 /* This space is used for "compiling" into during the first phase, when we are
5765 computing the amount of memory that is needed. Compiled items are thrown away
5766 as soon as possible, so that a fairly large buffer should be sufficient for
5767 this purpose. The same space is used in the second phase for remembering where
5768 to fill in forward references to subpatterns. */
5769
5770 uschar cworkspace[COMPILE_WORK_SIZE];
5771
5772
5773 /* Set this early so that early errors get offset 0. */
5774
5775 ptr = (const uschar *)pattern;
5776
5777 /* We can't pass back an error message if errorptr is NULL; I guess the best we
5778 can do is just return NULL, but we can set a code value if there is a code
5779 pointer. */
5780
5781 if (errorptr == NULL)
5782 {
5783 if (errorcodeptr != NULL) *errorcodeptr = 99;
5784 return NULL;
5785 }
5786
5787 *errorptr = NULL;
5788 if (errorcodeptr != NULL) *errorcodeptr = ERR0;
5789
5790 /* However, we can give a message for this error */
5791
5792 if (erroroffset == NULL)
5793 {
5794 errorcode = ERR16;
5795 goto PCRE_EARLY_ERROR_RETURN2;
5796 }
5797
5798 *erroroffset = 0;
5799
5800 /* Can't support UTF8 unless PCRE has been compiled to include the code. */
5801
5802 #ifdef SUPPORT_UTF8
5803 utf8 = (options & PCRE_UTF8) != 0;
5804 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
5805 (*erroroffset = _pcre_valid_utf8((uschar *)pattern, -1)) >= 0)
5806 {
5807 errorcode = ERR44;
5808 goto PCRE_EARLY_ERROR_RETURN2;
5809 }
5810 #else
5811 if ((options & PCRE_UTF8) != 0)
5812 {
5813 errorcode = ERR32;
5814 goto PCRE_EARLY_ERROR_RETURN;
5815 }
5816 #endif
5817
5818 if ((options & ~PUBLIC_OPTIONS) != 0)
5819 {
5820 errorcode = ERR17;
5821 goto PCRE_EARLY_ERROR_RETURN;
5822 }
5823
5824 /* Set up pointers to the individual character tables */
5825
5826 if (tables == NULL) tables = _pcre_default_tables;
5827 cd->lcc = tables + lcc_offset;
5828 cd->fcc = tables + fcc_offset;
5829 cd->cbits = tables + cbits_offset;
5830 cd->ctypes = tables + ctypes_offset;
5831
5832 /* Check for global one-time settings at the start of the pattern, and remember
5833 the offset for later. */
5834
5835 while (ptr[skipatstart] == '(' && ptr[skipatstart+1] == '*')
5836 {
5837 int newnl = 0;
5838 int newbsr = 0;
5839
5840 if (strncmp((char *)(ptr+skipatstart+2), "CR)", 3) == 0)
5841 { skipatstart += 5; newnl = PCRE_NEWLINE_CR; }
5842 else if (strncmp((char *)(ptr+skipatstart+2), "LF)", 3) == 0)
5843 { skipatstart += 5; newnl = PCRE_NEWLINE_LF; }
5844 else if (strncmp((char *)(ptr+skipatstart+2), "CRLF)", 5) == 0)
5845 { skipatstart += 7; newnl = PCRE_NEWLINE_CR + PCRE_NEWLINE_LF; }
5846 else if (strncmp((char *)(ptr+skipatstart+2), "ANY)", 4) == 0)
5847 { skipatstart += 6; newnl = PCRE_NEWLINE_ANY; }
5848 else if (strncmp((char *)(ptr+skipatstart+2), "ANYCRLF)", 8) == 0)
5849 { skipatstart += 10; newnl = PCRE_NEWLINE_ANYCRLF; }
5850
5851 else if (strncmp((char *)(ptr+skipatstart+2), "BSR_ANYCRLF)", 12) == 0)
5852 { skipatstart += 14; newbsr = PCRE_BSR_ANYCRLF; }
5853 else if (strncmp((char *)(ptr+skipatstart+2), "BSR_UNICODE)", 12) == 0)
5854 { skipatstart += 14; newbsr = PCRE_BSR_UNICODE; }
5855
5856 if (newnl != 0)
5857 options = (options & ~PCRE_NEWLINE_BITS) | newnl;
5858 else if (newbsr != 0)
5859 options = (options & ~(PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) | newbsr;
5860 else break;
5861 }
5862
5863 /* Check validity of \R options. */
5864
5865 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
5866 {
5867 case 0:
5868 case PCRE_BSR_ANYCRLF:
5869 case PCRE_BSR_UNICODE:
5870 break;
5871 default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
5872 }
5873
5874 /* Handle different types of newline. The three bits give seven cases. The
5875 current code allows for fixed one- or two-byte sequences, plus "any" and
5876 "anycrlf". */
5877
5878 switch (options & PCRE_NEWLINE_BITS)
5879 {
5880 case 0: newline = NEWLINE; break; /* Build-time default */
5881 case PCRE_NEWLINE_CR: newline = '\r'; break;
5882 case PCRE_NEWLINE_LF: newline = '\n'; break;
5883 case PCRE_NEWLINE_CR+
5884 PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
5885 case PCRE_NEWLINE_ANY: newline = -1; break;
5886 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
5887 default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
5888 }
5889
5890 if (newline == -2)
5891 {
5892 cd->nltype = NLTYPE_ANYCRLF;
5893 }
5894 else if (newline < 0)
5895 {
5896 cd->nltype = NLTYPE_ANY;
5897 }
5898 else
5899 {
5900 cd->nltype = NLTYPE_FIXED;
5901 if (newline > 255)
5902 {
5903 cd->nllen = 2;
5904 cd->nl[0] = (newline >> 8) & 255;
5905 cd->nl[1] = newline & 255;
5906 }
5907 else
5908 {
5909 cd->nllen = 1;
5910 cd->nl[0] = newline;
5911 }
5912 }
5913
5914 /* Maximum back reference and backref bitmap. The bitmap records up to 31 back
5915 references to help in deciding whether (.*) can be treated as anchored or not.
5916 */
5917
5918 cd->top_backref = 0;
5919 cd->backref_map = 0;
5920
5921 /* Reflect pattern for debugging output */
5922
5923 DPRINTF(("------------------------------------------------------------------\n"));
5924 DPRINTF(("%s\n", pattern));
5925
5926 /* Pretend to compile the pattern while actually just accumulating the length
5927 of memory required. This behaviour is triggered by passing a non-NULL final
5928 argument to compile_regex(). We pass a block of workspace (cworkspace) for it
5929 to compile parts of the pattern into; the compiled code is discarded when it is
5930 no longer needed, so hopefully this workspace will never overflow, though there
5931 is a test for its doing so. */
5932
5933 cd->bracount = 0;
5934 cd->names_found = 0;
5935 cd->name_entry_size = 0;
5936 cd->name_table = NULL;
5937 cd->start_workspace = cworkspace;
5938 cd->start_code = cworkspace;
5939 cd->hwm = cworkspace;
5940 cd->start_pattern = (const uschar *)pattern;
5941 cd->end_pattern = (const uschar *)(pattern + strlen(pattern));
5942 cd->req_varyopt = 0;
5943 cd->external_options = options;
5944 cd->external_flags = 0;
5945
5946 /* Now do the pre-compile. On error, errorcode will be set non-zero, so we
5947 don't need to look at the result of the function here. The initial options have
5948 been put into the cd block so that they can be changed if an option setting is
5949 found within the regex right at the beginning. Bringing initial option settings
5950 outside can help speed up starting point checks. */
5951
5952 ptr += skipatstart;
5953 code = cworkspace;
5954 *code = OP_BRA;
5955 (void)compile_regex(cd->external_options, cd->external_options & PCRE_IMS,
5956 &code, &ptr, &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd,
5957 &length);
5958 if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN;
5959
5960 DPRINTF(("end pre-compile: length=%d workspace=%d\n", length,
5961 cd->hwm - cworkspace));
5962
5963 if (length > MAX_PATTERN_SIZE)
5964 {
5965 errorcode = ERR20;
5966 goto PCRE_EARLY_ERROR_RETURN;
5967 }
5968
5969 /* Compute the size of data block needed and get it, either from malloc or
5970 externally provided function. Integer overflow should no longer be possible
5971 because nowadays we limit the maximum value of cd->names_found and
5972 cd->name_entry_size. */
5973
5974 size = length + sizeof(real_pcre) + cd->names_found * (cd->name_entry_size + 3);
5975 re = (real_pcre *)(pcre_malloc)(size);
5976
5977 if (re == NULL)
5978 {
5979 errorcode = ERR21;
5980 goto PCRE_EARLY_ERROR_RETURN;
5981 }
5982
5983 /* Put in the magic number, and save the sizes, initial options, internal
5984 flags, and character table pointer. NULL is used for the default character
5985 tables. The nullpad field is at the end; it's there to help in the case when a
5986 regex compiled on a system with 4-byte pointers is run on another with 8-byte
5987 pointers. */
5988
5989 re->magic_number = MAGIC_NUMBER;
5990 re->size = size;
5991 re->options = cd->external_options;
5992 re->flags = cd->external_flags;
5993 re->dummy1 = 0;
5994 re->first_byte = 0;
5995 re->req_byte = 0;
5996 re->name_table_offset = sizeof(real_pcre);
5997 re->name_entry_size = cd->name_entry_size;
5998 re->name_count = cd->names_found;
5999 re->ref_count = 0;
6000 re->tables = (tables == _pcre_default_tables)? NULL : tables;
6001 re->nullpad = NULL;
6002
6003 /* The starting points of the name/number translation table and of the code are
6004 passed around in the compile data block. The start/end pattern and initial
6005 options are already set from the pre-compile phase, as is the name_entry_size
6006 field. Reset the bracket count and the names_found field. Also reset the hwm
6007 field; this time it's used for remembering forward references to subpatterns.
6008 */
6009
6010 cd->bracount = 0;
6011 cd->names_found = 0;
6012 cd->name_table = (uschar *)re + re->name_table_offset;
6013 codestart = cd->name_table + re->name_entry_size * re->name_count;
6014 cd->start_code = codestart;
6015 cd->hwm = cworkspace;
6016 cd->req_varyopt = 0;
6017 cd->had_accept = FALSE;
6018
6019 /* Set up a starting, non-extracting bracket, then compile the expression. On
6020 error, errorcode will be set non-zero, so we don't need to look at the result
6021 of the function here. */
6022
6023 ptr = (const uschar *)pattern + skipatstart;
6024 code = (uschar *)codestart;
6025 *code = OP_BRA;
6026 (void)compile_regex(re->options, re->options & PCRE_IMS, &code, &ptr,
6027 &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd, NULL);
6028 re->top_bracket = cd->bracount;
6029 re->top_backref = cd->top_backref;
6030 re->flags = cd->external_flags;
6031
6032 if (cd->had_accept) reqbyte = -1; /* Must disable after (*ACCEPT) */
6033
6034 /* If not reached end of pattern on success, there's an excess bracket. */
6035
6036 if (errorcode == 0 && *ptr != 0) errorcode = ERR22;
6037
6038 /* Fill in the terminating state and check for disastrous overflow, but
6039 if debugging, leave the test till after things are printed out. */
6040
6041 *code++ = OP_END;
6042
6043 #ifndef DEBUG
6044 if (code - codestart > length) errorcode = ERR23;
6045 #endif
6046
6047 /* Fill in any forward references that are required. */
6048
6049 while (errorcode == 0 && cd->hwm > cworkspace)
6050 {
6051 int offset, recno;
6052 const uschar *groupptr;
6053 cd->hwm -= LINK_SIZE;
6054 offset = GET(cd->hwm, 0);
6055 recno = GET(codestart, offset);
6056 groupptr = find_bracket(codestart, (re->options & PCRE_UTF8) != 0, recno);
6057 if (groupptr == NULL) errorcode = ERR53;
6058 else PUT(((uschar *)codestart), offset, groupptr - codestart);
6059 }
6060
6061 /* Give an error if there's back reference to a non-existent capturing
6062 subpattern. */
6063
6064 if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15;
6065
6066 /* Failed to compile, or error while post-processing */
6067
6068 if (errorcode != 0)
6069 {
6070 (pcre_free)(re);
6071 PCRE_EARLY_ERROR_RETURN:
6072 *erroroffset = ptr - (const uschar *)pattern;
6073 PCRE_EARLY_ERROR_RETURN2:
6074 *errorptr = find_error_text(errorcode);
6075 if (errorcodeptr != NULL) *errorcodeptr = errorcode;
6076 return NULL;
6077 }
6078
6079 /* If the anchored option was not passed, set the flag if we can determine that
6080 the pattern is anchored by virtue of ^ characters or \A or anything else (such
6081 as starting with .* when DOTALL is set).
6082
6083 Otherwise, if we know what the first byte has to be, save it, because that
6084 speeds up unanchored matches no end. If not, see if we can set the
6085 PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
6086 start with ^. and also when all branches start with .* for non-DOTALL matches.
6087 */
6088
6089 if ((re->options & PCRE_ANCHORED) == 0)
6090 {
6091 int temp_options = re->options; /* May get changed during these scans */
6092 if (is_anchored(codestart, &temp_options, 0, cd->backref_map))
6093 re->options |= PCRE_ANCHORED;
6094 else
6095 {
6096 if (firstbyte < 0)
6097 firstbyte = find_firstassertedchar(codestart, &temp_options, FALSE);
6098 if (firstbyte >= 0) /* Remove caseless flag for non-caseable chars */
6099 {
6100 int ch = firstbyte & 255;
6101 re->first_byte = ((firstbyte & REQ_CASELESS) != 0 &&
6102 cd->fcc[ch] == ch)? ch : firstbyte;
6103 re->flags |= PCRE_FIRSTSET;
6104 }
6105 else if (is_startline(codestart, 0, cd->backref_map))
6106 re->flags |= PCRE_STARTLINE;
6107 }
6108 }
6109
6110 /* For an anchored pattern, we use the "required byte" only if it follows a
6111 variable length item in the regex. Remove the caseless flag for non-caseable
6112 bytes. */
6113
6114 if (reqbyte >= 0 &&
6115 ((re->options & PCRE_ANCHORED) == 0 || (reqbyte & REQ_VARY) != 0))
6116 {
6117 int ch = reqbyte & 255;
6118 re->req_byte = ((reqbyte & REQ_CASELESS) != 0 &&
6119 cd->fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte;
6120 re->flags |= PCRE_REQCHSET;
6121 }
6122
6123 /* Print out the compiled data if debugging is enabled. This is never the
6124 case when building a production library. */
6125
6126 #ifdef DEBUG
6127
6128 printf("Length = %d top_bracket = %d top_backref = %d\n",
6129 length, re->top_bracket, re->top_backref);
6130
6131 printf("Options=%08x\n", re->options);
6132
6133 if ((re->flags & PCRE_FIRSTSET) != 0)
6134 {
6135 int ch = re->first_byte & 255;
6136 const char *caseless = ((re->first_byte & REQ_CASELESS) == 0)?
6137 "" : " (caseless)";
6138 if (isprint(ch)) printf("First char = %c%s\n", ch, caseless);
6139 else printf("First char = \\x%02x%s\n", ch, caseless);
6140 }
6141
6142 if ((re->flags & PCRE_REQCHSET) != 0)
6143 {
6144 int ch = re->req_byte & 255;
6145 const char *caseless = ((re->req_byte & REQ_CASELESS) == 0)?
6146 "" : " (caseless)";
6147 if (isprint(ch)) printf("Req char = %c%s\n", ch, caseless);
6148 else printf("Req char = \\x%02x%s\n", ch, caseless);
6149 }
6150
6151 pcre_printint(re, stdout, TRUE);
6152
6153 /* This check is done here in the debugging case so that the code that
6154 was compiled can be seen. */
6155
6156 if (code - codestart > length)
6157 {
6158 (pcre_free)(re);
6159 *errorptr = find_error_text(ERR23);
6160 *erroroffset = ptr - (uschar *)pattern;
6161 if (errorcodeptr != NULL) *errorcodeptr = ERR23;
6162 return NULL;
6163 }
6164 #endif /* DEBUG */
6165
6166 return (pcre *)re;
6167 }
6168
6169 /* End of pcre_compile.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12