/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Contents of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 265 - (show annotations) (download)
Wed Nov 14 11:35:48 2007 UTC (6 years, 11 months ago) by ph10
File MIME type: text/plain
File size: 194331 byte(s)
Fix negative POSIX class bug with Unicode characters.

1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2007 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains the external function pcre_compile(), along with
42 supporting internal functions that are not used by other modules. */
43
44
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48
49 #define NLBLOCK cd /* Block containing newline information */
50 #define PSSTART start_pattern /* Field containing processed string start */
51 #define PSEND end_pattern /* Field containing processed string end */
52
53 #include "pcre_internal.h"
54
55
56 /* When DEBUG is defined, we need the pcre_printint() function, which is also
57 used by pcretest. DEBUG is not defined when building a production library. */
58
59 #ifdef DEBUG
60 #include "pcre_printint.src"
61 #endif
62
63
64 /* Macro for setting individual bits in class bitmaps. */
65
66 #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
67
68 /* Maximum length value to check against when making sure that the integer that
69 holds the compiled pattern length does not overflow. We make it a bit less than
70 INT_MAX to allow for adding in group terminating bytes, so that we don't have
71 to check them every time. */
72
73 #define OFLOW_MAX (INT_MAX - 20)
74
75
76 /*************************************************
77 * Code parameters and static tables *
78 *************************************************/
79
80 /* This value specifies the size of stack workspace that is used during the
81 first pre-compile phase that determines how much memory is required. The regex
82 is partly compiled into this space, but the compiled parts are discarded as
83 soon as they can be, so that hopefully there will never be an overrun. The code
84 does, however, check for an overrun. The largest amount I've seen used is 218,
85 so this number is very generous.
86
87 The same workspace is used during the second, actual compile phase for
88 remembering forward references to groups so that they can be filled in at the
89 end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
90 is 4 there is plenty of room. */
91
92 #define COMPILE_WORK_SIZE (4096)
93
94
95 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
96 are simple data values; negative values are for special things like \d and so
97 on. Zero means further processing is needed (for things like \x), or the escape
98 is invalid. */
99
100 #ifndef EBCDIC /* This is the "normal" table for ASCII systems */
101 static const short int escapes[] = {
102 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
103 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
104 '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */
105 -ESC_H, 0, 0, -ESC_K, 0, 0, 0, 0, /* H - O */
106 -ESC_P, -ESC_Q, -ESC_R, -ESC_S, 0, 0, -ESC_V, -ESC_W, /* P - W */
107 -ESC_X, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
108 '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */
109 -ESC_h, 0, 0, -ESC_k, 0, 0, ESC_n, 0, /* h - o */
110 -ESC_p, 0, ESC_r, -ESC_s, ESC_tee, 0, -ESC_v, -ESC_w, /* p - w */
111 0, 0, -ESC_z /* x - z */
112 };
113
114 #else /* This is the "abnormal" table for EBCDIC systems */
115 static const short int escapes[] = {
116 /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
117 /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
118 /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
119 /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
120 /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
121 /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
122 /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
123 /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
124 /* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0,
125 /* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
126 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
127 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0,
128 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
129 /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
130 /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
131 /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
132 /* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0,
133 /* D0 */ '}', 0, -ESC_K, 0, 0, 0, 0, -ESC_P,
134 /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
135 /* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X,
136 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
137 /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
138 /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
139 };
140 #endif
141
142
143 /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
144 searched linearly. Put all the names into a single string, in order to reduce
145 the number of relocations when a shared library is dynamically linked. */
146
147 typedef struct verbitem {
148 int len;
149 int op;
150 } verbitem;
151
152 static const char verbnames[] =
153 "ACCEPT\0"
154 "COMMIT\0"
155 "F\0"
156 "FAIL\0"
157 "PRUNE\0"
158 "SKIP\0"
159 "THEN";
160
161 static verbitem verbs[] = {
162 { 6, OP_ACCEPT },
163 { 6, OP_COMMIT },
164 { 1, OP_FAIL },
165 { 4, OP_FAIL },
166 { 5, OP_PRUNE },
167 { 4, OP_SKIP },
168 { 4, OP_THEN }
169 };
170
171 static int verbcount = sizeof(verbs)/sizeof(verbitem);
172
173
174 /* Tables of names of POSIX character classes and their lengths. The names are
175 now all in a single string, to reduce the number of relocations when a shared
176 library is dynamically loaded. The list of lengths is terminated by a zero
177 length entry. The first three must be alpha, lower, upper, as this is assumed
178 for handling case independence. */
179
180 static const char posix_names[] =
181 "alpha\0" "lower\0" "upper\0" "alnum\0" "ascii\0" "blank\0"
182 "cntrl\0" "digit\0" "graph\0" "print\0" "punct\0" "space\0"
183 "word\0" "xdigit";
184
185 static const uschar posix_name_lengths[] = {
186 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
187
188 /* Table of class bit maps for each POSIX class. Each class is formed from a
189 base map, with an optional addition or removal of another map. Then, for some
190 classes, there is some additional tweaking: for [:blank:] the vertical space
191 characters are removed, and for [:alpha:] and [:alnum:] the underscore
192 character is removed. The triples in the table consist of the base map offset,
193 second map offset or -1 if no second map, and a non-negative value for map
194 addition or a negative value for map subtraction (if there are two maps). The
195 absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
196 remove vertical space characters, 2 => remove underscore. */
197
198 static const int posix_class_maps[] = {
199 cbit_word, cbit_digit, -2, /* alpha */
200 cbit_lower, -1, 0, /* lower */
201 cbit_upper, -1, 0, /* upper */
202 cbit_word, -1, 2, /* alnum - word without underscore */
203 cbit_print, cbit_cntrl, 0, /* ascii */
204 cbit_space, -1, 1, /* blank - a GNU extension */
205 cbit_cntrl, -1, 0, /* cntrl */
206 cbit_digit, -1, 0, /* digit */
207 cbit_graph, -1, 0, /* graph */
208 cbit_print, -1, 0, /* print */
209 cbit_punct, -1, 0, /* punct */
210 cbit_space, -1, 0, /* space */
211 cbit_word, -1, 0, /* word - a Perl extension */
212 cbit_xdigit,-1, 0 /* xdigit */
213 };
214
215
216 #define STRING(a) # a
217 #define XSTRING(s) STRING(s)
218
219 /* The texts of compile-time error messages. These are "char *" because they
220 are passed to the outside world. Do not ever re-use any error number, because
221 they are documented. Always add a new error instead. Messages marked DEAD below
222 are no longer used. This used to be a table of strings, but in order to reduce
223 the number of relocations needed when a shared library is loaded dynamically,
224 it is now one long string. We cannot use a table of offsets, because the
225 lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
226 simply count through to the one we want - this isn't a performance issue
227 because these strings are used only when there is a compilation error. */
228
229 static const char error_texts[] =
230 "no error\0"
231 "\\ at end of pattern\0"
232 "\\c at end of pattern\0"
233 "unrecognized character follows \\\0"
234 "numbers out of order in {} quantifier\0"
235 /* 5 */
236 "number too big in {} quantifier\0"
237 "missing terminating ] for character class\0"
238 "invalid escape sequence in character class\0"
239 "range out of order in character class\0"
240 "nothing to repeat\0"
241 /* 10 */
242 "operand of unlimited repeat could match the empty string\0" /** DEAD **/
243 "internal error: unexpected repeat\0"
244 "unrecognized character after (?\0"
245 "POSIX named classes are supported only within a class\0"
246 "missing )\0"
247 /* 15 */
248 "reference to non-existent subpattern\0"
249 "erroffset passed as NULL\0"
250 "unknown option bit(s) set\0"
251 "missing ) after comment\0"
252 "parentheses nested too deeply\0" /** DEAD **/
253 /* 20 */
254 "regular expression is too large\0"
255 "failed to get memory\0"
256 "unmatched parentheses\0"
257 "internal error: code overflow\0"
258 "unrecognized character after (?<\0"
259 /* 25 */
260 "lookbehind assertion is not fixed length\0"
261 "malformed number or name after (?(\0"
262 "conditional group contains more than two branches\0"
263 "assertion expected after (?(\0"
264 "(?R or (?[+-]digits must be followed by )\0"
265 /* 30 */
266 "unknown POSIX class name\0"
267 "POSIX collating elements are not supported\0"
268 "this version of PCRE is not compiled with PCRE_UTF8 support\0"
269 "spare error\0" /** DEAD **/
270 "character value in \\x{...} sequence is too large\0"
271 /* 35 */
272 "invalid condition (?(0)\0"
273 "\\C not allowed in lookbehind assertion\0"
274 "PCRE does not support \\L, \\l, \\N, \\U, or \\u\0"
275 "number after (?C is > 255\0"
276 "closing ) for (?C expected\0"
277 /* 40 */
278 "recursive call could loop indefinitely\0"
279 "unrecognized character after (?P\0"
280 "syntax error in subpattern name (missing terminator)\0"
281 "two named subpatterns have the same name\0"
282 "invalid UTF-8 string\0"
283 /* 45 */
284 "support for \\P, \\p, and \\X has not been compiled\0"
285 "malformed \\P or \\p sequence\0"
286 "unknown property name after \\P or \\p\0"
287 "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
288 "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
289 /* 50 */
290 "repeated subpattern is too long\0" /** DEAD **/
291 "octal value is greater than \\377 (not in UTF-8 mode)\0"
292 "internal error: overran compiling workspace\0"
293 "internal error: previously-checked referenced subpattern not found\0"
294 "DEFINE group contains more than one branch\0"
295 /* 55 */
296 "repeating a DEFINE group is not allowed\0"
297 "inconsistent NEWLINE options\0"
298 "\\g is not followed by a braced name or an optionally braced non-zero number\0"
299 "(?+ or (?- or (?(+ or (?(- must be followed by a non-zero number\0"
300 "(*VERB) with an argument is not supported\0"
301 /* 60 */
302 "(*VERB) not recognized\0"
303 "number is too big";
304
305
306 /* Table to identify digits and hex digits. This is used when compiling
307 patterns. Note that the tables in chartables are dependent on the locale, and
308 may mark arbitrary characters as digits - but the PCRE compiling code expects
309 to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
310 a private table here. It costs 256 bytes, but it is a lot faster than doing
311 character value tests (at least in some simple cases I timed), and in some
312 applications one wants PCRE to compile efficiently as well as match
313 efficiently.
314
315 For convenience, we use the same bit definitions as in chartables:
316
317 0x04 decimal digit
318 0x08 hexadecimal digit
319
320 Then we can use ctype_digit and ctype_xdigit in the code. */
321
322 #ifndef EBCDIC /* This is the "normal" case, for ASCII systems */
323 static const unsigned char digitab[] =
324 {
325 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
326 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
327 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
328 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
329 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
330 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
331 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
332 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
333 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
334 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
335 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
336 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
337 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
338 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
339 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
340 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
341 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
342 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
343 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
344 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
345 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
346 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
347 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
348 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
349 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
350 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
351 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
352 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
353 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
354 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
355 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
356 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
357
358 #else /* This is the "abnormal" case, for EBCDIC systems */
359 static const unsigned char digitab[] =
360 {
361 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
362 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
363 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
364 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
365 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
366 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
367 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
368 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
369 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
370 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
371 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
372 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */
373 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
374 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
375 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
376 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
377 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
378 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
379 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
380 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
381 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
382 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
383 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
384 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
385 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
386 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
387 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
388 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
389 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
390 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
391 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
392 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
393
394 static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
395 0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
396 0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
397 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
398 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
399 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
400 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
401 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
402 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
403 0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
404 0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
405 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
406 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */
407 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
408 0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
409 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
410 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
411 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
412 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
413 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
414 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
415 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
416 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
417 0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
418 0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
419 0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
420 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
421 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
422 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
423 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
424 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
425 0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
426 0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
427 #endif
428
429
430 /* Definition to allow mutual recursion */
431
432 static BOOL
433 compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
434 int *, int *, branch_chain *, compile_data *, int *);
435
436
437
438 /*************************************************
439 * Find an error text *
440 *************************************************/
441
442 /* The error texts are now all in one long string, to save on relocations. As
443 some of the text is of unknown length, we can't use a table of offsets.
444 Instead, just count through the strings. This is not a performance issue
445 because it happens only when there has been a compilation error.
446
447 Argument: the error number
448 Returns: pointer to the error string
449 */
450
451 static const char *
452 find_error_text(int n)
453 {
454 const char *s = error_texts;
455 for (; n > 0; n--) while (*s++ != 0);
456 return s;
457 }
458
459
460 /*************************************************
461 * Handle escapes *
462 *************************************************/
463
464 /* This function is called when a \ has been encountered. It either returns a
465 positive value for a simple escape such as \n, or a negative value which
466 encodes one of the more complicated things such as \d. A backreference to group
467 n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
468 UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
469 ptr is pointing at the \. On exit, it is on the final character of the escape
470 sequence.
471
472 Arguments:
473 ptrptr points to the pattern position pointer
474 errorcodeptr points to the errorcode variable
475 bracount number of previous extracting brackets
476 options the options bits
477 isclass TRUE if inside a character class
478
479 Returns: zero or positive => a data character
480 negative => a special escape sequence
481 on error, errorcodeptr is set
482 */
483
484 static int
485 check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
486 int options, BOOL isclass)
487 {
488 BOOL utf8 = (options & PCRE_UTF8) != 0;
489 const uschar *ptr = *ptrptr + 1;
490 int c, i;
491
492 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
493 ptr--; /* Set pointer back to the last byte */
494
495 /* If backslash is at the end of the pattern, it's an error. */
496
497 if (c == 0) *errorcodeptr = ERR1;
498
499 /* Non-alphamerics are literals. For digits or letters, do an initial lookup in
500 a table. A non-zero result is something that can be returned immediately.
501 Otherwise further processing may be required. */
502
503 #ifndef EBCDIC /* ASCII coding */
504 else if (c < '0' || c > 'z') {} /* Not alphameric */
505 else if ((i = escapes[c - '0']) != 0) c = i;
506
507 #else /* EBCDIC coding */
508 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphameric */
509 else if ((i = escapes[c - 0x48]) != 0) c = i;
510 #endif
511
512 /* Escapes that need further processing, or are illegal. */
513
514 else
515 {
516 const uschar *oldptr;
517 BOOL braced, negated;
518
519 switch (c)
520 {
521 /* A number of Perl escapes are not handled by PCRE. We give an explicit
522 error. */
523
524 case 'l':
525 case 'L':
526 case 'N':
527 case 'u':
528 case 'U':
529 *errorcodeptr = ERR37;
530 break;
531
532 /* \g must be followed by a number, either plain or braced. If positive, it
533 is an absolute backreference. If negative, it is a relative backreference.
534 This is a Perl 5.10 feature. Perl 5.10 also supports \g{name} as a
535 reference to a named group. This is part of Perl's movement towards a
536 unified syntax for back references. As this is synonymous with \k{name}, we
537 fudge it up by pretending it really was \k. */
538
539 case 'g':
540 if (ptr[1] == '{')
541 {
542 const uschar *p;
543 for (p = ptr+2; *p != 0 && *p != '}'; p++)
544 if (*p != '-' && (digitab[*p] & ctype_digit) == 0) break;
545 if (*p != 0 && *p != '}')
546 {
547 c = -ESC_k;
548 break;
549 }
550 braced = TRUE;
551 ptr++;
552 }
553 else braced = FALSE;
554
555 if (ptr[1] == '-')
556 {
557 negated = TRUE;
558 ptr++;
559 }
560 else negated = FALSE;
561
562 c = 0;
563 while ((digitab[ptr[1]] & ctype_digit) != 0)
564 c = c * 10 + *(++ptr) - '0';
565
566 if (c < 0)
567 {
568 *errorcodeptr = ERR61;
569 break;
570 }
571
572 if (c == 0 || (braced && *(++ptr) != '}'))
573 {
574 *errorcodeptr = ERR57;
575 break;
576 }
577
578 if (negated)
579 {
580 if (c > bracount)
581 {
582 *errorcodeptr = ERR15;
583 break;
584 }
585 c = bracount - (c - 1);
586 }
587
588 c = -(ESC_REF + c);
589 break;
590
591 /* The handling of escape sequences consisting of a string of digits
592 starting with one that is not zero is not straightforward. By experiment,
593 the way Perl works seems to be as follows:
594
595 Outside a character class, the digits are read as a decimal number. If the
596 number is less than 10, or if there are that many previous extracting
597 left brackets, then it is a back reference. Otherwise, up to three octal
598 digits are read to form an escaped byte. Thus \123 is likely to be octal
599 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
600 value is greater than 377, the least significant 8 bits are taken. Inside a
601 character class, \ followed by a digit is always an octal number. */
602
603 case '1': case '2': case '3': case '4': case '5':
604 case '6': case '7': case '8': case '9':
605
606 if (!isclass)
607 {
608 oldptr = ptr;
609 c -= '0';
610 while ((digitab[ptr[1]] & ctype_digit) != 0)
611 c = c * 10 + *(++ptr) - '0';
612 if (c < 0)
613 {
614 *errorcodeptr = ERR61;
615 break;
616 }
617 if (c < 10 || c <= bracount)
618 {
619 c = -(ESC_REF + c);
620 break;
621 }
622 ptr = oldptr; /* Put the pointer back and fall through */
623 }
624
625 /* Handle an octal number following \. If the first digit is 8 or 9, Perl
626 generates a binary zero byte and treats the digit as a following literal.
627 Thus we have to pull back the pointer by one. */
628
629 if ((c = *ptr) >= '8')
630 {
631 ptr--;
632 c = 0;
633 break;
634 }
635
636 /* \0 always starts an octal number, but we may drop through to here with a
637 larger first octal digit. The original code used just to take the least
638 significant 8 bits of octal numbers (I think this is what early Perls used
639 to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
640 than 3 octal digits. */
641
642 case '0':
643 c -= '0';
644 while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
645 c = c * 8 + *(++ptr) - '0';
646 if (!utf8 && c > 255) *errorcodeptr = ERR51;
647 break;
648
649 /* \x is complicated. \x{ddd} is a character number which can be greater
650 than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
651 treated as a data character. */
652
653 case 'x':
654 if (ptr[1] == '{')
655 {
656 const uschar *pt = ptr + 2;
657 int count = 0;
658
659 c = 0;
660 while ((digitab[*pt] & ctype_xdigit) != 0)
661 {
662 register int cc = *pt++;
663 if (c == 0 && cc == '0') continue; /* Leading zeroes */
664 count++;
665
666 #ifndef EBCDIC /* ASCII coding */
667 if (cc >= 'a') cc -= 32; /* Convert to upper case */
668 c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));
669 #else /* EBCDIC coding */
670 if (cc >= 'a' && cc <= 'z') cc += 64; /* Convert to upper case */
671 c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));
672 #endif
673 }
674
675 if (*pt == '}')
676 {
677 if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
678 ptr = pt;
679 break;
680 }
681
682 /* If the sequence of hex digits does not end with '}', then we don't
683 recognize this construct; fall through to the normal \x handling. */
684 }
685
686 /* Read just a single-byte hex-defined char */
687
688 c = 0;
689 while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
690 {
691 int cc; /* Some compilers don't like ++ */
692 cc = *(++ptr); /* in initializers */
693 #ifndef EBCDIC /* ASCII coding */
694 if (cc >= 'a') cc -= 32; /* Convert to upper case */
695 c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
696 #else /* EBCDIC coding */
697 if (cc <= 'z') cc += 64; /* Convert to upper case */
698 c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
699 #endif
700 }
701 break;
702
703 /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
704 This coding is ASCII-specific, but then the whole concept of \cx is
705 ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
706
707 case 'c':
708 c = *(++ptr);
709 if (c == 0)
710 {
711 *errorcodeptr = ERR2;
712 break;
713 }
714
715 #ifndef EBCDIC /* ASCII coding */
716 if (c >= 'a' && c <= 'z') c -= 32;
717 c ^= 0x40;
718 #else /* EBCDIC coding */
719 if (c >= 'a' && c <= 'z') c += 64;
720 c ^= 0xC0;
721 #endif
722 break;
723
724 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
725 other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
726 for Perl compatibility, it is a literal. This code looks a bit odd, but
727 there used to be some cases other than the default, and there may be again
728 in future, so I haven't "optimized" it. */
729
730 default:
731 if ((options & PCRE_EXTRA) != 0) switch(c)
732 {
733 default:
734 *errorcodeptr = ERR3;
735 break;
736 }
737 break;
738 }
739 }
740
741 *ptrptr = ptr;
742 return c;
743 }
744
745
746
747 #ifdef SUPPORT_UCP
748 /*************************************************
749 * Handle \P and \p *
750 *************************************************/
751
752 /* This function is called after \P or \p has been encountered, provided that
753 PCRE is compiled with support for Unicode properties. On entry, ptrptr is
754 pointing at the P or p. On exit, it is pointing at the final character of the
755 escape sequence.
756
757 Argument:
758 ptrptr points to the pattern position pointer
759 negptr points to a boolean that is set TRUE for negation else FALSE
760 dptr points to an int that is set to the detailed property value
761 errorcodeptr points to the error code variable
762
763 Returns: type value from ucp_type_table, or -1 for an invalid type
764 */
765
766 static int
767 get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
768 {
769 int c, i, bot, top;
770 const uschar *ptr = *ptrptr;
771 char name[32];
772
773 c = *(++ptr);
774 if (c == 0) goto ERROR_RETURN;
775
776 *negptr = FALSE;
777
778 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
779 negation. */
780
781 if (c == '{')
782 {
783 if (ptr[1] == '^')
784 {
785 *negptr = TRUE;
786 ptr++;
787 }
788 for (i = 0; i < (int)sizeof(name) - 1; i++)
789 {
790 c = *(++ptr);
791 if (c == 0) goto ERROR_RETURN;
792 if (c == '}') break;
793 name[i] = c;
794 }
795 if (c !='}') goto ERROR_RETURN;
796 name[i] = 0;
797 }
798
799 /* Otherwise there is just one following character */
800
801 else
802 {
803 name[0] = c;
804 name[1] = 0;
805 }
806
807 *ptrptr = ptr;
808
809 /* Search for a recognized property name using binary chop */
810
811 bot = 0;
812 top = _pcre_utt_size;
813
814 while (bot < top)
815 {
816 i = (bot + top) >> 1;
817 c = strcmp(name, _pcre_utt_names + _pcre_utt[i].name_offset);
818 if (c == 0)
819 {
820 *dptr = _pcre_utt[i].value;
821 return _pcre_utt[i].type;
822 }
823 if (c > 0) bot = i + 1; else top = i;
824 }
825
826 *errorcodeptr = ERR47;
827 *ptrptr = ptr;
828 return -1;
829
830 ERROR_RETURN:
831 *errorcodeptr = ERR46;
832 *ptrptr = ptr;
833 return -1;
834 }
835 #endif
836
837
838
839
840 /*************************************************
841 * Check for counted repeat *
842 *************************************************/
843
844 /* This function is called when a '{' is encountered in a place where it might
845 start a quantifier. It looks ahead to see if it really is a quantifier or not.
846 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
847 where the ddds are digits.
848
849 Arguments:
850 p pointer to the first char after '{'
851
852 Returns: TRUE or FALSE
853 */
854
855 static BOOL
856 is_counted_repeat(const uschar *p)
857 {
858 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
859 while ((digitab[*p] & ctype_digit) != 0) p++;
860 if (*p == '}') return TRUE;
861
862 if (*p++ != ',') return FALSE;
863 if (*p == '}') return TRUE;
864
865 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
866 while ((digitab[*p] & ctype_digit) != 0) p++;
867
868 return (*p == '}');
869 }
870
871
872
873 /*************************************************
874 * Read repeat counts *
875 *************************************************/
876
877 /* Read an item of the form {n,m} and return the values. This is called only
878 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
879 so the syntax is guaranteed to be correct, but we need to check the values.
880
881 Arguments:
882 p pointer to first char after '{'
883 minp pointer to int for min
884 maxp pointer to int for max
885 returned as -1 if no max
886 errorcodeptr points to error code variable
887
888 Returns: pointer to '}' on success;
889 current ptr on error, with errorcodeptr set non-zero
890 */
891
892 static const uschar *
893 read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
894 {
895 int min = 0;
896 int max = -1;
897
898 /* Read the minimum value and do a paranoid check: a negative value indicates
899 an integer overflow. */
900
901 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
902 if (min < 0 || min > 65535)
903 {
904 *errorcodeptr = ERR5;
905 return p;
906 }
907
908 /* Read the maximum value if there is one, and again do a paranoid on its size.
909 Also, max must not be less than min. */
910
911 if (*p == '}') max = min; else
912 {
913 if (*(++p) != '}')
914 {
915 max = 0;
916 while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
917 if (max < 0 || max > 65535)
918 {
919 *errorcodeptr = ERR5;
920 return p;
921 }
922 if (max < min)
923 {
924 *errorcodeptr = ERR4;
925 return p;
926 }
927 }
928 }
929
930 /* Fill in the required variables, and pass back the pointer to the terminating
931 '}'. */
932
933 *minp = min;
934 *maxp = max;
935 return p;
936 }
937
938
939
940 /*************************************************
941 * Find forward referenced subpattern *
942 *************************************************/
943
944 /* This function scans along a pattern's text looking for capturing
945 subpatterns, and counting them. If it finds a named pattern that matches the
946 name it is given, it returns its number. Alternatively, if the name is NULL, it
947 returns when it reaches a given numbered subpattern. This is used for forward
948 references to subpatterns. We know that if (?P< is encountered, the name will
949 be terminated by '>' because that is checked in the first pass.
950
951 Arguments:
952 ptr current position in the pattern
953 count current count of capturing parens so far encountered
954 name name to seek, or NULL if seeking a numbered subpattern
955 lorn name length, or subpattern number if name is NULL
956 xmode TRUE if we are in /x mode
957
958 Returns: the number of the named subpattern, or -1 if not found
959 */
960
961 static int
962 find_parens(const uschar *ptr, int count, const uschar *name, int lorn,
963 BOOL xmode)
964 {
965 const uschar *thisname;
966
967 for (; *ptr != 0; ptr++)
968 {
969 int term;
970
971 /* Skip over backslashed characters and also entire \Q...\E */
972
973 if (*ptr == '\\')
974 {
975 if (*(++ptr) == 0) return -1;
976 if (*ptr == 'Q') for (;;)
977 {
978 while (*(++ptr) != 0 && *ptr != '\\');
979 if (*ptr == 0) return -1;
980 if (*(++ptr) == 'E') break;
981 }
982 continue;
983 }
984
985 /* Skip over character classes */
986
987 if (*ptr == '[')
988 {
989 while (*(++ptr) != ']')
990 {
991 if (*ptr == 0) return -1;
992 if (*ptr == '\\')
993 {
994 if (*(++ptr) == 0) return -1;
995 if (*ptr == 'Q') for (;;)
996 {
997 while (*(++ptr) != 0 && *ptr != '\\');
998 if (*ptr == 0) return -1;
999 if (*(++ptr) == 'E') break;
1000 }
1001 continue;
1002 }
1003 }
1004 continue;
1005 }
1006
1007 /* Skip comments in /x mode */
1008
1009 if (xmode && *ptr == '#')
1010 {
1011 while (*(++ptr) != 0 && *ptr != '\n');
1012 if (*ptr == 0) return -1;
1013 continue;
1014 }
1015
1016 /* An opening parens must now be a real metacharacter */
1017
1018 if (*ptr != '(') continue;
1019 if (ptr[1] != '?' && ptr[1] != '*')
1020 {
1021 count++;
1022 if (name == NULL && count == lorn) return count;
1023 continue;
1024 }
1025
1026 ptr += 2;
1027 if (*ptr == 'P') ptr++; /* Allow optional P */
1028
1029 /* We have to disambiguate (?<! and (?<= from (?<name> */
1030
1031 if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') &&
1032 *ptr != '\'')
1033 continue;
1034
1035 count++;
1036
1037 if (name == NULL && count == lorn) return count;
1038 term = *ptr++;
1039 if (term == '<') term = '>';
1040 thisname = ptr;
1041 while (*ptr != term) ptr++;
1042 if (name != NULL && lorn == ptr - thisname &&
1043 strncmp((const char *)name, (const char *)thisname, lorn) == 0)
1044 return count;
1045 }
1046
1047 return -1;
1048 }
1049
1050
1051
1052 /*************************************************
1053 * Find first significant op code *
1054 *************************************************/
1055
1056 /* This is called by several functions that scan a compiled expression looking
1057 for a fixed first character, or an anchoring op code etc. It skips over things
1058 that do not influence this. For some calls, a change of option is important.
1059 For some calls, it makes sense to skip negative forward and all backward
1060 assertions, and also the \b assertion; for others it does not.
1061
1062 Arguments:
1063 code pointer to the start of the group
1064 options pointer to external options
1065 optbit the option bit whose changing is significant, or
1066 zero if none are
1067 skipassert TRUE if certain assertions are to be skipped
1068
1069 Returns: pointer to the first significant opcode
1070 */
1071
1072 static const uschar*
1073 first_significant_code(const uschar *code, int *options, int optbit,
1074 BOOL skipassert)
1075 {
1076 for (;;)
1077 {
1078 switch ((int)*code)
1079 {
1080 case OP_OPT:
1081 if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
1082 *options = (int)code[1];
1083 code += 2;
1084 break;
1085
1086 case OP_ASSERT_NOT:
1087 case OP_ASSERTBACK:
1088 case OP_ASSERTBACK_NOT:
1089 if (!skipassert) return code;
1090 do code += GET(code, 1); while (*code == OP_ALT);
1091 code += _pcre_OP_lengths[*code];
1092 break;
1093
1094 case OP_WORD_BOUNDARY:
1095 case OP_NOT_WORD_BOUNDARY:
1096 if (!skipassert) return code;
1097 /* Fall through */
1098
1099 case OP_CALLOUT:
1100 case OP_CREF:
1101 case OP_RREF:
1102 case OP_DEF:
1103 code += _pcre_OP_lengths[*code];
1104 break;
1105
1106 default:
1107 return code;
1108 }
1109 }
1110 /* Control never reaches here */
1111 }
1112
1113
1114
1115
1116 /*************************************************
1117 * Find the fixed length of a pattern *
1118 *************************************************/
1119
1120 /* Scan a pattern and compute the fixed length of subject that will match it,
1121 if the length is fixed. This is needed for dealing with backward assertions.
1122 In UTF8 mode, the result is in characters rather than bytes.
1123
1124 Arguments:
1125 code points to the start of the pattern (the bracket)
1126 options the compiling options
1127
1128 Returns: the fixed length, or -1 if there is no fixed length,
1129 or -2 if \C was encountered
1130 */
1131
1132 static int
1133 find_fixedlength(uschar *code, int options)
1134 {
1135 int length = -1;
1136
1137 register int branchlength = 0;
1138 register uschar *cc = code + 1 + LINK_SIZE;
1139
1140 /* Scan along the opcodes for this branch. If we get to the end of the
1141 branch, check the length against that of the other branches. */
1142
1143 for (;;)
1144 {
1145 int d;
1146 register int op = *cc;
1147 switch (op)
1148 {
1149 case OP_CBRA:
1150 case OP_BRA:
1151 case OP_ONCE:
1152 case OP_COND:
1153 d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);
1154 if (d < 0) return d;
1155 branchlength += d;
1156 do cc += GET(cc, 1); while (*cc == OP_ALT);
1157 cc += 1 + LINK_SIZE;
1158 break;
1159
1160 /* Reached end of a branch; if it's a ket it is the end of a nested
1161 call. If it's ALT it is an alternation in a nested call. If it is
1162 END it's the end of the outer call. All can be handled by the same code. */
1163
1164 case OP_ALT:
1165 case OP_KET:
1166 case OP_KETRMAX:
1167 case OP_KETRMIN:
1168 case OP_END:
1169 if (length < 0) length = branchlength;
1170 else if (length != branchlength) return -1;
1171 if (*cc != OP_ALT) return length;
1172 cc += 1 + LINK_SIZE;
1173 branchlength = 0;
1174 break;
1175
1176 /* Skip over assertive subpatterns */
1177
1178 case OP_ASSERT:
1179 case OP_ASSERT_NOT:
1180 case OP_ASSERTBACK:
1181 case OP_ASSERTBACK_NOT:
1182 do cc += GET(cc, 1); while (*cc == OP_ALT);
1183 /* Fall through */
1184
1185 /* Skip over things that don't match chars */
1186
1187 case OP_REVERSE:
1188 case OP_CREF:
1189 case OP_RREF:
1190 case OP_DEF:
1191 case OP_OPT:
1192 case OP_CALLOUT:
1193 case OP_SOD:
1194 case OP_SOM:
1195 case OP_EOD:
1196 case OP_EODN:
1197 case OP_CIRC:
1198 case OP_DOLL:
1199 case OP_NOT_WORD_BOUNDARY:
1200 case OP_WORD_BOUNDARY:
1201 cc += _pcre_OP_lengths[*cc];
1202 break;
1203
1204 /* Handle literal characters */
1205
1206 case OP_CHAR:
1207 case OP_CHARNC:
1208 case OP_NOT:
1209 branchlength++;
1210 cc += 2;
1211 #ifdef SUPPORT_UTF8
1212 if ((options & PCRE_UTF8) != 0)
1213 {
1214 while ((*cc & 0xc0) == 0x80) cc++;
1215 }
1216 #endif
1217 break;
1218
1219 /* Handle exact repetitions. The count is already in characters, but we
1220 need to skip over a multibyte character in UTF8 mode. */
1221
1222 case OP_EXACT:
1223 branchlength += GET2(cc,1);
1224 cc += 4;
1225 #ifdef SUPPORT_UTF8
1226 if ((options & PCRE_UTF8) != 0)
1227 {
1228 while((*cc & 0x80) == 0x80) cc++;
1229 }
1230 #endif
1231 break;
1232
1233 case OP_TYPEEXACT:
1234 branchlength += GET2(cc,1);
1235 if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
1236 cc += 4;
1237 break;
1238
1239 /* Handle single-char matchers */
1240
1241 case OP_PROP:
1242 case OP_NOTPROP:
1243 cc += 2;
1244 /* Fall through */
1245
1246 case OP_NOT_DIGIT:
1247 case OP_DIGIT:
1248 case OP_NOT_WHITESPACE:
1249 case OP_WHITESPACE:
1250 case OP_NOT_WORDCHAR:
1251 case OP_WORDCHAR:
1252 case OP_ANY:
1253 branchlength++;
1254 cc++;
1255 break;
1256
1257 /* The single-byte matcher isn't allowed */
1258
1259 case OP_ANYBYTE:
1260 return -2;
1261
1262 /* Check a class for variable quantification */
1263
1264 #ifdef SUPPORT_UTF8
1265 case OP_XCLASS:
1266 cc += GET(cc, 1) - 33;
1267 /* Fall through */
1268 #endif
1269
1270 case OP_CLASS:
1271 case OP_NCLASS:
1272 cc += 33;
1273
1274 switch (*cc)
1275 {
1276 case OP_CRSTAR:
1277 case OP_CRMINSTAR:
1278 case OP_CRQUERY:
1279 case OP_CRMINQUERY:
1280 return -1;
1281
1282 case OP_CRRANGE:
1283 case OP_CRMINRANGE:
1284 if (GET2(cc,1) != GET2(cc,3)) return -1;
1285 branchlength += GET2(cc,1);
1286 cc += 5;
1287 break;
1288
1289 default:
1290 branchlength++;
1291 }
1292 break;
1293
1294 /* Anything else is variable length */
1295
1296 default:
1297 return -1;
1298 }
1299 }
1300 /* Control never gets here */
1301 }
1302
1303
1304
1305
1306 /*************************************************
1307 * Scan compiled regex for numbered bracket *
1308 *************************************************/
1309
1310 /* This little function scans through a compiled pattern until it finds a
1311 capturing bracket with the given number.
1312
1313 Arguments:
1314 code points to start of expression
1315 utf8 TRUE in UTF-8 mode
1316 number the required bracket number
1317
1318 Returns: pointer to the opcode for the bracket, or NULL if not found
1319 */
1320
1321 static const uschar *
1322 find_bracket(const uschar *code, BOOL utf8, int number)
1323 {
1324 for (;;)
1325 {
1326 register int c = *code;
1327 if (c == OP_END) return NULL;
1328
1329 /* XCLASS is used for classes that cannot be represented just by a bit
1330 map. This includes negated single high-valued characters. The length in
1331 the table is zero; the actual length is stored in the compiled code. */
1332
1333 if (c == OP_XCLASS) code += GET(code, 1);
1334
1335 /* Handle capturing bracket */
1336
1337 else if (c == OP_CBRA)
1338 {
1339 int n = GET2(code, 1+LINK_SIZE);
1340 if (n == number) return (uschar *)code;
1341 code += _pcre_OP_lengths[c];
1342 }
1343
1344 /* Otherwise, we can get the item's length from the table, except that for
1345 repeated character types, we have to test for \p and \P, which have an extra
1346 two bytes of parameters. */
1347
1348 else
1349 {
1350 switch(c)
1351 {
1352 case OP_TYPESTAR:
1353 case OP_TYPEMINSTAR:
1354 case OP_TYPEPLUS:
1355 case OP_TYPEMINPLUS:
1356 case OP_TYPEQUERY:
1357 case OP_TYPEMINQUERY:
1358 case OP_TYPEPOSSTAR:
1359 case OP_TYPEPOSPLUS:
1360 case OP_TYPEPOSQUERY:
1361 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1362 break;
1363
1364 case OP_TYPEUPTO:
1365 case OP_TYPEMINUPTO:
1366 case OP_TYPEEXACT:
1367 case OP_TYPEPOSUPTO:
1368 if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1369 break;
1370 }
1371
1372 /* Add in the fixed length from the table */
1373
1374 code += _pcre_OP_lengths[c];
1375
1376 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1377 a multi-byte character. The length in the table is a minimum, so we have to
1378 arrange to skip the extra bytes. */
1379
1380 #ifdef SUPPORT_UTF8
1381 if (utf8) switch(c)
1382 {
1383 case OP_CHAR:
1384 case OP_CHARNC:
1385 case OP_EXACT:
1386 case OP_UPTO:
1387 case OP_MINUPTO:
1388 case OP_POSUPTO:
1389 case OP_STAR:
1390 case OP_MINSTAR:
1391 case OP_POSSTAR:
1392 case OP_PLUS:
1393 case OP_MINPLUS:
1394 case OP_POSPLUS:
1395 case OP_QUERY:
1396 case OP_MINQUERY:
1397 case OP_POSQUERY:
1398 if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1399 break;
1400 }
1401 #endif
1402 }
1403 }
1404 }
1405
1406
1407
1408 /*************************************************
1409 * Scan compiled regex for recursion reference *
1410 *************************************************/
1411
1412 /* This little function scans through a compiled pattern until it finds an
1413 instance of OP_RECURSE.
1414
1415 Arguments:
1416 code points to start of expression
1417 utf8 TRUE in UTF-8 mode
1418
1419 Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1420 */
1421
1422 static const uschar *
1423 find_recurse(const uschar *code, BOOL utf8)
1424 {
1425 for (;;)
1426 {
1427 register int c = *code;
1428 if (c == OP_END) return NULL;
1429 if (c == OP_RECURSE) return code;
1430
1431 /* XCLASS is used for classes that cannot be represented just by a bit
1432 map. This includes negated single high-valued characters. The length in
1433 the table is zero; the actual length is stored in the compiled code. */
1434
1435 if (c == OP_XCLASS) code += GET(code, 1);
1436
1437 /* Otherwise, we can get the item's length from the table, except that for
1438 repeated character types, we have to test for \p and \P, which have an extra
1439 two bytes of parameters. */
1440
1441 else
1442 {
1443 switch(c)
1444 {
1445 case OP_TYPESTAR:
1446 case OP_TYPEMINSTAR:
1447 case OP_TYPEPLUS:
1448 case OP_TYPEMINPLUS:
1449 case OP_TYPEQUERY:
1450 case OP_TYPEMINQUERY:
1451 case OP_TYPEPOSSTAR:
1452 case OP_TYPEPOSPLUS:
1453 case OP_TYPEPOSQUERY:
1454 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1455 break;
1456
1457 case OP_TYPEPOSUPTO:
1458 case OP_TYPEUPTO:
1459 case OP_TYPEMINUPTO:
1460 case OP_TYPEEXACT:
1461 if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1462 break;
1463 }
1464
1465 /* Add in the fixed length from the table */
1466
1467 code += _pcre_OP_lengths[c];
1468
1469 /* In UTF-8 mode, opcodes that are followed by a character may be followed
1470 by a multi-byte character. The length in the table is a minimum, so we have
1471 to arrange to skip the extra bytes. */
1472
1473 #ifdef SUPPORT_UTF8
1474 if (utf8) switch(c)
1475 {
1476 case OP_CHAR:
1477 case OP_CHARNC:
1478 case OP_EXACT:
1479 case OP_UPTO:
1480 case OP_MINUPTO:
1481 case OP_POSUPTO:
1482 case OP_STAR:
1483 case OP_MINSTAR:
1484 case OP_POSSTAR:
1485 case OP_PLUS:
1486 case OP_MINPLUS:
1487 case OP_POSPLUS:
1488 case OP_QUERY:
1489 case OP_MINQUERY:
1490 case OP_POSQUERY:
1491 if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1492 break;
1493 }
1494 #endif
1495 }
1496 }
1497 }
1498
1499
1500
1501 /*************************************************
1502 * Scan compiled branch for non-emptiness *
1503 *************************************************/
1504
1505 /* This function scans through a branch of a compiled pattern to see whether it
1506 can match the empty string or not. It is called from could_be_empty()
1507 below and from compile_branch() when checking for an unlimited repeat of a
1508 group that can match nothing. Note that first_significant_code() skips over
1509 assertions. If we hit an unclosed bracket, we return "empty" - this means we've
1510 struck an inner bracket whose current branch will already have been scanned.
1511
1512 Arguments:
1513 code points to start of search
1514 endcode points to where to stop
1515 utf8 TRUE if in UTF8 mode
1516
1517 Returns: TRUE if what is matched could be empty
1518 */
1519
1520 static BOOL
1521 could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1522 {
1523 register int c;
1524 for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1525 code < endcode;
1526 code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1527 {
1528 const uschar *ccode;
1529
1530 c = *code;
1531
1532 /* Groups with zero repeats can of course be empty; skip them. */
1533
1534 if (c == OP_BRAZERO || c == OP_BRAMINZERO)
1535 {
1536 code += _pcre_OP_lengths[c];
1537 do code += GET(code, 1); while (*code == OP_ALT);
1538 c = *code;
1539 continue;
1540 }
1541
1542 /* For other groups, scan the branches. */
1543
1544 if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND)
1545 {
1546 BOOL empty_branch;
1547 if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1548
1549 /* Scan a closed bracket */
1550
1551 empty_branch = FALSE;
1552 do
1553 {
1554 if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1555 empty_branch = TRUE;
1556 code += GET(code, 1);
1557 }
1558 while (*code == OP_ALT);
1559 if (!empty_branch) return FALSE; /* All branches are non-empty */
1560 c = *code;
1561 continue;
1562 }
1563
1564 /* Handle the other opcodes */
1565
1566 switch (c)
1567 {
1568 /* Check for quantifiers after a class. XCLASS is used for classes that
1569 cannot be represented just by a bit map. This includes negated single
1570 high-valued characters. The length in _pcre_OP_lengths[] is zero; the
1571 actual length is stored in the compiled code, so we must update "code"
1572 here. */
1573
1574 #ifdef SUPPORT_UTF8
1575 case OP_XCLASS:
1576 ccode = code += GET(code, 1);
1577 goto CHECK_CLASS_REPEAT;
1578 #endif
1579
1580 case OP_CLASS:
1581 case OP_NCLASS:
1582 ccode = code + 33;
1583
1584 #ifdef SUPPORT_UTF8
1585 CHECK_CLASS_REPEAT:
1586 #endif
1587
1588 switch (*ccode)
1589 {
1590 case OP_CRSTAR: /* These could be empty; continue */
1591 case OP_CRMINSTAR:
1592 case OP_CRQUERY:
1593 case OP_CRMINQUERY:
1594 break;
1595
1596 default: /* Non-repeat => class must match */
1597 case OP_CRPLUS: /* These repeats aren't empty */
1598 case OP_CRMINPLUS:
1599 return FALSE;
1600
1601 case OP_CRRANGE:
1602 case OP_CRMINRANGE:
1603 if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
1604 break;
1605 }
1606 break;
1607
1608 /* Opcodes that must match a character */
1609
1610 case OP_PROP:
1611 case OP_NOTPROP:
1612 case OP_EXTUNI:
1613 case OP_NOT_DIGIT:
1614 case OP_DIGIT:
1615 case OP_NOT_WHITESPACE:
1616 case OP_WHITESPACE:
1617 case OP_NOT_WORDCHAR:
1618 case OP_WORDCHAR:
1619 case OP_ANY:
1620 case OP_ANYBYTE:
1621 case OP_CHAR:
1622 case OP_CHARNC:
1623 case OP_NOT:
1624 case OP_PLUS:
1625 case OP_MINPLUS:
1626 case OP_POSPLUS:
1627 case OP_EXACT:
1628 case OP_NOTPLUS:
1629 case OP_NOTMINPLUS:
1630 case OP_NOTPOSPLUS:
1631 case OP_NOTEXACT:
1632 case OP_TYPEPLUS:
1633 case OP_TYPEMINPLUS:
1634 case OP_TYPEPOSPLUS:
1635 case OP_TYPEEXACT:
1636 return FALSE;
1637
1638 /* These are going to continue, as they may be empty, but we have to
1639 fudge the length for the \p and \P cases. */
1640
1641 case OP_TYPESTAR:
1642 case OP_TYPEMINSTAR:
1643 case OP_TYPEPOSSTAR:
1644 case OP_TYPEQUERY:
1645 case OP_TYPEMINQUERY:
1646 case OP_TYPEPOSQUERY:
1647 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1648 break;
1649
1650 /* Same for these */
1651
1652 case OP_TYPEUPTO:
1653 case OP_TYPEMINUPTO:
1654 case OP_TYPEPOSUPTO:
1655 if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1656 break;
1657
1658 /* End of branch */
1659
1660 case OP_KET:
1661 case OP_KETRMAX:
1662 case OP_KETRMIN:
1663 case OP_ALT:
1664 return TRUE;
1665
1666 /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1667 MINUPTO, and POSUPTO may be followed by a multibyte character */
1668
1669 #ifdef SUPPORT_UTF8
1670 case OP_STAR:
1671 case OP_MINSTAR:
1672 case OP_POSSTAR:
1673 case OP_QUERY:
1674 case OP_MINQUERY:
1675 case OP_POSQUERY:
1676 case OP_UPTO:
1677 case OP_MINUPTO:
1678 case OP_POSUPTO:
1679 if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1680 break;
1681 #endif
1682 }
1683 }
1684
1685 return TRUE;
1686 }
1687
1688
1689
1690 /*************************************************
1691 * Scan compiled regex for non-emptiness *
1692 *************************************************/
1693
1694 /* This function is called to check for left recursive calls. We want to check
1695 the current branch of the current pattern to see if it could match the empty
1696 string. If it could, we must look outwards for branches at other levels,
1697 stopping when we pass beyond the bracket which is the subject of the recursion.
1698
1699 Arguments:
1700 code points to start of the recursion
1701 endcode points to where to stop (current RECURSE item)
1702 bcptr points to the chain of current (unclosed) branch starts
1703 utf8 TRUE if in UTF-8 mode
1704
1705 Returns: TRUE if what is matched could be empty
1706 */
1707
1708 static BOOL
1709 could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1710 BOOL utf8)
1711 {
1712 while (bcptr != NULL && bcptr->current >= code)
1713 {
1714 if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1715 bcptr = bcptr->outer;
1716 }
1717 return TRUE;
1718 }
1719
1720
1721
1722 /*************************************************
1723 * Check for POSIX class syntax *
1724 *************************************************/
1725
1726 /* This function is called when the sequence "[:" or "[." or "[=" is
1727 encountered in a character class. It checks whether this is followed by an
1728 optional ^ and then a sequence of letters, terminated by a matching ":]" or
1729 ".]" or "=]".
1730
1731 Argument:
1732 ptr pointer to the initial [
1733 endptr where to return the end pointer
1734 cd pointer to compile data
1735
1736 Returns: TRUE or FALSE
1737 */
1738
1739 static BOOL
1740 check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
1741 {
1742 int terminator; /* Don't combine these lines; the Solaris cc */
1743 terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
1744 if (*(++ptr) == '^') ptr++;
1745 while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
1746 if (*ptr == terminator && ptr[1] == ']')
1747 {
1748 *endptr = ptr;
1749 return TRUE;
1750 }
1751 return FALSE;
1752 }
1753
1754
1755
1756
1757 /*************************************************
1758 * Check POSIX class name *
1759 *************************************************/
1760
1761 /* This function is called to check the name given in a POSIX-style class entry
1762 such as [:alnum:].
1763
1764 Arguments:
1765 ptr points to the first letter
1766 len the length of the name
1767
1768 Returns: a value representing the name, or -1 if unknown
1769 */
1770
1771 static int
1772 check_posix_name(const uschar *ptr, int len)
1773 {
1774 const char *pn = posix_names;
1775 register int yield = 0;
1776 while (posix_name_lengths[yield] != 0)
1777 {
1778 if (len == posix_name_lengths[yield] &&
1779 strncmp((const char *)ptr, pn, len) == 0) return yield;
1780 pn += posix_name_lengths[yield] + 1;
1781 yield++;
1782 }
1783 return -1;
1784 }
1785
1786
1787 /*************************************************
1788 * Adjust OP_RECURSE items in repeated group *
1789 *************************************************/
1790
1791 /* OP_RECURSE items contain an offset from the start of the regex to the group
1792 that is referenced. This means that groups can be replicated for fixed
1793 repetition simply by copying (because the recursion is allowed to refer to
1794 earlier groups that are outside the current group). However, when a group is
1795 optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
1796 it, after it has been compiled. This means that any OP_RECURSE items within it
1797 that refer to the group itself or any contained groups have to have their
1798 offsets adjusted. That one of the jobs of this function. Before it is called,
1799 the partially compiled regex must be temporarily terminated with OP_END.
1800
1801 This function has been extended with the possibility of forward references for
1802 recursions and subroutine calls. It must also check the list of such references
1803 for the group we are dealing with. If it finds that one of the recursions in
1804 the current group is on this list, it adjusts the offset in the list, not the
1805 value in the reference (which is a group number).
1806
1807 Arguments:
1808 group points to the start of the group
1809 adjust the amount by which the group is to be moved
1810 utf8 TRUE in UTF-8 mode
1811 cd contains pointers to tables etc.
1812 save_hwm the hwm forward reference pointer at the start of the group
1813
1814 Returns: nothing
1815 */
1816
1817 static void
1818 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
1819 uschar *save_hwm)
1820 {
1821 uschar *ptr = group;
1822
1823 while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1824 {
1825 int offset;
1826 uschar *hc;
1827
1828 /* See if this recursion is on the forward reference list. If so, adjust the
1829 reference. */
1830
1831 for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
1832 {
1833 offset = GET(hc, 0);
1834 if (cd->start_code + offset == ptr + 1)
1835 {
1836 PUT(hc, 0, offset + adjust);
1837 break;
1838 }
1839 }
1840
1841 /* Otherwise, adjust the recursion offset if it's after the start of this
1842 group. */
1843
1844 if (hc >= cd->hwm)
1845 {
1846 offset = GET(ptr, 1);
1847 if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1848 }
1849
1850 ptr += 1 + LINK_SIZE;
1851 }
1852 }
1853
1854
1855
1856 /*************************************************
1857 * Insert an automatic callout point *
1858 *************************************************/
1859
1860 /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
1861 callout points before each pattern item.
1862
1863 Arguments:
1864 code current code pointer
1865 ptr current pattern pointer
1866 cd pointers to tables etc
1867
1868 Returns: new code pointer
1869 */
1870
1871 static uschar *
1872 auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
1873 {
1874 *code++ = OP_CALLOUT;
1875 *code++ = 255;
1876 PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */
1877 PUT(code, LINK_SIZE, 0); /* Default length */
1878 return code + 2*LINK_SIZE;
1879 }
1880
1881
1882
1883 /*************************************************
1884 * Complete a callout item *
1885 *************************************************/
1886
1887 /* A callout item contains the length of the next item in the pattern, which
1888 we can't fill in till after we have reached the relevant point. This is used
1889 for both automatic and manual callouts.
1890
1891 Arguments:
1892 previous_callout points to previous callout item
1893 ptr current pattern pointer
1894 cd pointers to tables etc
1895
1896 Returns: nothing
1897 */
1898
1899 static void
1900 complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
1901 {
1902 int length = ptr - cd->start_pattern - GET(previous_callout, 2);
1903 PUT(previous_callout, 2 + LINK_SIZE, length);
1904 }
1905
1906
1907
1908 #ifdef SUPPORT_UCP
1909 /*************************************************
1910 * Get othercase range *
1911 *************************************************/
1912
1913 /* This function is passed the start and end of a class range, in UTF-8 mode
1914 with UCP support. It searches up the characters, looking for internal ranges of
1915 characters in the "other" case. Each call returns the next one, updating the
1916 start address.
1917
1918 Arguments:
1919 cptr points to starting character value; updated
1920 d end value
1921 ocptr where to put start of othercase range
1922 odptr where to put end of othercase range
1923
1924 Yield: TRUE when range returned; FALSE when no more
1925 */
1926
1927 static BOOL
1928 get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
1929 unsigned int *odptr)
1930 {
1931 unsigned int c, othercase, next;
1932
1933 for (c = *cptr; c <= d; c++)
1934 { if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR) break; }
1935
1936 if (c > d) return FALSE;
1937
1938 *ocptr = othercase;
1939 next = othercase + 1;
1940
1941 for (++c; c <= d; c++)
1942 {
1943 if (_pcre_ucp_othercase(c) != next) break;
1944 next++;
1945 }
1946
1947 *odptr = next - 1;
1948 *cptr = c;
1949
1950 return TRUE;
1951 }
1952 #endif /* SUPPORT_UCP */
1953
1954
1955
1956 /*************************************************
1957 * Check if auto-possessifying is possible *
1958 *************************************************/
1959
1960 /* This function is called for unlimited repeats of certain items, to see
1961 whether the next thing could possibly match the repeated item. If not, it makes
1962 sense to automatically possessify the repeated item.
1963
1964 Arguments:
1965 op_code the repeated op code
1966 this data for this item, depends on the opcode
1967 utf8 TRUE in UTF-8 mode
1968 utf8_char used for utf8 character bytes, NULL if not relevant
1969 ptr next character in pattern
1970 options options bits
1971 cd contains pointers to tables etc.
1972
1973 Returns: TRUE if possessifying is wanted
1974 */
1975
1976 static BOOL
1977 check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
1978 const uschar *ptr, int options, compile_data *cd)
1979 {
1980 int next;
1981
1982 /* Skip whitespace and comments in extended mode */
1983
1984 if ((options & PCRE_EXTENDED) != 0)
1985 {
1986 for (;;)
1987 {
1988 while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1989 if (*ptr == '#')
1990 {
1991 while (*(++ptr) != 0)
1992 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1993 }
1994 else break;
1995 }
1996 }
1997
1998 /* If the next item is one that we can handle, get its value. A non-negative
1999 value is a character, a negative value is an escape value. */
2000
2001 if (*ptr == '\\')
2002 {
2003 int temperrorcode = 0;
2004 next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
2005 if (temperrorcode != 0) return FALSE;
2006 ptr++; /* Point after the escape sequence */
2007 }
2008
2009 else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
2010 {
2011 #ifdef SUPPORT_UTF8
2012 if (utf8) { GETCHARINC(next, ptr); } else
2013 #endif
2014 next = *ptr++;
2015 }
2016
2017 else return FALSE;
2018
2019 /* Skip whitespace and comments in extended mode */
2020
2021 if ((options & PCRE_EXTENDED) != 0)
2022 {
2023 for (;;)
2024 {
2025 while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2026 if (*ptr == '#')
2027 {
2028 while (*(++ptr) != 0)
2029 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2030 }
2031 else break;
2032 }
2033 }
2034
2035 /* If the next thing is itself optional, we have to give up. */
2036
2037 if (*ptr == '*' || *ptr == '?' || strncmp((char *)ptr, "{0,", 3) == 0)
2038 return FALSE;
2039
2040 /* Now compare the next item with the previous opcode. If the previous is a
2041 positive single character match, "item" either contains the character or, if
2042 "item" is greater than 127 in utf8 mode, the character's bytes are in
2043 utf8_char. */
2044
2045
2046 /* Handle cases when the next item is a character. */
2047
2048 if (next >= 0) switch(op_code)
2049 {
2050 case OP_CHAR:
2051 #ifdef SUPPORT_UTF8
2052 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2053 #endif
2054 return item != next;
2055
2056 /* For CHARNC (caseless character) we must check the other case. If we have
2057 Unicode property support, we can use it to test the other case of
2058 high-valued characters. */
2059
2060 case OP_CHARNC:
2061 #ifdef SUPPORT_UTF8
2062 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2063 #endif
2064 if (item == next) return FALSE;
2065 #ifdef SUPPORT_UTF8
2066 if (utf8)
2067 {
2068 unsigned int othercase;
2069 if (next < 128) othercase = cd->fcc[next]; else
2070 #ifdef SUPPORT_UCP
2071 othercase = _pcre_ucp_othercase((unsigned int)next);
2072 #else
2073 othercase = NOTACHAR;
2074 #endif
2075 return (unsigned int)item != othercase;
2076 }
2077 else
2078 #endif /* SUPPORT_UTF8 */
2079 return (item != cd->fcc[next]); /* Non-UTF-8 mode */
2080
2081 /* For OP_NOT, "item" must be a single-byte character. */
2082
2083 case OP_NOT:
2084 if (next < 0) return FALSE; /* Not a character */
2085 if (item == next) return TRUE;
2086 if ((options & PCRE_CASELESS) == 0) return FALSE;
2087 #ifdef SUPPORT_UTF8
2088 if (utf8)
2089 {
2090 unsigned int othercase;
2091 if (next < 128) othercase = cd->fcc[next]; else
2092 #ifdef SUPPORT_UCP
2093 othercase = _pcre_ucp_othercase(next);
2094 #else
2095 othercase = NOTACHAR;
2096 #endif
2097 return (unsigned int)item == othercase;
2098 }
2099 else
2100 #endif /* SUPPORT_UTF8 */
2101 return (item == cd->fcc[next]); /* Non-UTF-8 mode */
2102
2103 case OP_DIGIT:
2104 return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
2105
2106 case OP_NOT_DIGIT:
2107 return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
2108
2109 case OP_WHITESPACE:
2110 return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
2111
2112 case OP_NOT_WHITESPACE:
2113 return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
2114
2115 case OP_WORDCHAR:
2116 return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
2117
2118 case OP_NOT_WORDCHAR:
2119 return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
2120
2121 case OP_HSPACE:
2122 case OP_NOT_HSPACE:
2123 switch(next)
2124 {
2125 case 0x09:
2126 case 0x20:
2127 case 0xa0:
2128 case 0x1680:
2129 case 0x180e:
2130 case 0x2000:
2131 case 0x2001:
2132 case 0x2002:
2133 case 0x2003:
2134 case 0x2004:
2135 case 0x2005:
2136 case 0x2006:
2137 case 0x2007:
2138 case 0x2008:
2139 case 0x2009:
2140 case 0x200A:
2141 case 0x202f:
2142 case 0x205f:
2143 case 0x3000:
2144 return op_code != OP_HSPACE;
2145 default:
2146 return op_code == OP_HSPACE;
2147 }
2148
2149 case OP_VSPACE:
2150 case OP_NOT_VSPACE:
2151 switch(next)
2152 {
2153 case 0x0a:
2154 case 0x0b:
2155 case 0x0c:
2156 case 0x0d:
2157 case 0x85:
2158 case 0x2028:
2159 case 0x2029:
2160 return op_code != OP_VSPACE;
2161 default:
2162 return op_code == OP_VSPACE;
2163 }
2164
2165 default:
2166 return FALSE;
2167 }
2168
2169
2170 /* Handle the case when the next item is \d, \s, etc. */
2171
2172 switch(op_code)
2173 {
2174 case OP_CHAR:
2175 case OP_CHARNC:
2176 #ifdef SUPPORT_UTF8
2177 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2178 #endif
2179 switch(-next)
2180 {
2181 case ESC_d:
2182 return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
2183
2184 case ESC_D:
2185 return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
2186
2187 case ESC_s:
2188 return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
2189
2190 case ESC_S:
2191 return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
2192
2193 case ESC_w:
2194 return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
2195
2196 case ESC_W:
2197 return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
2198
2199 case ESC_h:
2200 case ESC_H:
2201 switch(item)
2202 {
2203 case 0x09:
2204 case 0x20:
2205 case 0xa0:
2206 case 0x1680:
2207 case 0x180e:
2208 case 0x2000:
2209 case 0x2001:
2210 case 0x2002:
2211 case 0x2003:
2212 case 0x2004:
2213 case 0x2005:
2214 case 0x2006:
2215 case 0x2007:
2216 case 0x2008:
2217 case 0x2009:
2218 case 0x200A:
2219 case 0x202f:
2220 case 0x205f:
2221 case 0x3000:
2222 return -next != ESC_h;
2223 default:
2224 return -next == ESC_h;
2225 }
2226
2227 case ESC_v:
2228 case ESC_V:
2229 switch(item)
2230 {
2231 case 0x0a:
2232 case 0x0b:
2233 case 0x0c:
2234 case 0x0d:
2235 case 0x85:
2236 case 0x2028:
2237 case 0x2029:
2238 return -next != ESC_v;
2239 default:
2240 return -next == ESC_v;
2241 }
2242
2243 default:
2244 return FALSE;
2245 }
2246
2247 case OP_DIGIT:
2248 return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
2249 next == -ESC_h || next == -ESC_v;
2250
2251 case OP_NOT_DIGIT:
2252 return next == -ESC_d;
2253
2254 case OP_WHITESPACE:
2255 return next == -ESC_S || next == -ESC_d || next == -ESC_w;
2256
2257 case OP_NOT_WHITESPACE:
2258 return next == -ESC_s || next == -ESC_h || next == -ESC_v;
2259
2260 case OP_HSPACE:
2261 return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w;
2262
2263 case OP_NOT_HSPACE:
2264 return next == -ESC_h;
2265
2266 /* Can't have \S in here because VT matches \S (Perl anomaly) */
2267 case OP_VSPACE:
2268 return next == -ESC_V || next == -ESC_d || next == -ESC_w;
2269
2270 case OP_NOT_VSPACE:
2271 return next == -ESC_v;
2272
2273 case OP_WORDCHAR:
2274 return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v;
2275
2276 case OP_NOT_WORDCHAR:
2277 return next == -ESC_w || next == -ESC_d;
2278
2279 default:
2280 return FALSE;
2281 }
2282
2283 /* Control does not reach here */
2284 }
2285
2286
2287
2288 /*************************************************
2289 * Compile one branch *
2290 *************************************************/
2291
2292 /* Scan the pattern, compiling it into the a vector. If the options are
2293 changed during the branch, the pointer is used to change the external options
2294 bits. This function is used during the pre-compile phase when we are trying
2295 to find out the amount of memory needed, as well as during the real compile
2296 phase. The value of lengthptr distinguishes the two phases.
2297
2298 Arguments:
2299 optionsptr pointer to the option bits
2300 codeptr points to the pointer to the current code point
2301 ptrptr points to the current pattern pointer
2302 errorcodeptr points to error code variable
2303 firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2304 reqbyteptr set to the last literal character required, else < 0
2305 bcptr points to current branch chain
2306 cd contains pointers to tables etc.
2307 lengthptr NULL during the real compile phase
2308 points to length accumulator during pre-compile phase
2309
2310 Returns: TRUE on success
2311 FALSE, with *errorcodeptr set non-zero on error
2312 */
2313
2314 static BOOL
2315 compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2316 int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2317 compile_data *cd, int *lengthptr)
2318 {
2319 int repeat_type, op_type;
2320 int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
2321 int bravalue = 0;
2322 int greedy_default, greedy_non_default;
2323 int firstbyte, reqbyte;
2324 int zeroreqbyte, zerofirstbyte;
2325 int req_caseopt, reqvary, tempreqvary;
2326 int options = *optionsptr;
2327 int after_manual_callout = 0;
2328 int length_prevgroup = 0;
2329 register int c;
2330 register uschar *code = *codeptr;
2331 uschar *last_code = code;
2332 uschar *orig_code = code;
2333 uschar *tempcode;
2334 BOOL inescq = FALSE;
2335 BOOL groupsetfirstbyte = FALSE;
2336 const uschar *ptr = *ptrptr;
2337 const uschar *tempptr;
2338 uschar *previous = NULL;
2339 uschar *previous_callout = NULL;
2340 uschar *save_hwm = NULL;
2341 uschar classbits[32];
2342
2343 #ifdef SUPPORT_UTF8
2344 BOOL class_utf8;
2345 BOOL utf8 = (options & PCRE_UTF8) != 0;
2346 uschar *class_utf8data;
2347 uschar utf8_char[6];
2348 #else
2349 BOOL utf8 = FALSE;
2350 uschar *utf8_char = NULL;
2351 #endif
2352
2353 #ifdef DEBUG
2354 if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2355 #endif
2356
2357 /* Set up the default and non-default settings for greediness */
2358
2359 greedy_default = ((options & PCRE_UNGREEDY) != 0);
2360 greedy_non_default = greedy_default ^ 1;
2361
2362 /* Initialize no first byte, no required byte. REQ_UNSET means "no char
2363 matching encountered yet". It gets changed to REQ_NONE if we hit something that
2364 matches a non-fixed char first char; reqbyte just remains unset if we never
2365 find one.
2366
2367 When we hit a repeat whose minimum is zero, we may have to adjust these values
2368 to take the zero repeat into account. This is implemented by setting them to
2369 zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2370 item types that can be repeated set these backoff variables appropriately. */
2371
2372 firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2373
2374 /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2375 according to the current setting of the caseless flag. REQ_CASELESS is a bit
2376 value > 255. It is added into the firstbyte or reqbyte variables to record the
2377 case status of the value. This is used only for ASCII characters. */
2378
2379 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2380
2381 /* Switch on next character until the end of the branch */
2382
2383 for (;; ptr++)
2384 {
2385 BOOL negate_class;
2386 BOOL should_flip_negation;
2387 BOOL possessive_quantifier;
2388 BOOL is_quantifier;
2389 BOOL is_recurse;
2390 BOOL reset_bracount;
2391 int class_charcount;
2392 int class_lastchar;
2393 int newoptions;
2394 int recno;
2395 int refsign;
2396 int skipbytes;
2397 int subreqbyte;
2398 int subfirstbyte;
2399 int terminator;
2400 int mclength;
2401 uschar mcbuffer[8];
2402
2403 /* Get next byte in the pattern */
2404
2405 c = *ptr;
2406
2407 /* If we are in the pre-compile phase, accumulate the length used for the
2408 previous cycle of this loop. */
2409
2410 if (lengthptr != NULL)
2411 {
2412 #ifdef DEBUG
2413 if (code > cd->hwm) cd->hwm = code; /* High water info */
2414 #endif
2415 if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
2416 {
2417 *errorcodeptr = ERR52;
2418 goto FAILED;
2419 }
2420
2421 /* There is at least one situation where code goes backwards: this is the
2422 case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2423 the class is simply eliminated. However, it is created first, so we have to
2424 allow memory for it. Therefore, don't ever reduce the length at this point.
2425 */
2426
2427 if (code < last_code) code = last_code;
2428
2429 /* Paranoid check for integer overflow */
2430
2431 if (OFLOW_MAX - *lengthptr < code - last_code)
2432 {
2433 *errorcodeptr = ERR20;
2434 goto FAILED;
2435 }
2436
2437 *lengthptr += code - last_code;
2438 DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2439
2440 /* If "previous" is set and it is not at the start of the work space, move
2441 it back to there, in order to avoid filling up the work space. Otherwise,
2442 if "previous" is NULL, reset the current code pointer to the start. */
2443
2444 if (previous != NULL)
2445 {
2446 if (previous > orig_code)
2447 {
2448 memmove(orig_code, previous, code - previous);
2449 code -= previous - orig_code;
2450 previous = orig_code;
2451 }
2452 }
2453 else code = orig_code;
2454
2455 /* Remember where this code item starts so we can pick up the length
2456 next time round. */
2457
2458 last_code = code;
2459 }
2460
2461 /* In the real compile phase, just check the workspace used by the forward
2462 reference list. */
2463
2464 else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
2465 {
2466 *errorcodeptr = ERR52;
2467 goto FAILED;
2468 }
2469
2470 /* If in \Q...\E, check for the end; if not, we have a literal */
2471
2472 if (inescq && c != 0)
2473 {
2474 if (c == '\\' && ptr[1] == 'E')
2475 {
2476 inescq = FALSE;
2477 ptr++;
2478 continue;
2479 }
2480 else
2481 {
2482 if (previous_callout != NULL)
2483 {
2484 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2485 complete_callout(previous_callout, ptr, cd);
2486 previous_callout = NULL;
2487 }
2488 if ((options & PCRE_AUTO_CALLOUT) != 0)
2489 {
2490 previous_callout = code;
2491 code = auto_callout(code, ptr, cd);
2492 }
2493 goto NORMAL_CHAR;
2494 }
2495 }
2496
2497 /* Fill in length of a previous callout, except when the next thing is
2498 a quantifier. */
2499
2500 is_quantifier = c == '*' || c == '+' || c == '?' ||
2501 (c == '{' && is_counted_repeat(ptr+1));
2502
2503 if (!is_quantifier && previous_callout != NULL &&
2504 after_manual_callout-- <= 0)
2505 {
2506 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2507 complete_callout(previous_callout, ptr, cd);
2508 previous_callout = NULL;
2509 }
2510
2511 /* In extended mode, skip white space and comments */
2512
2513 if ((options & PCRE_EXTENDED) != 0)
2514 {
2515 if ((cd->ctypes[c] & ctype_space) != 0) continue;
2516 if (c == '#')
2517 {
2518 while (*(++ptr) != 0)
2519 {
2520 if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
2521 }
2522 if (*ptr != 0) continue;
2523
2524 /* Else fall through to handle end of string */
2525 c = 0;
2526 }
2527 }
2528
2529 /* No auto callout for quantifiers. */
2530
2531 if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
2532 {
2533 previous_callout = code;
2534 code = auto_callout(code, ptr, cd);
2535 }
2536
2537 switch(c)
2538 {
2539 /* ===================================================================*/
2540 case 0: /* The branch terminates at string end */
2541 case '|': /* or | or ) */
2542 case ')':
2543 *firstbyteptr = firstbyte;
2544 *reqbyteptr = reqbyte;
2545 *codeptr = code;
2546 *ptrptr = ptr;
2547 if (lengthptr != NULL)
2548 {
2549 if (OFLOW_MAX - *lengthptr < code - last_code)
2550 {
2551 *errorcodeptr = ERR20;
2552 goto FAILED;
2553 }
2554 *lengthptr += code - last_code; /* To include callout length */
2555 DPRINTF((">> end branch\n"));
2556 }
2557 return TRUE;
2558
2559
2560 /* ===================================================================*/
2561 /* Handle single-character metacharacters. In multiline mode, ^ disables
2562 the setting of any following char as a first character. */
2563
2564 case '^':
2565 if ((options & PCRE_MULTILINE) != 0)
2566 {
2567 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2568 }
2569 previous = NULL;
2570 *code++ = OP_CIRC;
2571 break;
2572
2573 case '$':
2574 previous = NULL;
2575 *code++ = OP_DOLL;
2576 break;
2577
2578 /* There can never be a first char if '.' is first, whatever happens about
2579 repeats. The value of reqbyte doesn't change either. */
2580
2581 case '.':
2582 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2583 zerofirstbyte = firstbyte;
2584 zeroreqbyte = reqbyte;
2585 previous = code;
2586 *code++ = OP_ANY;
2587 break;
2588
2589
2590 /* ===================================================================*/
2591 /* Character classes. If the included characters are all < 256, we build a
2592 32-byte bitmap of the permitted characters, except in the special case
2593 where there is only one such character. For negated classes, we build the
2594 map as usual, then invert it at the end. However, we use a different opcode
2595 so that data characters > 255 can be handled correctly.
2596
2597 If the class contains characters outside the 0-255 range, a different
2598 opcode is compiled. It may optionally have a bit map for characters < 256,
2599 but those above are are explicitly listed afterwards. A flag byte tells
2600 whether the bitmap is present, and whether this is a negated class or not.
2601 */
2602
2603 case '[':
2604 previous = code;
2605
2606 /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
2607 they are encountered at the top level, so we'll do that too. */
2608
2609 if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2610 check_posix_syntax(ptr, &tempptr, cd))
2611 {
2612 *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;
2613 goto FAILED;
2614 }
2615
2616 /* If the first character is '^', set the negation flag and skip it. Also,
2617 if the first few characters (either before or after ^) are \Q\E or \E we
2618 skip them too. This makes for compatibility with Perl. */
2619
2620 negate_class = FALSE;
2621 for (;;)
2622 {
2623 c = *(++ptr);
2624 if (c == '\\')
2625 {
2626 if (ptr[1] == 'E') ptr++;
2627 else if (strncmp((const char *)ptr+1, "Q\\E", 3) == 0) ptr += 3;
2628 else break;
2629 }
2630 else if (!negate_class && c == '^')
2631 negate_class = TRUE;
2632 else break;
2633 }
2634
2635 /* If a class contains a negative special such as \S, we need to flip the
2636 negation flag at the end, so that support for characters > 255 works
2637 correctly (they are all included in the class). */
2638
2639 should_flip_negation = FALSE;
2640
2641 /* Keep a count of chars with values < 256 so that we can optimize the case
2642 of just a single character (as long as it's < 256). However, For higher
2643 valued UTF-8 characters, we don't yet do any optimization. */
2644
2645 class_charcount = 0;
2646 class_lastchar = -1;
2647
2648 /* Initialize the 32-char bit map to all zeros. We build the map in a
2649 temporary bit of memory, in case the class contains only 1 character (less
2650 than 256), because in that case the compiled code doesn't use the bit map.
2651 */
2652
2653 memset(classbits, 0, 32 * sizeof(uschar));
2654
2655 #ifdef SUPPORT_UTF8
2656 class_utf8 = FALSE; /* No chars >= 256 */
2657 class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */
2658 #endif
2659
2660 /* Process characters until ] is reached. By writing this as a "do" it
2661 means that an initial ] is taken as a data character. At the start of the
2662 loop, c contains the first byte of the character. */
2663
2664 if (c != 0) do
2665 {
2666 const uschar *oldptr;
2667
2668 #ifdef SUPPORT_UTF8
2669 if (utf8 && c > 127)
2670 { /* Braces are required because the */
2671 GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
2672 }
2673 #endif
2674
2675 /* Inside \Q...\E everything is literal except \E */
2676
2677 if (inescq)
2678 {
2679 if (c == '\\' && ptr[1] == 'E') /* If we are at \E */
2680 {
2681 inescq = FALSE; /* Reset literal state */
2682 ptr++; /* Skip the 'E' */
2683 continue; /* Carry on with next */
2684 }
2685 goto CHECK_RANGE; /* Could be range if \E follows */
2686 }
2687
2688 /* Handle POSIX class names. Perl allows a negation extension of the
2689 form [:^name:]. A square bracket that doesn't match the syntax is
2690 treated as a literal. We also recognize the POSIX constructions
2691 [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
2692 5.6 and 5.8 do. */
2693
2694 if (c == '[' &&
2695 (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2696 check_posix_syntax(ptr, &tempptr, cd))
2697 {
2698 BOOL local_negate = FALSE;
2699 int posix_class, taboffset, tabopt;
2700 register const uschar *cbits = cd->cbits;
2701 uschar pbits[32];
2702
2703 if (ptr[1] != ':')
2704 {
2705 *errorcodeptr = ERR31;
2706 goto FAILED;
2707 }
2708
2709 ptr += 2;
2710 if (*ptr == '^')
2711 {
2712 local_negate = TRUE;
2713 should_flip_negation = TRUE; /* Note negative special */
2714 ptr++;
2715 }
2716
2717 posix_class = check_posix_name(ptr, tempptr - ptr);
2718 if (posix_class < 0)
2719 {
2720 *errorcodeptr = ERR30;
2721 goto FAILED;
2722 }
2723
2724 /* If matching is caseless, upper and lower are converted to
2725 alpha. This relies on the fact that the class table starts with
2726 alpha, lower, upper as the first 3 entries. */
2727
2728 if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
2729 posix_class = 0;
2730
2731 /* We build the bit map for the POSIX class in a chunk of local store
2732 because we may be adding and subtracting from it, and we don't want to
2733 subtract bits that may be in the main map already. At the end we or the
2734 result into the bit map that is being built. */
2735
2736 posix_class *= 3;
2737
2738 /* Copy in the first table (always present) */
2739
2740 memcpy(pbits, cbits + posix_class_maps[posix_class],
2741 32 * sizeof(uschar));
2742
2743 /* If there is a second table, add or remove it as required. */
2744
2745 taboffset = posix_class_maps[posix_class + 1];
2746 tabopt = posix_class_maps[posix_class + 2];
2747
2748 if (taboffset >= 0)
2749 {
2750 if (tabopt >= 0)
2751 for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
2752 else
2753 for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
2754 }
2755
2756 /* Not see if we need to remove any special characters. An option
2757 value of 1 removes vertical space and 2 removes underscore. */
2758
2759 if (tabopt < 0) tabopt = -tabopt;
2760 if (tabopt == 1) pbits[1] &= ~0x3c;
2761 else if (tabopt == 2) pbits[11] &= 0x7f;
2762
2763 /* Add the POSIX table or its complement into the main table that is
2764 being built and we are done. */
2765
2766 if (local_negate)
2767 for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
2768 else
2769 for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
2770
2771 ptr = tempptr + 1;
2772 class_charcount = 10; /* Set > 1; assumes more than 1 per class */
2773 continue; /* End of POSIX syntax handling */
2774 }
2775
2776 /* Backslash may introduce a single character, or it may introduce one
2777 of the specials, which just set a flag. The sequence \b is a special
2778 case. Inside a class (and only there) it is treated as backspace.
2779 Elsewhere it marks a word boundary. Other escapes have preset maps ready
2780 to 'or' into the one we are building. We assume they have more than one
2781 character in them, so set class_charcount bigger than one. */
2782
2783 if (c == '\\')
2784 {
2785 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2786 if (*errorcodeptr != 0) goto FAILED;
2787
2788 if (-c == ESC_b) c = '\b'; /* \b is backslash in a class */
2789 else if (-c == ESC_X) c = 'X'; /* \X is literal X in a class */
2790 else if (-c == ESC_R) c = 'R'; /* \R is literal R in a class */
2791 else if (-c == ESC_Q) /* Handle start of quoted string */
2792 {
2793 if (ptr[1] == '\\' && ptr[2] == 'E')
2794 {
2795 ptr += 2; /* avoid empty string */
2796 }
2797 else inescq = TRUE;
2798 continue;
2799 }
2800 else if (-c == ESC_E) continue; /* Ignore orphan \E */
2801
2802 if (c < 0)
2803 {
2804 register const uschar *cbits = cd->cbits;
2805 class_charcount += 2; /* Greater than 1 is what matters */
2806
2807 /* Save time by not doing this in the pre-compile phase. */
2808
2809 if (lengthptr == NULL) switch (-c)
2810 {
2811 case ESC_d:
2812 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
2813 continue;
2814
2815 case ESC_D:
2816 should_flip_negation = TRUE;
2817 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
2818 continue;
2819
2820 case ESC_w:
2821 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
2822 continue;
2823
2824 case ESC_W:
2825 should_flip_negation = TRUE;
2826 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
2827 continue;
2828
2829 case ESC_s:
2830 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
2831 classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
2832 continue;
2833
2834 case ESC_S:
2835 should_flip_negation = TRUE;
2836 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
2837 classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
2838 continue;
2839
2840 case ESC_E: /* Perl ignores an orphan \E */
2841 continue;
2842
2843 default: /* Not recognized; fall through */
2844 break; /* Need "default" setting to stop compiler warning. */
2845 }
2846
2847 /* In the pre-compile phase, just do the recognition. */
2848
2849 else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
2850 c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
2851
2852 /* We need to deal with \H, \h, \V, and \v in both phases because
2853 they use extra memory. */
2854
2855 if (-c == ESC_h)
2856 {
2857 SETBIT(classbits, 0x09); /* VT */
2858 SETBIT(classbits, 0x20); /* SPACE */
2859 SETBIT(classbits, 0xa0); /* NSBP */
2860 #ifdef SUPPORT_UTF8
2861 if (utf8)
2862 {
2863 class_utf8 = TRUE;
2864 *class_utf8data++ = XCL_SINGLE;
2865 class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
2866 *class_utf8data++ = XCL_SINGLE;
2867 class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
2868 *class_utf8data++ = XCL_RANGE;
2869 class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
2870 class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
2871 *class_utf8data++ = XCL_SINGLE;
2872 class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
2873 *class_utf8data++ = XCL_SINGLE;
2874 class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
2875 *class_utf8data++ = XCL_SINGLE;
2876 class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
2877 }
2878 #endif
2879 continue;
2880 }
2881
2882 if (-c == ESC_H)
2883 {
2884 for (c = 0; c < 32; c++)
2885 {
2886 int x = 0xff;
2887 switch (c)
2888 {
2889 case 0x09/8: x ^= 1 << (0x09%8); break;
2890 case 0x20/8: x ^= 1 << (0x20%8); break;
2891 case 0xa0/8: x ^= 1 << (0xa0%8); break;
2892 default: break;
2893 }
2894 classbits[c] |= x;
2895 }
2896
2897 #ifdef SUPPORT_UTF8
2898 if (utf8)
2899 {
2900 class_utf8 = TRUE;
2901 *class_utf8data++ = XCL_RANGE;
2902 class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2903 class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
2904 *class_utf8data++ = XCL_RANGE;
2905 class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
2906 class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
2907 *class_utf8data++ = XCL_RANGE;
2908 class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
2909 class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
2910 *class_utf8data++ = XCL_RANGE;
2911 class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
2912 class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
2913 *class_utf8data++ = XCL_RANGE;
2914 class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
2915 class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
2916 *class_utf8data++ = XCL_RANGE;
2917 class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
2918 class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
2919 *class_utf8data++ = XCL_RANGE;
2920 class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
2921 class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2922 }
2923 #endif
2924 continue;
2925 }
2926
2927 if (-c == ESC_v)
2928 {
2929 SETBIT(classbits, 0x0a); /* LF */
2930 SETBIT(classbits, 0x0b); /* VT */
2931 SETBIT(classbits, 0x0c); /* FF */
2932 SETBIT(classbits, 0x0d); /* CR */
2933 SETBIT(classbits, 0x85); /* NEL */
2934 #ifdef SUPPORT_UTF8
2935 if (utf8)
2936 {
2937 class_utf8 = TRUE;
2938 *class_utf8data++ = XCL_RANGE;
2939 class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
2940 class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
2941 }
2942 #endif
2943 continue;
2944 }
2945
2946 if (-c == ESC_V)
2947 {
2948 for (c = 0; c < 32; c++)
2949 {
2950 int x = 0xff;
2951 switch (c)
2952 {
2953 case 0x0a/8: x ^= 1 << (0x0a%8);
2954 x ^= 1 << (0x0b%8);
2955 x ^= 1 << (0x0c%8);
2956 x ^= 1 << (0x0d%8);
2957 break;
2958 case 0x85/8: x ^= 1 << (0x85%8); break;
2959 default: break;
2960 }
2961 classbits[c] |= x;
2962 }
2963
2964 #ifdef SUPPORT_UTF8
2965 if (utf8)
2966 {
2967 class_utf8 = TRUE;
2968 *class_utf8data++ = XCL_RANGE;
2969 class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2970 class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
2971 *class_utf8data++ = XCL_RANGE;
2972 class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
2973 class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2974 }
2975 #endif
2976 continue;
2977 }
2978
2979 /* We need to deal with \P and \p in both phases. */
2980
2981 #ifdef SUPPORT_UCP
2982 if (-c == ESC_p || -c == ESC_P)
2983 {
2984 BOOL negated;
2985 int pdata;
2986 int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
2987 if (ptype < 0) goto FAILED;
2988 class_utf8 = TRUE;
2989 *class_utf8data++ = ((-c == ESC_p) != negated)?
2990 XCL_PROP : XCL_NOTPROP;
2991 *class_utf8data++ = ptype;
2992 *class_utf8data++ = pdata;
2993 class_charcount -= 2; /* Not a < 256 character */
2994 continue;
2995 }
2996 #endif
2997 /* Unrecognized escapes are faulted if PCRE is running in its
2998 strict mode. By default, for compatibility with Perl, they are
2999 treated as literals. */
3000
3001 if ((options & PCRE_EXTRA) != 0)
3002 {
3003 *errorcodeptr = ERR7;
3004 goto FAILED;
3005 }
3006
3007 class_charcount -= 2; /* Undo the default count from above */
3008 c = *ptr; /* Get the final character and fall through */
3009 }
3010
3011 /* Fall through if we have a single character (c >= 0). This may be
3012 greater than 256 in UTF-8 mode. */
3013
3014 } /* End of backslash handling */
3015
3016 /* A single character may be followed by '-' to form a range. However,
3017 Perl does not permit ']' to be the end of the range. A '-' character
3018 at the end is treated as a literal. Perl ignores orphaned \E sequences
3019 entirely. The code for handling \Q and \E is messy. */
3020
3021 CHECK_RANGE:
3022 while (ptr[1] == '\\' && ptr[2] == 'E')
3023 {
3024 inescq = FALSE;
3025 ptr += 2;
3026 }
3027
3028 oldptr = ptr;
3029
3030 /* Remember \r or \n */
3031
3032 if (c == '\r' || c == '\n') cd->external_flags |= PCRE_HASCRORLF;
3033
3034 /* Check for range */
3035
3036 if (!inescq && ptr[1] == '-')
3037 {
3038 int d;
3039 ptr += 2;
3040 while (*ptr == '\\' && ptr[1] == 'E') ptr += 2;
3041
3042 /* If we hit \Q (not followed by \E) at this point, go into escaped
3043 mode. */
3044
3045 while (*ptr == '\\' && ptr[1] == 'Q')
3046 {
3047 ptr += 2;
3048 if (*ptr == '\\' && ptr[1] == 'E') { ptr += 2; continue; }
3049 inescq = TRUE;
3050 break;
3051 }
3052
3053 if (*ptr == 0 || (!inescq && *ptr == ']'))
3054 {
3055 ptr = oldptr;
3056 goto LONE_SINGLE_CHARACTER;
3057 }
3058
3059 #ifdef SUPPORT_UTF8
3060 if (utf8)
3061 { /* Braces are required because the */
3062 GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
3063 }
3064 else
3065 #endif
3066 d = *ptr; /* Not UTF-8 mode */
3067
3068 /* The second part of a range can be a single-character escape, but
3069 not any of the other escapes. Perl 5.6 treats a hyphen as a literal
3070 in such circumstances. */
3071
3072 if (!inescq && d == '\\')
3073 {
3074 d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3075 if (*errorcodeptr != 0) goto FAILED;
3076
3077 /* \b is backslash; \X is literal X; \R is literal R; any other
3078 special means the '-' was literal */
3079
3080 if (d < 0)
3081 {
3082 if (d == -ESC_b) d = '\b';
3083 else if (d == -ESC_X) d = 'X';
3084 else if (d == -ESC_R) d = 'R'; else
3085 {
3086 ptr = oldptr;
3087 goto LONE_SINGLE_CHARACTER; /* A few lines below */
3088 }
3089 }
3090 }
3091
3092 /* Check that the two values are in the correct order. Optimize
3093 one-character ranges */
3094
3095 if (d < c)
3096 {
3097 *errorcodeptr = ERR8;
3098 goto FAILED;
3099 }
3100
3101 if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
3102
3103 /* Remember \r or \n */
3104
3105 if (d == '\r' || d == '\n') cd->external_flags |= PCRE_HASCRORLF;
3106
3107 /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
3108 matching, we have to use an XCLASS with extra data items. Caseless
3109 matching for characters > 127 is available only if UCP support is
3110 available. */
3111
3112 #ifdef SUPPORT_UTF8
3113 if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
3114 {
3115 class_utf8 = TRUE;
3116
3117 /* With UCP support, we can find the other case equivalents of
3118 the relevant characters. There may be several ranges. Optimize how
3119 they fit with the basic range. */
3120
3121 #ifdef SUPPORT_UCP
3122 if ((options & PCRE_CASELESS) != 0)
3123 {
3124 unsigned int occ, ocd;
3125 unsigned int cc = c;
3126 unsigned int origd = d;
3127 while (get_othercase_range(&cc, origd, &occ, &ocd))
3128 {
3129 if (occ >= (unsigned int)c &&
3130 ocd <= (unsigned int)d)
3131 continue; /* Skip embedded ranges */
3132
3133 if (occ < (unsigned int)c &&
3134 ocd >= (unsigned int)c - 1) /* Extend the basic range */
3135 { /* if there is overlap, */
3136 c = occ; /* noting that if occ < c */
3137 continue; /* we can't have ocd > d */
3138 } /* because a subrange is */
3139 if (ocd > (unsigned int)d &&
3140 occ <= (unsigned int)d + 1) /* always shorter than */
3141 { /* the basic range. */
3142 d = ocd;
3143 continue;
3144 }
3145
3146 if (occ == ocd)
3147 {
3148 *class_utf8data++ = XCL_SINGLE;
3149 }
3150 else
3151 {
3152 *class_utf8data++ = XCL_RANGE;
3153 class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
3154 }
3155 class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
3156 }
3157 }
3158 #endif /* SUPPORT_UCP */
3159
3160 /* Now record the original range, possibly modified for UCP caseless
3161 overlapping ranges. */
3162
3163 *class_utf8data++ = XCL_RANGE;
3164 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3165 class_utf8data += _pcre_ord2utf8(d, class_utf8data);
3166
3167 /* With UCP support, we are done. Without UCP support, there is no
3168 caseless matching for UTF-8 characters > 127; we can use the bit map
3169 for the smaller ones. */
3170
3171 #ifdef SUPPORT_UCP
3172 continue; /* With next character in the class */
3173 #else
3174 if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
3175
3176 /* Adjust upper limit and fall through to set up the map */
3177
3178 d = 127;
3179
3180 #endif /* SUPPORT_UCP */
3181 }
3182 #endif /* SUPPORT_UTF8 */
3183
3184 /* We use the bit map for all cases when not in UTF-8 mode; else
3185 ranges that lie entirely within 0-127 when there is UCP support; else
3186 for partial ranges without UCP support. */
3187
3188 class_charcount += d - c + 1;
3189 class_lastchar = d;
3190
3191 /* We can save a bit of time by skipping this in the pre-compile. */
3192
3193 if (lengthptr == NULL) for (; c <= d; c++)
3194 {
3195 classbits[c/8] |= (1 << (c&7));
3196 if ((options & PCRE_CASELESS) != 0)
3197 {
3198 int uc = cd->fcc[c]; /* flip case */
3199 classbits[uc/8] |= (1 << (uc&7));
3200 }
3201 }
3202
3203 continue; /* Go get the next char in the class */
3204 }
3205
3206 /* Handle a lone single character - we can get here for a normal
3207 non-escape char, or after \ that introduces a single character or for an
3208 apparent range that isn't. */
3209
3210 LONE_SINGLE_CHARACTER:
3211
3212 /* Handle a character that cannot go in the bit map */
3213
3214 #ifdef SUPPORT_UTF8
3215 if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
3216 {
3217 class_utf8 = TRUE;
3218 *class_utf8data++ = XCL_SINGLE;
3219 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3220
3221 #ifdef SUPPORT_UCP
3222 if ((options & PCRE_CASELESS) != 0)
3223 {
3224 unsigned int othercase;
3225 if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR)
3226 {
3227 *class_utf8data++ = XCL_SINGLE;
3228 class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
3229 }
3230 }
3231 #endif /* SUPPORT_UCP */
3232
3233 }
3234 else
3235 #endif /* SUPPORT_UTF8 */
3236
3237 /* Handle a single-byte character */
3238 {
3239 classbits[c/8] |= (1 << (c&7));
3240 if ((options & PCRE_CASELESS) != 0)
3241 {
3242 c = cd->fcc[c]; /* flip case */
3243 classbits[c/8] |= (1 << (c&7));
3244 }
3245 class_charcount++;
3246 class_lastchar = c;
3247 }
3248 }
3249
3250 /* Loop until ']' reached. This "while" is the end of the "do" above. */
3251
3252 while ((c = *(++ptr)) != 0 && (c != ']' || inescq));
3253
3254 if (c == 0) /* Missing terminating ']' */
3255 {
3256 *errorcodeptr = ERR6;
3257 goto FAILED;
3258 }
3259
3260
3261 /* This code has been disabled because it would mean that \s counts as
3262 an explicit \r or \n reference, and that's not really what is wanted. Now
3263 we set the flag only if there is a literal "\r" or "\n" in the class. */
3264
3265 #if 0
3266 /* Remember whether \r or \n are in this class */
3267
3268 if (negate_class)
3269 {
3270 if ((classbits[1] & 0x24) != 0x24) cd->external_flags |= PCRE_HASCRORLF;
3271 }
3272 else
3273 {
3274 if ((classbits[1] & 0x24) != 0) cd->external_flags |= PCRE_HASCRORLF;
3275 }
3276 #endif
3277
3278
3279 /* If class_charcount is 1, we saw precisely one character whose value is
3280 less than 256. As long as there were no characters >= 128 and there was no
3281 use of \p or \P, in other words, no use of any XCLASS features, we can
3282 optimize.
3283
3284 In UTF-8 mode, we can optimize the negative case only if there were no
3285 characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR
3286 operate on single-bytes only. This is an historical hangover. Maybe one day
3287 we can tidy these opcodes to handle multi-byte characters.
3288
3289 The optimization throws away the bit map. We turn the item into a
3290 1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
3291 that OP_NOT does not support multibyte characters. In the positive case, it
3292 can cause firstbyte to be set. Otherwise, there can be no first char if
3293 this item is first, whatever repeat count may follow. In the case of
3294 reqbyte, save the previous value for reinstating. */
3295
3296 #ifdef SUPPORT_UTF8
3297 if (class_charcount == 1 && !class_utf8 &&
3298 (!utf8 || !negate_class || class_lastchar < 128))
3299 #else
3300 if (class_charcount == 1)
3301 #endif
3302 {
3303 zeroreqbyte = reqbyte;
3304
3305 /* The OP_NOT opcode works on one-byte characters only. */
3306
3307 if (negate_class)
3308 {
3309 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3310 zerofirstbyte = firstbyte;
3311 *code++ = OP_NOT;
3312 *code++ = class_lastchar;
3313 break;
3314 }
3315
3316 /* For a single, positive character, get the value into mcbuffer, and
3317 then we can handle this with the normal one-character code. */
3318
3319 #ifdef SUPPORT_UTF8
3320 if (utf8 && class_lastchar > 127)
3321 mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
3322 else
3323 #endif
3324 {
3325 mcbuffer[0] = class_lastchar;
3326 mclength = 1;
3327 }
3328 goto ONE_CHAR;
3329 } /* End of 1-char optimization */
3330
3331 /* The general case - not the one-char optimization. If this is the first
3332 thing in the branch, there can be no first char setting, whatever the
3333 repeat count. Any reqbyte setting must remain unchanged after any kind of
3334 repeat. */
3335
3336 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3337 zerofirstbyte = firstbyte;
3338 zeroreqbyte = reqbyte;
3339
3340 /* If there are characters with values > 255, we have to compile an
3341 extended class, with its own opcode, unless there was a negated special
3342 such as \S in the class, because in that case all characters > 255 are in
3343 the class, so any that were explicitly given as well can be ignored. If
3344 (when there are explicit characters > 255 that must be listed) there are no
3345 characters < 256, we can omit the bitmap in the actual compiled code. */
3346
3347 #ifdef SUPPORT_UTF8
3348 if (class_utf8 && !should_flip_negation)
3349 {
3350 *class_utf8data++ = XCL_END; /* Marks the end of extra data */
3351 *code++ = OP_XCLASS;
3352 code += LINK_SIZE;
3353 *code = negate_class? XCL_NOT : 0;
3354
3355 /* If the map is required, move up the extra data to make room for it;
3356 otherwise just move the code pointer to the end of the extra data. */
3357
3358 if (class_charcount > 0)
3359 {
3360 *code++ |= XCL_MAP;
3361 memmove(code + 32, code, class_utf8data - code);
3362 memcpy(code, classbits, 32);
3363 code = class_utf8data + 32;
3364 }
3365 else code = class_utf8data;
3366
3367 /* Now fill in the complete length of the item */
3368
3369 PUT(previous, 1, code - previous);
3370 break; /* End of class handling */
3371 }
3372 #endif
3373
3374 /* If there are no characters > 255, set the opcode to OP_CLASS or
3375 OP_NCLASS, depending on whether the whole class was negated and whether
3376 there were negative specials such as \S in the class. Then copy the 32-byte
3377 map into the code vector, negating it if necessary. */
3378
3379 *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
3380 if (negate_class)
3381 {
3382 if (lengthptr == NULL) /* Save time in the pre-compile phase */
3383 for (c = 0; c < 32; c++) code[c] = ~classbits[c];
3384 }
3385 else
3386 {
3387 memcpy(code, classbits, 32);
3388 }
3389 code += 32;
3390 break;
3391
3392
3393 /* ===================================================================*/
3394 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
3395 has been tested above. */
3396
3397 case '{':
3398 if (!is_quantifier) goto NORMAL_CHAR;
3399 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
3400 if (*errorcodeptr != 0) goto FAILED;
3401 goto REPEAT;
3402
3403 case '*':
3404 repeat_min = 0;
3405 repeat_max = -1;
3406 goto REPEAT;
3407
3408 case '+':
3409 repeat_min = 1;
3410 repeat_max = -1;
3411 goto REPEAT;
3412
3413 case '?':
3414 repeat_min = 0;
3415 repeat_max = 1;
3416
3417 REPEAT:
3418 if (previous == NULL)
3419 {
3420 *errorcodeptr = ERR9;
3421 goto FAILED;
3422 }
3423
3424 if (repeat_min == 0)
3425 {
3426 firstbyte = zerofirstbyte; /* Adjust for zero repeat */
3427 reqbyte = zeroreqbyte; /* Ditto */
3428 }
3429
3430 /* Remember whether this is a variable length repeat */
3431
3432 reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
3433
3434 op_type = 0; /* Default single-char op codes */
3435 possessive_quantifier = FALSE; /* Default not possessive quantifier */
3436
3437 /* Save start of previous item, in case we have to move it up to make space
3438 for an inserted OP_ONCE for the additional '+' extension. */
3439
3440 tempcode = previous;
3441
3442 /* If the next character is '+', we have a possessive quantifier. This
3443 implies greediness, whatever the setting of the PCRE_UNGREEDY option.
3444 If the next character is '?' this is a minimizing repeat, by default,
3445 but if PCRE_UNGREEDY is set, it works the other way round. We change the
3446 repeat type to the non-default. */
3447
3448 if (ptr[1] == '+')
3449 {
3450 repeat_type = 0; /* Force greedy */
3451 possessive_quantifier = TRUE;
3452 ptr++;
3453 }
3454 else if (ptr[1] == '?')
3455 {
3456 repeat_type = greedy_non_default;
3457 ptr++;
3458 }
3459 else repeat_type = greedy_default;
3460
3461 /* If previous was a character match, abolish the item and generate a
3462 repeat item instead. If a char item has a minumum of more than one, ensure
3463 that it is set in reqbyte - it might not be if a sequence such as x{3} is
3464 the first thing in a branch because the x will have gone into firstbyte
3465 instead. */
3466
3467 if (*previous == OP_CHAR || *previous == OP_CHARNC)
3468 {
3469 /* Deal with UTF-8 characters that take up more than one byte. It's
3470 easier to write this out separately than try to macrify it. Use c to
3471 hold the length of the character in bytes, plus 0x80 to flag that it's a
3472 length rather than a small character. */
3473
3474 #ifdef SUPPORT_UTF8
3475 if (utf8 && (code[-1] & 0x80) != 0)
3476 {
3477 uschar *lastchar = code - 1;
3478 while((*lastchar & 0xc0) == 0x80) lastchar--;
3479 c = code - lastchar; /* Length of UTF-8 character */
3480 memcpy(utf8_char, lastchar, c); /* Save the char */
3481 c |= 0x80; /* Flag c as a length */
3482 }
3483 else
3484 #endif
3485
3486 /* Handle the case of a single byte - either with no UTF8 support, or
3487 with UTF-8 disabled, or for a UTF-8 character < 128. */
3488
3489 {
3490 c = code[-1];
3491 if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
3492 }
3493
3494 /* If the repetition is unlimited, it pays to see if the next thing on
3495 the line is something that cannot possibly match this character. If so,
3496 automatically possessifying this item gains some performance in the case
3497 where the match fails. */
3498
3499 if (!possessive_quantifier &&
3500 repeat_max < 0 &&
3501 check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3502 options, cd))
3503 {
3504 repeat_type = 0; /* Force greedy */
3505 possessive_quantifier = TRUE;
3506 }
3507
3508 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
3509 }
3510
3511 /* If previous was a single negated character ([^a] or similar), we use
3512 one of the special opcodes, replacing it. The code is shared with single-
3513 character repeats by setting opt_type to add a suitable offset into
3514 repeat_type. We can also test for auto-possessification. OP_NOT is
3515 currently used only for single-byte chars. */
3516
3517 else if (*previous == OP_NOT)
3518 {
3519 op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
3520 c = previous[1];
3521 if (!possessive_quantifier &&
3522 repeat_max < 0 &&
3523 check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3524 {
3525 repeat_type = 0; /* Force greedy */
3526 possessive_quantifier = TRUE;
3527 }
3528 goto OUTPUT_SINGLE_REPEAT;
3529 }
3530
3531 /* If previous was a character type match (\d or similar), abolish it and
3532 create a suitable repeat item. The code is shared with single-character
3533 repeats by setting op_type to add a suitable offset into repeat_type. Note
3534 the the Unicode property types will be present only when SUPPORT_UCP is
3535 defined, but we don't wrap the little bits of code here because it just
3536 makes it horribly messy. */
3537
3538 else if (*previous < OP_EODN)
3539 {
3540 uschar *oldcode;
3541 int prop_type, prop_value;
3542 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
3543 c = *previous;
3544
3545 if (!possessive_quantifier &&
3546 repeat_max < 0 &&
3547 check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3548 {
3549 repeat_type = 0; /* Force greedy */
3550 possessive_quantifier = TRUE;
3551 }
3552
3553 OUTPUT_SINGLE_REPEAT:
3554 if (*previous == OP_PROP || *previous == OP_NOTPROP)
3555 {
3556 prop_type = previous[1];
3557 prop_value = previous[2];
3558 }
3559 else prop_type = prop_value = -1;
3560
3561 oldcode = code;
3562 code = previous; /* Usually overwrite previous item */
3563
3564 /* If the maximum is zero then the minimum must also be zero; Perl allows
3565 this case, so we do too - by simply omitting the item altogether. */
3566
3567 if (repeat_max == 0) goto END_REPEAT;
3568
3569 /* All real repeats make it impossible to handle partial matching (maybe
3570 one day we will be able to remove this restriction). */
3571
3572 if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL;
3573
3574 /* Combine the op_type with the repeat_type */
3575
3576 repeat_type += op_type;
3577
3578 /* A minimum of zero is handled either as the special case * or ?, or as
3579 an UPTO, with the maximum given. */
3580
3581 if (repeat_min == 0)
3582 {
3583 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
3584 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
3585 else
3586 {
3587 *code++ = OP_UPTO + repeat_type;
3588 PUT2INC(code, 0, repeat_max);
3589 }
3590 }
3591
3592 /* A repeat minimum of 1 is optimized into some special cases. If the
3593 maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
3594 left in place and, if the maximum is greater than 1, we use OP_UPTO with
3595 one less than the maximum. */
3596
3597 else if (repeat_min == 1)
3598 {
3599 if (repeat_max == -1)
3600 *code++ = OP_PLUS + repeat_type;
3601 else
3602 {
3603 code = oldcode; /* leave previous item in place */
3604 if (repeat_max == 1) goto END_REPEAT;
3605 *code++ = OP_UPTO + repeat_type;
3606 PUT2INC(code, 0, repeat_max - 1);
3607 }
3608 }
3609
3610 /* The case {n,n} is just an EXACT, while the general case {n,m} is
3611 handled as an EXACT followed by an UPTO. */
3612
3613 else
3614 {
3615 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
3616 PUT2INC(code, 0, repeat_min);
3617
3618 /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
3619 we have to insert the character for the previous code. For a repeated
3620 Unicode property match, there are two extra bytes that define the
3621 required property. In UTF-8 mode, long characters have their length in
3622 c, with the 0x80 bit as a flag. */
3623
3624 if (repeat_max < 0)
3625 {
3626 #ifdef SUPPORT_UTF8
3627 if (utf8 && c >= 128)
3628 {
3629 memcpy(code, utf8_char, c & 7);
3630 code += c & 7;
3631 }
3632 else
3633 #endif
3634 {
3635 *code++ = c;
3636 if (prop_type >= 0)
3637 {
3638 *code++ = prop_type;
3639 *code++ = prop_value;
3640 }
3641 }
3642 *code++ = OP_STAR + repeat_type;
3643 }
3644
3645 /* Else insert an UPTO if the max is greater than the min, again
3646 preceded by the character, for the previously inserted code. If the
3647 UPTO is just for 1 instance, we can use QUERY instead. */
3648
3649 else if (repeat_max != repeat_min)
3650 {
3651 #ifdef SUPPORT_UTF8
3652 if (utf8 && c >= 128)
3653 {
3654 memcpy(code, utf8_char, c & 7);
3655 code += c & 7;
3656 }
3657 else
3658 #endif
3659 *code++ = c;
3660 if (prop_type >= 0)
3661 {
3662 *code++ = prop_type;
3663 *code++ = prop_value;
3664 }
3665 repeat_max -= repeat_min;
3666
3667 if (repeat_max == 1)
3668 {
3669 *code++ = OP_QUERY + repeat_type;
3670 }
3671 else
3672 {
3673 *code++ = OP_UPTO + repeat_type;
3674 PUT2INC(code, 0, repeat_max);
3675 }
3676 }
3677 }
3678
3679 /* The character or character type itself comes last in all cases. */
3680
3681 #ifdef SUPPORT_UTF8
3682 if (utf8 && c >= 128)
3683 {
3684 memcpy(code, utf8_char, c & 7);
3685 code += c & 7;
3686 }
3687 else
3688 #endif
3689 *code++ = c;
3690
3691 /* For a repeated Unicode property match, there are two extra bytes that
3692 define the required property. */
3693
3694 #ifdef SUPPORT_UCP
3695 if (prop_type >= 0)
3696 {
3697 *code++ = prop_type;
3698 *code++ = prop_value;
3699 }
3700 #endif
3701 }
3702
3703 /* If previous was a character class or a back reference, we put the repeat
3704 stuff after it, but just skip the item if the repeat was {0,0}. */
3705
3706 else if (*previous == OP_CLASS ||
3707 *previous == OP_NCLASS ||
3708 #ifdef SUPPORT_UTF8
3709 *previous == OP_XCLASS ||
3710 #endif
3711 *previous == OP_REF)
3712 {
3713 if (repeat_max == 0)
3714 {
3715 code = previous;
3716 goto END_REPEAT;
3717 }
3718
3719 /* All real repeats make it impossible to handle partial matching (maybe
3720 one day we will be able to remove this restriction). */
3721
3722 if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL;
3723
3724 if (repeat_min == 0 && repeat_max == -1)
3725 *code++ = OP_CRSTAR + repeat_type;
3726 else if (repeat_min == 1 && repeat_max == -1)
3727 *code++ = OP_CRPLUS + repeat_type;
3728 else if (repeat_min == 0 && repeat_max == 1)
3729 *code++ = OP_CRQUERY + repeat_type;
3730 else
3731 {
3732 *code++ = OP_CRRANGE + repeat_type;
3733 PUT2INC(code, 0, repeat_min);
3734 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
3735 PUT2INC(code, 0, repeat_max);
3736 }
3737 }
3738
3739 /* If previous was a bracket group, we may have to replicate it in certain
3740 cases. */
3741
3742 else if (*previous == OP_BRA || *previous == OP_CBRA ||
3743 *previous == OP_ONCE || *previous == OP_COND)
3744 {
3745 register int i;
3746 int ketoffset = 0;
3747 int len = code - previous;
3748 uschar *bralink = NULL;
3749
3750 /* Repeating a DEFINE group is pointless */
3751
3752 if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
3753 {
3754 *errorcodeptr = ERR55;
3755 goto FAILED;
3756 }
3757
3758 /* If the maximum repeat count is unlimited, find the end of the bracket
3759 by scanning through from the start, and compute the offset back to it
3760 from the current code pointer. There may be an OP_OPT setting following
3761 the final KET, so we can't find the end just by going back from the code
3762 pointer. */
3763
3764 if (repeat_max == -1)
3765 {
3766 register uschar *ket = previous;
3767 do ket += GET(ket, 1); while (*ket != OP_KET);
3768 ketoffset = code - ket;
3769 }
3770
3771 /* The case of a zero minimum is special because of the need to stick
3772 OP_BRAZERO in front of it, and because the group appears once in the
3773 data, whereas in other cases it appears the minimum number of times. For
3774 this reason, it is simplest to treat this case separately, as otherwise
3775 the code gets far too messy. There are several special subcases when the
3776 minimum is zero. */
3777
3778 if (repeat_min == 0)
3779 {
3780 /* If the maximum is also zero, we just omit the group from the output
3781 altogether. */
3782
3783 if (repeat_max == 0)
3784 {
3785 code = previous;
3786 goto END_REPEAT;
3787 }
3788
3789 /* If the maximum is 1 or unlimited, we just have to stick in the
3790 BRAZERO and do no more at this point. However, we do need to adjust
3791 any OP_RECURSE calls inside the group that refer to the group itself or
3792 any internal or forward referenced group, because the offset is from
3793 the start of the whole regex. Temporarily terminate the pattern while
3794 doing this. */
3795
3796 if (repeat_max <= 1)
3797 {
3798 *code = OP_END;
3799 adjust_recurse(previous, 1, utf8, cd, save_hwm);
3800 memmove(previous+1, previous, len);
3801 code++;
3802 *previous++ = OP_BRAZERO + repeat_type;
3803 }
3804
3805 /* If the maximum is greater than 1 and limited, we have to replicate
3806 in a nested fashion, sticking OP_BRAZERO before each set of brackets.
3807 The first one has to be handled carefully because it's the original
3808 copy, which has to be moved up. The remainder can be handled by code
3809 that is common with the non-zero minimum case below. We have to
3810 adjust the value or repeat_max, since one less copy is required. Once
3811 again, we may have to adjust any OP_RECURSE calls inside the group. */
3812
3813 else
3814 {
3815 int offset;
3816 *code = OP_END;
3817 adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
3818 memmove(previous + 2 + LINK_SIZE, previous, len);
3819 code += 2 + LINK_SIZE;
3820 *previous++ = OP_BRAZERO + repeat_type;
3821 *previous++ = OP_BRA;
3822
3823 /* We chain together the bracket offset fields that have to be
3824 filled in later when the ends of the brackets are reached. */
3825
3826 offset = (bralink == NULL)? 0 : previous - bralink;
3827 bralink = previous;
3828 PUTINC(previous, 0, offset);
3829 }
3830
3831 repeat_max--;
3832 }
3833
3834 /* If the minimum is greater than zero, replicate the group as many
3835 times as necessary, and adjust the maximum to the number of subsequent
3836 copies that we need. If we set a first char from the group, and didn't
3837 set a required char, copy the latter from the former. If there are any
3838 forward reference subroutine calls in the group, there will be entries on
3839 the workspace list; replicate these with an appropriate increment. */
3840
3841 else
3842 {
3843 if (repeat_min > 1)
3844 {
3845 /* In the pre-compile phase, we don't actually do the replication. We
3846 just adjust the length as if we had. Do some paranoid checks for
3847 potential integer overflow. */
3848
3849 if (lengthptr != NULL)
3850 {
3851 int delta = (repeat_min - 1)*length_prevgroup;
3852 if ((double)(repeat_min - 1)*(double)length_prevgroup >
3853 (double)INT_MAX ||
3854 OFLOW_MAX - *lengthptr < delta)
3855 {
3856 *errorcodeptr = ERR20;
3857 goto FAILED;
3858 }
3859 *lengthptr += delta;
3860 }
3861
3862 /* This is compiling for real */
3863
3864 else
3865 {
3866 if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
3867 for (i = 1; i < repeat_min; i++)
3868 {
3869 uschar *hc;
3870 uschar *this_hwm = cd->hwm;
3871 memcpy(code, previous, len);
3872 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3873 {
3874 PUT(cd->hwm, 0, GET(hc, 0) + len);
3875 cd->hwm += LINK_SIZE;
3876 }
3877 save_hwm = this_hwm;
3878 code += len;
3879 }
3880 }
3881 }
3882
3883 if (repeat_max > 0) repeat_max -= repeat_min;
3884 }
3885
3886 /* This code is common to both the zero and non-zero minimum cases. If
3887 the maximum is limited, it replicates the group in a nested fashion,
3888 remembering the bracket starts on a stack. In the case of a zero minimum,
3889 the first one was set up above. In all cases the repeat_max now specifies
3890 the number of additional copies needed. Again, we must remember to
3891 replicate entries on the forward reference list. */
3892
3893 if (repeat_max >= 0)
3894 {
3895 /* In the pre-compile phase, we don't actually do the replication. We
3896 just adjust the length as if we had. For each repetition we must add 1
3897 to the length for BRAZERO and for all but the last repetition we must
3898 add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
3899 paranoid checks to avoid integer overflow. */
3900
3901 if (lengthptr != NULL && repeat_max > 0)
3902 {
3903 int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
3904 2 - 2*LINK_SIZE; /* Last one doesn't nest */
3905 if ((double)repeat_max *
3906 (double)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
3907 > (double)INT_MAX ||
3908 OFLOW_MAX - *lengthptr < delta)
3909 {
3910 *errorcodeptr = ERR20;
3911 goto FAILED;
3912 }
3913 *lengthptr += delta;
3914 }
3915
3916 /* This is compiling for real */
3917
3918 else for (i = repeat_max - 1; i >= 0; i--)
3919 {
3920 uschar *hc;
3921 uschar *this_hwm = cd->hwm;
3922
3923 *code++ = OP_BRAZERO + repeat_type;
3924
3925 /* All but the final copy start a new nesting, maintaining the
3926 chain of brackets outstanding. */
3927
3928 if (i != 0)
3929 {
3930 int offset;
3931 *code++ = OP_BRA;
3932 offset = (bralink == NULL)? 0 : code - bralink;
3933 bralink = code;
3934 PUTINC(code, 0, offset);
3935 }
3936
3937 memcpy(code, previous, len);
3938 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3939 {
3940 PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
3941 cd->hwm += LINK_SIZE;
3942 }
3943 save_hwm = this_hwm;
3944 code += len;
3945 }
3946
3947 /* Now chain through the pending brackets, and fill in their length
3948 fields (which are holding the chain links pro tem). */
3949
3950 while (bralink != NULL)
3951 {
3952 int oldlinkoffset;
3953 int offset = code - bralink + 1;
3954 uschar *bra = code - offset;
3955 oldlinkoffset = GET(bra, 1);
3956 bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
3957 *code++ = OP_KET;
3958 PUTINC(code, 0, offset);
3959 PUT(bra, 1, offset);
3960 }
3961 }
3962
3963 /* If the maximum is unlimited, set a repeater in the final copy. We
3964 can't just offset backwards from the current code point, because we
3965 don't know if there's been an options resetting after the ket. The
3966 correct offset was computed above.
3967
3968 Then, when we are doing the actual compile phase, check to see whether
3969 this group is a non-atomic one that could match an empty string. If so,
3970 convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
3971 that runtime checking can be done. [This check is also applied to
3972 atomic groups at runtime, but in a different way.] */
3973
3974 else
3975 {
3976 uschar *ketcode = code - ketoffset;
3977 uschar *bracode = ketcode - GET(ketcode, 1);
3978 *ketcode = OP_KETRMAX + repeat_type;
3979 if (lengthptr == NULL && *bracode != OP_ONCE)
3980 {
3981 uschar *scode = bracode;
3982 do
3983 {
3984 if (could_be_empty_branch(scode, ketcode, utf8))
3985 {
3986 *bracode += OP_SBRA - OP_BRA;
3987 break;
3988 }
3989 scode += GET(scode, 1);
3990 }
3991 while (*scode == OP_ALT);
3992 }
3993 }
3994 }
3995
3996 /* Else there's some kind of shambles */
3997
3998 else
3999 {
4000 *errorcodeptr = ERR11;
4001 goto FAILED;
4002 }
4003
4004 /* If the character following a repeat is '+', or if certain optimization
4005 tests above succeeded, possessive_quantifier is TRUE. For some of the
4006 simpler opcodes, there is an special alternative opcode for this. For
4007 anything else, we wrap the entire repeated item inside OP_ONCE brackets.
4008 The '+' notation is just syntactic sugar, taken from Sun's Java package,
4009 but the special opcodes can optimize it a bit. The repeated item starts at
4010 tempcode, not at previous, which might be the first part of a string whose
4011 (former) last char we repeated.
4012
4013 Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
4014 an 'upto' may follow. We skip over an 'exact' item, and then test the
4015 length of what remains before proceeding. */
4016
4017 if (possessive_quantifier)
4018 {
4019 int len;
4020 if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||
4021 *tempcode == OP_NOTEXACT)
4022 tempcode += _pcre_OP_lengths[*tempcode];
4023 len = code - tempcode;
4024 if (len > 0) switch (*tempcode)
4025 {
4026 case OP_STAR: *tempcode = OP_POSSTAR; break;
4027 case OP_PLUS: *tempcode = OP_POSPLUS; break;
4028 case OP_QUERY: *tempcode = OP_POSQUERY; break;
4029 case OP_UPTO: *tempcode = OP_POSUPTO; break;
4030
4031 case OP_TYPESTAR: *tempcode = OP_TYPEPOSSTAR; break;
4032 case OP_TYPEPLUS: *tempcode = OP_TYPEPOSPLUS; break;
4033 case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
4034 case OP_TYPEUPTO: *tempcode = OP_TYPEPOSUPTO; break;
4035
4036 case OP_NOTSTAR: *tempcode = OP_NOTPOSSTAR; break;
4037 case OP_NOTPLUS: *tempcode = OP_NOTPOSPLUS; break;
4038 case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
4039 case OP_NOTUPTO: *tempcode = OP_NOTPOSUPTO; break;
4040
4041 default:
4042 memmove(tempcode + 1+LINK_SIZE, tempcode, len);
4043 code += 1 + LINK_SIZE;
4044 len += 1 + LINK_SIZE;
4045 tempcode[0] = OP_ONCE;
4046 *code++ = OP_KET;
4047 PUTINC(code, 0, len);
4048 PUT(tempcode, 1, len);
4049 break;
4050 }
4051 }
4052
4053 /* In all case we no longer have a previous item. We also set the
4054 "follows varying string" flag for subsequently encountered reqbytes if
4055 it isn't already set and we have just passed a varying length item. */
4056
4057 END_REPEAT:
4058 previous = NULL;
4059 cd->req_varyopt |= reqvary;
4060 break;
4061
4062
4063 /* ===================================================================*/
4064 /* Start of nested parenthesized sub-expression, or comment or lookahead or
4065 lookbehind or option setting or condition or all the other extended
4066 parenthesis forms. */
4067
4068 case '(':
4069 newoptions = options;
4070 skipbytes = 0;
4071 bravalue = OP_CBRA;
4072 save_hwm = cd->hwm;
4073 reset_bracount = FALSE;
4074
4075 /* First deal with various "verbs" that can be introduced by '*'. */
4076
4077 if (*(++ptr) == '*' && (cd->ctypes[ptr[1]] & ctype_letter) != 0)
4078 {
4079 int i, namelen;
4080 const char *vn = verbnames;
4081 const uschar *name = ++ptr;
4082 previous = NULL;
4083 while ((cd->ctypes[*++ptr] & ctype_letter) != 0);
4084 if (*ptr == ':')
4085 {
4086 *errorcodeptr = ERR59; /* Not supported */
4087 goto FAILED;
4088 }
4089 if (*ptr != ')')
4090 {
4091 *errorcodeptr = ERR60;
4092 goto FAILED;
4093 }
4094 namelen = ptr - name;
4095 for (i = 0; i < verbcount; i++)
4096 {
4097 if (namelen == verbs[i].len &&
4098 strncmp((char *)name, vn, namelen) == 0)
4099 {
4100 *code = verbs[i].op;
4101 if (*code++ == OP_ACCEPT) cd->had_accept = TRUE;
4102 break;
4103 }
4104 vn += verbs[i].len + 1;
4105 }
4106 if (i < verbcount) continue;
4107 *errorcodeptr = ERR60;
4108 goto FAILED;
4109 }
4110
4111 /* Deal with the extended parentheses; all are introduced by '?', and the
4112 appearance of any of them means that this is not a capturing group. */
4113
4114 else if (*ptr == '?')
4115 {
4116 int i, set, unset, namelen;
4117 int *optset;
4118 const uschar *name;
4119 uschar *slot;
4120
4121 switch (*(++ptr))
4122 {
4123 case '#': /* Comment; skip to ket */
4124 ptr++;
4125 while (*ptr != 0 && *ptr != ')') ptr++;
4126 if (*ptr == 0)
4127 {
4128 *errorcodeptr = ERR18;
4129 goto FAILED;
4130 }
4131 continue;
4132
4133
4134 /* ------------------------------------------------------------ */
4135 case '|': /* Reset capture count for each branch */
4136 reset_bracount = TRUE;
4137 /* Fall through */
4138
4139 /* ------------------------------------------------------------ */
4140 case ':': /* Non-capturing bracket */
4141 bravalue = OP_BRA;
4142 ptr++;
4143 break;
4144
4145
4146 /* ------------------------------------------------------------ */
4147 case '(':
4148 bravalue = OP_COND; /* Conditional group */
4149
4150 /* A condition can be an assertion, a number (referring to a numbered
4151 group), a name (referring to a named group), or 'R', referring to
4152 recursion. R<digits> and R&name are also permitted for recursion tests.
4153
4154 There are several syntaxes for testing a named group: (?(name)) is used
4155 by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
4156
4157 There are two unfortunate ambiguities, caused by history. (a) 'R' can
4158 be the recursive thing or the name 'R' (and similarly for 'R' followed
4159 by digits), and (b) a number could be a name that consists of digits.
4160 In both cases, we look for a name first; if not found, we try the other
4161 cases. */
4162
4163 /* For conditions that are assertions, check the syntax, and then exit
4164 the switch. This will take control down to where bracketed groups,
4165 including assertions, are processed. */
4166
4167 if (ptr[1] == '?' && (ptr[2] == '=' || ptr[2] == '!' || ptr[2] == '<'))
4168 break;
4169
4170 /* Most other conditions use OP_CREF (a couple change to OP_RREF
4171 below), and all need to skip 3 bytes at the start of the group. */
4172
4173 code[1+LINK_SIZE] = OP_CREF;
4174 skipbytes = 3;
4175 refsign = -1;
4176
4177 /* Check for a test for recursion in a named group. */
4178
4179 if (ptr[1] == 'R' && ptr[2] == '&')
4180 {
4181 terminator = -1;
4182 ptr += 2;
4183 code[1+LINK_SIZE] = OP_RREF; /* Change the type of test */
4184 }
4185
4186 /* Check for a test for a named group's having been set, using the Perl
4187 syntax (?(<name>) or (?('name') */
4188
4189 else if (ptr[1] == '<')
4190 {
4191 terminator = '>';
4192 ptr++;
4193 }
4194 else if (ptr[1] == '\'')
4195 {
4196 terminator = '\'';
4197 ptr++;
4198 }
4199 else
4200 {
4201 terminator = 0;
4202 if (ptr[1] == '-' || ptr[1] == '+') refsign = *(++ptr);
4203 }
4204
4205 /* We now expect to read a name; any thing else is an error */
4206
4207 if ((cd->ctypes[ptr[1]] & ctype_word) == 0)
4208 {
4209 ptr += 1; /* To get the right offset */
4210 *errorcodeptr = ERR28;
4211 goto FAILED;
4212 }
4213
4214 /* Read the name, but also get it as a number if it's all digits */
4215
4216 recno = 0;
4217 name = ++ptr;
4218 while ((cd->ctypes[*ptr] & ctype_word) != 0)
4219 {
4220 if (recno >= 0)
4221 recno = ((digitab[*ptr] & ctype_digit) != 0)?
4222 recno * 10 + *ptr - '0' : -1;
4223 ptr++;
4224 }
4225 namelen = ptr - name;
4226
4227 if ((terminator > 0 && *ptr++ != terminator) || *ptr++ != ')')
4228 {
4229 ptr--; /* Error offset */
4230 *errorcodeptr = ERR26;
4231 goto FAILED;
4232 }
4233
4234 /* Do no further checking in the pre-compile phase. */
4235
4236 if (lengthptr != NULL) break;
4237
4238 /* In the real compile we do the work of looking for the actual
4239 reference. If the string started with "+" or "-" we require the rest to
4240 be digits, in which case recno will be set. */
4241
4242 if (refsign > 0)
4243 {
4244 if (recno <= 0)
4245 {
4246 *errorcodeptr = ERR58;
4247 goto FAILED;
4248 }
4249 if (refsign == '-')
4250 {
4251 recno = cd->bracount - recno + 1;
4252 if (recno <= 0)
4253 {
4254 *errorcodeptr = ERR15;
4255 goto FAILED;
4256 }
4257 }
4258 else recno += cd->bracount;
4259 PUT2(code, 2+LINK_SIZE, recno);
4260 break;
4261 }
4262
4263 /* Otherwise (did not start with "+" or "-"), start by looking for the
4264 name. */
4265
4266 slot = cd->name_table;
4267 for (i = 0; i < cd->names_found; i++)
4268 {
4269 if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
4270 slot += cd->name_entry_size;
4271 }
4272
4273 /* Found a previous named subpattern */
4274
4275 if (i < cd->names_found)
4276 {
4277 recno = GET2(slot, 0);
4278 PUT2(code, 2+LINK_SIZE, recno);
4279 }
4280
4281 /* Search the pattern for a forward reference */
4282
4283 else if ((i = find_parens(ptr, cd->bracount, name, namelen,
4284 (options & PCRE_EXTENDED) != 0)) > 0)
4285 {
4286 PUT2(code, 2+LINK_SIZE, i);
4287 }
4288
4289 /* If terminator == 0 it means that the name followed directly after
4290 the opening parenthesis [e.g. (?(abc)...] and in this case there are
4291 some further alternatives to try. For the cases where terminator != 0
4292 [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
4293 now checked all the possibilities, so give an error. */
4294
4295 else if (terminator != 0)
4296 {
4297 *errorcodeptr = ERR15;
4298 goto FAILED;
4299 }
4300
4301 /* Check for (?(R) for recursion. Allow digits after R to specify a
4302 specific group number. */
4303
4304 else if (*name == 'R')
4305 {
4306 recno = 0;
4307 for (i = 1; i < namelen; i++)
4308 {
4309 if ((digitab[name[i]] & ctype_digit) == 0)
4310 {
4311 *errorcodeptr = ERR15;
4312 goto FAILED;
4313 }
4314 recno = recno * 10 + name[i] - '0';
4315 }
4316 if (recno == 0) recno = RREF_ANY;
4317 code[1+LINK_SIZE] = OP_RREF; /* Change test type */
4318 PUT2(code, 2+LINK_SIZE, recno);
4319 }
4320
4321 /* Similarly, check for the (?(DEFINE) "condition", which is always
4322 false. */
4323
4324 else if (namelen == 6 && strncmp((char *)name, "DEFINE", 6) == 0)
4325 {
4326 code[1+LINK_SIZE] = OP_DEF;
4327 skipbytes = 1;
4328 }
4329
4330 /* Check for the "name" actually being a subpattern number. */
4331
4332 else if (recno > 0)
4333 {
4334 PUT2(code, 2+LINK_SIZE, recno);
4335 }
4336
4337 /* Either an unidentified subpattern, or a reference to (?(0) */
4338
4339 else
4340 {
4341 *errorcodeptr = (recno == 0)? ERR35: ERR15;
4342 goto FAILED;
4343 }
4344 break;
4345
4346
4347 /* ------------------------------------------------------------ */
4348 case '=': /* Positive lookahead */
4349 bravalue = OP_ASSERT;
4350 ptr++;
4351 break;
4352
4353
4354 /* ------------------------------------------------------------ */
4355 case '!': /* Negative lookahead */
4356 ptr++;
4357 if (*ptr == ')') /* Optimize (?!) */
4358 {
4359 *code++ = OP_FAIL;
4360 previous = NULL;
4361 continue;
4362 }
4363 bravalue = OP_ASSERT_NOT;
4364 break;
4365
4366
4367 /* ------------------------------------------------------------ */
4368 case '<': /* Lookbehind or named define */
4369 switch (ptr[1])
4370 {
4371 case '=': /* Positive lookbehind */
4372 bravalue = OP_ASSERTBACK;
4373 ptr += 2;
4374 break;
4375
4376 case '!': /* Negative lookbehind */
4377 bravalue = OP_ASSERTBACK_NOT;
4378 ptr += 2;
4379 break;
4380
4381 default: /* Could be name define, else bad */
4382 if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME;
4383 ptr++; /* Correct offset for error */
4384 *errorcodeptr = ERR24;
4385 goto FAILED;
4386 }
4387 break;
4388
4389
4390 /* ------------------------------------------------------------ */
4391 case '>': /* One-time brackets */
4392 bravalue = OP_ONCE;
4393 ptr++;
4394 break;
4395
4396
4397 /* ------------------------------------------------------------ */
4398 case 'C': /* Callout - may be followed by digits; */
4399 previous_callout = code; /* Save for later completion */
4400 after_manual_callout = 1; /* Skip one item before completing */
4401 *code++ = OP_CALLOUT;
4402 {
4403 int n = 0;
4404 while ((digitab[*(++ptr)] & ctype_digit) != 0)
4405 n = n * 10 + *ptr - '0';
4406 if (*ptr != ')')
4407 {
4408 *errorcodeptr = ERR39;
4409 goto FAILED;
4410 }
4411 if (n > 255)
4412 {
4413 *errorcodeptr = ERR38;
4414 goto FAILED;
4415 }
4416 *code++ = n;
4417 PUT(code, 0, ptr - cd->start_pattern + 1); /* Pattern offset */
4418 PUT(code, LINK_SIZE, 0); /* Default length */
4419 code += 2 * LINK_SIZE;
4420 }
4421 previous = NULL;
4422 continue;
4423
4424
4425 /* ------------------------------------------------------------ */
4426 case 'P': /* Python-style named subpattern handling */
4427 if (*(++ptr) == '=' || *ptr == '>') /* Reference or recursion */
4428 {
4429 is_recurse = *ptr == '>';
4430 terminator = ')';
4431 goto NAMED_REF_OR_RECURSE;
4432 }
4433 else if (*ptr != '<') /* Test for Python-style definition */
4434 {
4435 *errorcodeptr = ERR41;
4436 goto FAILED;
4437 }
4438 /* Fall through to handle (?P< as (?< is handled */
4439
4440
4441 /* ------------------------------------------------------------ */
4442 DEFINE_NAME: /* Come here from (?< handling */
4443 case '\'':
4444 {
4445 terminator = (*ptr == '<')? '>' : '\'';
4446 name = ++ptr;
4447
4448 while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4449 namelen = ptr - name;
4450
4451 /* In the pre-compile phase, just do a syntax check. */
4452
4453 if (lengthptr != NULL)
4454 {
4455 if (*ptr != terminator)
4456 {
4457 *errorcodeptr = ERR42;
4458 goto FAILED;
4459 }
4460 if (cd->names_found >= MAX_NAME_COUNT)
4461 {
4462 *errorcodeptr = ERR49;
4463 goto FAILED;
4464 }
4465 if (namelen + 3 > cd->name_entry_size)
4466 {
4467 cd->name_entry_size = namelen + 3;
4468 if (namelen > MAX_NAME_SIZE)
4469 {
4470 *errorcodeptr = ERR48;
4471 goto FAILED;
4472 }
4473 }
4474 }
4475
4476 /* In the real compile, create the entry in the table */
4477
4478 else
4479 {
4480 slot = cd->name_table;
4481 for (i = 0; i < cd->names_found; i++)
4482 {
4483 int crc = memcmp(name, slot+2, namelen);
4484 if (crc == 0)
4485 {
4486 if (slot[2+namelen] == 0)
4487 {
4488 if ((options & PCRE_DUPNAMES) == 0)
4489 {
4490 *errorcodeptr = ERR43;
4491 goto FAILED;
4492 }
4493 }
4494 else crc = -1; /* Current name is substring */
4495 }
4496 if (crc < 0)
4497 {
4498 memmove(slot + cd->name_entry_size, slot,
4499 (cd->names_found - i) * cd->name_entry_size);
4500 break;
4501 }
4502 slot += cd->name_entry_size;
4503 }
4504
4505 PUT2(slot, 0, cd->bracount + 1);
4506 memcpy(slot + 2, name, namelen);
4507 slot[2+namelen] = 0;
4508 }
4509 }
4510
4511 /* In both cases, count the number of names we've encountered. */
4512
4513 ptr++; /* Move past > or ' */
4514 cd->names_found++;
4515 goto NUMBERED_GROUP;
4516
4517
4518 /* ------------------------------------------------------------ */
4519 case '&': /* Perl recursion/subroutine syntax */
4520 terminator = ')';
4521 is_recurse = TRUE;
4522 /* Fall through */
4523
4524 /* We come here from the Python syntax above that handles both
4525 references (?P=name) and recursion (?P>name), as well as falling
4526 through from the Perl recursion syntax (?&name). */
4527
4528 NAMED_REF_OR_RECURSE:
4529 name = ++ptr;
4530 while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4531 namelen = ptr - name;
4532
4533 /* In the pre-compile phase, do a syntax check and set a dummy
4534 reference number. */
4535
4536 if (lengthptr != NULL)
4537 {
4538 if (*ptr != terminator)
4539 {
4540 *errorcodeptr = ERR42;
4541 goto FAILED;
4542 }
4543 if (namelen > MAX_NAME_SIZE)
4544 {
4545 *errorcodeptr = ERR48;
4546 goto FAILED;
4547 }
4548 recno = 0;
4549 }
4550
4551 /* In the real compile, seek the name in the table */
4552
4553 else
4554 {
4555 slot = cd->name_table;
4556 for (i = 0; i < cd->names_found; i++)
4557 {
4558 if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
4559 slot += cd->name_entry_size;
4560 }
4561
4562 if (i < cd->names_found) /* Back reference */
4563 {
4564 recno = GET2(slot, 0);
4565 }
4566 else if ((recno = /* Forward back reference */
4567 find_parens(ptr, cd->bracount, name, namelen,
4568 (options & PCRE_EXTENDED) != 0)) <= 0)
4569 {
4570 *errorcodeptr = ERR15;
4571 goto FAILED;
4572 }
4573 }
4574
4575 /* In both phases, we can now go to the code than handles numerical
4576 recursion or backreferences. */
4577
4578 if (is_recurse) goto HANDLE_RECURSION;
4579 else goto HANDLE_REFERENCE;
4580
4581
4582 /* ------------------------------------------------------------ */
4583 case 'R': /* Recursion */
4584 ptr++; /* Same as (?0) */
4585 /* Fall through */
4586
4587
4588 /* ------------------------------------------------------------ */
4589 case '-': case '+':
4590 case '0': case '1': case '2': case '3': case '4': /* Recursion or */
4591 case '5': case '6': case '7': case '8': case '9': /* subroutine */
4592 {
4593 const uschar *called;
4594
4595 if ((refsign = *ptr) == '+') ptr++;
4596 else if (refsign == '-')
4597 {
4598 if ((digitab[ptr[1]] & ctype_digit) == 0)
4599 goto OTHER_CHAR_AFTER_QUERY;
4600 ptr++;
4601 }
4602
4603 recno = 0;
4604 while((digitab[*ptr] & ctype_digit) != 0)
4605 recno = recno * 10 + *ptr++ - '0';
4606
4607 if (*ptr != ')')
4608 {
4609 *errorcodeptr = ERR29;
4610 goto FAILED;
4611 }
4612
4613 if (refsign == '-')
4614 {
4615 if (recno == 0)
4616 {
4617 *errorcodeptr = ERR58;
4618 goto FAILED;
4619 }
4620 recno = cd->bracount - recno + 1;
4621 if (recno <= 0)
4622 {
4623 *errorcodeptr = ERR15;
4624 goto FAILED;
4625 }
4626 }
4627 else if (refsign == '+')
4628 {
4629 if (recno == 0)
4630 {
4631 *errorcodeptr = ERR58;
4632 goto FAILED;
4633 }
4634 recno += cd->bracount;
4635 }
4636
4637 /* Come here from code above that handles a named recursion */
4638
4639 HANDLE_RECURSION:
4640
4641 previous = code;
4642 called = cd->start_code;
4643
4644 /* When we are actually compiling, find the bracket that is being
4645 referenced. Temporarily end the regex in case it doesn't exist before
4646 this point. If we end up with a forward reference, first check that
4647 the bracket does occur later so we can give the error (and position)
4648 now. Then remember this forward reference in the workspace so it can
4649 be filled in at the end. */
4650
4651 if (lengthptr == NULL)
4652 {
4653 *code = OP_END;
4654 if (recno != 0) called = find_bracket(cd->start_code, utf8, recno);
4655
4656 /* Forward reference */
4657
4658 if (called == NULL)
4659 {
4660 if (find_parens(ptr, cd->bracount, NULL, recno,
4661 (options & PCRE_EXTENDED) != 0) < 0)
4662 {
4663 *errorcodeptr = ERR15;
4664 goto FAILED;
4665 }
4666 called = cd->start_code + recno;
4667 PUTINC(cd->hwm, 0, code + 2 + LINK_SIZE - cd->start_code);
4668 }
4669
4670 /* If not a forward reference, and the subpattern is still open,
4671 this is a recursive call. We check to see if this is a left
4672 recursion that could loop for ever, and diagnose that case. */
4673
4674 else if (GET(called, 1) == 0 &&
4675 could_be_empty(called, code, bcptr, utf8))
4676 {
4677 *errorcodeptr = ERR40;
4678 goto FAILED;
4679 }
4680 }
4681
4682 /* Insert the recursion/subroutine item, automatically wrapped inside
4683 "once" brackets. Set up a "previous group" length so that a
4684 subsequent quantifier will work. */
4685
4686 *code = OP_ONCE;
4687 PUT(code, 1, 2 + 2*LINK_SIZE);
4688 code += 1 + LINK_SIZE;
4689
4690 *code = OP_RECURSE;
4691 PUT(code, 1, called - cd->start_code);
4692 code += 1 + LINK_SIZE;
4693
4694 *code = OP_KET;
4695 PUT(code, 1, 2 + 2*LINK_SIZE);
4696 code += 1 + LINK_SIZE;
4697
4698 length_prevgroup = 3 + 3*LINK_SIZE;
4699 }
4700
4701 /* Can't determine a first byte now */
4702
4703 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
4704 continue;
4705
4706
4707 /* ------------------------------------------------------------ */
4708 default: /* Other characters: check option setting */
4709 OTHER_CHAR_AFTER_QUERY:
4710 set = unset = 0;
4711 optset = &set;
4712
4713 while (*ptr != ')' && *ptr != ':')
4714 {
4715 switch (*ptr++)
4716 {
4717 case '-': optset = &unset; break;
4718
4719 case 'J': /* Record that it changed in the external options */
4720 *optset |= PCRE_DUPNAMES;
4721 cd->external_flags |= PCRE_JCHANGED;
4722 break;
4723
4724 case 'i': *optset |= PCRE_CASELESS; break;
4725 case 'm': *optset |= PCRE_MULTILINE; break;
4726 case 's': *optset |= PCRE_DOTALL; break;
4727 case 'x': *optset |= PCRE_EXTENDED; break;
4728 case 'U': *optset |= PCRE_UNGREEDY; break;
4729 case 'X': *optset |= PCRE_EXTRA; break;
4730
4731 default: *errorcodeptr = ERR12;
4732 ptr--; /* Correct the offset */
4733 goto FAILED;
4734 }
4735 }
4736
4737 /* Set up the changed option bits, but don't change anything yet. */
4738
4739 newoptions = (options | set) & (~unset);
4740
4741 /* If the options ended with ')' this is not the start of a nested
4742 group with option changes, so the options change at this level. If this
4743 item is right at the start of the pattern, the options can be
4744 abstracted and made external in the pre-compile phase, and ignored in
4745 the compile phase. This can be helpful when matching -- for instance in
4746 caseless checking of required bytes.
4747
4748 If the code pointer is not (cd->start_code + 1 + LINK_SIZE), we are
4749 definitely *not* at the start of the pattern because something has been
4750 compiled. In the pre-compile phase, however, the code pointer can have
4751 that value after the start, because it gets reset as code is discarded
4752 during the pre-compile. However, this can happen only at top level - if
4753 we are within parentheses, the starting BRA will still be present. At
4754 any parenthesis level, the length value can be used to test if anything
4755 has been compiled at that level. Thus, a test for both these conditions
4756 is necessary to ensure we correctly detect the start of the pattern in
4757 both phases.
4758
4759 If we are not at the pattern start, compile code to change the ims
4760 options if this setting actually changes any of them. We also pass the
4761 new setting back so that it can be put at the start of any following
4762 branches, and when this group ends (if we are in a group), a resetting
4763 item can be compiled. */
4764
4765 if (*ptr == ')')
4766 {
4767 if (code == cd->start_code + 1 + LINK_SIZE &&
4768 (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE))
4769 {
4770 cd->external_options = newoptions;
4771 options = newoptions;
4772 }
4773 else
4774 {
4775 if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
4776 {
4777 *code++ = OP_OPT;
4778 *code++ = newoptions & PCRE_IMS;
4779 }
4780
4781 /* Change options at this level, and pass them back for use
4782 in subsequent branches. Reset the greedy defaults and the case
4783 value for firstbyte and reqbyte. */
4784
4785 *optionsptr = options = newoptions;
4786 greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
4787 greedy_non_default = greedy_default ^ 1;
4788 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
4789 }
4790
4791 previous = NULL; /* This item can't be repeated */
4792 continue; /* It is complete */
4793 }
4794
4795 /* If the options ended with ':' we are heading into a nested group
4796 with possible change of options. Such groups are non-capturing and are
4797 not assertions of any kind. All we need to do is skip over the ':';
4798 the newoptions value is handled below. */
4799
4800 bravalue = OP_BRA;
4801 ptr++;
4802 } /* End of switch for character following (? */
4803 } /* End of (? handling */
4804
4805 /* Opening parenthesis not followed by '?'. If PCRE_NO_AUTO_CAPTURE is set,
4806 all unadorned brackets become non-capturing and behave like (?:...)
4807 brackets. */
4808
4809 else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
4810 {
4811 bravalue = OP_BRA;
4812 }
4813
4814 /* Else we have a capturing group. */
4815
4816 else
4817 {
4818 NUMBERED_GROUP:
4819 cd->bracount += 1;
4820 PUT2(code, 1+LINK_SIZE, cd->bracount);
4821 skipbytes = 2;
4822 }
4823
4824 /* Process nested bracketed regex. Assertions may not be repeated, but
4825 other kinds can be. All their opcodes are >= OP_ONCE. We copy code into a
4826 non-register variable in order to be able to pass its address because some
4827 compilers complain otherwise. Pass in a new setting for the ims options if
4828 they have changed. */
4829
4830 previous = (bravalue >= OP_ONCE)? code : NULL;
4831 *code = bravalue;
4832 tempcode = code;
4833 tempreqvary = cd->req_varyopt; /* Save value before bracket */
4834 length_prevgroup = 0; /* Initialize for pre-compile phase */
4835
4836 if (!compile_regex(
4837 newoptions, /* The complete new option state */
4838 options & PCRE_IMS, /* The previous ims option state */
4839 &tempcode, /* Where to put code (updated) */
4840 &ptr, /* Input pointer (updated) */
4841 errorcodeptr, /* Where to put an error message */
4842 (bravalue == OP_ASSERTBACK ||
4843 bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
4844 reset_bracount, /* True if (?| group */
4845 skipbytes, /* Skip over bracket number */
4846 &subfirstbyte, /* For possible first char */
4847 &subreqbyte, /* For possible last char */
4848 bcptr, /* Current branch chain */
4849 cd, /* Tables block */
4850 (lengthptr == NULL)? NULL : /* Actual compile phase */
4851 &length_prevgroup /* Pre-compile phase */
4852 ))
4853 goto FAILED;
4854
4855 /* At the end of compiling, code is still pointing to the start of the
4856 group, while tempcode has been updated to point past the end of the group
4857 and any option resetting that may follow it. The pattern pointer (ptr)
4858 is on the bracket. */
4859
4860 /* If this is a conditional bracket, check that there are no more than
4861 two branches in the group, or just one if it's a DEFINE group. We do this
4862 in the real compile phase, not in the pre-pass, where the whole group may
4863 not be available. */
4864
4865 if (bravalue == OP_COND && lengthptr == NULL)
4866 {
4867 uschar *tc = code;
4868 int condcount = 0;
4869
4870 do {
4871 condcount++;
4872 tc += GET(tc,1);
4873 }
4874 while (*tc != OP_KET);
4875
4876 /* A DEFINE group is never obeyed inline (the "condition" is always
4877 false). It must have only one branch. */
4878
4879 if (code[LINK_SIZE+1] == OP_DEF)
4880 {
4881 if (condcount > 1)
4882 {
4883 *errorcodeptr = ERR54;
4884 goto FAILED;
4885 }
4886 bravalue = OP_DEF; /* Just a flag to suppress char handling below */
4887 }
4888
4889 /* A "normal" conditional group. If there is just one branch, we must not
4890 make use of its firstbyte or reqbyte, because this is equivalent to an
4891 empty second branch. */
4892
4893 else
4894 {
4895 if (condcount > 2)
4896 {
4897 *errorcodeptr = ERR27;
4898 goto FAILED;
4899 }
4900 if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
4901 }
4902 }
4903
4904 /* Error if hit end of pattern */
4905
4906 if (*ptr != ')')
4907 {
4908 *errorcodeptr = ERR14;
4909 goto FAILED;
4910 }
4911
4912 /* In the pre-compile phase, update the length by the length of the group,
4913 less the brackets at either end. Then reduce the compiled code to just a
4914 set of non-capturing brackets so that it doesn't use much memory if it is
4915 duplicated by a quantifier.*/
4916
4917 if (lengthptr != NULL)
4918 {
4919 if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE)
4920 {
4921 *errorcodeptr = ERR20;
4922 goto FAILED;
4923 }
4924 *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
4925 *code++ = OP_BRA;
4926 PUTINC(code, 0, 1 + LINK_SIZE);
4927 *code++ = OP_KET;
4928 PUTINC(code, 0, 1 + LINK_SIZE);
4929 break; /* No need to waste time with special character handling */
4930 }
4931
4932 /* Otherwise update the main code pointer to the end of the group. */
4933
4934 code = tempcode;
4935
4936 /* For a DEFINE group, required and first character settings are not
4937 relevant. */
4938
4939 if (bravalue == OP_DEF) break;
4940
4941 /* Handle updating of the required and first characters for other types of
4942 group. Update for normal brackets of all kinds, and conditions with two
4943 branches (see code above). If the bracket is followed by a quantifier with
4944 zero repeat, we have to back off. Hence the definition of zeroreqbyte and
4945 zerofirstbyte outside the main loop so that they can be accessed for the
4946 back off. */
4947
4948 zeroreqbyte = reqbyte;
4949 zerofirstbyte = firstbyte;
4950 groupsetfirstbyte = FALSE;
4951
4952 if (bravalue >= OP_ONCE)
4953 {
4954 /* If we have not yet set a firstbyte in this branch, take it from the
4955 subpattern, remembering that it was set here so that a repeat of more
4956 than one can replicate it as reqbyte if necessary. If the subpattern has
4957 no firstbyte, set "none" for the whole branch. In both cases, a zero
4958 repeat forces firstbyte to "none". */
4959
4960 if (firstbyte == REQ_UNSET)
4961 {
4962 if (subfirstbyte >= 0)
4963 {
4964 firstbyte = subfirstbyte;
4965 groupsetfirstbyte = TRUE;
4966 }
4967 else firstbyte = REQ_NONE;
4968 zerofirstbyte = REQ_NONE;
4969 }
4970
4971 /* If firstbyte was previously set, convert the subpattern's firstbyte
4972 into reqbyte if there wasn't one, using the vary flag that was in
4973 existence beforehand. */
4974
4975 else if (subfirstbyte >= 0 && subreqbyte < 0)
4976 subreqbyte = subfirstbyte | tempreqvary;
4977
4978 /* If the subpattern set a required byte (or set a first byte that isn't
4979 really the first byte - see above), set it. */
4980
4981 if (subreqbyte >= 0) reqbyte = subreqbyte;
4982 }
4983
4984 /* For a forward assertion, we take the reqbyte, if set. This can be
4985 helpful if the pattern that follows the assertion doesn't set a different
4986 char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
4987 for an assertion, however because it leads to incorrect effect for patterns
4988 such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
4989 of a firstbyte. This is overcome by a scan at the end if there's no
4990 firstbyte, looking for an asserted first char. */
4991
4992 else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
4993 break; /* End of processing '(' */
4994
4995
4996 /* ===================================================================*/
4997 /* Handle metasequences introduced by \. For ones like \d, the ESC_ values
4998 are arranged to be the negation of the corresponding OP_values. For the
4999 back references, the values are ESC_REF plus the reference number. Only
5000 back references and those types that consume a character may be repeated.
5001 We can test for values between ESC_b and ESC_Z for the latter; this may
5002 have to change if any new ones are ever created. */
5003
5004 case '\\':
5005 tempptr = ptr;
5006 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, FALSE);
5007 if (*errorcodeptr != 0) goto FAILED;
5008
5009 if (c < 0)
5010 {
5011 if (-c == ESC_Q) /* Handle start of quoted string */
5012 {
5013 if (ptr[1] == '\\' && ptr[2] == 'E') ptr += 2; /* avoid empty string */
5014 else inescq = TRUE;
5015 continue;
5016 }
5017
5018 if (-c == ESC_E) continue; /* Perl ignores an orphan \E */
5019
5020 /* For metasequences that actually match a character, we disable the
5021 setting of a first character if it hasn't already been set. */
5022
5023 if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
5024 firstbyte = REQ_NONE;
5025
5026 /* Set values to reset to if this is followed by a zero repeat. */
5027
5028 zerofirstbyte = firstbyte;
5029 zeroreqbyte = reqbyte;
5030
5031 /* \k<name> or \k'name' is a back reference by name (Perl syntax).
5032 We also support \k{name} (.NET syntax) */
5033
5034 if (-c == ESC_k && (ptr[1] == '<' || ptr[1] == '\'' || ptr[1] == '{'))
5035 {
5036 is_recurse = FALSE;
5037 terminator = (*(++ptr) == '<')? '>' : (*ptr == '\'')? '\'' : '}';
5038 goto NAMED_REF_OR_RECURSE;
5039 }
5040
5041 /* Back references are handled specially; must disable firstbyte if
5042 not set to cope with cases like (?=(\w+))\1: which would otherwise set
5043 ':' later. */
5044
5045 if (-c >= ESC_REF)
5046 {
5047 recno = -c - ESC_REF;
5048
5049 HANDLE_REFERENCE: /* Come here from named backref handling */
5050 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
5051 previous = code;
5052 *code++ = OP_REF;
5053 PUT2INC(code, 0, recno);
5054 cd->backref_map |= (recno < 32)? (1 << recno) : 1;
5055 if (recno > cd->top_backref) cd->top_backref = recno;
5056 }
5057
5058 /* So are Unicode property matches, if supported. */
5059
5060 #ifdef SUPPORT_UCP
5061 else if (-c == ESC_P || -c == ESC_p)
5062 {
5063 BOOL negated;
5064 int pdata;
5065 int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
5066 if (ptype < 0) goto FAILED;
5067 previous = code;
5068 *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
5069 *code++ = ptype;
5070 *code++ = pdata;
5071 }
5072 #else
5073
5074 /* If Unicode properties are not supported, \X, \P, and \p are not
5075 allowed. */
5076
5077 else if (-c == ESC_X || -c == ESC_P || -c == ESC_p)
5078 {
5079 *errorcodeptr = ERR45;
5080 goto FAILED;
5081 }
5082 #endif
5083
5084 /* For the rest (including \X when Unicode properties are supported), we
5085 can obtain the OP value by negating the escape value. */
5086
5087 else
5088 {
5089 previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
5090 *code++ = -c;
5091 }
5092 continue;
5093 }
5094
5095 /* We have a data character whose value is in c. In UTF-8 mode it may have
5096 a value > 127. We set its representation in the length/buffer, and then
5097 handle it as a data character. */
5098
5099 #ifdef SUPPORT_UTF8
5100 if (utf8 && c > 127)
5101 mclength = _pcre_ord2utf8(c, mcbuffer);
5102 else
5103 #endif
5104
5105 {
5106 mcbuffer[0] = c;
5107 mclength = 1;
5108 }
5109 goto ONE_CHAR;
5110
5111
5112 /* ===================================================================*/
5113 /* Handle a literal character. It is guaranteed not to be whitespace or #
5114 when the extended flag is set. If we are in UTF-8 mode, it may be a
5115 multi-byte literal character. */
5116
5117 default:
5118 NORMAL_CHAR:
5119 mclength = 1;
5120 mcbuffer[0] = c;
5121
5122 #ifdef SUPPORT_UTF8
5123 if (utf8 && c >= 0xc0)
5124 {
5125 while ((ptr[1] & 0xc0) == 0x80)
5126 mcbuffer[mclength++] = *(++ptr);
5127 }
5128 #endif
5129
5130 /* At this point we have the character's bytes in mcbuffer, and the length
5131 in mclength. When not in UTF-8 mode, the length is always 1. */
5132
5133 ONE_CHAR:
5134 previous = code;
5135 *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARNC : OP_CHAR;
5136 for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
5137
5138 /* Remember if \r or \n were seen */
5139
5140 if (mcbuffer[0] == '\r' || mcbuffer[0] == '\n')
5141 cd->external_flags |= PCRE_HASCRORLF;
5142
5143 /* Set the first and required bytes appropriately. If no previous first
5144 byte, set it from this character, but revert to none on a zero repeat.
5145 Otherwise, leave the firstbyte value alone, and don't change it on a zero
5146 repeat. */
5147
5148 if (firstbyte == REQ_UNSET)
5149 {
5150 zerofirstbyte = REQ_NONE;
5151 zeroreqbyte = reqbyte;
5152
5153 /* If the character is more than one byte long, we can set firstbyte
5154 only if it is not to be matched caselessly. */
5155
5156 if (mclength == 1 || req_caseopt == 0)
5157 {
5158 firstbyte = mcbuffer[0] | req_caseopt;
5159 if (mclength != 1) reqbyte = code[-1] | cd->req_varyopt;
5160 }
5161 else firstbyte = reqbyte = REQ_NONE;
5162 }
5163
5164 /* firstbyte was previously set; we can set reqbyte only the length is
5165 1 or the matching is caseful. */
5166
5167 else
5168 {
5169 zerofirstbyte = firstbyte;
5170 zeroreqbyte = reqbyte;
5171 if (mclength == 1 || req_caseopt == 0)
5172 reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
5173 }
5174
5175 break; /* End of literal character handling */
5176 }
5177 } /* end of big loop */
5178
5179
5180 /* Control never reaches here by falling through, only by a goto for all the
5181 error states. Pass back the position in the pattern so that it can be displayed
5182 to the user for diagnosing the error. */
5183
5184 FAILED:
5185 *ptrptr = ptr;
5186 return FALSE;
5187 }
5188
5189
5190
5191
5192 /*************************************************
5193 * Compile sequence of alternatives *
5194 *************************************************/
5195
5196 /* On entry, ptr is pointing past the bracket character, but on return it
5197 points to the closing bracket, or vertical bar, or end of string. The code
5198 variable is pointing at the byte into which the BRA operator has been stored.
5199 If the ims options are changed at the start (for a (?ims: group) or during any
5200 branch, we need to insert an OP_OPT item at the start of every following branch
5201 to ensure they get set correctly at run time, and also pass the new options
5202 into every subsequent branch compile.
5203
5204 This function is used during the pre-compile phase when we are trying to find
5205 out the amount of memory needed, as well as during the real compile phase. The
5206 value of lengthptr distinguishes the two phases.
5207
5208 Arguments:
5209 options option bits, including any changes for this subpattern
5210 oldims previous settings of ims option bits
5211 codeptr -> the address of the current code pointer
5212 ptrptr -> the address of the current pattern pointer
5213 errorcodeptr -> pointer to error code variable
5214 lookbehind TRUE if this is a lookbehind assertion
5215 reset_bracount TRUE to reset the count for each branch
5216 skipbytes skip this many bytes at start (for brackets and OP_COND)
5217 firstbyteptr place to put the first required character, or a negative number
5218 reqbyteptr place to put the last required character, or a negative number
5219 bcptr pointer to the chain of currently open branches
5220 cd points to the data block with tables pointers etc.
5221 lengthptr NULL during the real compile phase
5222 points to length accumulator during pre-compile phase
5223
5224 Returns: TRUE on success
5225 */
5226
5227 static BOOL
5228 compile_regex(int options, int oldims, uschar **codeptr, const uschar **ptrptr,
5229 int *errorcodeptr, BOOL lookbehind, BOOL reset_bracount, int skipbytes,
5230 int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd,
5231 int *lengthptr)
5232 {
5233 const uschar *ptr = *ptrptr;
5234 uschar *code = *codeptr;
5235 uschar *last_branch = code;
5236 uschar *start_bracket = code;
5237 uschar *reverse_count = NULL;
5238 int firstbyte, reqbyte;
5239 int branchfirstbyte, branchreqbyte;
5240 int length;
5241 int orig_bracount;
5242 int max_bracount;
5243 branch_chain bc;
5244
5245 bc.outer = bcptr;
5246 bc.current = code;
5247
5248 firstbyte = reqbyte = REQ_UNSET;
5249
5250 /* Accumulate the length for use in the pre-compile phase. Start with the
5251 length of the BRA and KET and any extra bytes that are required at the
5252 beginning. We accumulate in a local variable to save frequent testing of
5253 lenthptr for NULL. We cannot do this by looking at the value of code at the
5254 start and end of each alternative, because compiled items are discarded during
5255 the pre-compile phase so that the work space is not exceeded. */
5256
5257 length = 2 + 2*LINK_SIZE + skipbytes;
5258
5259 /* WARNING: If the above line is changed for any reason, you must also change
5260 the code that abstracts option settings at the start of the pattern and makes
5261 them global. It tests the value of length for (2 + 2*LINK_SIZE) in the
5262 pre-compile phase to find out whether anything has yet been compiled or not. */
5263
5264 /* Offset is set zero to mark that this bracket is still open */
5265
5266 PUT(code, 1, 0);
5267 code += 1 + LINK_SIZE + skipbytes;
5268
5269 /* Loop for each alternative branch */
5270
5271 orig_bracount = max_bracount = cd->bracount;
5272 for (;;)
5273 {
5274 /* For a (?| group, reset the capturing bracket count so that each branch
5275 uses the same numbers. */
5276
5277 if (reset_bracount) cd->bracount = orig_bracount;
5278
5279 /* Handle a change of ims options at the start of the branch */
5280
5281 if ((options & PCRE_IMS) != oldims)
5282 {
5283 *code++ = OP_OPT;
5284 *code++ = options & PCRE_IMS;
5285 length += 2;
5286 }
5287
5288 /* Set up dummy OP_REVERSE if lookbehind assertion */
5289
5290 if (lookbehind)
5291 {
5292 *code++ = OP_REVERSE;
5293 reverse_count = code;
5294 PUTINC(code, 0, 0);
5295 length += 1 + LINK_SIZE;
5296 }
5297
5298 /* Now compile the branch; in the pre-compile phase its length gets added
5299 into the length. */
5300
5301 if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstbyte,
5302 &branchreqbyte, &bc, cd, (lengthptr == NULL)? NULL : &length))
5303 {
5304 *ptrptr = ptr;
5305 return FALSE;
5306 }
5307
5308 /* Keep the highest bracket count in case (?| was used and some branch
5309 has fewer than the rest. */
5310
5311 if (cd->bracount > max_bracount) max_bracount = cd->bracount;
5312
5313 /* In the real compile phase, there is some post-processing to be done. */
5314
5315 if (lengthptr == NULL)
5316 {
5317 /* If this is the first branch, the firstbyte and reqbyte values for the
5318 branch become the values for the regex. */
5319
5320 if (*last_branch != OP_ALT)
5321 {
5322 firstbyte = branchfirstbyte;
5323 reqbyte = branchreqbyte;
5324 }
5325
5326 /* If this is not the first branch, the first char and reqbyte have to
5327 match the values from all the previous branches, except that if the
5328 previous value for reqbyte didn't have REQ_VARY set, it can still match,
5329 and we set REQ_VARY for the regex. */
5330
5331 else
5332 {
5333 /* If we previously had a firstbyte, but it doesn't match the new branch,
5334 we have to abandon the firstbyte for the regex, but if there was
5335 previously no reqbyte, it takes on the value of the old firstbyte. */
5336
5337 if (firstbyte >= 0 && firstbyte != branchfirstbyte)
5338 {
5339 if (reqbyte < 0) reqbyte = firstbyte;
5340 firstbyte = REQ_NONE;
5341 }
5342
5343 /* If we (now or from before) have no firstbyte, a firstbyte from the
5344 branch becomes a reqbyte if there isn't a branch reqbyte. */
5345
5346 if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
5347 branchreqbyte = branchfirstbyte;
5348
5349 /* Now ensure that the reqbytes match */
5350
5351 if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
5352 reqbyte = REQ_NONE;
5353 else reqbyte |= branchreqbyte; /* To "or" REQ_VARY */
5354 }
5355
5356 /* If lookbehind, check that this branch matches a fixed-length string, and
5357 put the length into the OP_REVERSE item. Temporarily mark the end of the
5358 branch with OP_END. */
5359
5360 if (lookbehind)
5361 {
5362 int fixed_length;
5363 *code = OP_END;
5364 fixed_length = find_fixedlength(last_branch, options);
5365 DPRINTF(("fixed length = %d\n", fixed_length));
5366 if (fixed_length < 0)
5367 {
5368 *errorcodeptr = (fixed_length == -2)? ERR36 : ERR25;
5369 *ptrptr = ptr;
5370 return FALSE;
5371 }
5372 PUT(reverse_count, 0, fixed_length);
5373 }
5374 }
5375
5376 /* Reached end of expression, either ')' or end of pattern. In the real
5377 compile phase, go back through the alternative branches and reverse the chain
5378 of offsets, with the field in the BRA item now becoming an offset to the
5379 first alternative. If there are no alternatives, it points to the end of the
5380 group. The length in the terminating ket is always the length of the whole
5381 bracketed item. If any of the ims options were changed inside the group,
5382 compile a resetting op-code following, except at the very end of the pattern.
5383 Return leaving the pointer at the terminating char. */
5384
5385 if (*ptr != '|')
5386 {
5387 if (lengthptr == NULL)
5388 {
5389 int branch_length = code - last_branch;
5390 do
5391 {
5392 int prev_length = GET(last_branch, 1);
5393 PUT(last_branch, 1, branch_length);
5394 branch_length = prev_length;
5395 last_branch -= branch_length;
5396 }
5397 while (branch_length > 0);
5398 }
5399
5400 /* Fill in the ket */
5401
5402 *code = OP_KET;
5403 PUT(code, 1, code - start_bracket);
5404 code += 1 + LINK_SIZE;
5405
5406 /* Resetting option if needed */
5407
5408 if ((options & PCRE_IMS) != oldims && *ptr == ')')
5409 {
5410 *code++ = OP_OPT;
5411 *code++ = oldims;
5412 length += 2;
5413 }
5414
5415 /* Retain the highest bracket number, in case resetting was used. */
5416
5417 cd->bracount = max_bracount;
5418
5419 /* Set values to pass back */
5420
5421 *codeptr = code;
5422 *ptrptr = ptr;
5423 *firstbyteptr = firstbyte;
5424 *reqbyteptr = reqbyte;
5425 if (lengthptr != NULL)
5426 {
5427 if (OFLOW_MAX - *lengthptr < length)
5428 {
5429 *errorcodeptr = ERR20;
5430 return FALSE;
5431 }
5432 *lengthptr += length;
5433 }
5434 return TRUE;
5435 }
5436
5437 /* Another branch follows. In the pre-compile phase, we can move the code
5438 pointer back to where it was for the start of the first branch. (That is,
5439 pretend that each branch is the only one.)
5440
5441 In the real compile phase, insert an ALT node. Its length field points back
5442 to the previous branch while the bracket remains open. At the end the chain
5443 is reversed. It's done like this so that the start of the bracket has a
5444 zero offset until it is closed, making it possible to detect recursion. */
5445
5446 if (lengthptr != NULL)
5447 {
5448 code = *codeptr + 1 + LINK_SIZE + skipbytes;
5449 length += 1 + LINK_SIZE;
5450 }
5451 else
5452 {
5453 *code = OP_ALT;
5454 PUT(code, 1, code - last_branch);
5455 bc.current = last_branch = code;
5456 code += 1 + LINK_SIZE;
5457 }
5458
5459 ptr++;
5460 }
5461 /* Control never reaches here */
5462 }
5463
5464
5465
5466
5467 /*************************************************
5468 * Check for anchored expression *
5469 *************************************************/
5470
5471 /* Try to find out if this is an anchored regular expression. Consider each
5472 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
5473 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
5474 it's anchored. However, if this is a multiline pattern, then only OP_SOD
5475 counts, since OP_CIRC can match in the middle.
5476
5477 We can also consider a regex to be anchored if OP_SOM starts all its branches.
5478 This is the code for \G, which means "match at start of match position, taking
5479 into account the match offset".
5480
5481 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
5482 because that will try the rest of the pattern at all possible matching points,
5483 so there is no point trying again.... er ....
5484
5485 .... except when the .* appears inside capturing parentheses, and there is a
5486 subsequent back reference to those parentheses. We haven't enough information
5487 to catch that case precisely.
5488
5489 At first, the best we could do was to detect when .* was in capturing brackets
5490 and the highest back reference was greater than or equal to that level.
5491 However, by keeping a bitmap of the first 31 back references, we can catch some
5492 of the more common cases more precisely.
5493
5494 Arguments:
5495 code points to start of expression (the bracket)
5496 options points to the options setting
5497 bracket_map a bitmap of which brackets we are inside while testing; this
5498 handles up to substring 31; after that we just have to take
5499 the less precise approach
5500 backref_map the back reference bitmap
5501
5502 Returns: TRUE or FALSE
5503 */
5504
5505 static BOOL
5506 is_anchored(register const uschar *code, int *options, unsigned int bracket_map,
5507 unsigned int backref_map)
5508 {
5509 do {
5510 const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
5511 options, PCRE_MULTILINE, FALSE);
5512 register int op = *scode;
5513
5514 /* Non-capturing brackets */
5515
5516 if (op == OP_BRA)
5517 {
5518 if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
5519 }
5520
5521 /* Capturing brackets */
5522
5523 else if (op == OP_CBRA)
5524 {
5525 int n = GET2(scode, 1+LINK_SIZE);
5526 int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
5527 if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;
5528 }
5529
5530 /* Other brackets */
5531
5532 else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
5533 {
5534 if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
5535 }
5536
5537 /* .* is not anchored unless DOTALL is set and it isn't in brackets that
5538 are or may be referenced. */
5539
5540 else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
5541 op == OP_TYPEPOSSTAR) &&
5542 (*options & PCRE_DOTALL) != 0)
5543 {
5544 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
5545 }
5546
5547 /* Check for explicit anchoring */
5548
5549 else if (op != OP_SOD && op != OP_SOM &&
5550 ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
5551 return FALSE;
5552 code += GET(code, 1);
5553 }
5554 while (*code == OP_ALT); /* Loop for each alternative */
5555 return TRUE;
5556 }
5557
5558
5559
5560 /*************************************************
5561 * Check for starting with ^ or .* *
5562 *************************************************/
5563
5564 /* This is called to find out if every branch starts with ^ or .* so that
5565 "first char" processing can be done to speed things up in multiline
5566 matching and for non-DOTALL patterns that start with .* (which must start at
5567 the beginning or after \n). As in the case of is_anchored() (see above), we
5568 have to take account of back references to capturing brackets that contain .*
5569 because in that case we can't make the assumption.
5570
5571 Arguments:
5572 code points to start of expression (the bracket)
5573 bracket_map a bitmap of which brackets we are inside while testing; this
5574 handles up to substring 31; after that we just have to take
5575 the less precise approach
5576 backref_map the back reference bitmap
5577
5578 Returns: TRUE or FALSE
5579 */
5580
5581 static BOOL
5582 is_startline(const uschar *code, unsigned int bracket_map,
5583 unsigned int backref_map)
5584 {
5585 do {
5586 const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
5587 NULL, 0, FALSE);
5588 register int op = *scode;
5589
5590 /* Non-capturing brackets */
5591
5592 if (op == OP_BRA)
5593 {
5594 if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
5595 }
5596
5597 /* Capturing brackets */
5598
5599 else if (op == OP_CBRA)
5600 {
5601 int n = GET2(scode, 1+LINK_SIZE);
5602 int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
5603 if (!is_startline(scode, new_map, backref_map)) return FALSE;
5604 }
5605
5606 /* Other brackets */
5607
5608 else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
5609 { if (!is_startline(scode, bracket_map, backref_map)) return FALSE; }
5610
5611 /* .* means "start at start or after \n" if it isn't in brackets that
5612 may be referenced. */
5613
5614 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
5615 {
5616 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
5617 }
5618
5619 /* Check for explicit circumflex */
5620
5621 else if (op != OP_CIRC) return FALSE;
5622
5623 /* Move on to the next alternative */
5624
5625 code += GET(code, 1);
5626 }
5627 while (*code == OP_ALT); /* Loop for each alternative */
5628 return TRUE;
5629 }
5630
5631
5632
5633 /*************************************************
5634 * Check for asserted fixed first char *
5635 *************************************************/
5636
5637 /* During compilation, the "first char" settings from forward assertions are
5638 discarded, because they can cause conflicts with actual literals that follow.
5639 However, if we end up without a first char setting for an unanchored pattern,
5640 it is worth scanning the regex to see if there is an initial asserted first
5641 char. If all branches start with the same asserted char, or with a bracket all
5642 of whose alternatives start with the same asserted char (recurse ad lib), then
5643 we return that char, otherwise -1.
5644
5645 Arguments:
5646 code points to start of expression (the bracket)
5647 options pointer to the options (used to check casing changes)
5648 inassert TRUE if in an assertion
5649
5650 Returns: -1 or the fixed first char
5651 */
5652
5653 static int
5654 find_firstassertedchar(const uschar *code, int *options, BOOL inassert)
5655 {
5656 register int c = -1;
5657 do {
5658 int d;
5659 const uschar *scode =
5660 first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE);
5661 register int op = *scode;
5662
5663 switch(op)
5664 {
5665 default:
5666 return -1;
5667
5668 case OP_BRA:
5669 case OP_CBRA:
5670 case OP_ASSERT:
5671 case OP_ONCE:
5672 case OP_COND:
5673 if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0)
5674 return -1;
5675 if (c < 0) c = d; else if (c != d) return -1;
5676 break;
5677
5678 case OP_EXACT: /* Fall through */
5679 scode += 2;
5680
5681 case OP_CHAR:
5682 case OP_CHARNC:
5683 case OP_PLUS:
5684 case OP_MINPLUS:
5685 case OP_POSPLUS:
5686 if (!inassert) return -1;
5687 if (c < 0)
5688 {
5689 c = scode[1];
5690 if ((*options & PCRE_CASELESS) != 0) c |= REQ_CASELESS;
5691 }
5692 else if (c != scode[1]) return -1;
5693 break;
5694 }
5695
5696 code += GET(code, 1);
5697 }
5698 while (*code == OP_ALT);
5699 return c;
5700 }
5701
5702
5703
5704 /*************************************************
5705 * Compile a Regular Expression *
5706 *************************************************/
5707
5708 /* This function takes a string and returns a pointer to a block of store
5709 holding a compiled version of the expression. The original API for this
5710 function had no error code return variable; it is retained for backwards
5711 compatibility. The new function is given a new name.
5712
5713 Arguments:
5714 pattern the regular expression
5715 options various option bits
5716 errorcodeptr pointer to error code variable (pcre_compile2() only)
5717 can be NULL if you don't want a code value
5718 errorptr pointer to pointer to error text
5719 erroroffset ptr offset in pattern where error was detected
5720 tables pointer to character tables or NULL
5721
5722 Returns: pointer to compiled data block, or NULL on error,
5723 with errorptr and erroroffset set
5724 */
5725
5726 PCRE_EXP_DEFN pcre *
5727 pcre_compile(const char *pattern, int options, const char **errorptr,
5728 int *erroroffset, const unsigned char *tables)
5729 {
5730 return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
5731 }
5732
5733
5734 PCRE_EXP_DEFN pcre *
5735 pcre_compile2(const char *pattern, int options, int *errorcodeptr,
5736 const char **errorptr, int *erroroffset, const unsigned char *tables)
5737 {
5738 real_pcre *re;
5739 int length = 1; /* For final END opcode */
5740 int firstbyte, reqbyte, newline;
5741 int errorcode = 0;
5742 int skipatstart = 0;
5743 #ifdef SUPPORT_UTF8
5744 BOOL utf8;
5745 #endif
5746 size_t size;
5747 uschar *code;
5748 const uschar *codestart;
5749 const uschar *ptr;
5750 compile_data compile_block;
5751 compile_data *cd = &compile_block;
5752
5753 /* This space is used for "compiling" into during the first phase, when we are
5754 computing the amount of memory that is needed. Compiled items are thrown away
5755 as soon as possible, so that a fairly large buffer should be sufficient for
5756 this purpose. The same space is used in the second phase for remembering where
5757 to fill in forward references to subpatterns. */
5758
5759 uschar cworkspace[COMPILE_WORK_SIZE];
5760
5761
5762 /* Set this early so that early errors get offset 0. */
5763
5764 ptr = (const uschar *)pattern;
5765
5766 /* We can't pass back an error message if errorptr is NULL; I guess the best we
5767 can do is just return NULL, but we can set a code value if there is a code
5768 pointer. */
5769
5770 if (errorptr == NULL)
5771 {
5772 if (errorcodeptr != NULL) *errorcodeptr = 99;
5773 return NULL;
5774 }
5775
5776 *errorptr = NULL;
5777 if (errorcodeptr != NULL) *errorcodeptr = ERR0;
5778
5779 /* However, we can give a message for this error */
5780
5781 if (erroroffset == NULL)
5782 {
5783 errorcode = ERR16;
5784 goto PCRE_EARLY_ERROR_RETURN2;
5785 }
5786
5787 *erroroffset = 0;
5788
5789 /* Can't support UTF8 unless PCRE has been compiled to include the code. */
5790
5791 #ifdef SUPPORT_UTF8
5792 utf8 = (options & PCRE_UTF8) != 0;
5793 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
5794 (*erroroffset = _pcre_valid_utf8((uschar *)pattern, -1)) >= 0)
5795 {
5796 errorcode = ERR44;
5797 goto PCRE_EARLY_ERROR_RETURN2;
5798 }
5799 #else
5800 if ((options & PCRE_UTF8) != 0)
5801 {
5802 errorcode = ERR32;
5803 goto PCRE_EARLY_ERROR_RETURN;
5804 }
5805 #endif
5806
5807 if ((options & ~PUBLIC_OPTIONS) != 0)
5808 {
5809 errorcode = ERR17;
5810 goto PCRE_EARLY_ERROR_RETURN;
5811 }
5812
5813 /* Set up pointers to the individual character tables */
5814
5815 if (tables == NULL) tables = _pcre_default_tables;
5816 cd->lcc = tables + lcc_offset;
5817 cd->fcc = tables + fcc_offset;
5818 cd->cbits = tables + cbits_offset;
5819 cd->ctypes = tables + ctypes_offset;
5820
5821 /* Check for global one-time settings at the start of the pattern, and remember
5822 the offset for later. */
5823
5824 while (ptr[skipatstart] == '(' && ptr[skipatstart+1] == '*')
5825 {
5826 int newnl = 0;
5827 int newbsr = 0;
5828
5829 if (strncmp((char *)(ptr+skipatstart+2), "CR)", 3) == 0)
5830 { skipatstart += 5; newnl = PCRE_NEWLINE_CR; }
5831 else if (strncmp((char *)(ptr+skipatstart+2), "LF)", 3) == 0)
5832 { skipatstart += 5; newnl = PCRE_NEWLINE_LF; }
5833 else if (strncmp((char *)(ptr+skipatstart+2), "CRLF)", 5) == 0)
5834 { skipatstart += 7; newnl = PCRE_NEWLINE_CR + PCRE_NEWLINE_LF; }
5835 else if (strncmp((char *)(ptr+skipatstart+2), "ANY)", 4) == 0)
5836 { skipatstart += 6; newnl = PCRE_NEWLINE_ANY; }
5837 else if (strncmp((char *)(ptr+skipatstart+2), "ANYCRLF)", 8) == 0)
5838 { skipatstart += 10; newnl = PCRE_NEWLINE_ANYCRLF; }
5839
5840 else if (strncmp((char *)(ptr+skipatstart+2), "BSR_ANYCRLF)", 12) == 0)
5841 { skipatstart += 14; newbsr = PCRE_BSR_ANYCRLF; }
5842 else if (strncmp((char *)(ptr+skipatstart+2), "BSR_UNICODE)", 12) == 0)
5843 { skipatstart += 14; newbsr = PCRE_BSR_UNICODE; }
5844
5845 if (newnl != 0)
5846 options = (options & ~PCRE_NEWLINE_BITS) | newnl;
5847 else if (newbsr != 0)
5848 options = (options & ~(PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) | newbsr;
5849 else break;
5850 }
5851
5852 /* Check validity of \R options. */
5853
5854 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
5855 {
5856 case 0:
5857 case PCRE_BSR_ANYCRLF:
5858 case PCRE_BSR_UNICODE:
5859 break;
5860 default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
5861 }
5862
5863 /* Handle different types of newline. The three bits give seven cases. The
5864 current code allows for fixed one- or two-byte sequences, plus "any" and
5865 "anycrlf". */
5866
5867 switch (options & PCRE_NEWLINE_BITS)
5868 {
5869 case 0: newline = NEWLINE; break; /* Build-time default */
5870 case PCRE_NEWLINE_CR: newline = '\r'; break;
5871 case PCRE_NEWLINE_LF: newline = '\n'; break;
5872 case PCRE_NEWLINE_CR+
5873 PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
5874 case PCRE_NEWLINE_ANY: newline = -1; break;
5875 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
5876 default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
5877 }
5878
5879 if (newline == -2)
5880 {
5881 cd->nltype = NLTYPE_ANYCRLF;
5882 }
5883 else if (newline < 0)
5884 {
5885 cd->nltype = NLTYPE_ANY;
5886 }
5887 else
5888 {
5889 cd->nltype = NLTYPE_FIXED;
5890 if (newline > 255)
5891 {
5892 cd->nllen = 2;
5893 cd->nl[0] = (newline >> 8) & 255;
5894 cd->nl[1] = newline & 255;
5895 }
5896 else
5897 {
5898 cd->nllen = 1;
5899 cd->nl[0] = newline;
5900 }
5901 }
5902
5903 /* Maximum back reference and backref bitmap. The bitmap records up to 31 back
5904 references to help in deciding whether (.*) can be treated as anchored or not.
5905 */
5906
5907 cd->top_backref = 0;
5908 cd->backref_map = 0;
5909
5910 /* Reflect pattern for debugging output */
5911
5912 DPRINTF(("------------------------------------------------------------------\n"));
5913 DPRINTF(("%s\n", pattern));
5914
5915 /* Pretend to compile the pattern while actually just accumulating the length
5916 of memory required. This behaviour is triggered by passing a non-NULL final
5917 argument to compile_regex(). We pass a block of workspace (cworkspace) for it
5918 to compile parts of the pattern into; the compiled code is discarded when it is
5919 no longer needed, so hopefully this workspace will never overflow, though there
5920 is a test for its doing so. */
5921
5922 cd->bracount = 0;
5923 cd->names_found = 0;
5924 cd->name_entry_size = 0;
5925 cd->name_table = NULL;
5926 cd->start_workspace = cworkspace;
5927 cd->start_code = cworkspace;
5928 cd->hwm = cworkspace;
5929 cd->start_pattern = (const uschar *)pattern;
5930 cd->end_pattern = (const uschar *)(pattern + strlen(pattern));
5931 cd->req_varyopt = 0;
5932 cd->external_options = options;
5933 cd->external_flags = 0;
5934
5935 /* Now do the pre-compile. On error, errorcode will be set non-zero, so we
5936 don't need to look at the result of the function here. The initial options have
5937 been put into the cd block so that they can be changed if an option setting is
5938 found within the regex right at the beginning. Bringing initial option settings
5939 outside can help speed up starting point checks. */
5940
5941 ptr += skipatstart;
5942 code = cworkspace;
5943 *code = OP_BRA;
5944 (void)compile_regex(cd->external_options, cd->external_options & PCRE_IMS,
5945 &code, &ptr, &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd,
5946 &length);
5947 if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN;
5948
5949 DPRINTF(("end pre-compile: length=%d workspace=%d\n", length,
5950 cd->hwm - cworkspace));
5951
5952 if (length > MAX_PATTERN_SIZE)
5953 {
5954 errorcode = ERR20;
5955 goto PCRE_EARLY_ERROR_RETURN;
5956 }
5957
5958 /* Compute the size of data block needed and get it, either from malloc or
5959 externally provided function. Integer overflow should no longer be possible
5960 because nowadays we limit the maximum value of cd->names_found and
5961 cd->name_entry_size. */
5962
5963 size = length + sizeof(real_pcre) + cd->names_found * (cd->name_entry_size + 3);
5964 re = (real_pcre *)(pcre_malloc)(size);
5965
5966 if (re == NULL)
5967 {
5968 errorcode = ERR21;
5969 goto PCRE_EARLY_ERROR_RETURN;
5970 }
5971
5972 /* Put in the magic number, and save the sizes, initial options, internal
5973 flags, and character table pointer. NULL is used for the default character
5974 tables. The nullpad field is at the end; it's there to help in the case when a
5975 regex compiled on a system with 4-byte pointers is run on another with 8-byte
5976 pointers. */
5977
5978 re->magic_number = MAGIC_NUMBER;
5979 re->size = size;
5980 re->options = cd->external_options;
5981 re->flags = cd->external_flags;
5982 re->dummy1 = 0;
5983 re->first_byte = 0;
5984 re->req_byte = 0;
5985 re->name_table_offset = sizeof(real_pcre);
5986 re->name_entry_size = cd->name_entry_size;
5987 re->name_count = cd->names_found;
5988 re->ref_count = 0;
5989 re->tables = (tables == _pcre_default_tables)? NULL : tables;
5990 re->nullpad = NULL;
5991
5992 /* The starting points of the name/number translation table and of the code are
5993 passed around in the compile data block. The start/end pattern and initial
5994 options are already set from the pre-compile phase, as is the name_entry_size
5995 field. Reset the bracket count and the names_found field. Also reset the hwm
5996 field; this time it's used for remembering forward references to subpatterns.
5997 */
5998
5999 cd->bracount = 0;
6000 cd->names_found = 0;
6001 cd->name_table = (uschar *)re + re->name_table_offset;
6002 codestart = cd->name_table + re->name_entry_size * re->name_count;
6003 cd->start_code = codestart;
6004 cd->hwm = cworkspace;
6005 cd->req_varyopt = 0;
6006 cd->had_accept = FALSE;
6007
6008 /* Set up a starting, non-extracting bracket, then compile the expression. On
6009 error, errorcode will be set non-zero, so we don't need to look at the result
6010 of the function here. */
6011
6012 ptr = (const uschar *)pattern + skipatstart;
6013 code = (uschar *)codestart;
6014 *code = OP_BRA;
6015 (void)compile_regex(re->options, re->options & PCRE_IMS, &code, &ptr,
6016 &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd, NULL);
6017 re->top_bracket = cd->bracount;
6018 re->top_backref = cd->top_backref;
6019 re->flags = cd->external_flags;
6020
6021 if (cd->had_accept) reqbyte = -1; /* Must disable after (*ACCEPT) */
6022
6023 /* If not reached end of pattern on success, there's an excess bracket. */
6024
6025 if (errorcode == 0 && *ptr != 0) errorcode = ERR22;
6026
6027 /* Fill in the terminating state and check for disastrous overflow, but
6028 if debugging, leave the test till after things are printed out. */
6029
6030 *code++ = OP_END;
6031
6032 #ifndef DEBUG
6033 if (code - codestart > length) errorcode = ERR23;
6034 #endif
6035
6036 /* Fill in any forward references that are required. */
6037
6038 while (errorcode == 0 && cd->hwm > cworkspace)
6039 {
6040 int offset, recno;
6041 const uschar *groupptr;
6042 cd->hwm -= LINK_SIZE;
6043 offset = GET(cd->hwm, 0);
6044 recno = GET(codestart, offset);
6045 groupptr = find_bracket(codestart, (re->options & PCRE_UTF8) != 0, recno);
6046 if (groupptr == NULL) errorcode = ERR53;
6047 else PUT(((uschar *)codestart), offset, groupptr - codestart);
6048 }
6049
6050 /* Give an error if there's back reference to a non-existent capturing
6051 subpattern. */
6052
6053 if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15;
6054
6055 /* Failed to compile, or error while post-processing */
6056
6057 if (errorcode != 0)
6058 {
6059 (pcre_free)(re);
6060 PCRE_EARLY_ERROR_RETURN:
6061 *erroroffset = ptr - (const uschar *)pattern;
6062 PCRE_EARLY_ERROR_RETURN2:
6063 *errorptr = find_error_text(errorcode);
6064 if (errorcodeptr != NULL) *errorcodeptr = errorcode;
6065 return NULL;
6066 }
6067
6068 /* If the anchored option was not passed, set the flag if we can determine that
6069 the pattern is anchored by virtue of ^ characters or \A or anything else (such
6070 as starting with .* when DOTALL is set).
6071
6072 Otherwise, if we know what the first byte has to be, save it, because that
6073 speeds up unanchored matches no end. If not, see if we can set the
6074 PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
6075 start with ^. and also when all branches start with .* for non-DOTALL matches.
6076 */
6077
6078 if ((re->options & PCRE_ANCHORED) == 0)
6079 {
6080 int temp_options = re->options; /* May get changed during these scans */
6081 if (is_anchored(codestart, &temp_options, 0, cd->backref_map))
6082 re->options |= PCRE_ANCHORED;
6083 else
6084 {
6085 if (firstbyte < 0)
6086 firstbyte = find_firstassertedchar(codestart, &temp_options, FALSE);
6087 if (firstbyte >= 0) /* Remove caseless flag for non-caseable chars */
6088 {
6089 int ch = firstbyte & 255;
6090 re->first_byte = ((firstbyte & REQ_CASELESS) != 0 &&
6091 cd->fcc[ch] == ch)? ch : firstbyte;
6092 re->flags |= PCRE_FIRSTSET;
6093 }
6094 else if (is_startline(codestart, 0, cd->backref_map))
6095 re->flags |= PCRE_STARTLINE;
6096 }
6097 }
6098
6099 /* For an anchored pattern, we use the "required byte" only if it follows a
6100 variable length item in the regex. Remove the caseless flag for non-caseable
6101 bytes. */
6102
6103 if (reqbyte >= 0 &&
6104 ((re->options & PCRE_ANCHORED) == 0 || (reqbyte & REQ_VARY) != 0))
6105 {
6106 int ch = reqbyte & 255;
6107 re->req_byte = ((reqbyte & REQ_CASELESS) != 0 &&
6108 cd->fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte;
6109 re->flags |= PCRE_REQCHSET;
6110 }
6111
6112 /* Print out the compiled data if debugging is enabled. This is never the
6113 case when building a production library. */
6114
6115 #ifdef DEBUG
6116
6117 printf("Length = %d top_bracket = %d top_backref = %d\n",
6118 length, re->top_bracket, re->top_backref);
6119
6120 printf("Options=%08x\n", re->options);
6121
6122 if ((re->flags & PCRE_FIRSTSET) != 0)
6123 {
6124 int ch = re->first_byte & 255;
6125 const char *caseless = ((re->first_byte & REQ_CASELESS) == 0)?
6126 "" : " (caseless)";
6127 if (isprint(ch)) printf("First char = %c%s\n", ch, caseless);
6128 else printf("First char = \\x%02x%s\n", ch, caseless);
6129 }
6130
6131 if ((re->flags & PCRE_REQCHSET) != 0)
6132 {
6133 int ch = re->req_byte & 255;
6134 const char *caseless = ((re->req_byte & REQ_CASELESS) == 0)?
6135 "" : " (caseless)";
6136 if (isprint(ch)) printf("Req char = %c%s\n", ch, caseless);
6137 else printf("Req char = \\x%02x%s\n", ch, caseless);
6138 }
6139
6140 pcre_printint(re, stdout, TRUE);
6141
6142 /* This check is done here in the debugging case so that the code that
6143 was compiled can be seen. */
6144
6145 if (code - codestart > length)
6146 {
6147 (pcre_free)(re);
6148 *errorptr = find_error_text(ERR23);
6149 *erroroffset = ptr - (uschar *)pattern;
6150 if (errorcodeptr != NULL) *errorcodeptr = ERR23;
6151 return NULL;
6152 }
6153 #endif /* DEBUG */
6154
6155 return (pcre *)re;
6156 }
6157
6158 /* End of pcre_compile.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12