/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Contents of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 334 - (show annotations) (download)
Fri Apr 11 15:48:14 2008 UTC (6 years, 3 months ago) by ph10
File MIME type: text/plain
File size: 200213 byte(s)
Fix bug in Oniguruma \g support.

1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2008 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains the external function pcre_compile(), along with
42 supporting internal functions that are not used by other modules. */
43
44
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48
49 #define NLBLOCK cd /* Block containing newline information */
50 #define PSSTART start_pattern /* Field containing processed string start */
51 #define PSEND end_pattern /* Field containing processed string end */
52
53 #include "pcre_internal.h"
54
55
56 /* When DEBUG is defined, we need the pcre_printint() function, which is also
57 used by pcretest. DEBUG is not defined when building a production library. */
58
59 #ifdef DEBUG
60 #include "pcre_printint.src"
61 #endif
62
63
64 /* Macro for setting individual bits in class bitmaps. */
65
66 #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
67
68 /* Maximum length value to check against when making sure that the integer that
69 holds the compiled pattern length does not overflow. We make it a bit less than
70 INT_MAX to allow for adding in group terminating bytes, so that we don't have
71 to check them every time. */
72
73 #define OFLOW_MAX (INT_MAX - 20)
74
75
76 /*************************************************
77 * Code parameters and static tables *
78 *************************************************/
79
80 /* This value specifies the size of stack workspace that is used during the
81 first pre-compile phase that determines how much memory is required. The regex
82 is partly compiled into this space, but the compiled parts are discarded as
83 soon as they can be, so that hopefully there will never be an overrun. The code
84 does, however, check for an overrun. The largest amount I've seen used is 218,
85 so this number is very generous.
86
87 The same workspace is used during the second, actual compile phase for
88 remembering forward references to groups so that they can be filled in at the
89 end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
90 is 4 there is plenty of room. */
91
92 #define COMPILE_WORK_SIZE (4096)
93
94
95 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
96 are simple data values; negative values are for special things like \d and so
97 on. Zero means further processing is needed (for things like \x), or the escape
98 is invalid. */
99
100 #ifndef EBCDIC /* This is the "normal" table for ASCII systems */
101 static const short int escapes[] = {
102 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
103 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
104 '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */
105 -ESC_H, 0, 0, -ESC_K, 0, 0, 0, 0, /* H - O */
106 -ESC_P, -ESC_Q, -ESC_R, -ESC_S, 0, 0, -ESC_V, -ESC_W, /* P - W */
107 -ESC_X, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
108 '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */
109 -ESC_h, 0, 0, -ESC_k, 0, 0, ESC_n, 0, /* h - o */
110 -ESC_p, 0, ESC_r, -ESC_s, ESC_tee, 0, -ESC_v, -ESC_w, /* p - w */
111 0, 0, -ESC_z /* x - z */
112 };
113
114 #else /* This is the "abnormal" table for EBCDIC systems */
115 static const short int escapes[] = {
116 /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
117 /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
118 /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
119 /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
120 /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
121 /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
122 /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
123 /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
124 /* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0,
125 /* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
126 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
127 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0,
128 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
129 /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
130 /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
131 /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
132 /* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0,
133 /* D0 */ '}', 0, -ESC_K, 0, 0, 0, 0, -ESC_P,
134 /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
135 /* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X,
136 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
137 /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
138 /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
139 };
140 #endif
141
142
143 /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
144 searched linearly. Put all the names into a single string, in order to reduce
145 the number of relocations when a shared library is dynamically linked. */
146
147 typedef struct verbitem {
148 int len;
149 int op;
150 } verbitem;
151
152 static const char verbnames[] =
153 "ACCEPT\0"
154 "COMMIT\0"
155 "F\0"
156 "FAIL\0"
157 "PRUNE\0"
158 "SKIP\0"
159 "THEN";
160
161 static const verbitem verbs[] = {
162 { 6, OP_ACCEPT },
163 { 6, OP_COMMIT },
164 { 1, OP_FAIL },
165 { 4, OP_FAIL },
166 { 5, OP_PRUNE },
167 { 4, OP_SKIP },
168 { 4, OP_THEN }
169 };
170
171 static const int verbcount = sizeof(verbs)/sizeof(verbitem);
172
173
174 /* Tables of names of POSIX character classes and their lengths. The names are
175 now all in a single string, to reduce the number of relocations when a shared
176 library is dynamically loaded. The list of lengths is terminated by a zero
177 length entry. The first three must be alpha, lower, upper, as this is assumed
178 for handling case independence. */
179
180 static const char posix_names[] =
181 "alpha\0" "lower\0" "upper\0" "alnum\0" "ascii\0" "blank\0"
182 "cntrl\0" "digit\0" "graph\0" "print\0" "punct\0" "space\0"
183 "word\0" "xdigit";
184
185 static const uschar posix_name_lengths[] = {
186 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
187
188 /* Table of class bit maps for each POSIX class. Each class is formed from a
189 base map, with an optional addition or removal of another map. Then, for some
190 classes, there is some additional tweaking: for [:blank:] the vertical space
191 characters are removed, and for [:alpha:] and [:alnum:] the underscore
192 character is removed. The triples in the table consist of the base map offset,
193 second map offset or -1 if no second map, and a non-negative value for map
194 addition or a negative value for map subtraction (if there are two maps). The
195 absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
196 remove vertical space characters, 2 => remove underscore. */
197
198 static const int posix_class_maps[] = {
199 cbit_word, cbit_digit, -2, /* alpha */
200 cbit_lower, -1, 0, /* lower */
201 cbit_upper, -1, 0, /* upper */
202 cbit_word, -1, 2, /* alnum - word without underscore */
203 cbit_print, cbit_cntrl, 0, /* ascii */
204 cbit_space, -1, 1, /* blank - a GNU extension */
205 cbit_cntrl, -1, 0, /* cntrl */
206 cbit_digit, -1, 0, /* digit */
207 cbit_graph, -1, 0, /* graph */
208 cbit_print, -1, 0, /* print */
209 cbit_punct, -1, 0, /* punct */
210 cbit_space, -1, 0, /* space */
211 cbit_word, -1, 0, /* word - a Perl extension */
212 cbit_xdigit,-1, 0 /* xdigit */
213 };
214
215
216 #define STRING(a) # a
217 #define XSTRING(s) STRING(s)
218
219 /* The texts of compile-time error messages. These are "char *" because they
220 are passed to the outside world. Do not ever re-use any error number, because
221 they are documented. Always add a new error instead. Messages marked DEAD below
222 are no longer used. This used to be a table of strings, but in order to reduce
223 the number of relocations needed when a shared library is loaded dynamically,
224 it is now one long string. We cannot use a table of offsets, because the
225 lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
226 simply count through to the one we want - this isn't a performance issue
227 because these strings are used only when there is a compilation error. */
228
229 static const char error_texts[] =
230 "no error\0"
231 "\\ at end of pattern\0"
232 "\\c at end of pattern\0"
233 "unrecognized character follows \\\0"
234 "numbers out of order in {} quantifier\0"
235 /* 5 */
236 "number too big in {} quantifier\0"
237 "missing terminating ] for character class\0"
238 "invalid escape sequence in character class\0"
239 "range out of order in character class\0"
240 "nothing to repeat\0"
241 /* 10 */
242 "operand of unlimited repeat could match the empty string\0" /** DEAD **/
243 "internal error: unexpected repeat\0"
244 "unrecognized character after (? or (?-\0"
245 "POSIX named classes are supported only within a class\0"
246 "missing )\0"
247 /* 15 */
248 "reference to non-existent subpattern\0"
249 "erroffset passed as NULL\0"
250 "unknown option bit(s) set\0"
251 "missing ) after comment\0"
252 "parentheses nested too deeply\0" /** DEAD **/
253 /* 20 */
254 "regular expression is too large\0"
255 "failed to get memory\0"
256 "unmatched parentheses\0"
257 "internal error: code overflow\0"
258 "unrecognized character after (?<\0"
259 /* 25 */
260 "lookbehind assertion is not fixed length\0"
261 "malformed number or name after (?(\0"
262 "conditional group contains more than two branches\0"
263 "assertion expected after (?(\0"
264 "(?R or (?[+-]digits must be followed by )\0"
265 /* 30 */
266 "unknown POSIX class name\0"
267 "POSIX collating elements are not supported\0"
268 "this version of PCRE is not compiled with PCRE_UTF8 support\0"
269 "spare error\0" /** DEAD **/
270 "character value in \\x{...} sequence is too large\0"
271 /* 35 */
272 "invalid condition (?(0)\0"
273 "\\C not allowed in lookbehind assertion\0"
274 "PCRE does not support \\L, \\l, \\N, \\U, or \\u\0"
275 "number after (?C is > 255\0"
276 "closing ) for (?C expected\0"
277 /* 40 */
278 "recursive call could loop indefinitely\0"
279 "unrecognized character after (?P\0"
280 "syntax error in subpattern name (missing terminator)\0"
281 "two named subpatterns have the same name\0"
282 "invalid UTF-8 string\0"
283 /* 45 */
284 "support for \\P, \\p, and \\X has not been compiled\0"
285 "malformed \\P or \\p sequence\0"
286 "unknown property name after \\P or \\p\0"
287 "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
288 "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
289 /* 50 */
290 "repeated subpattern is too long\0" /** DEAD **/
291 "octal value is greater than \\377 (not in UTF-8 mode)\0"
292 "internal error: overran compiling workspace\0"
293 "internal error: previously-checked referenced subpattern not found\0"
294 "DEFINE group contains more than one branch\0"
295 /* 55 */
296 "repeating a DEFINE group is not allowed\0"
297 "inconsistent NEWLINE options\0"
298 "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
299 "a numbered reference must not be zero\0"
300 "(*VERB) with an argument is not supported\0"
301 /* 60 */
302 "(*VERB) not recognized\0"
303 "number is too big\0"
304 "subpattern name expected\0"
305 "digit expected after (?+";
306
307
308 /* Table to identify digits and hex digits. This is used when compiling
309 patterns. Note that the tables in chartables are dependent on the locale, and
310 may mark arbitrary characters as digits - but the PCRE compiling code expects
311 to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
312 a private table here. It costs 256 bytes, but it is a lot faster than doing
313 character value tests (at least in some simple cases I timed), and in some
314 applications one wants PCRE to compile efficiently as well as match
315 efficiently.
316
317 For convenience, we use the same bit definitions as in chartables:
318
319 0x04 decimal digit
320 0x08 hexadecimal digit
321
322 Then we can use ctype_digit and ctype_xdigit in the code. */
323
324 #ifndef EBCDIC /* This is the "normal" case, for ASCII systems */
325 static const unsigned char digitab[] =
326 {
327 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
328 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
329 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
330 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
331 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
332 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
333 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
334 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
335 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
336 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
337 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
338 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
339 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
340 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
341 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
342 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
343 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
344 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
345 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
346 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
347 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
348 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
349 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
350 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
351 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
352 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
353 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
354 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
355 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
356 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
357 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
358 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
359
360 #else /* This is the "abnormal" case, for EBCDIC systems */
361 static const unsigned char digitab[] =
362 {
363 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
364 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
365 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
366 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
367 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
368 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
369 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
370 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
371 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
372 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
373 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
374 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */
375 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
376 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
377 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
378 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
379 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
380 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
381 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
382 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
383 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
384 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
385 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
386 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
387 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
388 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
389 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
390 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
391 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
392 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
393 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
394 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
395
396 static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
397 0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
398 0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
399 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
400 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
401 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
402 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
403 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
404 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
405 0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
406 0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
407 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
408 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */
409 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
410 0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
411 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
412 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
413 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
414 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
415 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
416 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
417 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
418 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
419 0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
420 0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
421 0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
422 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
423 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
424 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
425 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
426 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
427 0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
428 0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
429 #endif
430
431
432 /* Definition to allow mutual recursion */
433
434 static BOOL
435 compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
436 int *, int *, branch_chain *, compile_data *, int *);
437
438
439
440 /*************************************************
441 * Find an error text *
442 *************************************************/
443
444 /* The error texts are now all in one long string, to save on relocations. As
445 some of the text is of unknown length, we can't use a table of offsets.
446 Instead, just count through the strings. This is not a performance issue
447 because it happens only when there has been a compilation error.
448
449 Argument: the error number
450 Returns: pointer to the error string
451 */
452
453 static const char *
454 find_error_text(int n)
455 {
456 const char *s = error_texts;
457 for (; n > 0; n--) while (*s++ != 0);
458 return s;
459 }
460
461
462 /*************************************************
463 * Handle escapes *
464 *************************************************/
465
466 /* This function is called when a \ has been encountered. It either returns a
467 positive value for a simple escape such as \n, or a negative value which
468 encodes one of the more complicated things such as \d. A backreference to group
469 n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
470 UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
471 ptr is pointing at the \. On exit, it is on the final character of the escape
472 sequence.
473
474 Arguments:
475 ptrptr points to the pattern position pointer
476 errorcodeptr points to the errorcode variable
477 bracount number of previous extracting brackets
478 options the options bits
479 isclass TRUE if inside a character class
480
481 Returns: zero or positive => a data character
482 negative => a special escape sequence
483 on error, errorcodeptr is set
484 */
485
486 static int
487 check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
488 int options, BOOL isclass)
489 {
490 BOOL utf8 = (options & PCRE_UTF8) != 0;
491 const uschar *ptr = *ptrptr + 1;
492 int c, i;
493
494 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
495 ptr--; /* Set pointer back to the last byte */
496
497 /* If backslash is at the end of the pattern, it's an error. */
498
499 if (c == 0) *errorcodeptr = ERR1;
500
501 /* Non-alphanumerics are literals. For digits or letters, do an initial lookup
502 in a table. A non-zero result is something that can be returned immediately.
503 Otherwise further processing may be required. */
504
505 #ifndef EBCDIC /* ASCII coding */
506 else if (c < '0' || c > 'z') {} /* Not alphanumeric */
507 else if ((i = escapes[c - '0']) != 0) c = i;
508
509 #else /* EBCDIC coding */
510 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphanumeric */
511 else if ((i = escapes[c - 0x48]) != 0) c = i;
512 #endif
513
514 /* Escapes that need further processing, or are illegal. */
515
516 else
517 {
518 const uschar *oldptr;
519 BOOL braced, negated;
520
521 switch (c)
522 {
523 /* A number of Perl escapes are not handled by PCRE. We give an explicit
524 error. */
525
526 case 'l':
527 case 'L':
528 case 'N':
529 case 'u':
530 case 'U':
531 *errorcodeptr = ERR37;
532 break;
533
534 /* \g must be followed by one of a number of specific things:
535
536 (1) A number, either plain or braced. If positive, it is an absolute
537 backreference. If negative, it is a relative backreference. This is a Perl
538 5.10 feature.
539
540 (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
541 is part of Perl's movement towards a unified syntax for back references. As
542 this is synonymous with \k{name}, we fudge it up by pretending it really
543 was \k.
544
545 (3) For Oniguruma compatibility we also support \g followed by a name or a
546 number either in angle brackets or in single quotes. However, these are
547 (possibly recursive) subroutine calls, _not_ backreferences. Just return
548 the -ESC_g code (cf \k). */
549
550 case 'g':
551 if (ptr[1] == '<' || ptr[1] == '\'')
552 {
553 c = -ESC_g;
554 break;
555 }
556
557 /* Handle the Perl-compatible cases */
558
559 if (ptr[1] == '{')
560 {
561 const uschar *p;
562 for (p = ptr+2; *p != 0 && *p != '}'; p++)
563 if (*p != '-' && (digitab[*p] & ctype_digit) == 0) break;
564 if (*p != 0 && *p != '}')
565 {
566 c = -ESC_k;
567 break;
568 }
569 braced = TRUE;
570 ptr++;
571 }
572 else braced = FALSE;
573
574 if (ptr[1] == '-')
575 {
576 negated = TRUE;
577 ptr++;
578 }
579 else negated = FALSE;
580
581 c = 0;
582 while ((digitab[ptr[1]] & ctype_digit) != 0)
583 c = c * 10 + *(++ptr) - '0';
584
585 if (c < 0) /* Integer overflow */
586 {
587 *errorcodeptr = ERR61;
588 break;
589 }
590
591 if (braced && *(++ptr) != '}')
592 {
593 *errorcodeptr = ERR57;
594 break;
595 }
596
597 if (c == 0)
598 {
599 *errorcodeptr = ERR58;
600 break;
601 }
602
603 if (negated)
604 {
605 if (c > bracount)
606 {
607 *errorcodeptr = ERR15;
608 break;
609 }
610 c = bracount - (c - 1);
611 }
612
613 c = -(ESC_REF + c);
614 break;
615
616 /* The handling of escape sequences consisting of a string of digits
617 starting with one that is not zero is not straightforward. By experiment,
618 the way Perl works seems to be as follows:
619
620 Outside a character class, the digits are read as a decimal number. If the
621 number is less than 10, or if there are that many previous extracting
622 left brackets, then it is a back reference. Otherwise, up to three octal
623 digits are read to form an escaped byte. Thus \123 is likely to be octal
624 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
625 value is greater than 377, the least significant 8 bits are taken. Inside a
626 character class, \ followed by a digit is always an octal number. */
627
628 case '1': case '2': case '3': case '4': case '5':
629 case '6': case '7': case '8': case '9':
630
631 if (!isclass)
632 {
633 oldptr = ptr;
634 c -= '0';
635 while ((digitab[ptr[1]] & ctype_digit) != 0)
636 c = c * 10 + *(++ptr) - '0';
637 if (c < 0) /* Integer overflow */
638 {
639 *errorcodeptr = ERR61;
640 break;
641 }
642 if (c < 10 || c <= bracount)
643 {
644 c = -(ESC_REF + c);
645 break;
646 }
647 ptr = oldptr; /* Put the pointer back and fall through */
648 }
649
650 /* Handle an octal number following \. If the first digit is 8 or 9, Perl
651 generates a binary zero byte and treats the digit as a following literal.
652 Thus we have to pull back the pointer by one. */
653
654 if ((c = *ptr) >= '8')
655 {
656 ptr--;
657 c = 0;
658 break;
659 }
660
661 /* \0 always starts an octal number, but we may drop through to here with a
662 larger first octal digit. The original code used just to take the least
663 significant 8 bits of octal numbers (I think this is what early Perls used
664 to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
665 than 3 octal digits. */
666
667 case '0':
668 c -= '0';
669 while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
670 c = c * 8 + *(++ptr) - '0';
671 if (!utf8 && c > 255) *errorcodeptr = ERR51;
672 break;
673
674 /* \x is complicated. \x{ddd} is a character number which can be greater
675 than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
676 treated as a data character. */
677
678 case 'x':
679 if (ptr[1] == '{')
680 {
681 const uschar *pt = ptr + 2;
682 int count = 0;
683
684 c = 0;
685 while ((digitab[*pt] & ctype_xdigit) != 0)
686 {
687 register int cc = *pt++;
688 if (c == 0 && cc == '0') continue; /* Leading zeroes */
689 count++;
690
691 #ifndef EBCDIC /* ASCII coding */
692 if (cc >= 'a') cc -= 32; /* Convert to upper case */
693 c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));
694 #else /* EBCDIC coding */
695 if (cc >= 'a' && cc <= 'z') cc += 64; /* Convert to upper case */
696 c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));
697 #endif
698 }
699
700 if (*pt == '}')
701 {
702 if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
703 ptr = pt;
704 break;
705 }
706
707 /* If the sequence of hex digits does not end with '}', then we don't
708 recognize this construct; fall through to the normal \x handling. */
709 }
710
711 /* Read just a single-byte hex-defined char */
712
713 c = 0;
714 while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
715 {
716 int cc; /* Some compilers don't like ++ */
717 cc = *(++ptr); /* in initializers */
718 #ifndef EBCDIC /* ASCII coding */
719 if (cc >= 'a') cc -= 32; /* Convert to upper case */
720 c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
721 #else /* EBCDIC coding */
722 if (cc <= 'z') cc += 64; /* Convert to upper case */
723 c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
724 #endif
725 }
726 break;
727
728 /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
729 This coding is ASCII-specific, but then the whole concept of \cx is
730 ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
731
732 case 'c':
733 c = *(++ptr);
734 if (c == 0)
735 {
736 *errorcodeptr = ERR2;
737 break;
738 }
739
740 #ifndef EBCDIC /* ASCII coding */
741 if (c >= 'a' && c <= 'z') c -= 32;
742 c ^= 0x40;
743 #else /* EBCDIC coding */
744 if (c >= 'a' && c <= 'z') c += 64;
745 c ^= 0xC0;
746 #endif
747 break;
748
749 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
750 other alphanumeric following \ is an error if PCRE_EXTRA was set;
751 otherwise, for Perl compatibility, it is a literal. This code looks a bit
752 odd, but there used to be some cases other than the default, and there may
753 be again in future, so I haven't "optimized" it. */
754
755 default:
756 if ((options & PCRE_EXTRA) != 0) switch(c)
757 {
758 default:
759 *errorcodeptr = ERR3;
760 break;
761 }
762 break;
763 }
764 }
765
766 *ptrptr = ptr;
767 return c;
768 }
769
770
771
772 #ifdef SUPPORT_UCP
773 /*************************************************
774 * Handle \P and \p *
775 *************************************************/
776
777 /* This function is called after \P or \p has been encountered, provided that
778 PCRE is compiled with support for Unicode properties. On entry, ptrptr is
779 pointing at the P or p. On exit, it is pointing at the final character of the
780 escape sequence.
781
782 Argument:
783 ptrptr points to the pattern position pointer
784 negptr points to a boolean that is set TRUE for negation else FALSE
785 dptr points to an int that is set to the detailed property value
786 errorcodeptr points to the error code variable
787
788 Returns: type value from ucp_type_table, or -1 for an invalid type
789 */
790
791 static int
792 get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
793 {
794 int c, i, bot, top;
795 const uschar *ptr = *ptrptr;
796 char name[32];
797
798 c = *(++ptr);
799 if (c == 0) goto ERROR_RETURN;
800
801 *negptr = FALSE;
802
803 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
804 negation. */
805
806 if (c == '{')
807 {
808 if (ptr[1] == '^')
809 {
810 *negptr = TRUE;
811 ptr++;
812 }
813 for (i = 0; i < (int)sizeof(name) - 1; i++)
814 {
815 c = *(++ptr);
816 if (c == 0) goto ERROR_RETURN;
817 if (c == '}') break;
818 name[i] = c;
819 }
820 if (c !='}') goto ERROR_RETURN;
821 name[i] = 0;
822 }
823
824 /* Otherwise there is just one following character */
825
826 else
827 {
828 name[0] = c;
829 name[1] = 0;
830 }
831
832 *ptrptr = ptr;
833
834 /* Search for a recognized property name using binary chop */
835
836 bot = 0;
837 top = _pcre_utt_size;
838
839 while (bot < top)
840 {
841 i = (bot + top) >> 1;
842 c = strcmp(name, _pcre_utt_names + _pcre_utt[i].name_offset);
843 if (c == 0)
844 {
845 *dptr = _pcre_utt[i].value;
846 return _pcre_utt[i].type;
847 }
848 if (c > 0) bot = i + 1; else top = i;
849 }
850
851 *errorcodeptr = ERR47;
852 *ptrptr = ptr;
853 return -1;
854
855 ERROR_RETURN:
856 *errorcodeptr = ERR46;
857 *ptrptr = ptr;
858 return -1;
859 }
860 #endif
861
862
863
864
865 /*************************************************
866 * Check for counted repeat *
867 *************************************************/
868
869 /* This function is called when a '{' is encountered in a place where it might
870 start a quantifier. It looks ahead to see if it really is a quantifier or not.
871 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
872 where the ddds are digits.
873
874 Arguments:
875 p pointer to the first char after '{'
876
877 Returns: TRUE or FALSE
878 */
879
880 static BOOL
881 is_counted_repeat(const uschar *p)
882 {
883 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
884 while ((digitab[*p] & ctype_digit) != 0) p++;
885 if (*p == '}') return TRUE;
886
887 if (*p++ != ',') return FALSE;
888 if (*p == '}') return TRUE;
889
890 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
891 while ((digitab[*p] & ctype_digit) != 0) p++;
892
893 return (*p == '}');
894 }
895
896
897
898 /*************************************************
899 * Read repeat counts *
900 *************************************************/
901
902 /* Read an item of the form {n,m} and return the values. This is called only
903 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
904 so the syntax is guaranteed to be correct, but we need to check the values.
905
906 Arguments:
907 p pointer to first char after '{'
908 minp pointer to int for min
909 maxp pointer to int for max
910 returned as -1 if no max
911 errorcodeptr points to error code variable
912
913 Returns: pointer to '}' on success;
914 current ptr on error, with errorcodeptr set non-zero
915 */
916
917 static const uschar *
918 read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
919 {
920 int min = 0;
921 int max = -1;
922
923 /* Read the minimum value and do a paranoid check: a negative value indicates
924 an integer overflow. */
925
926 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
927 if (min < 0 || min > 65535)
928 {
929 *errorcodeptr = ERR5;
930 return p;
931 }
932
933 /* Read the maximum value if there is one, and again do a paranoid on its size.
934 Also, max must not be less than min. */
935
936 if (*p == '}') max = min; else
937 {
938 if (*(++p) != '}')
939 {
940 max = 0;
941 while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
942 if (max < 0 || max > 65535)
943 {
944 *errorcodeptr = ERR5;
945 return p;
946 }
947 if (max < min)
948 {
949 *errorcodeptr = ERR4;
950 return p;
951 }
952 }
953 }
954
955 /* Fill in the required variables, and pass back the pointer to the terminating
956 '}'. */
957
958 *minp = min;
959 *maxp = max;
960 return p;
961 }
962
963
964
965 /*************************************************
966 * Find forward referenced subpattern *
967 *************************************************/
968
969 /* This function scans along a pattern's text looking for capturing
970 subpatterns, and counting them. If it finds a named pattern that matches the
971 name it is given, it returns its number. Alternatively, if the name is NULL, it
972 returns when it reaches a given numbered subpattern. This is used for forward
973 references to subpatterns. We know that if (?P< is encountered, the name will
974 be terminated by '>' because that is checked in the first pass.
975
976 Arguments:
977 ptr current position in the pattern
978 count current count of capturing parens so far encountered
979 name name to seek, or NULL if seeking a numbered subpattern
980 lorn name length, or subpattern number if name is NULL
981 xmode TRUE if we are in /x mode
982
983 Returns: the number of the named subpattern, or -1 if not found
984 */
985
986 static int
987 find_parens(const uschar *ptr, int count, const uschar *name, int lorn,
988 BOOL xmode)
989 {
990 const uschar *thisname;
991
992 for (; *ptr != 0; ptr++)
993 {
994 int term;
995
996 /* Skip over backslashed characters and also entire \Q...\E */
997
998 if (*ptr == '\\')
999 {
1000 if (*(++ptr) == 0) return -1;
1001 if (*ptr == 'Q') for (;;)
1002 {
1003 while (*(++ptr) != 0 && *ptr != '\\');
1004 if (*ptr == 0) return -1;
1005 if (*(++ptr) == 'E') break;
1006 }
1007 continue;
1008 }
1009
1010 /* Skip over character classes */
1011
1012 if (*ptr == '[')
1013 {
1014 while (*(++ptr) != ']')
1015 {
1016 if (*ptr == 0) return -1;
1017 if (*ptr == '\\')
1018 {
1019 if (*(++ptr) == 0) return -1;
1020 if (*ptr == 'Q') for (;;)
1021 {
1022 while (*(++ptr) != 0 && *ptr != '\\');
1023 if (*ptr == 0) return -1;
1024 if (*(++ptr) == 'E') break;
1025 }
1026 continue;
1027 }
1028 }
1029 continue;
1030 }
1031
1032 /* Skip comments in /x mode */
1033
1034 if (xmode && *ptr == '#')
1035 {
1036 while (*(++ptr) != 0 && *ptr != '\n');
1037 if (*ptr == 0) return -1;
1038 continue;
1039 }
1040
1041 /* An opening parens must now be a real metacharacter */
1042
1043 if (*ptr != '(') continue;
1044 if (ptr[1] != '?' && ptr[1] != '*')
1045 {
1046 count++;
1047 if (name == NULL && count == lorn) return count;
1048 continue;
1049 }
1050
1051 ptr += 2;
1052 if (*ptr == 'P') ptr++; /* Allow optional P */
1053
1054 /* We have to disambiguate (?<! and (?<= from (?<name> */
1055
1056 if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') &&
1057 *ptr != '\'')
1058 continue;
1059
1060 count++;
1061
1062 if (name == NULL && count == lorn) return count;
1063 term = *ptr++;
1064 if (term == '<') term = '>';
1065 thisname = ptr;
1066 while (*ptr != term) ptr++;
1067 if (name != NULL && lorn == ptr - thisname &&
1068 strncmp((const char *)name, (const char *)thisname, lorn) == 0)
1069 return count;
1070 }
1071
1072 return -1;
1073 }
1074
1075
1076
1077 /*************************************************
1078 * Find first significant op code *
1079 *************************************************/
1080
1081 /* This is called by several functions that scan a compiled expression looking
1082 for a fixed first character, or an anchoring op code etc. It skips over things
1083 that do not influence this. For some calls, a change of option is important.
1084 For some calls, it makes sense to skip negative forward and all backward
1085 assertions, and also the \b assertion; for others it does not.
1086
1087 Arguments:
1088 code pointer to the start of the group
1089 options pointer to external options
1090 optbit the option bit whose changing is significant, or
1091 zero if none are
1092 skipassert TRUE if certain assertions are to be skipped
1093
1094 Returns: pointer to the first significant opcode
1095 */
1096
1097 static const uschar*
1098 first_significant_code(const uschar *code, int *options, int optbit,
1099 BOOL skipassert)
1100 {
1101 for (;;)
1102 {
1103 switch ((int)*code)
1104 {
1105 case OP_OPT:
1106 if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
1107 *options = (int)code[1];
1108 code += 2;
1109 break;
1110
1111 case OP_ASSERT_NOT:
1112 case OP_ASSERTBACK:
1113 case OP_ASSERTBACK_NOT:
1114 if (!skipassert) return code;
1115 do code += GET(code, 1); while (*code == OP_ALT);
1116 code += _pcre_OP_lengths[*code];
1117 break;
1118
1119 case OP_WORD_BOUNDARY:
1120 case OP_NOT_WORD_BOUNDARY:
1121 if (!skipassert) return code;
1122 /* Fall through */
1123
1124 case OP_CALLOUT:
1125 case OP_CREF:
1126 case OP_RREF:
1127 case OP_DEF:
1128 code += _pcre_OP_lengths[*code];
1129 break;
1130
1131 default:
1132 return code;
1133 }
1134 }
1135 /* Control never reaches here */
1136 }
1137
1138
1139
1140
1141 /*************************************************
1142 * Find the fixed length of a pattern *
1143 *************************************************/
1144
1145 /* Scan a pattern and compute the fixed length of subject that will match it,
1146 if the length is fixed. This is needed for dealing with backward assertions.
1147 In UTF8 mode, the result is in characters rather than bytes.
1148
1149 Arguments:
1150 code points to the start of the pattern (the bracket)
1151 options the compiling options
1152
1153 Returns: the fixed length, or -1 if there is no fixed length,
1154 or -2 if \C was encountered
1155 */
1156
1157 static int
1158 find_fixedlength(uschar *code, int options)
1159 {
1160 int length = -1;
1161
1162 register int branchlength = 0;
1163 register uschar *cc = code + 1 + LINK_SIZE;
1164
1165 /* Scan along the opcodes for this branch. If we get to the end of the
1166 branch, check the length against that of the other branches. */
1167
1168 for (;;)
1169 {
1170 int d;
1171 register int op = *cc;
1172 switch (op)
1173 {
1174 case OP_CBRA:
1175 case OP_BRA:
1176 case OP_ONCE:
1177 case OP_COND:
1178 d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);
1179 if (d < 0) return d;
1180 branchlength += d;
1181 do cc += GET(cc, 1); while (*cc == OP_ALT);
1182 cc += 1 + LINK_SIZE;
1183 break;
1184
1185 /* Reached end of a branch; if it's a ket it is the end of a nested
1186 call. If it's ALT it is an alternation in a nested call. If it is
1187 END it's the end of the outer call. All can be handled by the same code. */
1188
1189 case OP_ALT:
1190 case OP_KET:
1191 case OP_KETRMAX:
1192 case OP_KETRMIN:
1193 case OP_END:
1194 if (length < 0) length = branchlength;
1195 else if (length != branchlength) return -1;
1196 if (*cc != OP_ALT) return length;
1197 cc += 1 + LINK_SIZE;
1198 branchlength = 0;
1199 break;
1200
1201 /* Skip over assertive subpatterns */
1202
1203 case OP_ASSERT:
1204 case OP_ASSERT_NOT:
1205 case OP_ASSERTBACK:
1206 case OP_ASSERTBACK_NOT:
1207 do cc += GET(cc, 1); while (*cc == OP_ALT);
1208 /* Fall through */
1209
1210 /* Skip over things that don't match chars */
1211
1212 case OP_REVERSE:
1213 case OP_CREF:
1214 case OP_RREF:
1215 case OP_DEF:
1216 case OP_OPT:
1217 case OP_CALLOUT:
1218 case OP_SOD:
1219 case OP_SOM:
1220 case OP_EOD:
1221 case OP_EODN:
1222 case OP_CIRC:
1223 case OP_DOLL:
1224 case OP_NOT_WORD_BOUNDARY:
1225 case OP_WORD_BOUNDARY:
1226 cc += _pcre_OP_lengths[*cc];
1227 break;
1228
1229 /* Handle literal characters */
1230
1231 case OP_CHAR:
1232 case OP_CHARNC:
1233 case OP_NOT:
1234 branchlength++;
1235 cc += 2;
1236 #ifdef SUPPORT_UTF8
1237 if ((options & PCRE_UTF8) != 0)
1238 {
1239 while ((*cc & 0xc0) == 0x80) cc++;
1240 }
1241 #endif
1242 break;
1243
1244 /* Handle exact repetitions. The count is already in characters, but we
1245 need to skip over a multibyte character in UTF8 mode. */
1246
1247 case OP_EXACT:
1248 branchlength += GET2(cc,1);
1249 cc += 4;
1250 #ifdef SUPPORT_UTF8
1251 if ((options & PCRE_UTF8) != 0)
1252 {
1253 while((*cc & 0x80) == 0x80) cc++;
1254 }
1255 #endif
1256 break;
1257
1258 case OP_TYPEEXACT:
1259 branchlength += GET2(cc,1);
1260 if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
1261 cc += 4;
1262 break;
1263
1264 /* Handle single-char matchers */
1265
1266 case OP_PROP:
1267 case OP_NOTPROP:
1268 cc += 2;
1269 /* Fall through */
1270
1271 case OP_NOT_DIGIT:
1272 case OP_DIGIT:
1273 case OP_NOT_WHITESPACE:
1274 case OP_WHITESPACE:
1275 case OP_NOT_WORDCHAR:
1276 case OP_WORDCHAR:
1277 case OP_ANY:
1278 branchlength++;
1279 cc++;
1280 break;
1281
1282 /* The single-byte matcher isn't allowed */
1283
1284 case OP_ANYBYTE:
1285 return -2;
1286
1287 /* Check a class for variable quantification */
1288
1289 #ifdef SUPPORT_UTF8
1290 case OP_XCLASS:
1291 cc += GET(cc, 1) - 33;
1292 /* Fall through */
1293 #endif
1294
1295 case OP_CLASS:
1296 case OP_NCLASS:
1297 cc += 33;
1298
1299 switch (*cc)
1300 {
1301 case OP_CRSTAR:
1302 case OP_CRMINSTAR:
1303 case OP_CRQUERY:
1304 case OP_CRMINQUERY:
1305 return -1;
1306
1307 case OP_CRRANGE:
1308 case OP_CRMINRANGE:
1309 if (GET2(cc,1) != GET2(cc,3)) return -1;
1310 branchlength += GET2(cc,1);
1311 cc += 5;
1312 break;
1313
1314 default:
1315 branchlength++;
1316 }
1317 break;
1318
1319 /* Anything else is variable length */
1320
1321 default:
1322 return -1;
1323 }
1324 }
1325 /* Control never gets here */
1326 }
1327
1328
1329
1330
1331 /*************************************************
1332 * Scan compiled regex for numbered bracket *
1333 *************************************************/
1334
1335 /* This little function scans through a compiled pattern until it finds a
1336 capturing bracket with the given number.
1337
1338 Arguments:
1339 code points to start of expression
1340 utf8 TRUE in UTF-8 mode
1341 number the required bracket number
1342
1343 Returns: pointer to the opcode for the bracket, or NULL if not found
1344 */
1345
1346 static const uschar *
1347 find_bracket(const uschar *code, BOOL utf8, int number)
1348 {
1349 for (;;)
1350 {
1351 register int c = *code;
1352 if (c == OP_END) return NULL;
1353
1354 /* XCLASS is used for classes that cannot be represented just by a bit
1355 map. This includes negated single high-valued characters. The length in
1356 the table is zero; the actual length is stored in the compiled code. */
1357
1358 if (c == OP_XCLASS) code += GET(code, 1);
1359
1360 /* Handle capturing bracket */
1361
1362 else if (c == OP_CBRA)
1363 {
1364 int n = GET2(code, 1+LINK_SIZE);
1365 if (n == number) return (uschar *)code;
1366 code += _pcre_OP_lengths[c];
1367 }
1368
1369 /* Otherwise, we can get the item's length from the table, except that for
1370 repeated character types, we have to test for \p and \P, which have an extra
1371 two bytes of parameters. */
1372
1373 else
1374 {
1375 switch(c)
1376 {
1377 case OP_TYPESTAR:
1378 case OP_TYPEMINSTAR:
1379 case OP_TYPEPLUS:
1380 case OP_TYPEMINPLUS:
1381 case OP_TYPEQUERY:
1382 case OP_TYPEMINQUERY:
1383 case OP_TYPEPOSSTAR:
1384 case OP_TYPEPOSPLUS:
1385 case OP_TYPEPOSQUERY:
1386 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1387 break;
1388
1389 case OP_TYPEUPTO:
1390 case OP_TYPEMINUPTO:
1391 case OP_TYPEEXACT:
1392 case OP_TYPEPOSUPTO:
1393 if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1394 break;
1395 }
1396
1397 /* Add in the fixed length from the table */
1398
1399 code += _pcre_OP_lengths[c];
1400
1401 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1402 a multi-byte character. The length in the table is a minimum, so we have to
1403 arrange to skip the extra bytes. */
1404
1405 #ifdef SUPPORT_UTF8
1406 if (utf8) switch(c)
1407 {
1408 case OP_CHAR:
1409 case OP_CHARNC:
1410 case OP_EXACT:
1411 case OP_UPTO:
1412 case OP_MINUPTO:
1413 case OP_POSUPTO:
1414 case OP_STAR:
1415 case OP_MINSTAR:
1416 case OP_POSSTAR:
1417 case OP_PLUS:
1418 case OP_MINPLUS:
1419 case OP_POSPLUS:
1420 case OP_QUERY:
1421 case OP_MINQUERY:
1422 case OP_POSQUERY:
1423 if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1424 break;
1425 }
1426 #endif
1427 }
1428 }
1429 }
1430
1431
1432
1433 /*************************************************
1434 * Scan compiled regex for recursion reference *
1435 *************************************************/
1436
1437 /* This little function scans through a compiled pattern until it finds an
1438 instance of OP_RECURSE.
1439
1440 Arguments:
1441 code points to start of expression
1442 utf8 TRUE in UTF-8 mode
1443
1444 Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1445 */
1446
1447 static const uschar *
1448 find_recurse(const uschar *code, BOOL utf8)
1449 {
1450 for (;;)
1451 {
1452 register int c = *code;
1453 if (c == OP_END) return NULL;
1454 if (c == OP_RECURSE) return code;
1455
1456 /* XCLASS is used for classes that cannot be represented just by a bit
1457 map. This includes negated single high-valued characters. The length in
1458 the table is zero; the actual length is stored in the compiled code. */
1459
1460 if (c == OP_XCLASS) code += GET(code, 1);
1461
1462 /* Otherwise, we can get the item's length from the table, except that for
1463 repeated character types, we have to test for \p and \P, which have an extra
1464 two bytes of parameters. */
1465
1466 else
1467 {
1468 switch(c)
1469 {
1470 case OP_TYPESTAR:
1471 case OP_TYPEMINSTAR:
1472 case OP_TYPEPLUS:
1473 case OP_TYPEMINPLUS:
1474 case OP_TYPEQUERY:
1475 case OP_TYPEMINQUERY:
1476 case OP_TYPEPOSSTAR:
1477 case OP_TYPEPOSPLUS:
1478 case OP_TYPEPOSQUERY:
1479 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1480 break;
1481
1482 case OP_TYPEPOSUPTO:
1483 case OP_TYPEUPTO:
1484 case OP_TYPEMINUPTO:
1485 case OP_TYPEEXACT:
1486 if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1487 break;
1488 }
1489
1490 /* Add in the fixed length from the table */
1491
1492 code += _pcre_OP_lengths[c];
1493
1494 /* In UTF-8 mode, opcodes that are followed by a character may be followed
1495 by a multi-byte character. The length in the table is a minimum, so we have
1496 to arrange to skip the extra bytes. */
1497
1498 #ifdef SUPPORT_UTF8
1499 if (utf8) switch(c)
1500 {
1501 case OP_CHAR:
1502 case OP_CHARNC:
1503 case OP_EXACT:
1504 case OP_UPTO:
1505 case OP_MINUPTO:
1506 case OP_POSUPTO:
1507 case OP_STAR:
1508 case OP_MINSTAR:
1509 case OP_POSSTAR:
1510 case OP_PLUS:
1511 case OP_MINPLUS:
1512 case OP_POSPLUS:
1513 case OP_QUERY:
1514 case OP_MINQUERY:
1515 case OP_POSQUERY:
1516 if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1517 break;
1518 }
1519 #endif
1520 }
1521 }
1522 }
1523
1524
1525
1526 /*************************************************
1527 * Scan compiled branch for non-emptiness *
1528 *************************************************/
1529
1530 /* This function scans through a branch of a compiled pattern to see whether it
1531 can match the empty string or not. It is called from could_be_empty()
1532 below and from compile_branch() when checking for an unlimited repeat of a
1533 group that can match nothing. Note that first_significant_code() skips over
1534 backward and negative forward assertions when its final argument is TRUE. If we
1535 hit an unclosed bracket, we return "empty" - this means we've struck an inner
1536 bracket whose current branch will already have been scanned.
1537
1538 Arguments:
1539 code points to start of search
1540 endcode points to where to stop
1541 utf8 TRUE if in UTF8 mode
1542
1543 Returns: TRUE if what is matched could be empty
1544 */
1545
1546 static BOOL
1547 could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1548 {
1549 register int c;
1550 for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1551 code < endcode;
1552 code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1553 {
1554 const uschar *ccode;
1555
1556 c = *code;
1557
1558 /* Skip over forward assertions; the other assertions are skipped by
1559 first_significant_code() with a TRUE final argument. */
1560
1561 if (c == OP_ASSERT)
1562 {
1563 do code += GET(code, 1); while (*code == OP_ALT);
1564 c = *code;
1565 continue;
1566 }
1567
1568 /* Groups with zero repeats can of course be empty; skip them. */
1569
1570 if (c == OP_BRAZERO || c == OP_BRAMINZERO)
1571 {
1572 code += _pcre_OP_lengths[c];
1573 do code += GET(code, 1); while (*code == OP_ALT);
1574 c = *code;
1575 continue;
1576 }
1577
1578 /* For other groups, scan the branches. */
1579
1580 if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND)
1581 {
1582 BOOL empty_branch;
1583 if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1584
1585 /* Scan a closed bracket */
1586
1587 empty_branch = FALSE;
1588 do
1589 {
1590 if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1591 empty_branch = TRUE;
1592 code += GET(code, 1);
1593 }
1594 while (*code == OP_ALT);
1595 if (!empty_branch) return FALSE; /* All branches are non-empty */
1596 c = *code;
1597 continue;
1598 }
1599
1600 /* Handle the other opcodes */
1601
1602 switch (c)
1603 {
1604 /* Check for quantifiers after a class. XCLASS is used for classes that
1605 cannot be represented just by a bit map. This includes negated single
1606 high-valued characters. The length in _pcre_OP_lengths[] is zero; the
1607 actual length is stored in the compiled code, so we must update "code"
1608 here. */
1609
1610 #ifdef SUPPORT_UTF8
1611 case OP_XCLASS:
1612 ccode = code += GET(code, 1);
1613 goto CHECK_CLASS_REPEAT;
1614 #endif
1615
1616 case OP_CLASS:
1617 case OP_NCLASS:
1618 ccode = code + 33;
1619
1620 #ifdef SUPPORT_UTF8
1621 CHECK_CLASS_REPEAT:
1622 #endif
1623
1624 switch (*ccode)
1625 {
1626 case OP_CRSTAR: /* These could be empty; continue */
1627 case OP_CRMINSTAR:
1628 case OP_CRQUERY:
1629 case OP_CRMINQUERY:
1630 break;
1631
1632 default: /* Non-repeat => class must match */
1633 case OP_CRPLUS: /* These repeats aren't empty */
1634 case OP_CRMINPLUS:
1635 return FALSE;
1636
1637 case OP_CRRANGE:
1638 case OP_CRMINRANGE:
1639 if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
1640 break;
1641 }
1642 break;
1643
1644 /* Opcodes that must match a character */
1645
1646 case OP_PROP:
1647 case OP_NOTPROP:
1648 case OP_EXTUNI:
1649 case OP_NOT_DIGIT:
1650 case OP_DIGIT:
1651 case OP_NOT_WHITESPACE:
1652 case OP_WHITESPACE:
1653 case OP_NOT_WORDCHAR:
1654 case OP_WORDCHAR:
1655 case OP_ANY:
1656 case OP_ANYBYTE:
1657 case OP_CHAR:
1658 case OP_CHARNC:
1659 case OP_NOT:
1660 case OP_PLUS:
1661 case OP_MINPLUS:
1662 case OP_POSPLUS:
1663 case OP_EXACT:
1664 case OP_NOTPLUS:
1665 case OP_NOTMINPLUS:
1666 case OP_NOTPOSPLUS:
1667 case OP_NOTEXACT:
1668 case OP_TYPEPLUS:
1669 case OP_TYPEMINPLUS:
1670 case OP_TYPEPOSPLUS:
1671 case OP_TYPEEXACT:
1672 return FALSE;
1673
1674 /* These are going to continue, as they may be empty, but we have to
1675 fudge the length for the \p and \P cases. */
1676
1677 case OP_TYPESTAR:
1678 case OP_TYPEMINSTAR:
1679 case OP_TYPEPOSSTAR:
1680 case OP_TYPEQUERY:
1681 case OP_TYPEMINQUERY:
1682 case OP_TYPEPOSQUERY:
1683 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1684 break;
1685
1686 /* Same for these */
1687
1688 case OP_TYPEUPTO:
1689 case OP_TYPEMINUPTO:
1690 case OP_TYPEPOSUPTO:
1691 if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1692 break;
1693
1694 /* End of branch */
1695
1696 case OP_KET:
1697 case OP_KETRMAX:
1698 case OP_KETRMIN:
1699 case OP_ALT:
1700 return TRUE;
1701
1702 /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1703 MINUPTO, and POSUPTO may be followed by a multibyte character */
1704
1705 #ifdef SUPPORT_UTF8
1706 case OP_STAR:
1707 case OP_MINSTAR:
1708 case OP_POSSTAR:
1709 case OP_QUERY:
1710 case OP_MINQUERY:
1711 case OP_POSQUERY:
1712 case OP_UPTO:
1713 case OP_MINUPTO:
1714 case OP_POSUPTO:
1715 if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1716 break;
1717 #endif
1718 }
1719 }
1720
1721 return TRUE;
1722 }
1723
1724
1725
1726 /*************************************************
1727 * Scan compiled regex for non-emptiness *
1728 *************************************************/
1729
1730 /* This function is called to check for left recursive calls. We want to check
1731 the current branch of the current pattern to see if it could match the empty
1732 string. If it could, we must look outwards for branches at other levels,
1733 stopping when we pass beyond the bracket which is the subject of the recursion.
1734
1735 Arguments:
1736 code points to start of the recursion
1737 endcode points to where to stop (current RECURSE item)
1738 bcptr points to the chain of current (unclosed) branch starts
1739 utf8 TRUE if in UTF-8 mode
1740
1741 Returns: TRUE if what is matched could be empty
1742 */
1743
1744 static BOOL
1745 could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1746 BOOL utf8)
1747 {
1748 while (bcptr != NULL && bcptr->current >= code)
1749 {
1750 if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1751 bcptr = bcptr->outer;
1752 }
1753 return TRUE;
1754 }
1755
1756
1757
1758 /*************************************************
1759 * Check for POSIX class syntax *
1760 *************************************************/
1761
1762 /* This function is called when the sequence "[:" or "[." or "[=" is
1763 encountered in a character class. It checks whether this is followed by a
1764 sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
1765 reach an unescaped ']' without the special preceding character, return FALSE.
1766
1767 Originally, this function only recognized a sequence of letters between the
1768 terminators, but it seems that Perl recognizes any sequence of characters,
1769 though of course unknown POSIX names are subsequently rejected. Perl gives an
1770 "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
1771 didn't consider this to be a POSIX class. Likewise for [:1234:].
1772
1773 The problem in trying to be exactly like Perl is in the handling of escapes. We
1774 have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
1775 class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
1776 below handles the special case of \], but does not try to do any other escape
1777 processing. This makes it different from Perl for cases such as [:l\ower:]
1778 where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize
1779 "l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,
1780 I think.
1781
1782 Arguments:
1783 ptr pointer to the initial [
1784 endptr where to return the end pointer
1785
1786 Returns: TRUE or FALSE
1787 */
1788
1789 static BOOL
1790 check_posix_syntax(const uschar *ptr, const uschar **endptr)
1791 {
1792 int terminator; /* Don't combine these lines; the Solaris cc */
1793 terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
1794 for (++ptr; *ptr != 0; ptr++)
1795 {
1796 if (*ptr == '\\' && ptr[1] == ']') ptr++; else
1797 {
1798 if (*ptr == ']') return FALSE;
1799 if (*ptr == terminator && ptr[1] == ']')
1800 {
1801 *endptr = ptr;
1802 return TRUE;
1803 }
1804 }
1805 }
1806 return FALSE;
1807 }
1808
1809
1810
1811
1812 /*************************************************
1813 * Check POSIX class name *
1814 *************************************************/
1815
1816 /* This function is called to check the name given in a POSIX-style class entry
1817 such as [:alnum:].
1818
1819 Arguments:
1820 ptr points to the first letter
1821 len the length of the name
1822
1823 Returns: a value representing the name, or -1 if unknown
1824 */
1825
1826 static int
1827 check_posix_name(const uschar *ptr, int len)
1828 {
1829 const char *pn = posix_names;
1830 register int yield = 0;
1831 while (posix_name_lengths[yield] != 0)
1832 {
1833 if (len == posix_name_lengths[yield] &&
1834 strncmp((const char *)ptr, pn, len) == 0) return yield;
1835 pn += posix_name_lengths[yield] + 1;
1836 yield++;
1837 }
1838 return -1;
1839 }
1840
1841
1842 /*************************************************
1843 * Adjust OP_RECURSE items in repeated group *
1844 *************************************************/
1845
1846 /* OP_RECURSE items contain an offset from the start of the regex to the group
1847 that is referenced. This means that groups can be replicated for fixed
1848 repetition simply by copying (because the recursion is allowed to refer to
1849 earlier groups that are outside the current group). However, when a group is
1850 optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
1851 it, after it has been compiled. This means that any OP_RECURSE items within it
1852 that refer to the group itself or any contained groups have to have their
1853 offsets adjusted. That one of the jobs of this function. Before it is called,
1854 the partially compiled regex must be temporarily terminated with OP_END.
1855
1856 This function has been extended with the possibility of forward references for
1857 recursions and subroutine calls. It must also check the list of such references
1858 for the group we are dealing with. If it finds that one of the recursions in
1859 the current group is on this list, it adjusts the offset in the list, not the
1860 value in the reference (which is a group number).
1861
1862 Arguments:
1863 group points to the start of the group
1864 adjust the amount by which the group is to be moved
1865 utf8 TRUE in UTF-8 mode
1866 cd contains pointers to tables etc.
1867 save_hwm the hwm forward reference pointer at the start of the group
1868
1869 Returns: nothing
1870 */
1871
1872 static void
1873 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
1874 uschar *save_hwm)
1875 {
1876 uschar *ptr = group;
1877
1878 while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1879 {
1880 int offset;
1881 uschar *hc;
1882
1883 /* See if this recursion is on the forward reference list. If so, adjust the
1884 reference. */
1885
1886 for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
1887 {
1888 offset = GET(hc, 0);
1889 if (cd->start_code + offset == ptr + 1)
1890 {
1891 PUT(hc, 0, offset + adjust);
1892 break;
1893 }
1894 }
1895
1896 /* Otherwise, adjust the recursion offset if it's after the start of this
1897 group. */
1898
1899 if (hc >= cd->hwm)
1900 {
1901 offset = GET(ptr, 1);
1902 if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1903 }
1904
1905 ptr += 1 + LINK_SIZE;
1906 }
1907 }
1908
1909
1910
1911 /*************************************************
1912 * Insert an automatic callout point *
1913 *************************************************/
1914
1915 /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
1916 callout points before each pattern item.
1917
1918 Arguments:
1919 code current code pointer
1920 ptr current pattern pointer
1921 cd pointers to tables etc
1922
1923 Returns: new code pointer
1924 */
1925
1926 static uschar *
1927 auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
1928 {
1929 *code++ = OP_CALLOUT;
1930 *code++ = 255;
1931 PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */
1932 PUT(code, LINK_SIZE, 0); /* Default length */
1933 return code + 2*LINK_SIZE;
1934 }
1935
1936
1937
1938 /*************************************************
1939 * Complete a callout item *
1940 *************************************************/
1941
1942 /* A callout item contains the length of the next item in the pattern, which
1943 we can't fill in till after we have reached the relevant point. This is used
1944 for both automatic and manual callouts.
1945
1946 Arguments:
1947 previous_callout points to previous callout item
1948 ptr current pattern pointer
1949 cd pointers to tables etc
1950
1951 Returns: nothing
1952 */
1953
1954 static void
1955 complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
1956 {
1957 int length = ptr - cd->start_pattern - GET(previous_callout, 2);
1958 PUT(previous_callout, 2 + LINK_SIZE, length);
1959 }
1960
1961
1962
1963 #ifdef SUPPORT_UCP
1964 /*************************************************
1965 * Get othercase range *
1966 *************************************************/
1967
1968 /* This function is passed the start and end of a class range, in UTF-8 mode
1969 with UCP support. It searches up the characters, looking for internal ranges of
1970 characters in the "other" case. Each call returns the next one, updating the
1971 start address.
1972
1973 Arguments:
1974 cptr points to starting character value; updated
1975 d end value
1976 ocptr where to put start of othercase range
1977 odptr where to put end of othercase range
1978
1979 Yield: TRUE when range returned; FALSE when no more
1980 */
1981
1982 static BOOL
1983 get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
1984 unsigned int *odptr)
1985 {
1986 unsigned int c, othercase, next;
1987
1988 for (c = *cptr; c <= d; c++)
1989 { if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR) break; }
1990
1991 if (c > d) return FALSE;
1992
1993 *ocptr = othercase;
1994 next = othercase + 1;
1995
1996 for (++c; c <= d; c++)
1997 {
1998 if (_pcre_ucp_othercase(c) != next) break;
1999 next++;
2000 }
2001
2002 *odptr = next - 1;
2003 *cptr = c;
2004
2005 return TRUE;
2006 }
2007 #endif /* SUPPORT_UCP */
2008
2009
2010
2011 /*************************************************
2012 * Check if auto-possessifying is possible *
2013 *************************************************/
2014
2015 /* This function is called for unlimited repeats of certain items, to see
2016 whether the next thing could possibly match the repeated item. If not, it makes
2017 sense to automatically possessify the repeated item.
2018
2019 Arguments:
2020 op_code the repeated op code
2021 this data for this item, depends on the opcode
2022 utf8 TRUE in UTF-8 mode
2023 utf8_char used for utf8 character bytes, NULL if not relevant
2024 ptr next character in pattern
2025 options options bits
2026 cd contains pointers to tables etc.
2027
2028 Returns: TRUE if possessifying is wanted
2029 */
2030
2031 static BOOL
2032 check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
2033 const uschar *ptr, int options, compile_data *cd)
2034 {
2035 int next;
2036
2037 /* Skip whitespace and comments in extended mode */
2038
2039 if ((options & PCRE_EXTENDED) != 0)
2040 {
2041 for (;;)
2042 {
2043 while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2044 if (*ptr == '#')
2045 {
2046 while (*(++ptr) != 0)
2047 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2048 }
2049 else break;
2050 }
2051 }
2052
2053 /* If the next item is one that we can handle, get its value. A non-negative
2054 value is a character, a negative value is an escape value. */
2055
2056 if (*ptr == '\\')
2057 {
2058 int temperrorcode = 0;
2059 next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
2060 if (temperrorcode != 0) return FALSE;
2061 ptr++; /* Point after the escape sequence */
2062 }
2063
2064 else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
2065 {
2066 #ifdef SUPPORT_UTF8
2067 if (utf8) { GETCHARINC(next, ptr); } else
2068 #endif
2069 next = *ptr++;
2070 }
2071
2072 else return FALSE;
2073
2074 /* Skip whitespace and comments in extended mode */
2075
2076 if ((options & PCRE_EXTENDED) != 0)
2077 {
2078 for (;;)
2079 {
2080 while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2081 if (*ptr == '#')
2082 {
2083 while (*(++ptr) != 0)
2084 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2085 }
2086 else break;
2087 }
2088 }
2089
2090 /* If the next thing is itself optional, we have to give up. */
2091
2092 if (*ptr == '*' || *ptr == '?' || strncmp((char *)ptr, "{0,", 3) == 0)
2093 return FALSE;
2094
2095 /* Now compare the next item with the previous opcode. If the previous is a
2096 positive single character match, "item" either contains the character or, if
2097 "item" is greater than 127 in utf8 mode, the character's bytes are in
2098 utf8_char. */
2099
2100
2101 /* Handle cases when the next item is a character. */
2102
2103 if (next >= 0) switch(op_code)
2104 {
2105 case OP_CHAR:
2106 #ifdef SUPPORT_UTF8
2107 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2108 #endif
2109 return item != next;
2110
2111 /* For CHARNC (caseless character) we must check the other case. If we have
2112 Unicode property support, we can use it to test the other case of
2113 high-valued characters. */
2114
2115 case OP_CHARNC:
2116 #ifdef SUPPORT_UTF8
2117 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2118 #endif
2119 if (item == next) return FALSE;
2120 #ifdef SUPPORT_UTF8
2121 if (utf8)
2122 {
2123 unsigned int othercase;
2124 if (next < 128) othercase = cd->fcc[next]; else
2125 #ifdef SUPPORT_UCP
2126 othercase = _pcre_ucp_othercase((unsigned int)next);
2127 #else
2128 othercase = NOTACHAR;
2129 #endif
2130 return (unsigned int)item != othercase;
2131 }
2132 else
2133 #endif /* SUPPORT_UTF8 */
2134 return (item != cd->fcc[next]); /* Non-UTF-8 mode */
2135
2136 /* For OP_NOT, "item" must be a single-byte character. */
2137
2138 case OP_NOT:
2139 if (item == next) return TRUE;
2140 if ((options & PCRE_CASELESS) == 0) return FALSE;
2141 #ifdef SUPPORT_UTF8
2142 if (utf8)
2143 {
2144 unsigned int othercase;
2145 if (next < 128) othercase = cd->fcc[next]; else
2146 #ifdef SUPPORT_UCP
2147 othercase = _pcre_ucp_othercase(next);
2148 #else
2149 othercase = NOTACHAR;
2150 #endif
2151 return (unsigned int)item == othercase;
2152 }
2153 else
2154 #endif /* SUPPORT_UTF8 */
2155 return (item == cd->fcc[next]); /* Non-UTF-8 mode */
2156
2157 case OP_DIGIT:
2158 return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
2159
2160 case OP_NOT_DIGIT:
2161 return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
2162
2163 case OP_WHITESPACE:
2164 return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
2165
2166 case OP_NOT_WHITESPACE:
2167 return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
2168
2169 case OP_WORDCHAR:
2170 return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
2171
2172 case OP_NOT_WORDCHAR:
2173 return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
2174
2175 case OP_HSPACE:
2176 case OP_NOT_HSPACE:
2177 switch(next)
2178 {
2179 case 0x09:
2180 case 0x20:
2181 case 0xa0:
2182 case 0x1680:
2183 case 0x180e:
2184 case 0x2000:
2185 case 0x2001:
2186 case 0x2002:
2187 case 0x2003:
2188 case 0x2004:
2189 case 0x2005:
2190 case 0x2006:
2191 case 0x2007:
2192 case 0x2008:
2193 case 0x2009:
2194 case 0x200A:
2195 case 0x202f:
2196 case 0x205f:
2197 case 0x3000:
2198 return op_code != OP_HSPACE;
2199 default:
2200 return op_code == OP_HSPACE;
2201 }
2202
2203 case OP_VSPACE:
2204 case OP_NOT_VSPACE:
2205 switch(next)
2206 {
2207 case 0x0a:
2208 case 0x0b:
2209 case 0x0c:
2210 case 0x0d:
2211 case 0x85:
2212 case 0x2028:
2213 case 0x2029:
2214 return op_code != OP_VSPACE;
2215 default:
2216 return op_code == OP_VSPACE;
2217 }
2218
2219 default:
2220 return FALSE;
2221 }
2222
2223
2224 /* Handle the case when the next item is \d, \s, etc. */
2225
2226 switch(op_code)
2227 {
2228 case OP_CHAR:
2229 case OP_CHARNC:
2230 #ifdef SUPPORT_UTF8
2231 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2232 #endif
2233 switch(-next)
2234 {
2235 case ESC_d:
2236 return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
2237
2238 case ESC_D:
2239 return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
2240
2241 case ESC_s:
2242 return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
2243
2244 case ESC_S:
2245 return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
2246
2247 case ESC_w:
2248 return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
2249
2250 case ESC_W:
2251 return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
2252
2253 case ESC_h:
2254 case ESC_H:
2255 switch(item)
2256 {
2257 case 0x09:
2258 case 0x20:
2259 case 0xa0:
2260 case 0x1680:
2261 case 0x180e:
2262 case 0x2000:
2263 case 0x2001:
2264 case 0x2002:
2265 case 0x2003:
2266 case 0x2004:
2267 case 0x2005:
2268 case 0x2006:
2269 case 0x2007:
2270 case 0x2008:
2271 case 0x2009:
2272 case 0x200A:
2273 case 0x202f:
2274 case 0x205f:
2275 case 0x3000:
2276 return -next != ESC_h;
2277 default:
2278 return -next == ESC_h;
2279 }
2280
2281 case ESC_v:
2282 case ESC_V:
2283 switch(item)
2284 {
2285 case 0x0a:
2286 case 0x0b:
2287 case 0x0c:
2288 case 0x0d:
2289 case 0x85:
2290 case 0x2028:
2291 case 0x2029:
2292 return -next != ESC_v;
2293 default:
2294 return -next == ESC_v;
2295 }
2296
2297 default:
2298 return FALSE;
2299 }
2300
2301 case OP_DIGIT:
2302 return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
2303 next == -ESC_h || next == -ESC_v;
2304
2305 case OP_NOT_DIGIT:
2306 return next == -ESC_d;
2307
2308 case OP_WHITESPACE:
2309 return next == -ESC_S || next == -ESC_d || next == -ESC_w;
2310
2311 case OP_NOT_WHITESPACE:
2312 return next == -ESC_s || next == -ESC_h || next == -ESC_v;
2313
2314 case OP_HSPACE:
2315 return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w;
2316
2317 case OP_NOT_HSPACE:
2318 return next == -ESC_h;
2319
2320 /* Can't have \S in here because VT matches \S (Perl anomaly) */
2321 case OP_VSPACE:
2322 return next == -ESC_V || next == -ESC_d || next == -ESC_w;
2323
2324 case OP_NOT_VSPACE:
2325 return next == -ESC_v;
2326
2327 case OP_WORDCHAR:
2328 return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v;
2329
2330 case OP_NOT_WORDCHAR:
2331 return next == -ESC_w || next == -ESC_d;
2332
2333 default:
2334 return FALSE;
2335 }
2336
2337 /* Control does not reach here */
2338 }
2339
2340
2341
2342 /*************************************************
2343 * Compile one branch *
2344 *************************************************/
2345
2346 /* Scan the pattern, compiling it into the a vector. If the options are
2347 changed during the branch, the pointer is used to change the external options
2348 bits. This function is used during the pre-compile phase when we are trying
2349 to find out the amount of memory needed, as well as during the real compile
2350 phase. The value of lengthptr distinguishes the two phases.
2351
2352 Arguments:
2353 optionsptr pointer to the option bits
2354 codeptr points to the pointer to the current code point
2355 ptrptr points to the current pattern pointer
2356 errorcodeptr points to error code variable
2357 firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2358 reqbyteptr set to the last literal character required, else < 0
2359 bcptr points to current branch chain
2360 cd contains pointers to tables etc.
2361 lengthptr NULL during the real compile phase
2362 points to length accumulator during pre-compile phase
2363
2364 Returns: TRUE on success
2365 FALSE, with *errorcodeptr set non-zero on error
2366 */
2367
2368 static BOOL
2369 compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2370 int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2371 compile_data *cd, int *lengthptr)
2372 {
2373 int repeat_type, op_type;
2374 int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
2375 int bravalue = 0;
2376 int greedy_default, greedy_non_default;
2377 int firstbyte, reqbyte;
2378 int zeroreqbyte, zerofirstbyte;
2379 int req_caseopt, reqvary, tempreqvary;
2380 int options = *optionsptr;
2381 int after_manual_callout = 0;
2382 int length_prevgroup = 0;
2383 register int c;
2384 register uschar *code = *codeptr;
2385 uschar *last_code = code;
2386 uschar *orig_code = code;
2387 uschar *tempcode;
2388 BOOL inescq = FALSE;
2389 BOOL groupsetfirstbyte = FALSE;
2390 const uschar *ptr = *ptrptr;
2391 const uschar *tempptr;
2392 uschar *previous = NULL;
2393 uschar *previous_callout = NULL;
2394 uschar *save_hwm = NULL;
2395 uschar classbits[32];
2396
2397 #ifdef SUPPORT_UTF8
2398 BOOL class_utf8;
2399 BOOL utf8 = (options & PCRE_UTF8) != 0;
2400 uschar *class_utf8data;
2401 uschar *class_utf8data_base;
2402 uschar utf8_char[6];
2403 #else
2404 BOOL utf8 = FALSE;
2405 uschar *utf8_char = NULL;
2406 #endif
2407
2408 #ifdef DEBUG
2409 if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2410 #endif
2411
2412 /* Set up the default and non-default settings for greediness */
2413
2414 greedy_default = ((options & PCRE_UNGREEDY) != 0);
2415 greedy_non_default = greedy_default ^ 1;
2416
2417 /* Initialize no first byte, no required byte. REQ_UNSET means "no char
2418 matching encountered yet". It gets changed to REQ_NONE if we hit something that
2419 matches a non-fixed char first char; reqbyte just remains unset if we never
2420 find one.
2421
2422 When we hit a repeat whose minimum is zero, we may have to adjust these values
2423 to take the zero repeat into account. This is implemented by setting them to
2424 zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2425 item types that can be repeated set these backoff variables appropriately. */
2426
2427 firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2428
2429 /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2430 according to the current setting of the caseless flag. REQ_CASELESS is a bit
2431 value > 255. It is added into the firstbyte or reqbyte variables to record the
2432 case status of the value. This is used only for ASCII characters. */
2433
2434 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2435
2436 /* Switch on next character until the end of the branch */
2437
2438 for (;; ptr++)
2439 {
2440 BOOL negate_class;
2441 BOOL should_flip_negation;
2442 BOOL possessive_quantifier;
2443 BOOL is_quantifier;
2444 BOOL is_recurse;
2445 BOOL reset_bracount;
2446 int class_charcount;
2447 int class_lastchar;
2448 int newoptions;
2449 int recno;
2450 int refsign;
2451 int skipbytes;
2452 int subreqbyte;
2453 int subfirstbyte;
2454 int terminator;
2455 int mclength;
2456 uschar mcbuffer[8];
2457
2458 /* Get next byte in the pattern */
2459
2460 c = *ptr;
2461
2462 /* If we are in the pre-compile phase, accumulate the length used for the
2463 previous cycle of this loop. */
2464
2465 if (lengthptr != NULL)
2466 {
2467 #ifdef DEBUG
2468 if (code > cd->hwm) cd->hwm = code; /* High water info */
2469 #endif
2470 if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
2471 {
2472 *errorcodeptr = ERR52;
2473 goto FAILED;
2474 }
2475
2476 /* There is at least one situation where code goes backwards: this is the
2477 case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2478 the class is simply eliminated. However, it is created first, so we have to
2479 allow memory for it. Therefore, don't ever reduce the length at this point.
2480 */
2481
2482 if (code < last_code) code = last_code;
2483
2484 /* Paranoid check for integer overflow */
2485
2486 if (OFLOW_MAX - *lengthptr < code - last_code)
2487 {
2488 *errorcodeptr = ERR20;
2489 goto FAILED;
2490 }
2491
2492 *lengthptr += code - last_code;
2493 DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2494
2495 /* If "previous" is set and it is not at the start of the work space, move
2496 it back to there, in order to avoid filling up the work space. Otherwise,
2497 if "previous" is NULL, reset the current code pointer to the start. */
2498
2499 if (previous != NULL)
2500 {
2501 if (previous > orig_code)
2502 {
2503 memmove(orig_code, previous, code - previous);
2504 code -= previous - orig_code;
2505 previous = orig_code;
2506 }
2507 }
2508 else code = orig_code;
2509
2510 /* Remember where this code item starts so we can pick up the length
2511 next time round. */
2512
2513 last_code = code;
2514 }
2515
2516 /* In the real compile phase, just check the workspace used by the forward
2517 reference list. */
2518
2519 else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
2520 {
2521 *errorcodeptr = ERR52;
2522 goto FAILED;
2523 }
2524
2525 /* If in \Q...\E, check for the end; if not, we have a literal */
2526
2527 if (inescq && c != 0)
2528 {
2529 if (c == '\\' && ptr[1] == 'E')
2530 {
2531 inescq = FALSE;
2532 ptr++;
2533 continue;
2534 }
2535 else
2536 {
2537 if (previous_callout != NULL)
2538 {
2539 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2540 complete_callout(previous_callout, ptr, cd);
2541 previous_callout = NULL;
2542 }
2543 if ((options & PCRE_AUTO_CALLOUT) != 0)
2544 {
2545 previous_callout = code;
2546 code = auto_callout(code, ptr, cd);
2547 }
2548 goto NORMAL_CHAR;
2549 }
2550 }
2551
2552 /* Fill in length of a previous callout, except when the next thing is
2553 a quantifier. */
2554
2555 is_quantifier = c == '*' || c == '+' || c == '?' ||
2556 (c == '{' && is_counted_repeat(ptr+1));
2557
2558 if (!is_quantifier && previous_callout != NULL &&
2559 after_manual_callout-- <= 0)
2560 {
2561 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2562 complete_callout(previous_callout, ptr, cd);
2563 previous_callout = NULL;
2564 }
2565
2566 /* In extended mode, skip white space and comments */
2567
2568 if ((options & PCRE_EXTENDED) != 0)
2569 {
2570 if ((cd->ctypes[c] & ctype_space) != 0) continue;
2571 if (c == '#')
2572 {
2573 while (*(++ptr) != 0)
2574 {
2575 if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
2576 }
2577 if (*ptr != 0) continue;
2578
2579 /* Else fall through to handle end of string */
2580 c = 0;
2581 }
2582 }
2583
2584 /* No auto callout for quantifiers. */
2585
2586 if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
2587 {
2588 previous_callout = code;
2589 code = auto_callout(code, ptr, cd);
2590 }
2591
2592 switch(c)
2593 {
2594 /* ===================================================================*/
2595 case 0: /* The branch terminates at string end */
2596 case '|': /* or | or ) */
2597 case ')':
2598 *firstbyteptr = firstbyte;
2599 *reqbyteptr = reqbyte;
2600 *codeptr = code;
2601 *ptrptr = ptr;
2602 if (lengthptr != NULL)
2603 {
2604 if (OFLOW_MAX - *lengthptr < code - last_code)
2605 {
2606 *errorcodeptr = ERR20;
2607 goto FAILED;
2608 }
2609 *lengthptr += code - last_code; /* To include callout length */
2610 DPRINTF((">> end branch\n"));
2611 }
2612 return TRUE;
2613
2614
2615 /* ===================================================================*/
2616 /* Handle single-character metacharacters. In multiline mode, ^ disables
2617 the setting of any following char as a first character. */
2618
2619 case '^':
2620 if ((options & PCRE_MULTILINE) != 0)
2621 {
2622 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2623 }
2624 previous = NULL;
2625 *code++ = OP_CIRC;
2626 break;
2627
2628 case '$':
2629 previous = NULL;
2630 *code++ = OP_DOLL;
2631 break;
2632
2633 /* There can never be a first char if '.' is first, whatever happens about
2634 repeats. The value of reqbyte doesn't change either. */
2635
2636 case '.':
2637 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2638 zerofirstbyte = firstbyte;
2639 zeroreqbyte = reqbyte;
2640 previous = code;
2641 *code++ = OP_ANY;
2642 break;
2643
2644
2645 /* ===================================================================*/
2646 /* Character classes. If the included characters are all < 256, we build a
2647 32-byte bitmap of the permitted characters, except in the special case
2648 where there is only one such character. For negated classes, we build the
2649 map as usual, then invert it at the end. However, we use a different opcode
2650 so that data characters > 255 can be handled correctly.
2651
2652 If the class contains characters outside the 0-255 range, a different
2653 opcode is compiled. It may optionally have a bit map for characters < 256,
2654 but those above are are explicitly listed afterwards. A flag byte tells
2655 whether the bitmap is present, and whether this is a negated class or not.
2656 */
2657
2658 case '[':
2659 previous = code;
2660
2661 /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
2662 they are encountered at the top level, so we'll do that too. */
2663
2664 if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2665 check_posix_syntax(ptr, &tempptr))
2666 {
2667 *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;
2668 goto FAILED;
2669 }
2670
2671 /* If the first character is '^', set the negation flag and skip it. Also,
2672 if the first few characters (either before or after ^) are \Q\E or \E we
2673 skip them too. This makes for compatibility with Perl. */
2674
2675 negate_class = FALSE;
2676 for (;;)
2677 {
2678 c = *(++ptr);
2679 if (c == '\\')
2680 {
2681 if (ptr[1] == 'E') ptr++;
2682 else if (strncmp((const char *)ptr+1, "Q\\E", 3) == 0) ptr += 3;
2683 else break;
2684 }
2685 else if (!negate_class && c == '^')
2686 negate_class = TRUE;
2687 else break;
2688 }
2689
2690 /* If a class contains a negative special such as \S, we need to flip the
2691 negation flag at the end, so that support for characters > 255 works
2692 correctly (they are all included in the class). */
2693
2694 should_flip_negation = FALSE;
2695
2696 /* Keep a count of chars with values < 256 so that we can optimize the case
2697 of just a single character (as long as it's < 256). However, For higher
2698 valued UTF-8 characters, we don't yet do any optimization. */
2699
2700 class_charcount = 0;
2701 class_lastchar = -1;
2702
2703 /* Initialize the 32-char bit map to all zeros. We build the map in a
2704 temporary bit of memory, in case the class contains only 1 character (less
2705 than 256), because in that case the compiled code doesn't use the bit map.
2706 */
2707
2708 memset(classbits, 0, 32 * sizeof(uschar));
2709
2710 #ifdef SUPPORT_UTF8
2711 class_utf8 = FALSE; /* No chars >= 256 */
2712 class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */
2713 class_utf8data_base = class_utf8data; /* For resetting in pass 1 */
2714 #endif
2715
2716 /* Process characters until ] is reached. By writing this as a "do" it
2717 means that an initial ] is taken as a data character. At the start of the
2718 loop, c contains the first byte of the character. */
2719
2720 if (c != 0) do
2721 {
2722 const uschar *oldptr;
2723
2724 #ifdef SUPPORT_UTF8
2725 if (utf8 && c > 127)
2726 { /* Braces are required because the */
2727 GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
2728 }
2729
2730 /* In the pre-compile phase, accumulate the length of any UTF-8 extra
2731 data and reset the pointer. This is so that very large classes that
2732 contain a zillion UTF-8 characters no longer overwrite the work space
2733 (which is on the stack). */
2734
2735 if (lengthptr != NULL)
2736 {
2737 *lengthptr += class_utf8data - class_utf8data_base;
2738 class_utf8data = class_utf8data_base;
2739 }
2740
2741 #endif
2742
2743 /* Inside \Q...\E everything is literal except \E */
2744
2745 if (inescq)
2746 {
2747 if (c == '\\' && ptr[1] == 'E') /* If we are at \E */
2748 {
2749 inescq = FALSE; /* Reset literal state */
2750 ptr++; /* Skip the 'E' */
2751 continue; /* Carry on with next */
2752 }
2753 goto CHECK_RANGE; /* Could be range if \E follows */
2754 }
2755
2756 /* Handle POSIX class names. Perl allows a negation extension of the
2757 form [:^name:]. A square bracket that doesn't match the syntax is
2758 treated as a literal. We also recognize the POSIX constructions
2759 [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
2760 5.6 and 5.8 do. */
2761
2762 if (c == '[' &&
2763 (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2764 check_posix_syntax(ptr, &tempptr))
2765 {
2766 BOOL local_negate = FALSE;
2767 int posix_class, taboffset, tabopt;
2768 register const uschar *cbits = cd->cbits;
2769 uschar pbits[32];
2770
2771 if (ptr[1] != ':')
2772 {
2773 *errorcodeptr = ERR31;
2774 goto FAILED;
2775 }
2776
2777 ptr += 2;
2778 if (*ptr == '^')
2779 {
2780 local_negate = TRUE;
2781 should_flip_negation = TRUE; /* Note negative special */
2782 ptr++;
2783 }
2784
2785 posix_class = check_posix_name(ptr, tempptr - ptr);
2786 if (posix_class < 0)
2787 {
2788 *errorcodeptr = ERR30;
2789 goto FAILED;
2790 }
2791
2792 /* If matching is caseless, upper and lower are converted to
2793 alpha. This relies on the fact that the class table starts with
2794 alpha, lower, upper as the first 3 entries. */
2795
2796 if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
2797 posix_class = 0;
2798
2799 /* We build the bit map for the POSIX class in a chunk of local store
2800 because we may be adding and subtracting from it, and we don't want to
2801 subtract bits that may be in the main map already. At the end we or the
2802 result into the bit map that is being built. */
2803
2804 posix_class *= 3;
2805
2806 /* Copy in the first table (always present) */
2807
2808 memcpy(pbits, cbits + posix_class_maps[posix_class],
2809 32 * sizeof(uschar));
2810
2811 /* If there is a second table, add or remove it as required. */
2812
2813 taboffset = posix_class_maps[posix_class + 1];
2814 tabopt = posix_class_maps[posix_class + 2];
2815
2816 if (taboffset >= 0)
2817 {
2818 if (tabopt >= 0)
2819 for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
2820 else
2821 for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
2822 }
2823
2824 /* Not see if we need to remove any special characters. An option
2825 value of 1 removes vertical space and 2 removes underscore. */
2826
2827 if (tabopt < 0) tabopt = -tabopt;
2828 if (tabopt == 1) pbits[1] &= ~0x3c;
2829 else if (tabopt == 2) pbits[11] &= 0x7f;
2830
2831 /* Add the POSIX table or its complement into the main table that is
2832 being built and we are done. */
2833
2834 if (local_negate)
2835 for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
2836 else
2837 for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
2838
2839 ptr = tempptr + 1;
2840 class_charcount = 10; /* Set > 1; assumes more than 1 per class */
2841 continue; /* End of POSIX syntax handling */
2842 }
2843
2844 /* Backslash may introduce a single character, or it may introduce one
2845 of the specials, which just set a flag. The sequence \b is a special
2846 case. Inside a class (and only there) it is treated as backspace.
2847 Elsewhere it marks a word boundary. Other escapes have preset maps ready
2848 to 'or' into the one we are building. We assume they have more than one
2849 character in them, so set class_charcount bigger than one. */
2850
2851 if (c == '\\')
2852 {
2853 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2854 if (*errorcodeptr != 0) goto FAILED;
2855
2856 if (-c == ESC_b) c = '\b'; /* \b is backspace in a class */
2857 else if (-c == ESC_X) c = 'X'; /* \X is literal X in a class */
2858 else if (-c == ESC_R) c = 'R'; /* \R is literal R in a class */
2859 else if (-c == ESC_Q) /* Handle start of quoted string */
2860 {
2861 if (ptr[1] == '\\' && ptr[2] == 'E')
2862 {
2863 ptr += 2; /* avoid empty string */
2864 }
2865 else inescq = TRUE;
2866 continue;
2867 }
2868 else if (-c == ESC_E) continue; /* Ignore orphan \E */
2869
2870 if (c < 0)
2871 {
2872 register const uschar *cbits = cd->cbits;
2873 class_charcount += 2; /* Greater than 1 is what matters */
2874
2875 /* Save time by not doing this in the pre-compile phase. */
2876
2877 if (lengthptr == NULL) switch (-c)
2878 {
2879 case ESC_d:
2880 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
2881 continue;
2882
2883 case ESC_D:
2884 should_flip_negation = TRUE;
2885 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
2886 continue;
2887
2888 case ESC_w:
2889 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
2890 continue;
2891
2892 case ESC_W:
2893 should_flip_negation = TRUE;
2894 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
2895 continue;
2896
2897 case ESC_s:
2898 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
2899 classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
2900 continue;
2901
2902 case ESC_S:
2903 should_flip_negation = TRUE;
2904 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
2905 classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
2906 continue;
2907
2908 default: /* Not recognized; fall through */
2909 break; /* Need "default" setting to stop compiler warning. */
2910 }
2911
2912 /* In the pre-compile phase, just do the recognition. */
2913
2914 else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
2915 c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
2916
2917 /* We need to deal with \H, \h, \V, and \v in both phases because
2918 they use extra memory. */
2919
2920 if (-c == ESC_h)
2921 {
2922 SETBIT(classbits, 0x09); /* VT */
2923 SETBIT(classbits, 0x20); /* SPACE */
2924 SETBIT(classbits, 0xa0); /* NSBP */
2925 #ifdef SUPPORT_UTF8
2926 if (utf8)
2927 {
2928 class_utf8 = TRUE;
2929 *class_utf8data++ = XCL_SINGLE;
2930 class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
2931 *class_utf8data++ = XCL_SINGLE;
2932 class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
2933 *class_utf8data++ = XCL_RANGE;
2934 class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
2935 class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
2936 *class_utf8data++ = XCL_SINGLE;
2937 class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
2938 *class_utf8data++ = XCL_SINGLE;
2939 class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
2940 *class_utf8data++ = XCL_SINGLE;
2941 class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
2942 }
2943 #endif
2944 continue;
2945 }
2946
2947 if (-c == ESC_H)
2948 {
2949 for (c = 0; c < 32; c++)
2950 {
2951 int x = 0xff;
2952 switch (c)
2953 {
2954 case 0x09/8: x ^= 1 << (0x09%8); break;
2955 case 0x20/8: x ^= 1 << (0x20%8); break;
2956 case 0xa0/8: x ^= 1 << (0xa0%8); break;
2957 default: break;
2958 }
2959 classbits[c] |= x;
2960 }
2961
2962 #ifdef SUPPORT_UTF8
2963 if (utf8)
2964 {
2965 class_utf8 = TRUE;
2966 *class_utf8data++ = XCL_RANGE;
2967 class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2968 class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
2969 *class_utf8data++ = XCL_RANGE;
2970 class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
2971 class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
2972 *class_utf8data++ = XCL_RANGE;
2973 class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
2974 class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
2975 *class_utf8data++ = XCL_RANGE;
2976 class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
2977 class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
2978 *class_utf8data++ = XCL_RANGE;
2979 class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
2980 class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
2981 *class_utf8data++ = XCL_RANGE;
2982 class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
2983 class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
2984 *class_utf8data++ = XCL_RANGE;
2985 class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
2986 class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2987 }
2988 #endif
2989 continue;
2990 }
2991
2992 if (-c == ESC_v)
2993 {
2994 SETBIT(classbits, 0x0a); /* LF */
2995 SETBIT(classbits, 0x0b); /* VT */
2996 SETBIT(classbits, 0x0c); /* FF */
2997 SETBIT(classbits, 0x0d); /* CR */
2998 SETBIT(classbits, 0x85); /* NEL */
2999 #ifdef SUPPORT_UTF8
3000 if (utf8)
3001 {
3002 class_utf8 = TRUE;
3003 *class_utf8data++ = XCL_RANGE;
3004 class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
3005 class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
3006 }
3007 #endif
3008 continue;
3009 }
3010
3011 if (-c == ESC_V)
3012 {
3013 for (c = 0; c < 32; c++)
3014 {
3015 int x = 0xff;
3016 switch (c)
3017 {
3018 case 0x0a/8: x ^= 1 << (0x0a%8);
3019 x ^= 1 << (0x0b%8);
3020 x ^= 1 << (0x0c%8);
3021 x ^= 1 << (0x0d%8);
3022 break;
3023 case 0x85/8: x ^= 1 << (0x85%8); break;
3024 default: break;
3025 }
3026 classbits[c] |= x;
3027 }
3028
3029 #ifdef SUPPORT_UTF8
3030 if (utf8)
3031 {
3032 class_utf8 = TRUE;
3033 *class_utf8data++ = XCL_RANGE;
3034 class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
3035 class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
3036 *class_utf8data++ = XCL_RANGE;
3037 class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
3038 class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
3039 }
3040 #endif
3041 continue;
3042 }
3043
3044 /* We need to deal with \P and \p in both phases. */
3045
3046 #ifdef SUPPORT_UCP
3047 if (-c == ESC_p || -c == ESC_P)
3048 {
3049 BOOL negated;
3050 int pdata;
3051 int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
3052 if (ptype < 0) goto FAILED;
3053 class_utf8 = TRUE;
3054 *class_utf8data++ = ((-c == ESC_p) != negated)?
3055 XCL_PROP : XCL_NOTPROP;
3056 *class_utf8data++ = ptype;
3057 *class_utf8data++ = pdata;
3058 class_charcount -= 2; /* Not a < 256 character */
3059 continue;
3060 }
3061 #endif
3062 /* Unrecognized escapes are faulted if PCRE is running in its
3063 strict mode. By default, for compatibility with Perl, they are
3064 treated as literals. */
3065
3066 if ((options & PCRE_EXTRA) != 0)
3067 {
3068 *errorcodeptr = ERR7;
3069 goto FAILED;
3070 }
3071
3072 class_charcount -= 2; /* Undo the default count from above */
3073 c = *ptr; /* Get the final character and fall through */
3074 }
3075
3076 /* Fall through if we have a single character (c >= 0). This may be
3077 greater than 256 in UTF-8 mode. */
3078
3079 } /* End of backslash handling */
3080
3081 /* A single character may be followed by '-' to form a range. However,
3082 Perl does not permit ']' to be the end of the range. A '-' character
3083 at the end is treated as a literal. Perl ignores orphaned \E sequences
3084 entirely. The code for handling \Q and \E is messy. */
3085
3086 CHECK_RANGE:
3087 while (ptr[1] == '\\' && ptr[2] == 'E')
3088 {
3089 inescq = FALSE;
3090 ptr += 2;
3091 }
3092
3093 oldptr = ptr;
3094
3095 /* Remember \r or \n */
3096
3097 if (c == '\r' || c == '\n') cd->external_flags |= PCRE_HASCRORLF;
3098
3099 /* Check for range */
3100
3101 if (!inescq && ptr[1] == '-')
3102 {
3103 int d;
3104 ptr += 2;
3105 while (*ptr == '\\' && ptr[1] == 'E') ptr += 2;
3106
3107 /* If we hit \Q (not followed by \E) at this point, go into escaped
3108 mode. */
3109
3110 while (*ptr == '\\' && ptr[1] == 'Q')
3111 {
3112 ptr += 2;
3113 if (*ptr == '\\' && ptr[1] == 'E') { ptr += 2; continue; }
3114 inescq = TRUE;
3115 break;
3116 }
3117
3118 if (*ptr == 0 || (!inescq && *ptr == ']'))
3119 {
3120 ptr = oldptr;
3121 goto LONE_SINGLE_CHARACTER;
3122 }
3123
3124 #ifdef SUPPORT_UTF8
3125 if (utf8)
3126 { /* Braces are required because the */
3127 GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
3128 }
3129 else
3130 #endif
3131 d = *ptr; /* Not UTF-8 mode */
3132
3133 /* The second part of a range can be a single-character escape, but
3134 not any of the other escapes. Perl 5.6 treats a hyphen as a literal
3135 in such circumstances. */
3136
3137 if (!inescq && d == '\\')
3138 {
3139 d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3140 if (*errorcodeptr != 0) goto FAILED;
3141
3142 /* \b is backspace; \X is literal X; \R is literal R; any other
3143 special means the '-' was literal */
3144
3145 if (d < 0)
3146 {
3147 if (d == -ESC_b) d = '\b';
3148 else if (d == -ESC_X) d = 'X';
3149 else if (d == -ESC_R) d = 'R'; else
3150 {
3151 ptr = oldptr;
3152 goto LONE_SINGLE_CHARACTER; /* A few lines below */
3153 }
3154 }
3155 }
3156
3157 /* Check that the two values are in the correct order. Optimize
3158 one-character ranges */
3159
3160 if (d < c)
3161 {
3162 *errorcodeptr = ERR8;
3163 goto FAILED;
3164 }
3165
3166 if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
3167
3168 /* Remember \r or \n */
3169
3170 if (d == '\r' || d == '\n') cd->external_flags |= PCRE_HASCRORLF;
3171
3172 /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
3173 matching, we have to use an XCLASS with extra data items. Caseless
3174 matching for characters > 127 is available only if UCP support is
3175 available. */
3176
3177 #ifdef SUPPORT_UTF8
3178 if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
3179 {
3180 class_utf8 = TRUE;
3181
3182 /* With UCP support, we can find the other case equivalents of
3183 the relevant characters. There may be several ranges. Optimize how
3184 they fit with the basic range. */
3185
3186 #ifdef SUPPORT_UCP
3187 if ((options & PCRE_CASELESS) != 0)
3188 {
3189 unsigned int occ, ocd;
3190 unsigned int cc = c;
3191 unsigned int origd = d;
3192 while (get_othercase_range(&cc, origd, &occ, &ocd))
3193 {
3194 if (occ >= (unsigned int)c &&
3195 ocd <= (unsigned int)d)
3196 continue; /* Skip embedded ranges */
3197
3198 if (occ < (unsigned int)c &&
3199 ocd >= (unsigned int)c - 1) /* Extend the basic range */
3200 { /* if there is overlap, */
3201 c = occ; /* noting that if occ < c */
3202 continue; /* we can't have ocd > d */
3203 } /* because a subrange is */
3204 if (ocd > (unsigned int)d &&
3205 occ <= (unsigned int)d + 1) /* always shorter than */
3206 { /* the basic range. */
3207 d = ocd;
3208 continue;
3209 }
3210
3211 if (occ == ocd)
3212 {
3213 *class_utf8data++ = XCL_SINGLE;
3214 }
3215 else
3216 {
3217 *class_utf8data++ = XCL_RANGE;
3218 class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
3219 }
3220 class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
3221 }
3222 }
3223 #endif /* SUPPORT_UCP */
3224
3225 /* Now record the original range, possibly modified for UCP caseless
3226 overlapping ranges. */
3227
3228 *class_utf8data++ = XCL_RANGE;
3229 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3230 class_utf8data += _pcre_ord2utf8(d, class_utf8data);
3231
3232 /* With UCP support, we are done. Without UCP support, there is no
3233 caseless matching for UTF-8 characters > 127; we can use the bit map
3234 for the smaller ones. */
3235
3236 #ifdef SUPPORT_UCP
3237 continue; /* With next character in the class */
3238 #else
3239 if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
3240
3241 /* Adjust upper limit and fall through to set up the map */
3242
3243 d = 127;
3244
3245 #endif /* SUPPORT_UCP */
3246 }
3247 #endif /* SUPPORT_UTF8 */
3248
3249 /* We use the bit map for all cases when not in UTF-8 mode; else
3250 ranges that lie entirely within 0-127 when there is UCP support; else
3251 for partial ranges without UCP support. */
3252
3253 class_charcount += d - c + 1;
3254 class_lastchar = d;
3255
3256 /* We can save a bit of time by skipping this in the pre-compile. */
3257
3258 if (lengthptr == NULL) for (; c <= d; c++)
3259 {
3260 classbits[c/8] |= (1 << (c&7));
3261 if ((options & PCRE_CASELESS) != 0)
3262 {
3263 int uc = cd->fcc[c]; /* flip case */
3264 classbits[uc/8] |= (1 << (uc&7));
3265 }
3266 }
3267
3268 continue; /* Go get the next char in the class */
3269 }
3270
3271 /* Handle a lone single character - we can get here for a normal
3272 non-escape char, or after \ that introduces a single character or for an
3273 apparent range that isn't. */
3274
3275 LONE_SINGLE_CHARACTER:
3276
3277 /* Handle a character that cannot go in the bit map */
3278
3279 #ifdef SUPPORT_UTF8
3280 if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
3281 {
3282 class_utf8 = TRUE;
3283 *class_utf8data++ = XCL_SINGLE;
3284 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3285
3286 #ifdef SUPPORT_UCP
3287 if ((options & PCRE_CASELESS) != 0)
3288 {
3289 unsigned int othercase;
3290 if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR)
3291 {
3292 *class_utf8data++ = XCL_SINGLE;
3293 class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
3294 }
3295 }
3296 #endif /* SUPPORT_UCP */
3297
3298 }
3299 else
3300 #endif /* SUPPORT_UTF8 */
3301
3302 /* Handle a single-byte character */
3303 {
3304 classbits[c/8] |= (1 << (c&7));
3305 if ((options & PCRE_CASELESS) != 0)
3306 {
3307 c = cd->fcc[c]; /* flip case */
3308 classbits[c/8] |= (1 << (c&7));
3309 }
3310 class_charcount++;
3311 class_lastchar = c;
3312 }
3313 }
3314
3315 /* Loop until ']' reached. This "while" is the end of the "do" above. */
3316
3317 while ((c = *(++ptr)) != 0 && (c != ']' || inescq));
3318
3319 if (c == 0) /* Missing terminating ']' */
3320 {
3321 *errorcodeptr = ERR6;
3322 goto FAILED;
3323 }
3324
3325
3326 /* This code has been disabled because it would mean that \s counts as
3327 an explicit \r or \n reference, and that's not really what is wanted. Now
3328 we set the flag only if there is a literal "\r" or "\n" in the class. */
3329
3330 #if 0
3331 /* Remember whether \r or \n are in this class */
3332
3333 if (negate_class)
3334 {
3335 if ((classbits[1] & 0x24) != 0x24) cd->external_flags |= PCRE_HASCRORLF;
3336 }
3337 else
3338 {
3339 if ((classbits[1] & 0x24) != 0) cd->external_flags |= PCRE_HASCRORLF;
3340 }
3341 #endif
3342
3343
3344 /* If class_charcount is 1, we saw precisely one character whose value is
3345 less than 256. As long as there were no characters >= 128 and there was no
3346 use of \p or \P, in other words, no use of any XCLASS features, we can
3347 optimize.
3348
3349 In UTF-8 mode, we can optimize the negative case only if there were no
3350 characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR
3351 operate on single-bytes only. This is an historical hangover. Maybe one day
3352 we can tidy these opcodes to handle multi-byte characters.
3353
3354 The optimization throws away the bit map. We turn the item into a
3355 1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
3356 that OP_NOT does not support multibyte characters. In the positive case, it
3357 can cause firstbyte to be set. Otherwise, there can be no first char if
3358 this item is first, whatever repeat count may follow. In the case of
3359 reqbyte, save the previous value for reinstating. */
3360
3361 #ifdef SUPPORT_UTF8
3362 if (class_charcount == 1 && !class_utf8 &&
3363 (!utf8 || !negate_class || class_lastchar < 128))
3364 #else
3365 if (class_charcount == 1)
3366 #endif
3367 {
3368 zeroreqbyte = reqbyte;
3369
3370 /* The OP_NOT opcode works on one-byte characters only. */
3371
3372 if (negate_class)
3373 {
3374 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3375 zerofirstbyte = firstbyte;
3376 *code++ = OP_NOT;
3377 *code++ = class_lastchar;
3378 break;
3379 }
3380
3381 /* For a single, positive character, get the value into mcbuffer, and
3382 then we can handle this with the normal one-character code. */
3383
3384 #ifdef SUPPORT_UTF8
3385 if (utf8 && class_lastchar > 127)
3386 mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
3387 else
3388 #endif
3389 {
3390 mcbuffer[0] = class_lastchar;
3391 mclength = 1;
3392 }
3393 goto ONE_CHAR;
3394 } /* End of 1-char optimization */
3395
3396 /* The general case - not the one-char optimization. If this is the first
3397 thing in the branch, there can be no first char setting, whatever the
3398 repeat count. Any reqbyte setting must remain unchanged after any kind of
3399 repeat. */
3400
3401 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3402 zerofirstbyte = firstbyte;
3403 zeroreqbyte = reqbyte;
3404
3405 /* If there are characters with values > 255, we have to compile an
3406 extended class, with its own opcode, unless there was a negated special
3407 such as \S in the class, because in that case all characters > 255 are in
3408 the class, so any that were explicitly given as well can be ignored. If
3409 (when there are explicit characters > 255 that must be listed) there are no
3410 characters < 256, we can omit the bitmap in the actual compiled code. */
3411
3412 #ifdef SUPPORT_UTF8
3413 if (class_utf8 && !should_flip_negation)
3414 {
3415 *class_utf8data++ = XCL_END; /* Marks the end of extra data */
3416 *code++ = OP_XCLASS;
3417 code += LINK_SIZE;
3418 *code = negate_class? XCL_NOT : 0;
3419
3420 /* If the map is required, move up the extra data to make room for it;
3421 otherwise just move the code pointer to the end of the extra data. */
3422
3423 if (class_charcount > 0)
3424 {
3425 *code++ |= XCL_MAP;
3426 memmove(code + 32, code, class_utf8data - code);
3427 memcpy(code, classbits, 32);
3428 code = class_utf8data + 32;
3429 }
3430 else code = class_utf8data;
3431
3432 /* Now fill in the complete length of the item */
3433
3434 PUT(previous, 1, code - previous);
3435 break; /* End of class handling */
3436 }
3437 #endif
3438
3439 /* If there are no characters > 255, set the opcode to OP_CLASS or
3440 OP_NCLASS, depending on whether the whole class was negated and whether
3441 there were negative specials such as \S in the class. Then copy the 32-byte
3442 map into the code vector, negating it if necessary. */
3443
3444 *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
3445 if (negate_class)
3446 {
3447 if (lengthptr == NULL) /* Save time in the pre-compile phase */
3448 for (c = 0; c < 32; c++) code[c] = ~classbits[c];
3449 }
3450 else
3451 {
3452 memcpy(code, classbits, 32);
3453 }
3454 code += 32;
3455 break;
3456
3457
3458 /* ===================================================================*/
3459 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
3460 has been tested above. */
3461
3462 case '{':
3463 if (!is_quantifier) goto NORMAL_CHAR;
3464 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
3465 if (*errorcodeptr != 0) goto FAILED;
3466 goto REPEAT;
3467
3468 case '*':
3469 repeat_min = 0;
3470 repeat_max = -1;
3471 goto REPEAT;
3472
3473 case '+':
3474 repeat_min = 1;
3475 repeat_max = -1;
3476 goto REPEAT;
3477
3478 case '?':
3479 repeat_min = 0;
3480 repeat_max = 1;
3481
3482 REPEAT:
3483 if (previous == NULL)
3484 {
3485 *errorcodeptr = ERR9;
3486 goto FAILED;
3487 }
3488
3489 if (repeat_min == 0)
3490 {
3491 firstbyte = zerofirstbyte; /* Adjust for zero repeat */
3492 reqbyte = zeroreqbyte; /* Ditto */
3493 }
3494
3495 /* Remember whether this is a variable length repeat */
3496
3497 reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
3498
3499 op_type = 0; /* Default single-char op codes */
3500 possessive_quantifier = FALSE; /* Default not possessive quantifier */
3501
3502 /* Save start of previous item, in case we have to move it up to make space
3503 for an inserted OP_ONCE for the additional '+' extension. */
3504
3505 tempcode = previous;
3506
3507 /* If the next character is '+', we have a possessive quantifier. This
3508 implies greediness, whatever the setting of the PCRE_UNGREEDY option.
3509 If the next character is '?' this is a minimizing repeat, by default,
3510 but if PCRE_UNGREEDY is set, it works the other way round. We change the
3511 repeat type to the non-default. */
3512
3513 if (ptr[1] == '+')
3514 {
3515 repeat_type = 0; /* Force greedy */
3516 possessive_quantifier = TRUE;
3517 ptr++;
3518 }
3519 else if (ptr[1] == '?')
3520 {
3521 repeat_type = greedy_non_default;
3522 ptr++;
3523 }
3524 else repeat_type = greedy_default;
3525
3526 /* If previous was a character match, abolish the item and generate a
3527 repeat item instead. If a char item has a minumum of more than one, ensure
3528 that it is set in reqbyte - it might not be if a sequence such as x{3} is
3529 the first thing in a branch because the x will have gone into firstbyte
3530 instead. */
3531
3532 if (*previous == OP_CHAR || *previous == OP_CHARNC)
3533 {
3534 /* Deal with UTF-8 characters that take up more than one byte. It's
3535 easier to write this out separately than try to macrify it. Use c to
3536 hold the length of the character in bytes, plus 0x80 to flag that it's a
3537 length rather than a small character. */
3538
3539 #ifdef SUPPORT_UTF8
3540 if (utf8 && (code[-1] & 0x80) != 0)
3541 {
3542 uschar *lastchar = code - 1;
3543 while((*lastchar & 0xc0) == 0x80) lastchar--;
3544 c = code - lastchar; /* Length of UTF-8 character */
3545 memcpy(utf8_char, lastchar, c); /* Save the char */
3546 c |= 0x80; /* Flag c as a length */
3547 }
3548 else
3549 #endif
3550
3551 /* Handle the case of a single byte - either with no UTF8 support, or
3552 with UTF-8 disabled, or for a UTF-8 character < 128. */
3553
3554 {
3555 c = code[-1];
3556 if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
3557 }
3558
3559 /* If the repetition is unlimited, it pays to see if the next thing on
3560 the line is something that cannot possibly match this character. If so,
3561 automatically possessifying this item gains some performance in the case
3562 where the match fails. */
3563
3564 if (!possessive_quantifier &&
3565 repeat_max < 0 &&
3566 check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3567 options, cd))
3568 {
3569 repeat_type = 0; /* Force greedy */
3570 possessive_quantifier = TRUE;
3571 }
3572
3573 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
3574 }
3575
3576 /* If previous was a single negated character ([^a] or similar), we use
3577 one of the special opcodes, replacing it. The code is shared with single-
3578 character repeats by setting opt_type to add a suitable offset into
3579 repeat_type. We can also test for auto-possessification. OP_NOT is
3580 currently used only for single-byte chars. */
3581
3582 else if (*previous == OP_NOT)
3583 {
3584 op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
3585 c = previous[1];
3586 if (!possessive_quantifier &&
3587 repeat_max < 0 &&
3588 check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3589 {
3590 repeat_type = 0; /* Force greedy */
3591 possessive_quantifier = TRUE;
3592 }
3593 goto OUTPUT_SINGLE_REPEAT;
3594 }
3595
3596 /* If previous was a character type match (\d or similar), abolish it and
3597 create a suitable repeat item. The code is shared with single-character
3598 repeats by setting op_type to add a suitable offset into repeat_type. Note
3599 the the Unicode property types will be present only when SUPPORT_UCP is
3600 defined, but we don't wrap the little bits of code here because it just
3601 makes it horribly messy. */
3602
3603 else if (*previous < OP_EODN)
3604 {
3605 uschar *oldcode;
3606 int prop_type, prop_value;
3607 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
3608 c = *previous;
3609
3610 if (!possessive_quantifier &&
3611 repeat_max < 0 &&
3612 check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3613 {
3614 repeat_type = 0; /* Force greedy */
3615 possessive_quantifier = TRUE;
3616 }
3617
3618 OUTPUT_SINGLE_REPEAT:
3619 if (*previous == OP_PROP || *previous == OP_NOTPROP)
3620 {
3621 prop_type = previous[1];
3622 prop_value = previous[2];
3623 }
3624 else prop_type = prop_value = -1;
3625
3626 oldcode = code;
3627 code = previous; /* Usually overwrite previous item */
3628
3629 /* If the maximum is zero then the minimum must also be zero; Perl allows
3630 this case, so we do too - by simply omitting the item altogether. */
3631
3632 if (repeat_max == 0) goto END_REPEAT;
3633
3634 /* All real repeats make it impossible to handle partial matching (maybe
3635 one day we will be able to remove this restriction). */
3636
3637 if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL;
3638
3639 /* Combine the op_type with the repeat_type */
3640
3641 repeat_type += op_type;
3642
3643 /* A minimum of zero is handled either as the special case * or ?, or as
3644 an UPTO, with the maximum given. */
3645
3646 if (repeat_min == 0)
3647 {
3648 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
3649 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
3650 else
3651 {
3652 *code++ = OP_UPTO + repeat_type;
3653 PUT2INC(code, 0, repeat_max);
3654 }
3655 }
3656
3657 /* A repeat minimum of 1 is optimized into some special cases. If the
3658 maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
3659 left in place and, if the maximum is greater than 1, we use OP_UPTO with
3660 one less than the maximum. */
3661
3662 else if (repeat_min == 1)
3663 {
3664 if (repeat_max == -1)
3665 *code++ = OP_PLUS + repeat_type;
3666 else
3667 {
3668 code = oldcode; /* leave previous item in place */
3669 if (repeat_max == 1) goto END_REPEAT;
3670 *code++ = OP_UPTO + repeat_type;
3671 PUT2INC(code, 0, repeat_max - 1);
3672 }
3673 }
3674
3675 /* The case {n,n} is just an EXACT, while the general case {n,m} is
3676 handled as an EXACT followed by an UPTO. */
3677
3678 else
3679 {
3680 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
3681 PUT2INC(code, 0, repeat_min);
3682
3683 /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
3684 we have to insert the character for the previous code. For a repeated
3685 Unicode property match, there are two extra bytes that define the
3686 required property. In UTF-8 mode, long characters have their length in
3687 c, with the 0x80 bit as a flag. */
3688
3689 if (repeat_max < 0)
3690 {
3691 #ifdef SUPPORT_UTF8
3692 if (utf8 && c >= 128)
3693 {
3694 memcpy(code, utf8_char, c & 7);
3695 code += c & 7;
3696 }
3697 else
3698 #endif
3699 {
3700 *code++ = c;
3701 if (prop_type >= 0)
3702 {
3703 *code++ = prop_type;
3704 *code++ = prop_value;
3705 }
3706 }
3707 *code++ = OP_STAR + repeat_type;
3708 }
3709
3710 /* Else insert an UPTO if the max is greater than the min, again
3711 preceded by the character, for the previously inserted code. If the
3712 UPTO is just for 1 instance, we can use QUERY instead. */
3713
3714 else if (repeat_max != repeat_min)
3715 {
3716 #ifdef SUPPORT_UTF8
3717 if (utf8 && c >= 128)
3718 {
3719 memcpy(code, utf8_char, c & 7);
3720 code += c & 7;
3721 }
3722 else
3723 #endif
3724 *code++ = c;
3725 if (prop_type >= 0)
3726 {
3727 *code++ = prop_type;
3728 *code++ = prop_value;
3729 }
3730 repeat_max -= repeat_min;
3731
3732 if (repeat_max == 1)
3733 {
3734 *code++ = OP_QUERY + repeat_type;
3735 }
3736 else
3737 {
3738 *code++ = OP_UPTO + repeat_type;
3739 PUT2INC(code, 0, repeat_max);
3740 }
3741 }
3742 }
3743
3744 /* The character or character type itself comes last in all cases. */
3745
3746 #ifdef SUPPORT_UTF8
3747 if (utf8 && c >= 128)
3748 {
3749 memcpy(code, utf8_char, c & 7);
3750 code += c & 7;
3751 }
3752 else
3753 #endif
3754 *code++ = c;
3755
3756 /* For a repeated Unicode property match, there are two extra bytes that
3757 define the required property. */
3758
3759 #ifdef SUPPORT_UCP
3760 if (prop_type >= 0)
3761 {
3762 *code++ = prop_type;
3763 *code++ = prop_value;
3764 }
3765 #endif
3766 }
3767
3768 /* If previous was a character class or a back reference, we put the repeat
3769 stuff after it, but just skip the item if the repeat was {0,0}. */
3770
3771 else if (*previous == OP_CLASS ||
3772 *previous == OP_NCLASS ||
3773 #ifdef SUPPORT_UTF8
3774 *previous == OP_XCLASS ||
3775 #endif
3776 *previous == OP_REF)
3777 {
3778 if (repeat_max == 0)
3779 {
3780 code = previous;
3781 goto END_REPEAT;
3782 }
3783
3784 /* All real repeats make it impossible to handle partial matching (maybe
3785 one day we will be able to remove this restriction). */
3786
3787 if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL;
3788
3789 if (repeat_min == 0 && repeat_max == -1)
3790 *code++ = OP_CRSTAR + repeat_type;
3791 else if (repeat_min == 1 && repeat_max == -1)
3792 *code++ = OP_CRPLUS + repeat_type;
3793 else if (repeat_min == 0 && repeat_max == 1)
3794 *code++ = OP_CRQUERY + repeat_type;
3795 else
3796 {
3797 *code++ = OP_CRRANGE + repeat_type;
3798 PUT2INC(code, 0, repeat_min);
3799 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
3800 PUT2INC(code, 0, repeat_max);
3801 }
3802 }
3803
3804 /* If previous was a bracket group, we may have to replicate it in certain
3805 cases. */
3806
3807 else if (*previous == OP_BRA || *previous == OP_CBRA ||
3808 *previous == OP_ONCE || *previous == OP_COND)
3809 {
3810 register int i;
3811 int ketoffset = 0;
3812 int len = code - previous;
3813 uschar *bralink = NULL;
3814
3815 /* Repeating a DEFINE group is pointless */
3816
3817 if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
3818 {
3819 *errorcodeptr = ERR55;
3820 goto FAILED;
3821 }
3822
3823 /* If the maximum repeat count is unlimited, find the end of the bracket
3824 by scanning through from the start, and compute the offset back to it
3825 from the current code pointer. There may be an OP_OPT setting following
3826 the final KET, so we can't find the end just by going back from the code
3827 pointer. */
3828
3829 if (repeat_max == -1)
3830 {
3831 register uschar *ket = previous;
3832 do ket += GET(ket, 1); while (*ket != OP_KET);
3833 ketoffset = code - ket;
3834 }
3835
3836 /* The case of a zero minimum is special because of the need to stick
3837 OP_BRAZERO in front of it, and because the group appears once in the
3838 data, whereas in other cases it appears the minimum number of times. For
3839 this reason, it is simplest to treat this case separately, as otherwise
3840 the code gets far too messy. There are several special subcases when the
3841 minimum is zero. */
3842
3843 if (repeat_min == 0)
3844 {
3845 /* If the maximum is also zero, we just omit the group from the output
3846 altogether. */
3847
3848 if (repeat_max == 0)
3849 {
3850 code = previous;
3851 goto END_REPEAT;
3852 }
3853
3854 /* If the maximum is 1 or unlimited, we just have to stick in the
3855 BRAZERO and do no more at this point. However, we do need to adjust
3856 any OP_RECURSE calls inside the group that refer to the group itself or
3857 any internal or forward referenced group, because the offset is from
3858 the start of the whole regex. Temporarily terminate the pattern while
3859 doing this. */
3860
3861 if (repeat_max <= 1)
3862 {
3863 *code = OP_END;
3864 adjust_recurse(previous, 1, utf8, cd, save_hwm);
3865 memmove(previous+1, previous, len);
3866 code++;
3867 *previous++ = OP_BRAZERO + repeat_type;
3868 }
3869
3870 /* If the maximum is greater than 1 and limited, we have to replicate
3871 in a nested fashion, sticking OP_BRAZERO before each set of brackets.
3872 The first one has to be handled carefully because it's the original
3873 copy, which has to be moved up. The remainder can be handled by code
3874 that is common with the non-zero minimum case below. We have to
3875 adjust the value or repeat_max, since one less copy is required. Once
3876 again, we may have to adjust any OP_RECURSE calls inside the group. */
3877
3878 else
3879 {
3880 int offset;
3881 *code = OP_END;
3882 adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
3883 memmove(previous + 2 + LINK_SIZE, previous, len);
3884 code += 2 + LINK_SIZE;
3885 *previous++ = OP_BRAZERO + repeat_type;
3886 *previous++ = OP_BRA;
3887
3888 /* We chain together the bracket offset fields that have to be
3889 filled in later when the ends of the brackets are reached. */
3890
3891 offset = (bralink == NULL)? 0 : previous - bralink;
3892 bralink = previous;
3893 PUTINC(previous, 0, offset);
3894 }
3895
3896 repeat_max--;
3897 }
3898
3899 /* If the minimum is greater than zero, replicate the group as many
3900 times as necessary, and adjust the maximum to the number of subsequent
3901 copies that we need. If we set a first char from the group, and didn't
3902 set a required char, copy the latter from the former. If there are any
3903 forward reference subroutine calls in the group, there will be entries on
3904 the workspace list; replicate these with an appropriate increment. */
3905
3906 else
3907 {
3908 if (repeat_min > 1)
3909 {
3910 /* In the pre-compile phase, we don't actually do the replication. We
3911 just adjust the length as if we had. Do some paranoid checks for
3912 potential integer overflow. */
3913
3914 if (lengthptr != NULL)
3915 {
3916 int delta = (repeat_min - 1)*length_prevgroup;
3917 if ((double)(repeat_min - 1)*(double)length_prevgroup >
3918 (double)INT_MAX ||
3919 OFLOW_MAX - *lengthptr < delta)
3920 {
3921 *errorcodeptr = ERR20;
3922 goto FAILED;
3923 }
3924 *lengthptr += delta;
3925 }
3926
3927 /* This is compiling for real */
3928
3929 else
3930 {
3931 if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
3932 for (i = 1; i < repeat_min; i++)
3933 {
3934 uschar *hc;
3935 uschar *this_hwm = cd->hwm;
3936 memcpy(code, previous, len);
3937 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3938 {
3939 PUT(cd->hwm, 0, GET(hc, 0) + len);
3940 cd->hwm += LINK_SIZE;
3941 }
3942 save_hwm = this_hwm;
3943 code += len;
3944 }
3945 }
3946 }
3947
3948 if (repeat_max > 0) repeat_max -= repeat_min;
3949 }
3950
3951 /* This code is common to both the zero and non-zero minimum cases. If
3952 the maximum is limited, it replicates the group in a nested fashion,
3953 remembering the bracket starts on a stack. In the case of a zero minimum,
3954 the first one was set up above. In all cases the repeat_max now specifies
3955 the number of additional copies needed. Again, we must remember to
3956 replicate entries on the forward reference list. */
3957
3958 if (repeat_max >= 0)
3959 {
3960 /* In the pre-compile phase, we don't actually do the replication. We
3961 just adjust the length as if we had. For each repetition we must add 1
3962 to the length for BRAZERO and for all but the last repetition we must
3963 add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
3964 paranoid checks to avoid integer overflow. */
3965
3966 if (lengthptr != NULL && repeat_max > 0)
3967 {
3968 int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
3969 2 - 2*LINK_SIZE; /* Last one doesn't nest */
3970 if ((double)repeat_max *
3971 (double)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
3972 > (double)INT_MAX ||
3973 OFLOW_MAX - *lengthptr < delta)
3974 {
3975 *errorcodeptr = ERR20;
3976 goto FAILED;
3977 }
3978 *lengthptr += delta;
3979 }
3980
3981 /* This is compiling for real */
3982
3983 else for (i = repeat_max - 1; i >= 0; i--)
3984 {
3985 uschar *hc;
3986 uschar *this_hwm = cd->hwm;
3987
3988 *code++ = OP_BRAZERO + repeat_type;
3989
3990 /* All but the final copy start a new nesting, maintaining the
3991 chain of brackets outstanding. */
3992
3993 if (i != 0)
3994 {
3995 int offset;
3996 *code++ = OP_BRA;
3997 offset = (bralink == NULL)? 0 : code - bralink;
3998 bralink = code;
3999 PUTINC(code, 0, offset);
4000 }
4001
4002 memcpy(code, previous, len);
4003 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
4004 {
4005 PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
4006 cd->hwm += LINK_SIZE;
4007 }
4008 save_hwm = this_hwm;
4009 code += len;
4010 }
4011
4012 /* Now chain through the pending brackets, and fill in their length
4013 fields (which are holding the chain links pro tem). */
4014
4015 while (bralink != NULL)
4016 {
4017 int oldlinkoffset;
4018 int offset = code - bralink + 1;
4019 uschar *bra = code - offset;
4020 oldlinkoffset = GET(bra, 1);
4021 bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
4022 *code++ = OP_KET;
4023 PUTINC(code, 0, offset);
4024 PUT(bra, 1, offset);
4025 }
4026 }
4027
4028 /* If the maximum is unlimited, set a repeater in the final copy. We
4029 can't just offset backwards from the current code point, because we
4030 don't know if there's been an options resetting after the ket. The
4031 correct offset was computed above.
4032
4033 Then, when we are doing the actual compile phase, check to see whether
4034 this group is a non-atomic one that could match an empty string. If so,
4035 convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
4036 that runtime checking can be done. [This check is also applied to
4037 atomic groups at runtime, but in a different way.] */
4038
4039 else
4040 {
4041 uschar *ketcode = code - ketoffset;
4042 uschar *bracode = ketcode - GET(ketcode, 1);
4043 *ketcode = OP_KETRMAX + repeat_type;
4044 if (lengthptr == NULL && *bracode != OP_ONCE)
4045 {
4046 uschar *scode = bracode;
4047 do
4048 {
4049 if (could_be_empty_branch(scode, ketcode, utf8))
4050 {
4051 *bracode += OP_SBRA - OP_BRA;
4052 break;
4053 }
4054 scode += GET(scode, 1);
4055 }
4056 while (*scode == OP_ALT);
4057 }
4058 }
4059 }
4060
4061 /* Else there's some kind of shambles */
4062
4063 else
4064 {
4065 *errorcodeptr = ERR11;
4066 goto FAILED;
4067 }
4068
4069 /* If the character following a repeat is '+', or if certain optimization
4070 tests above succeeded, possessive_quantifier is TRUE. For some of the
4071 simpler opcodes, there is an special alternative opcode for this. For
4072 anything else, we wrap the entire repeated item inside OP_ONCE brackets.
4073 The '+' notation is just syntactic sugar, taken from Sun's Java package,
4074 but the special opcodes can optimize it a bit. The repeated item starts at
4075 tempcode, not at previous, which might be the first part of a string whose
4076 (former) last char we repeated.
4077
4078 Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
4079 an 'upto' may follow. We skip over an 'exact' item, and then test the
4080 length of what remains before proceeding. */
4081
4082 if (possessive_quantifier)
4083 {
4084 int len;
4085 if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||
4086 *tempcode == OP_NOTEXACT)
4087 tempcode += _pcre_OP_lengths[*tempcode] +
4088 ((*tempcode == OP_TYPEEXACT &&
4089 (tempcode[3] == OP_PROP || tempcode[3] == OP_NOTPROP))? 2:0);
4090 len = code - tempcode;
4091 if (len > 0) switch (*tempcode)
4092 {
4093 case OP_STAR: *tempcode = OP_POSSTAR; break;
4094 case OP_PLUS: *tempcode = OP_POSPLUS; break;
4095 case OP_QUERY: *tempcode = OP_POSQUERY; break;
4096 case OP_UPTO: *tempcode = OP_POSUPTO; break;
4097
4098 case OP_TYPESTAR: *tempcode = OP_TYPEPOSSTAR; break;
4099 case OP_TYPEPLUS: *tempcode = OP_TYPEPOSPLUS; break;
4100 case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
4101 case OP_TYPEUPTO: *tempcode = OP_TYPEPOSUPTO; break;
4102
4103 case OP_NOTSTAR: *tempcode = OP_NOTPOSSTAR; break;
4104 case OP_NOTPLUS: *tempcode = OP_NOTPOSPLUS; break;
4105 case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
4106 case OP_NOTUPTO: *tempcode = OP_NOTPOSUPTO; break;
4107
4108 default:
4109 memmove(tempcode + 1+LINK_SIZE, tempcode, len);
4110 code += 1 + LINK_SIZE;
4111 len += 1 + LINK_SIZE;
4112 tempcode[0] = OP_ONCE;
4113 *code++ = OP_KET;
4114 PUTINC(code, 0, len);
4115 PUT(tempcode, 1, len);
4116 break;
4117 }
4118 }
4119
4120 /* In all case we no longer have a previous item. We also set the
4121 "follows varying string" flag for subsequently encountered reqbytes if
4122 it isn't already set and we have just passed a varying length item. */
4123
4124 END_REPEAT:
4125 previous = NULL;
4126 cd->req_varyopt |= reqvary;
4127 break;
4128
4129
4130 /* ===================================================================*/
4131 /* Start of nested parenthesized sub-expression, or comment or lookahead or
4132 lookbehind or option setting or condition or all the other extended
4133 parenthesis forms. */
4134
4135 case '(':
4136 newoptions = options;
4137 skipbytes = 0;
4138 bravalue = OP_CBRA;
4139 save_hwm = cd->hwm;
4140 reset_bracount = FALSE;
4141
4142 /* First deal with various "verbs" that can be introduced by '*'. */
4143
4144 if (*(++ptr) == '*' && (cd->ctypes[ptr[1]] & ctype_letter) != 0)
4145 {
4146 int i, namelen;
4147 const char *vn = verbnames;
4148 const uschar *name = ++ptr;
4149 previous = NULL;
4150 while ((cd->ctypes[*++ptr] & ctype_letter) != 0);
4151 if (*ptr == ':')
4152 {
4153 *errorcodeptr = ERR59; /* Not supported */
4154 goto FAILED;
4155 }
4156 if (*ptr != ')')
4157 {
4158 *errorcodeptr = ERR60;
4159 goto FAILED;
4160 }
4161 namelen = ptr - name;
4162 for (i = 0; i < verbcount; i++)
4163 {
4164 if (namelen == verbs[i].len &&
4165 strncmp((char *)name, vn, namelen) == 0)
4166 {
4167 *code = verbs[i].op;
4168 if (*code++ == OP_ACCEPT) cd->had_accept = TRUE;
4169 break;
4170 }
4171 vn += verbs[i].len + 1;
4172 }
4173 if (i < verbcount) continue;
4174 *errorcodeptr = ERR60;
4175 goto FAILED;
4176 }
4177
4178 /* Deal with the extended parentheses; all are introduced by '?', and the
4179 appearance of any of them means that this is not a capturing group. */
4180
4181 else if (*ptr == '?')
4182 {
4183 int i, set, unset, namelen;
4184 int *optset;
4185 const uschar *name;
4186 uschar *slot;
4187
4188 switch (*(++ptr))
4189 {
4190 case '#': /* Comment; skip to ket */
4191 ptr++;
4192 while (*ptr != 0 && *ptr != ')') ptr++;
4193 if (*ptr == 0)
4194 {
4195 *errorcodeptr = ERR18;
4196 goto FAILED;
4197 }
4198 continue;
4199
4200
4201 /* ------------------------------------------------------------ */
4202 case '|': /* Reset capture count for each branch */
4203 reset_bracount = TRUE;
4204 /* Fall through */
4205
4206 /* ------------------------------------------------------------ */
4207 case ':': /* Non-capturing bracket */
4208 bravalue = OP_BRA;
4209 ptr++;
4210 break;
4211
4212
4213 /* ------------------------------------------------------------ */
4214 case '(':
4215 bravalue = OP_COND; /* Conditional group */
4216
4217 /* A condition can be an assertion, a number (referring to a numbered
4218 group), a name (referring to a named group), or 'R', referring to
4219 recursion. R<digits> and R&name are also permitted for recursion tests.
4220
4221 There are several syntaxes for testing a named group: (?(name)) is used
4222 by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
4223
4224 There are two unfortunate ambiguities, caused by history. (a) 'R' can
4225 be the recursive thing or the name 'R' (and similarly for 'R' followed
4226 by digits), and (b) a number could be a name that consists of digits.
4227 In both cases, we look for a name first; if not found, we try the other
4228 cases. */
4229
4230 /* For conditions that are assertions, check the syntax, and then exit
4231 the switch. This will take control down to where bracketed groups,
4232 including assertions, are processed. */
4233
4234 if (ptr[1] == '?' && (ptr[2] == '=' || ptr[2] == '!' || ptr[2] == '<'))
4235 break;
4236
4237 /* Most other conditions use OP_CREF (a couple change to OP_RREF
4238 below), and all need to skip 3 bytes at the start of the group. */
4239
4240 code[1+LINK_SIZE] = OP_CREF;
4241 skipbytes = 3;
4242 refsign = -1;
4243
4244 /* Check for a test for recursion in a named group. */
4245
4246 if (ptr[1] == 'R' && ptr[2] == '&')
4247 {
4248 terminator = -1;
4249 ptr += 2;
4250 code[1+LINK_SIZE] = OP_RREF; /* Change the type of test */
4251 }
4252
4253 /* Check for a test for a named group's having been set, using the Perl
4254 syntax (?(<name>) or (?('name') */
4255
4256 else if (ptr[1] == '<')
4257 {
4258 terminator = '>';
4259 ptr++;
4260 }
4261 else if (ptr[1] == '\'')
4262 {
4263 terminator = '\'';
4264 ptr++;
4265 }
4266 else
4267 {
4268 terminator = 0;
4269 if (ptr[1] == '-' || ptr[1] == '+') refsign = *(++ptr);
4270 }
4271
4272 /* We now expect to read a name; any thing else is an error */
4273
4274 if ((cd->ctypes[ptr[1]] & ctype_word) == 0)
4275 {
4276 ptr += 1; /* To get the right offset */
4277 *errorcodeptr = ERR28;
4278 goto FAILED;
4279 }
4280
4281 /* Read the name, but also get it as a number if it's all digits */
4282
4283 recno = 0;
4284 name = ++ptr;
4285 while ((cd->ctypes[*ptr] & ctype_word) != 0)
4286 {
4287 if (recno >= 0)
4288 recno = ((digitab[*ptr] & ctype_digit) != 0)?
4289 recno * 10 + *ptr - '0' : -1;
4290 ptr++;
4291 }
4292 namelen = ptr - name;
4293
4294 if ((terminator > 0 && *ptr++ != terminator) || *ptr++ != ')')
4295 {
4296 ptr--; /* Error offset */
4297 *errorcodeptr = ERR26;
4298 goto FAILED;
4299 }
4300
4301 /* Do no further checking in the pre-compile phase. */
4302
4303 if (lengthptr != NULL) break;
4304
4305 /* In the real compile we do the work of looking for the actual
4306 reference. If the string started with "+" or "-" we require the rest to
4307 be digits, in which case recno will be set. */
4308
4309 if (refsign > 0)
4310 {
4311 if (recno <= 0)
4312 {
4313 *errorcodeptr = ERR58;
4314 goto FAILED;
4315 }
4316 recno = (refsign == '-')?
4317 cd->bracount - recno + 1 : recno +cd->bracount;
4318 if (recno <= 0 || recno > cd->final_bracount)
4319 {
4320 *errorcodeptr = ERR15;
4321 goto FAILED;
4322 }
4323 PUT2(code, 2+LINK_SIZE, recno);
4324 break;
4325 }
4326
4327 /* Otherwise (did not start with "+" or "-"), start by looking for the
4328 name. */
4329
4330 slot = cd->name_table;
4331 for (i = 0; i < cd->names_found; i++)
4332 {
4333 if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
4334 slot += cd->name_entry_size;
4335 }
4336
4337 /* Found a previous named subpattern */
4338
4339 if (i < cd->names_found)
4340 {
4341 recno = GET2(slot, 0);
4342 PUT2(code, 2+LINK_SIZE, recno);
4343 }
4344
4345 /* Search the pattern for a forward reference */
4346
4347 else if ((i = find_parens(ptr, cd->bracount, name, namelen,
4348 (options & PCRE_EXTENDED) != 0)) > 0)
4349 {
4350 PUT2(code, 2+LINK_SIZE, i);
4351 }
4352
4353 /* If terminator == 0 it means that the name followed directly after
4354 the opening parenthesis [e.g. (?(abc)...] and in this case there are
4355 some further alternatives to try. For the cases where terminator != 0
4356 [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
4357 now checked all the possibilities, so give an error. */
4358
4359 else if (terminator != 0)
4360 {
4361 *errorcodeptr = ERR15;
4362 goto FAILED;
4363 }
4364
4365 /* Check for (?(R) for recursion. Allow digits after R to specify a
4366 specific group number. */
4367
4368 else if (*name == 'R')
4369 {
4370 recno = 0;
4371 for (i = 1; i < namelen; i++)
4372 {
4373 if ((digitab[name[i]] & ctype_digit) == 0)
4374 {
4375 *errorcodeptr = ERR15;
4376 goto FAILED;
4377 }
4378 recno = recno * 10 + name[i] - '0';
4379 }
4380 if (recno == 0) recno = RREF_ANY;
4381 code[1+LINK_SIZE] = OP_RREF; /* Change test type */
4382 PUT2(code, 2+LINK_SIZE, recno);
4383 }
4384
4385 /* Similarly, check for the (?(DEFINE) "condition", which is always
4386 false. */
4387
4388 else if (namelen == 6 && strncmp((char *)name, "DEFINE", 6) == 0)
4389 {
4390 code[1+LINK_SIZE] = OP_DEF;
4391 skipbytes = 1;
4392 }
4393
4394 /* Check for the "name" actually being a subpattern number. We are
4395 in the second pass here, so final_bracount is set. */
4396
4397 else if (recno > 0 && recno <= cd->final_bracount)
4398 {
4399 PUT2(code, 2+LINK_SIZE, recno);
4400 }
4401
4402 /* Either an unidentified subpattern, or a reference to (?(0) */
4403
4404 else
4405 {
4406 *errorcodeptr = (recno == 0)? ERR35: ERR15;
4407 goto FAILED;
4408 }
4409 break;
4410
4411
4412 /* ------------------------------------------------------------ */
4413 case '=': /* Positive lookahead */
4414 bravalue = OP_ASSERT;
4415 ptr++;
4416 break;
4417
4418
4419 /* ------------------------------------------------------------ */
4420 case '!': /* Negative lookahead */
4421 ptr++;
4422 if (*ptr == ')') /* Optimize (?!) */
4423 {
4424 *code++ = OP_FAIL;
4425 previous = NULL;
4426 continue;
4427 }
4428 bravalue = OP_ASSERT_NOT;
4429 break;
4430
4431
4432 /* ------------------------------------------------------------ */
4433 case '<': /* Lookbehind or named define */
4434 switch (ptr[1])
4435 {
4436 case '=': /* Positive lookbehind */
4437 bravalue = OP_ASSERTBACK;
4438 ptr += 2;
4439 break;
4440
4441 case '!': /* Negative lookbehind */
4442 bravalue = OP_ASSERTBACK_NOT;
4443 ptr += 2;
4444 break;
4445
4446 default: /* Could be name define, else bad */
4447 if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME;
4448 ptr++; /* Correct offset for error */
4449 *errorcodeptr = ERR24;
4450 goto FAILED;
4451 }
4452 break;
4453
4454
4455 /* ------------------------------------------------------------ */
4456 case '>': /* One-time brackets */
4457 bravalue = OP_ONCE;
4458 ptr++;
4459 break;
4460
4461
4462 /* ------------------------------------------------------------ */
4463 case 'C': /* Callout - may be followed by digits; */
4464 previous_callout = code; /* Save for later completion */
4465 after_manual_callout = 1; /* Skip one item before completing */
4466 *code++ = OP_CALLOUT;
4467 {
4468 int n = 0;
4469 while ((digitab[*(++ptr)] & ctype_digit) != 0)
4470 n = n * 10 + *ptr - '0';
4471 if (*ptr != ')')
4472 {
4473 *errorcodeptr = ERR39;
4474 goto FAILED;
4475 }
4476 if (n > 255)
4477 {
4478 *errorcodeptr = ERR38;
4479 goto FAILED;
4480 }
4481 *code++ = n;
4482 PUT(code, 0, ptr - cd->start_pattern + 1); /* Pattern offset */
4483 PUT(code, LINK_SIZE, 0); /* Default length */
4484 code += 2 * LINK_SIZE;
4485 }
4486 previous = NULL;
4487 continue;
4488
4489
4490 /* ------------------------------------------------------------ */
4491 case 'P': /* Python-style named subpattern handling */
4492 if (*(++ptr) == '=' || *ptr == '>') /* Reference or recursion */
4493 {
4494 is_recurse = *ptr == '>';
4495 terminator = ')';
4496 goto NAMED_REF_OR_RECURSE;
4497 }
4498 else if (*ptr != '<') /* Test for Python-style definition */
4499 {
4500 *errorcodeptr = ERR41;
4501 goto FAILED;
4502 }
4503 /* Fall through to handle (?P< as (?< is handled */
4504
4505
4506 /* ------------------------------------------------------------ */
4507 DEFINE_NAME: /* Come here from (?< handling */
4508 case '\'':
4509 {
4510 terminator = (*ptr == '<')? '>' : '\'';
4511 name = ++ptr;
4512
4513 while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4514 namelen = ptr - name;
4515
4516 /* In the pre-compile phase, just do a syntax check. */
4517
4518 if (lengthptr != NULL)
4519 {
4520 if (*ptr != terminator)
4521 {
4522 *errorcodeptr = ERR42;
4523 goto FAILED;
4524 }
4525 if (cd->names_found >= MAX_NAME_COUNT)
4526 {
4527 *errorcodeptr = ERR49;
4528 goto FAILED;
4529 }
4530 if (namelen + 3 > cd->name_entry_size)
4531 {
4532 cd->name_entry_size = namelen + 3;
4533 if (namelen > MAX_NAME_SIZE)
4534 {
4535 *errorcodeptr = ERR48;
4536 goto FAILED;
4537 }
4538 }
4539 }
4540
4541 /* In the real compile, create the entry in the table */
4542
4543 else
4544 {
4545 slot = cd->name_table;
4546 for (i = 0; i < cd->names_found; i++)
4547 {
4548 int crc = memcmp(name, slot+2, namelen);
4549 if (crc == 0)
4550 {
4551 if (slot[2+namelen] == 0)
4552 {
4553 if ((options & PCRE_DUPNAMES) == 0)
4554 {
4555 *errorcodeptr = ERR43;
4556 goto FAILED;
4557 }
4558 }
4559 else crc = -1; /* Current name is substring */
4560 }
4561 if (crc < 0)
4562 {
4563 memmove(slot + cd->name_entry_size, slot,
4564 (cd->names_found - i) * cd->name_entry_size);
4565 break;
4566 }
4567 slot += cd->name_entry_size;
4568 }
4569
4570 PUT2(slot, 0, cd->bracount + 1);
4571 memcpy(slot + 2, name, namelen);
4572 slot[2+namelen] = 0;
4573 }
4574 }
4575
4576 /* In both cases, count the number of names we've encountered. */
4577
4578 ptr++; /* Move past > or ' */
4579 cd->names_found++;
4580 goto NUMBERED_GROUP;
4581
4582
4583 /* ------------------------------------------------------------ */
4584 case '&': /* Perl recursion/subroutine syntax */
4585 terminator = ')';
4586 is_recurse = TRUE;
4587 /* Fall through */
4588
4589 /* We come here from the Python syntax above that handles both
4590 references (?P=name) and recursion (?P>name), as well as falling
4591 through from the Perl recursion syntax (?&name). We also come here from
4592 the Perl \k<name> or \k'name' back reference syntax and the \k{name}
4593 .NET syntax, and the Oniguruma \g<...> and \g'...' subroutine syntax. */
4594
4595 NAMED_REF_OR_RECURSE:
4596 name = ++ptr;
4597 while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4598 namelen = ptr - name;
4599
4600 /* In the pre-compile phase, do a syntax check and set a dummy
4601 reference number. */
4602
4603 if (lengthptr != NULL)
4604 {
4605 if (namelen == 0)
4606 {
4607 *errorcodeptr = ERR62;
4608 goto FAILED;
4609 }
4610 if (*ptr != terminator)
4611 {
4612 *errorcodeptr = ERR42;
4613 goto FAILED;
4614 }
4615 if (namelen > MAX_NAME_SIZE)
4616 {
4617 *errorcodeptr = ERR48;
4618 goto FAILED;
4619 }
4620 recno = 0;
4621 }
4622
4623 /* In the real compile, seek the name in the table. We check the name
4624 first, and then check that we have reached the end of the name in the
4625 table. That way, if the name that is longer than any in the table,
4626 the comparison will fail without reading beyond the table entry. */
4627
4628 else
4629 {
4630 slot = cd->name_table;
4631 for (i = 0; i < cd->names_found; i++)
4632 {
4633 if (strncmp((char *)name, (char *)slot+2, namelen) == 0 &&
4634 slot[2+namelen] == 0)
4635 break;
4636 slot += cd->name_entry_size;
4637 }
4638
4639 if (i < cd->names_found) /* Back reference */
4640 {
4641 recno = GET2(slot, 0);
4642 }
4643 else if ((recno = /* Forward back reference */
4644 find_parens(ptr, cd->bracount, name, namelen,
4645 (options & PCRE_EXTENDED) != 0)) <= 0)
4646 {
4647 *errorcodeptr = ERR15;
4648 goto FAILED;
4649 }
4650 }
4651
4652 /* In both phases, we can now go to the code than handles numerical
4653 recursion or backreferences. */
4654
4655 if (is_recurse) goto HANDLE_RECURSION;
4656 else goto HANDLE_REFERENCE;
4657
4658
4659 /* ------------------------------------------------------------ */
4660 case 'R': /* Recursion */
4661 ptr++; /* Same as (?0) */
4662 /* Fall through */
4663
4664
4665 /* ------------------------------------------------------------ */
4666 case '-': case '+':
4667 case '0': case '1': case '2': case '3': case '4': /* Recursion or */
4668 case '5': case '6': case '7': case '8': case '9': /* subroutine */
4669 {
4670 const uschar *called;
4671 terminator = ')';
4672
4673 /* Come here from the \g<...> and \g'...' code (Oniguruma
4674 compatibility). However, the syntax has been checked to ensure that
4675 the ... are a (signed) number, so that neither ERR63 nor ERR29 will
4676 be called on this path, nor with the jump to OTHER_CHAR_AFTER_QUERY
4677 ever be taken. */
4678
4679 HANDLE_NUMERICAL_RECURSION:
4680
4681 if ((refsign = *ptr) == '+')
4682 {
4683 ptr++;
4684 if ((digitab[*ptr] & ctype_digit) == 0)
4685 {
4686 *errorcodeptr = ERR63;
4687 goto FAILED;
4688 }
4689 }
4690 else if (refsign == '-')
4691 {
4692 if ((digitab[ptr[1]] & ctype_digit) == 0)
4693 goto OTHER_CHAR_AFTER_QUERY;
4694 ptr++;
4695 }
4696
4697 recno = 0;
4698 while((digitab[*ptr] & ctype_digit) != 0)
4699 recno = recno * 10 + *ptr++ - '0';
4700
4701 if (*ptr != terminator)
4702 {
4703 *errorcodeptr = ERR29;
4704 goto FAILED;
4705 }
4706
4707 if (refsign == '-')
4708 {
4709 if (recno == 0)
4710 {
4711 *errorcodeptr = ERR58;
4712 goto FAILED;
4713 }
4714 recno = cd->bracount - recno + 1;
4715 if (recno <= 0)
4716 {
4717 *errorcodeptr = ERR15;
4718 goto FAILED;
4719 }
4720 }
4721 else if (refsign == '+')
4722 {
4723 if (recno == 0)
4724 {
4725 *errorcodeptr = ERR58;
4726 goto FAILED;
4727 }
4728 recno += cd->bracount;
4729 }
4730
4731 /* Come here from code above that handles a named recursion */
4732
4733 HANDLE_RECURSION:
4734
4735 previous = code;
4736 called = cd->start_code;
4737
4738 /* When we are actually compiling, find the bracket that is being
4739 referenced. Temporarily end the regex in case it doesn't exist before
4740 this point. If we end up with a forward reference, first check that
4741 the bracket does occur later so we can give the error (and position)
4742 now. Then remember this forward reference in the workspace so it can
4743 be filled in at the end. */
4744
4745 if (lengthptr == NULL)
4746 {
4747 *code = OP_END;
4748 if (recno != 0) called = find_bracket(cd->start_code, utf8, recno);
4749
4750 /* Forward reference */
4751
4752 if (called == NULL)
4753 {
4754 if (find_parens(ptr, cd->bracount, NULL, recno,
4755 (options & PCRE_EXTENDED) != 0) < 0)
4756 {
4757 *errorcodeptr = ERR15;
4758 goto FAILED;
4759 }
4760 called = cd->start_code + recno;
4761 PUTINC(cd->hwm, 0, code + 2 + LINK_SIZE - cd->start_code);
4762 }
4763
4764 /* If not a forward reference, and the subpattern is still open,
4765 this is a recursive call. We check to see if this is a left
4766 recursion that could loop for ever, and diagnose that case. */
4767
4768 else if (GET(called, 1) == 0 &&
4769 could_be_empty(called, code, bcptr, utf8))
4770 {
4771 *errorcodeptr = ERR40;
4772 goto FAILED;
4773 }
4774 }
4775
4776 /* Insert the recursion/subroutine item, automatically wrapped inside
4777 "once" brackets. Set up a "previous group" length so that a
4778 subsequent quantifier will work. */
4779
4780 *code = OP_ONCE;
4781 PUT(code, 1, 2 + 2*LINK_SIZE);
4782 code += 1 + LINK_SIZE;
4783
4784 *code = OP_RECURSE;
4785 PUT(code, 1, called - cd->start_code);
4786 code += 1 + LINK_SIZE;
4787
4788 *code = OP_KET;
4789 PUT(code, 1, 2 + 2*LINK_SIZE);
4790 code += 1 + LINK_SIZE;
4791
4792 length_prevgroup = 3 + 3*LINK_SIZE;
4793 }
4794
4795 /* Can't determine a first byte now */
4796
4797 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
4798 continue;
4799
4800
4801 /* ------------------------------------------------------------ */
4802 default: /* Other characters: check option setting */
4803 OTHER_CHAR_AFTER_QUERY:
4804 set = unset = 0;
4805 optset = &set;
4806
4807 while (*ptr != ')' && *ptr != ':')
4808 {
4809 switch (*ptr++)
4810 {
4811 case '-': optset = &unset; break;
4812
4813 case 'J': /* Record that it changed in the external options */
4814 *optset |= PCRE_DUPNAMES;
4815 cd->external_flags |= PCRE_JCHANGED;
4816 break;
4817
4818 case 'i': *optset |= PCRE_CASELESS; break;
4819 case 'm': *optset |= PCRE_MULTILINE; break;
4820 case 's': *optset |= PCRE_DOTALL; break;
4821 case 'x': *optset |= PCRE_EXTENDED; break;
4822 case 'U': *optset |= PCRE_UNGREEDY; break;
4823 case 'X': *optset |= PCRE_EXTRA; break;
4824
4825 default: *errorcodeptr = ERR12;
4826 ptr--; /* Correct the offset */
4827 goto FAILED;
4828 }
4829 }
4830
4831 /* Set up the changed option bits, but don't change anything yet. */
4832
4833 newoptions = (options | set) & (~unset);
4834
4835 /* If the options ended with ')' this is not the start of a nested
4836 group with option changes, so the options change at this level. If this
4837 item is right at the start of the pattern, the options can be
4838 abstracted and made external in the pre-compile phase, and ignored in
4839 the compile phase. This can be helpful when matching -- for instance in
4840 caseless checking of required bytes.
4841
4842 If the code pointer is not (cd->start_code + 1 + LINK_SIZE), we are
4843 definitely *not* at the start of the pattern because something has been
4844 compiled. In the pre-compile phase, however, the code pointer can have
4845 that value after the start, because it gets reset as code is discarded
4846 during the pre-compile. However, this can happen only at top level - if
4847 we are within parentheses, the starting BRA will still be present. At
4848 any parenthesis level, the length value can be used to test if anything
4849 has been compiled at that level. Thus, a test for both these conditions
4850 is necessary to ensure we correctly detect the start of the pattern in
4851 both phases.
4852
4853 If we are not at the pattern start, compile code to change the ims
4854 options if this setting actually changes any of them. We also pass the
4855 new setting back so that it can be put at the start of any following
4856 branches, and when this group ends (if we are in a group), a resetting
4857 item can be compiled. */
4858
4859 if (*ptr == ')')
4860 {
4861 if (code == cd->start_code + 1 + LINK_SIZE &&
4862 (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE))
4863 {
4864 cd->external_options = newoptions;
4865 options = newoptions;
4866 }
4867 else
4868 {
4869 if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
4870 {
4871 *code++ = OP_OPT;
4872 *code++ = newoptions & PCRE_IMS;
4873 }
4874
4875 /* Change options at this level, and pass them back for use
4876 in subsequent branches. Reset the greedy defaults and the case
4877 value for firstbyte and reqbyte. */
4878
4879 *optionsptr = options = newoptions;
4880 greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
4881 greedy_non_default = greedy_default ^ 1;
4882 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
4883 }
4884
4885 previous = NULL; /* This item can't be repeated */
4886 continue; /* It is complete */
4887 }
4888
4889 /* If the options ended with ':' we are heading into a nested group
4890 with possible change of options. Such groups are non-capturing and are
4891 not assertions of any kind. All we need to do is skip over the ':';
4892 the newoptions value is handled below. */
4893
4894 bravalue = OP_BRA;
4895 ptr++;
4896 } /* End of switch for character following (? */
4897 } /* End of (? handling */
4898
4899 /* Opening parenthesis not followed by '?'. If PCRE_NO_AUTO_CAPTURE is set,
4900 all unadorned brackets become non-capturing and behave like (?:...)
4901 brackets. */
4902
4903 else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
4904 {
4905 bravalue = OP_BRA;
4906 }
4907
4908 /* Else we have a capturing group. */
4909
4910 else
4911 {
4912 NUMBERED_GROUP:
4913 cd->bracount += 1;
4914 PUT2(code, 1+LINK_SIZE, cd->bracount);
4915 skipbytes = 2;
4916 }
4917
4918 /* Process nested bracketed regex. Assertions may not be repeated, but
4919 other kinds can be. All their opcodes are >= OP_ONCE. We copy code into a
4920 non-register variable in order to be able to pass its address because some
4921 compilers complain otherwise. Pass in a new setting for the ims options if
4922 they have changed. */
4923
4924 previous = (bravalue >= OP_ONCE)? code : NULL;
4925 *code = bravalue;
4926 tempcode = code;
4927 tempreqvary = cd->req_varyopt; /* Save value before bracket */
4928 length_prevgroup = 0; /* Initialize for pre-compile phase */
4929
4930 if (!compile_regex(
4931 newoptions, /* The complete new option state */
4932 options & PCRE_IMS, /* The previous ims option state */
4933 &tempcode, /* Where to put code (updated) */
4934 &ptr, /* Input pointer (updated) */
4935 errorcodeptr, /* Where to put an error message */
4936 (bravalue == OP_ASSERTBACK ||
4937 bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
4938 reset_bracount, /* True if (?| group */
4939 skipbytes, /* Skip over bracket number */
4940 &subfirstbyte, /* For possible first char */
4941 &subreqbyte, /* For possible last char */
4942 bcptr, /* Current branch chain */
4943 cd, /* Tables block */
4944 (lengthptr == NULL)? NULL : /* Actual compile phase */
4945 &length_prevgroup /* Pre-compile phase */
4946 ))
4947 goto FAILED;
4948
4949 /* At the end of compiling, code is still pointing to the start of the
4950 group, while tempcode has been updated to point past the end of the group
4951 and any option resetting that may follow it. The pattern pointer (ptr)
4952 is on the bracket. */
4953
4954 /* If this is a conditional bracket, check that there are no more than
4955 two branches in the group, or just one if it's a DEFINE group. We do this
4956 in the real compile phase, not in the pre-pass, where the whole group may
4957 not be available. */
4958
4959 if (bravalue == OP_COND && lengthptr == NULL)
4960 {
4961 uschar *tc = code;
4962 int condcount = 0;
4963
4964 do {
4965 condcount++;
4966 tc += GET(tc,1);
4967 }
4968 while (*tc != OP_KET);
4969
4970 /* A DEFINE group is never obeyed inline (the "condition" is always
4971 false). It must have only one branch. */
4972
4973 if (code[LINK_SIZE+1] == OP_DEF)
4974 {
4975 if (condcount > 1)
4976 {
4977 *errorcodeptr = ERR54;
4978 goto FAILED;
4979 }
4980 bravalue = OP_DEF; /* Just a flag to suppress char handling below */
4981 }
4982
4983 /* A "normal" conditional group. If there is just one branch, we must not
4984 make use of its firstbyte or reqbyte, because this is equivalent to an
4985 empty second branch. */
4986
4987 else
4988 {
4989 if (condcount > 2)
4990 {
4991 *errorcodeptr = ERR27;
4992 goto FAILED;
4993 }
4994 if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
4995 }
4996 }
4997
4998 /* Error if hit end of pattern */
4999
5000 if (*ptr != ')')
5001 {
5002 *errorcodeptr = ERR14;
5003 goto FAILED;
5004 }
5005
5006 /* In the pre-compile phase, update the length by the length of the group,
5007 less the brackets at either end. Then reduce the compiled code to just a
5008 set of non-capturing brackets so that it doesn't use much memory if it is
5009 duplicated by a quantifier.*/
5010
5011 if (lengthptr != NULL)
5012 {
5013 if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE)
5014 {
5015 *errorcodeptr = ERR20;
5016 goto FAILED;
5017 }
5018 *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
5019 *code++ = OP_BRA;
5020 PUTINC(code, 0, 1 + LINK_SIZE);
5021 *code++ = OP_KET;
5022 PUTINC(code, 0, 1 + LINK_SIZE);
5023 break; /* No need to waste time with special character handling */
5024 }
5025
5026 /* Otherwise update the main code pointer to the end of the group. */
5027
5028 code = tempcode;
5029
5030 /* For a DEFINE group, required and first character settings are not
5031 relevant. */
5032
5033 if (bravalue == OP_DEF) break;
5034
5035 /* Handle updating of the required and first characters for other types of
5036 group. Update for normal brackets of all kinds, and conditions with two
5037 branches (see code above). If the bracket is followed by a quantifier with
5038 zero repeat, we have to back off. Hence the definition of zeroreqbyte and
5039 zerofirstbyte outside the main loop so that they can be accessed for the
5040 back off. */
5041
5042 zeroreqbyte = reqbyte;
5043 zerofirstbyte = firstbyte;
5044 groupsetfirstbyte = FALSE;
5045
5046 if (bravalue >= OP_ONCE)
5047 {
5048 /* If we have not yet set a firstbyte in this branch, take it from the
5049 subpattern, remembering that it was set here so that a repeat of more
5050 than one can replicate it as reqbyte if necessary. If the subpattern has
5051 no firstbyte, set "none" for the whole branch. In both cases, a zero
5052 repeat forces firstbyte to "none". */
5053
5054 if (firstbyte == REQ_UNSET)
5055 {
5056 if (subfirstbyte >= 0)
5057 {
5058 firstbyte = subfirstbyte;
5059 groupsetfirstbyte = TRUE;
5060 }
5061 else firstbyte = REQ_NONE;
5062 zerofirstbyte = REQ_NONE;
5063 }
5064
5065 /* If firstbyte was previously set, convert the subpattern's firstbyte
5066 into reqbyte if there wasn't one, using the vary flag that was in
5067 existence beforehand. */
5068
5069 else if (subfirstbyte >= 0 && subreqbyte < 0)
5070 subreqbyte = subfirstbyte | tempreqvary;
5071
5072 /* If the subpattern set a required byte (or set a first byte that isn't
5073 really the first byte - see above), set it. */
5074
5075 if (subreqbyte >= 0) reqbyte = subreqbyte;
5076 }
5077
5078 /* For a forward assertion, we take the reqbyte, if set. This can be
5079 helpful if the pattern that follows the assertion doesn't set a different
5080 char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
5081 for an assertion, however because it leads to incorrect effect for patterns
5082 such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
5083 of a firstbyte. This is overcome by a scan at the end if there's no
5084 firstbyte, looking for an asserted first char. */
5085
5086 else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
5087 break; /* End of processing '(' */
5088
5089
5090 /* ===================================================================*/
5091 /* Handle metasequences introduced by \. For ones like \d, the ESC_ values
5092 are arranged to be the negation of the corresponding OP_values. For the
5093 back references, the values are ESC_REF plus the reference number. Only
5094 back references and those types that consume a character may be repeated.
5095 We can test for values between ESC_b and ESC_Z for the latter; this may
5096 have to change if any new ones are ever created. */
5097
5098 case '\\':
5099 tempptr = ptr;
5100 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, FALSE);
5101 if (*errorcodeptr != 0) goto FAILED;
5102
5103 if (c < 0)
5104 {
5105 if (-c == ESC_Q) /* Handle start of quoted string */
5106 {
5107 if (ptr[1] == '\\' && ptr[2] == 'E') ptr += 2; /* avoid empty string */
5108 else inescq = TRUE;
5109 continue;
5110 }
5111
5112 if (-c == ESC_E) continue; /* Perl ignores an orphan \E */
5113
5114 /* For metasequences that actually match a character, we disable the
5115 setting of a first character if it hasn't already been set. */
5116
5117 if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
5118 firstbyte = REQ_NONE;
5119
5120 /* Set values to reset to if this is followed by a zero repeat. */
5121
5122 zerofirstbyte = firstbyte;
5123 zeroreqbyte = reqbyte;
5124
5125 /* \g<name> or \g'name' is a subroutine call by name and \g<n> or \g'n'
5126 is a subroutine call by number (Oniguruma syntax). In fact, the value
5127 -ESC_g is returned only for these cases. So we don't need to check for <
5128 or ' if the value is -ESC_g. For the Perl syntax \g{n} the value is
5129 -ESC_REF+n, and for the Perl syntax \g{name} the result is -ESC_k (as
5130 that is a synonym for a named back reference). */
5131
5132 if (-c == ESC_g)
5133 {
5134 const uschar *p;
5135 save_hwm = cd->hwm; /* Normally this is set when '(' is read */
5136 terminator = (*(++ptr) == '<')? '>' : '\'';
5137
5138 /* These two statements stop the compiler for warning about possibly
5139 unset variables caused by the jump to HANDLE_NUMERICAL_RECURSION. In
5140 fact, because we actually check for a number below, the paths that
5141 would actually be in error are never taken. */
5142
5143 skipbytes = 0;
5144 reset_bracount = FALSE;
5145
5146 /* Test for a name */
5147
5148 if (ptr[1] != '+' && ptr[1] != '-')
5149 {
5150 BOOL isnumber = TRUE;
5151 for (p = ptr + 1; *p != 0 && *p != terminator; p++)
5152 {
5153 if ((cd->ctypes[*p] & ctype_digit) == 0) isnumber = FALSE;
5154 if ((cd->ctypes[*p] & ctype_word) == 0) break;
5155 }
5156 if (*p != terminator)
5157 {
5158 *errorcodeptr = ERR57;
5159 break;
5160 }
5161 if (isnumber)
5162 {
5163 ptr++;
5164 goto HANDLE_NUMERICAL_RECURSION;
5165 }
5166 is_recurse = TRUE;
5167 goto NAMED_REF_OR_RECURSE;
5168 }
5169
5170 /* Test a signed number in angle brackets or quotes. */
5171
5172 p = ptr + 2;
5173 while ((digitab[*p] & ctype_digit) != 0) p++;
5174 if (*p != terminator)
5175 {
5176 *errorcodeptr = ERR57;
5177 break;
5178 }
5179 ptr++;
5180 goto HANDLE_NUMERICAL_RECURSION;
5181 }
5182
5183 /* \k<name> or \k'name' is a back reference by name (Perl syntax).
5184 We also support \k{name} (.NET syntax) */
5185
5186 if (-c == ESC_k && (ptr[1] == '<' || ptr[1] == '\'' || ptr[1] == '{'))
5187 {
5188 is_recurse = FALSE;
5189 terminator = (*(++ptr) == '<')? '>' : (*ptr == '\'')? '\'' : '}';
5190 goto NAMED_REF_OR_RECURSE;
5191 }
5192
5193 /* Back references are handled specially; must disable firstbyte if
5194 not set to cope with cases like (?=(\w+))\1: which would otherwise set
5195 ':' later. */
5196
5197 if (-c >= ESC_REF)
5198 {
5199 recno = -c - ESC_REF;
5200
5201 HANDLE_REFERENCE: /* Come here from named backref handling */
5202 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
5203 previous = code;
5204 *code++ = OP_REF;
5205 PUT2INC(code, 0, recno);
5206 cd->backref_map |= (recno < 32)? (1 << recno) : 1;
5207 if (recno > cd->top_backref) cd->top_backref = recno;
5208 }
5209
5210 /* So are Unicode property matches, if supported. */
5211
5212 #ifdef SUPPORT_UCP
5213 else if (-c == ESC_P || -c == ESC_p)
5214 {
5215 BOOL negated;
5216 int pdata;
5217 int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
5218 if (ptype < 0) goto FAILED;
5219 previous = code;
5220 *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
5221 *code++ = ptype;
5222 *code++ = pdata;
5223 }
5224 #else
5225
5226 /* If Unicode properties are not supported, \X, \P, and \p are not
5227 allowed. */
5228
5229 else if (-c == ESC_X || -c == ESC_P || -c == ESC_p)
5230 {
5231 *errorcodeptr = ERR45;
5232 goto FAILED;
5233 }
5234 #endif
5235
5236 /* For the rest (including \X when Unicode properties are supported), we
5237 can obtain the OP value by negating the escape value. */
5238
5239 else
5240 {
5241 previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
5242 *code++ = -c;
5243 }
5244 continue;
5245 }
5246
5247 /* We have a data character whose value is in c. In UTF-8 mode it may have
5248 a value > 127. We set its representation in the length/buffer, and then
5249 handle it as a data character. */
5250
5251 #ifdef SUPPORT_UTF8
5252 if (utf8 && c > 127)
5253 mclength = _pcre_ord2utf8(c, mcbuffer);
5254 else
5255 #endif
5256
5257 {
5258 mcbuffer[0] = c;
5259 mclength = 1;
5260 }
5261 goto ONE_CHAR;
5262
5263
5264 /* ===================================================================*/
5265 /* Handle a literal character. It is guaranteed not to be whitespace or #
5266 when the extended flag is set. If we are in UTF-8 mode, it may be a
5267 multi-byte literal character. */
5268
5269 default:
5270 NORMAL_CHAR:
5271 mclength = 1;
5272 mcbuffer[0] = c;
5273
5274 #ifdef SUPPORT_UTF8
5275 if (utf8 && c >= 0xc0)
5276 {
5277 while ((ptr[1] & 0xc0) == 0x80)
5278 mcbuffer[mclength++] = *(++ptr);
5279 }
5280 #endif
5281
5282 /* At this point we have the character's bytes in mcbuffer, and the length
5283 in mclength. When not in UTF-8 mode, the length is always 1. */
5284
5285 ONE_CHAR:
5286 previous = code;
5287 *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARNC : OP_CHAR;
5288 for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
5289
5290 /* Remember if \r or \n were seen */
5291
5292 if (mcbuffer[0] == '\r' || mcbuffer[0] == '\n')
5293 cd->external_flags |= PCRE_HASCRORLF;
5294
5295 /* Set the first and required bytes appropriately. If no previous first
5296 byte, set it from this character, but revert to none on a zero repeat.
5297 Otherwise, leave the firstbyte value alone, and don't change it on a zero
5298 repeat. */
5299
5300 if (firstbyte == REQ_UNSET)
5301 {
5302 zerofirstbyte = REQ_NONE;
5303 zeroreqbyte = reqbyte;
5304
5305 /* If the character is more than one byte long, we can set firstbyte
5306 only if it is not to be matched caselessly. */
5307
5308 if (mclength == 1 || req_caseopt == 0)
5309 {
5310 firstbyte = mcbuffer[0] | req_caseopt;
5311 if (mclength != 1) reqbyte = code[-1] | cd->req_varyopt;
5312 }
5313 else firstbyte = reqbyte = REQ_NONE;
5314 }
5315
5316 /* firstbyte was previously set; we can set reqbyte only the length is
5317 1 or the matching is caseful. */
5318
5319 else
5320 {
5321 zerofirstbyte = firstbyte;
5322 zeroreqbyte = reqbyte;
5323 if (mclength == 1 || req_caseopt == 0)
5324 reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
5325 }
5326
5327 break; /* End of literal character handling */
5328 }
5329 } /* end of big loop */
5330
5331
5332 /* Control never reaches here by falling through, only by a goto for all the
5333 error states. Pass back the position in the pattern so that it can be displayed
5334 to the user for diagnosing the error. */
5335
5336 FAILED:
5337 *ptrptr = ptr;
5338 return FALSE;
5339 }
5340
5341
5342
5343
5344 /*************************************************
5345 * Compile sequence of alternatives *
5346 *************************************************/
5347
5348 /* On entry, ptr is pointing past the bracket character, but on return it
5349 points to the closing bracket, or vertical bar, or end of string. The code
5350 variable is pointing at the byte into which the BRA operator has been stored.
5351 If the ims options are changed at the start (for a (?ims: group) or during any
5352 branch, we need to insert an OP_OPT item at the start of every following branch
5353 to ensure they get set correctly at run time, and also pass the new options
5354 into every subsequent branch compile.
5355
5356 This function is used during the pre-compile phase when we are trying to find
5357 out the amount of memory needed, as well as during the real compile phase. The
5358 value of lengthptr distinguishes the two phases.
5359
5360 Arguments:
5361 options option bits, including any changes for this subpattern
5362 oldims previous settings of ims option bits
5363 codeptr -> the address of the current code pointer
5364 ptrptr -> the address of the current pattern pointer
5365 errorcodeptr -> pointer to error code variable
5366 lookbehind TRUE if this is a lookbehind assertion
5367 reset_bracount TRUE to reset the count for each branch
5368 skipbytes skip this many bytes at start (for brackets and OP_COND)
5369 firstbyteptr place to put the first required character, or a negative number
5370 reqbyteptr place to put the last required character, or a negative number
5371 bcptr pointer to the chain of currently open branches
5372 cd points to the data block with tables pointers etc.
5373 lengthptr NULL during the real compile phase
5374 points to length accumulator during pre-compile phase
5375
5376 Returns: TRUE on success
5377 */
5378
5379 static BOOL
5380 compile_regex(int options, int oldims, uschar **codeptr, const uschar **ptrptr,
5381 int *errorcodeptr, BOOL lookbehind, BOOL reset_bracount, int skipbytes,
5382 int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd,
5383 int *lengthptr)
5384 {
5385 const uschar *ptr = *ptrptr;
5386 uschar *code = *codeptr;
5387 uschar *last_branch = code;
5388 uschar *start_bracket = code;
5389 uschar *reverse_count = NULL;
5390 int firstbyte, reqbyte;
5391 int branchfirstbyte, branchreqbyte;
5392 int length;
5393 int orig_bracount;
5394 int max_bracount;
5395 branch_chain bc;
5396
5397 bc.outer = bcptr;
5398 bc.current = code;
5399
5400 firstbyte = reqbyte = REQ_UNSET;
5401
5402 /* Accumulate the length for use in the pre-compile phase. Start with the
5403 length of the BRA and KET and any extra bytes that are required at the
5404 beginning. We accumulate in a local variable to save frequent testing of
5405 lenthptr for NULL. We cannot do this by looking at the value of code at the
5406 start and end of each alternative, because compiled items are discarded during
5407 the pre-compile phase so that the work space is not exceeded. */
5408
5409 length = 2 + 2*LINK_SIZE + skipbytes;
5410
5411 /* WARNING: If the above line is changed for any reason, you must also change
5412 the code that abstracts option settings at the start of the pattern and makes
5413 them global. It tests the value of length for (2 + 2*LINK_SIZE) in the
5414 pre-compile phase to find out whether anything has yet been compiled or not. */
5415
5416 /* Offset is set zero to mark that this bracket is still open */
5417
5418 PUT(code, 1, 0);
5419 code += 1 + LINK_SIZE + skipbytes;
5420
5421 /* Loop for each alternative branch */
5422
5423 orig_bracount = max_bracount = cd->bracount;
5424 for (;;)
5425 {
5426 /* For a (?| group, reset the capturing bracket count so that each branch
5427 uses the same numbers. */
5428
5429 if (reset_bracount) cd->bracount = orig_bracount;
5430
5431 /* Handle a change of ims options at the start of the branch */
5432
5433 if ((options & PCRE_IMS) != oldims)
5434 {
5435 *code++ = OP_OPT;
5436 *code++ = options & PCRE_IMS;
5437 length += 2;
5438 }
5439
5440 /* Set up dummy OP_REVERSE if lookbehind assertion */
5441
5442 if (lookbehind)
5443 {
5444 *code++ = OP_REVERSE;
5445 reverse_count = code;
5446 PUTINC(code, 0, 0);
5447 length += 1 + LINK_SIZE;
5448 }
5449
5450 /* Now compile the branch; in the pre-compile phase its length gets added
5451 into the length. */
5452
5453 if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstbyte,
5454 &branchreqbyte, &bc, cd, (lengthptr == NULL)? NULL : &length))
5455 {
5456 *ptrptr = ptr;
5457 return FALSE;
5458 }
5459
5460 /* Keep the highest bracket count in case (?| was used and some branch
5461 has fewer than the rest. */
5462
5463 if (cd->bracount > max_bracount) max_bracount = cd->bracount;
5464
5465 /* In the real compile phase, there is some post-processing to be done. */
5466
5467 if (lengthptr == NULL)
5468 {
5469 /* If this is the first branch, the firstbyte and reqbyte values for the
5470 branch become the values for the regex. */
5471
5472 if (*last_branch != OP_ALT)
5473 {
5474 firstbyte = branchfirstbyte;
5475 reqbyte = branchreqbyte;
5476 }
5477
5478 /* If this is not the first branch, the first char and reqbyte have to
5479 match the values from all the previous branches, except that if the
5480 previous value for reqbyte didn't have REQ_VARY set, it can still match,
5481 and we set REQ_VARY for the regex. */
5482
5483 else
5484 {
5485 /* If we previously had a firstbyte, but it doesn't match the new branch,
5486 we have to abandon the firstbyte for the regex, but if there was
5487 previously no reqbyte, it takes on the value of the old firstbyte. */
5488
5489 if (firstbyte >= 0 && firstbyte != branchfirstbyte)
5490 {
5491 if (reqbyte < 0) reqbyte = firstbyte;
5492 firstbyte = REQ_NONE;
5493 }
5494
5495 /* If we (now or from before) have no firstbyte, a firstbyte from the
5496 branch becomes a reqbyte if there isn't a branch reqbyte. */
5497
5498 if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
5499 branchreqbyte = branchfirstbyte;
5500
5501 /* Now ensure that the reqbytes match */
5502
5503 if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
5504 reqbyte = REQ_NONE;
5505 else reqbyte |= branchreqbyte; /* To "or" REQ_VARY */
5506 }
5507
5508 /* If lookbehind, check that this branch matches a fixed-length string, and
5509 put the length into the OP_REVERSE item. Temporarily mark the end of the
5510 branch with OP_END. */
5511
5512 if (lookbehind)
5513 {
5514 int fixed_length;
5515 *code = OP_END;
5516 fixed_length = find_fixedlength(last_branch, options);
5517 DPRINTF(("fixed length = %d\n", fixed_length));
5518 if (fixed_length < 0)
5519 {
5520 *errorcodeptr = (fixed_length == -2)? ERR36 : ERR25;
5521 *ptrptr = ptr;
5522 return FALSE;
5523 }
5524 PUT(reverse_count, 0, fixed_length);
5525 }
5526 }
5527
5528 /* Reached end of expression, either ')' or end of pattern. In the real
5529 compile phase, go back through the alternative branches and reverse the chain
5530 of offsets, with the field in the BRA item now becoming an offset to the
5531 first alternative. If there are no alternatives, it points to the end of the
5532 group. The length in the terminating ket is always the length of the whole
5533 bracketed item. If any of the ims options were changed inside the group,
5534 compile a resetting op-code following, except at the very end of the pattern.
5535 Return leaving the pointer at the terminating char. */
5536
5537 if (*ptr != '|')
5538 {
5539 if (lengthptr == NULL)
5540 {
5541 int branch_length = code - last_branch;
5542 do
5543 {
5544 int prev_length = GET(last_branch, 1);
5545 PUT(last_branch, 1, branch_length);
5546 branch_length = prev_length;
5547 last_branch -= branch_length;
5548 }
5549 while (branch_length > 0);
5550 }
5551
5552 /* Fill in the ket */
5553
5554 *code = OP_KET;
5555 PUT(code, 1, code - start_bracket);
5556 code += 1 + LINK_SIZE;
5557
5558 /* Resetting option if needed */
5559
5560 if ((options & PCRE_IMS) != oldims && *ptr == ')')
5561 {
5562 *code++ = OP_OPT;
5563 *code++ = oldims;
5564 length += 2;
5565 }
5566
5567 /* Retain the highest bracket number, in case resetting was used. */
5568
5569 cd->bracount = max_bracount;
5570
5571 /* Set values to pass back */
5572
5573 *codeptr = code;
5574 *ptrptr = ptr;
5575 *firstbyteptr = firstbyte;
5576 *reqbyteptr = reqbyte;
5577 if (lengthptr != NULL)
5578 {
5579 if (OFLOW_MAX - *lengthptr < length)
5580 {
5581 *errorcodeptr = ERR20;
5582 return FALSE;
5583 }
5584 *lengthptr += length;
5585 }
5586 return TRUE;
5587 }
5588
5589 /* Another branch follows. In the pre-compile phase, we can move the code
5590 pointer back to where it was for the start of the first branch. (That is,
5591 pretend that each branch is the only one.)
5592
5593 In the real compile phase, insert an ALT node. Its length field points back
5594 to the previous branch while the bracket remains open. At the end the chain
5595 is reversed. It's done like this so that the start of the bracket has a
5596 zero offset until it is closed, making it possible to detect recursion. */
5597
5598 if (lengthptr != NULL)
5599 {
5600 code = *codeptr + 1 + LINK_SIZE + skipbytes;
5601 length += 1 + LINK_SIZE;
5602 }
5603 else
5604 {
5605 *code = OP_ALT;
5606 PUT(code, 1, code - last_branch);
5607 bc.current = last_branch = code;
5608 code += 1 + LINK_SIZE;
5609 }
5610
5611 ptr++;
5612 }
5613 /* Control never reaches here */
5614 }
5615
5616
5617
5618
5619 /*************************************************
5620 * Check for anchored expression *
5621 *************************************************/
5622
5623 /* Try to find out if this is an anchored regular expression. Consider each
5624 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
5625 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
5626 it's anchored. However, if this is a multiline pattern, then only OP_SOD
5627 counts, since OP_CIRC can match in the middle.
5628
5629 We can also consider a regex to be anchored if OP_SOM starts all its branches.
5630 This is the code for \G, which means "match at start of match position, taking
5631 into account the match offset".
5632
5633 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
5634 because that will try the rest of the pattern at all possible matching points,
5635 so there is no point trying again.... er ....
5636
5637 .... except when the .* appears inside capturing parentheses, and there is a
5638 subsequent back reference to those parentheses. We haven't enough information
5639 to catch that case precisely.
5640
5641 At first, the best we could do was to detect when .* was in capturing brackets
5642 and the highest back reference was greater than or equal to that level.
5643 However, by keeping a bitmap of the first 31 back references, we can catch some
5644 of the more common cases more precisely.
5645
5646 Arguments:
5647 code points to start of expression (the bracket)
5648 options points to the options setting
5649 bracket_map a bitmap of which brackets we are inside while testing; this
5650 handles up to substring 31; after that we just have to take
5651 the less precise approach
5652 backref_map the back reference bitmap
5653
5654 Returns: TRUE or FALSE
5655 */
5656
5657 static BOOL
5658 is_anchored(register const uschar *code, int *options, unsigned int bracket_map,
5659 unsigned int backref_map)
5660 {
5661 do {
5662 const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
5663 options, PCRE_MULTILINE, FALSE);
5664 register int op = *scode;
5665
5666 /* Non-capturing brackets */
5667
5668 if (op == OP_BRA)
5669 {
5670 if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
5671 }
5672
5673 /* Capturing brackets */
5674
5675 else if (op == OP_CBRA)
5676 {
5677 int n = GET2(scode, 1+LINK_SIZE);
5678 int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
5679 if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;
5680 }
5681
5682 /* Other brackets */
5683
5684 else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
5685 {
5686 if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
5687 }
5688
5689 /* .* is not anchored unless DOTALL is set and it isn't in brackets that
5690 are or may be referenced. */
5691
5692 else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
5693 op == OP_TYPEPOSSTAR) &&
5694 (*options & PCRE_DOTALL) != 0)
5695 {
5696 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
5697 }
5698
5699 /* Check for explicit anchoring */
5700
5701 else if (op != OP_SOD && op != OP_SOM &&
5702 ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
5703 return FALSE;
5704 code += GET(code, 1);
5705 }
5706 while (*code == OP_ALT); /* Loop for each alternative */
5707 return TRUE;
5708 }
5709
5710
5711
5712 /*************************************************
5713 * Check for starting with ^ or .* *
5714 *************************************************/
5715
5716 /* This is called to find out if every branch starts with ^ or .* so that
5717 "first char" processing can be done to speed things up in multiline
5718 matching and for non-DOTALL patterns that start with .* (which must start at
5719 the beginning or after \n). As in the case of is_anchored() (see above), we
5720 have to take account of back references to capturing brackets that contain .*
5721 because in that case we can't make the assumption.
5722
5723 Arguments:
5724 code points to start of expression (the bracket)
5725 bracket_map a bitmap of which brackets we are inside while testing; this
5726 handles up to substring 31; after that we just have to take
5727 the less precise approach
5728 backref_map the back reference bitmap
5729
5730 Returns: TRUE or FALSE
5731 */
5732
5733 static BOOL
5734 is_startline(const uschar *code, unsigned int bracket_map,
5735 unsigned int backref_map)
5736 {
5737 do {
5738 const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
5739 NULL, 0, FALSE);
5740 register int op = *scode;
5741
5742 /* Non-capturing brackets */
5743
5744 if (op == OP_BRA)
5745 {
5746 if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
5747 }
5748
5749 /* Capturing brackets */
5750
5751 else if (op == OP_CBRA)
5752 {
5753 int n = GET2(scode, 1+LINK_SIZE);
5754 int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
5755 if (!is_startline(scode, new_map, backref_map)) return FALSE;
5756 }
5757
5758 /* Other brackets */
5759
5760 else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
5761 { if (!is_startline(scode, bracket_map, backref_map)) return FALSE; }
5762
5763 /* .* means "start at start or after \n" if it isn't in brackets that
5764 may be referenced. */
5765
5766 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
5767 {
5768 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
5769 }
5770
5771 /* Check for explicit circumflex */
5772
5773 else if (op != OP_CIRC) return FALSE;
5774
5775 /* Move on to the next alternative */
5776
5777 code += GET(code, 1);
5778 }
5779 while (*code == OP_ALT); /* Loop for each alternative */
5780 return TRUE;
5781 }
5782
5783
5784
5785 /*************************************************
5786 * Check for asserted fixed first char *
5787 *************************************************/
5788
5789 /* During compilation, the "first char" settings from forward assertions are
5790 discarded, because they can cause conflicts with actual literals that follow.
5791 However, if we end up without a first char setting for an unanchored pattern,
5792 it is worth scanning the regex to see if there is an initial asserted first
5793 char. If all branches start with the same asserted char, or with a bracket all
5794 of whose alternatives start with the same asserted char (recurse ad lib), then
5795 we return that char, otherwise -1.
5796
5797 Arguments:
5798 code points to start of expression (the bracket)
5799 options pointer to the options (used to check casing changes)
5800 inassert TRUE if in an assertion
5801
5802 Returns: -1 or the fixed first char
5803 */
5804
5805 static int
5806 find_firstassertedchar(const uschar *code, int *options, BOOL inassert)
5807 {
5808 register int c = -1;
5809 do {
5810 int d;
5811 const uschar *scode =
5812 first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE);
5813 register int op = *scode;
5814
5815 switch(op)
5816 {
5817 default:
5818 return -1;
5819
5820 case OP_BRA:
5821 case OP_CBRA:
5822 case OP_ASSERT:
5823 case OP_ONCE:
5824 case OP_COND:
5825 if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0)
5826 return -1;
5827 if (c < 0) c = d; else if (c != d) return -1;
5828 break;
5829
5830 case OP_EXACT: /* Fall through */
5831 scode += 2;
5832
5833 case OP_CHAR:
5834 case OP_CHARNC:
5835 case OP_PLUS:
5836 case OP_MINPLUS:
5837 case OP_POSPLUS:
5838 if (!inassert) return -1;
5839 if (c < 0)
5840 {
5841 c = scode[1];
5842 if ((*options & PCRE_CASELESS) != 0) c |= REQ_CASELESS;
5843 }
5844 else if (c != scode[1]) return -1;
5845 break;
5846 }
5847
5848 code += GET(code, 1);
5849 }
5850 while (*code == OP_ALT);
5851 return c;
5852 }
5853
5854
5855
5856 /*************************************************
5857 * Compile a Regular Expression *
5858 *************************************************/
5859
5860 /* This function takes a string and returns a pointer to a block of store
5861 holding a compiled version of the expression. The original API for this
5862 function had no error code return variable; it is retained for backwards
5863 compatibility. The new function is given a new name.
5864
5865 Arguments:
5866 pattern the regular expression
5867 options various option bits
5868 errorcodeptr pointer to error code variable (pcre_compile2() only)
5869 can be NULL if you don't want a code value
5870 errorptr pointer to pointer to error text
5871 erroroffset ptr offset in pattern where error was detected
5872 tables pointer to character tables or NULL
5873
5874 Returns: pointer to compiled data block, or NULL on error,
5875 with errorptr and erroroffset set
5876 */
5877
5878 PCRE_EXP_DEFN pcre *
5879 pcre_compile(const char *pattern, int options, const char **errorptr,
5880 int *erroroffset, const unsigned char *tables)
5881 {
5882 return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
5883 }
5884
5885
5886 PCRE_EXP_DEFN pcre *
5887 pcre_compile2(const char *pattern, int options, int *errorcodeptr,
5888 const char **errorptr, int *erroroffset, const unsigned char *tables)
5889 {
5890 real_pcre *re;
5891 int length = 1; /* For final END opcode */
5892 int firstbyte, reqbyte, newline;
5893 int errorcode = 0;
5894 int skipatstart = 0;
5895 #ifdef SUPPORT_UTF8
5896 BOOL utf8;
5897 #endif
5898 size_t size;
5899 uschar *code;
5900 const uschar *codestart;
5901 const uschar *ptr;
5902 compile_data compile_block;
5903 compile_data *cd = &compile_block;
5904
5905 /* This space is used for "compiling" into during the first phase, when we are
5906 computing the amount of memory that is needed. Compiled items are thrown away
5907 as soon as possible, so that a fairly large buffer should be sufficient for
5908 this purpose. The same space is used in the second phase for remembering where
5909 to fill in forward references to subpatterns. */
5910
5911 uschar cworkspace[COMPILE_WORK_SIZE];
5912
5913 /* Set this early so that early errors get offset 0. */
5914
5915 ptr = (const uschar *)pattern;
5916
5917 /* We can't pass back an error message if errorptr is NULL; I guess the best we
5918 can do is just return NULL, but we can set a code value if there is a code
5919 pointer. */
5920
5921 if (errorptr == NULL)
5922 {
5923 if (errorcodeptr != NULL) *errorcodeptr = 99;
5924 return NULL;
5925 }
5926
5927 *errorptr = NULL;
5928 if (errorcodeptr != NULL) *errorcodeptr = ERR0;
5929
5930 /* However, we can give a message for this error */
5931
5932 if (erroroffset == NULL)
5933 {
5934 errorcode = ERR16;
5935 goto PCRE_EARLY_ERROR_RETURN2;
5936 }
5937
5938 *erroroffset = 0;
5939
5940 /* Can't support UTF8 unless PCRE has been compiled to include the code. */
5941
5942 #ifdef SUPPORT_UTF8
5943 utf8 = (options & PCRE_UTF8) != 0;
5944 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
5945 (*erroroffset = _pcre_valid_utf8((uschar *)pattern, -1)) >= 0)
5946 {
5947 errorcode = ERR44;
5948 goto PCRE_EARLY_ERROR_RETURN2;
5949 }
5950 #else
5951 if ((options & PCRE_UTF8) != 0)
5952 {
5953 errorcode = ERR32;
5954 goto PCRE_EARLY_ERROR_RETURN;
5955 }
5956 #endif
5957
5958 if ((options & ~PUBLIC_OPTIONS) != 0)
5959 {
5960 errorcode = ERR17;
5961 goto PCRE_EARLY_ERROR_RETURN;
5962 }
5963
5964 /* Set up pointers to the individual character tables */
5965
5966 if (tables == NULL) tables = _pcre_default_tables;
5967 cd->lcc = tables + lcc_offset;
5968 cd->fcc = tables + fcc_offset;
5969 cd->cbits = tables + cbits_offset;
5970 cd->ctypes = tables + ctypes_offset;
5971
5972 /* Check for global one-time settings at the start of the pattern, and remember
5973 the offset for later. */
5974
5975 while (ptr[skipatstart] == '(' && ptr[skipatstart+1] == '*')
5976 {
5977 int newnl = 0;
5978 int newbsr = 0;
5979
5980 if (strncmp((char *)(ptr+skipatstart+2), "CR)", 3) == 0)
5981 { skipatstart += 5; newnl = PCRE_NEWLINE_CR; }
5982 else if (strncmp((char *)(ptr+skipatstart+2), "LF)", 3) == 0)
5983 { skipatstart += 5; newnl = PCRE_NEWLINE_LF; }
5984 else if (strncmp((char *)(ptr+skipatstart+2), "CRLF)", 5) == 0)
5985 { skipatstart += 7; newnl = PCRE_NEWLINE_CR + PCRE_NEWLINE_LF; }
5986 else if (strncmp((char *)(ptr+skipatstart+2), "ANY)", 4) == 0)
5987 { skipatstart += 6; newnl = PCRE_NEWLINE_ANY; }
5988 else if (strncmp((char *)(ptr+skipatstart+2), "ANYCRLF)", 8) == 0)
5989 { skipatstart += 10; newnl = PCRE_NEWLINE_ANYCRLF; }
5990
5991 else if (strncmp((char *)(ptr+skipatstart+2), "BSR_ANYCRLF)", 12) == 0)
5992 { skipatstart += 14; newbsr = PCRE_BSR_ANYCRLF; }
5993 else if (strncmp((char *)(ptr+skipatstart+2), "BSR_UNICODE)", 12) == 0)
5994 { skipatstart += 14; newbsr = PCRE_BSR_UNICODE; }
5995
5996 if (newnl != 0)
5997 options = (options & ~PCRE_NEWLINE_BITS) | newnl;
5998 else if (newbsr != 0)
5999 options = (options & ~(PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) | newbsr;
6000 else break;
6001 }
6002
6003 /* Check validity of \R options. */
6004
6005 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
6006 {
6007 case 0:
6008 case PCRE_BSR_ANYCRLF:
6009 case PCRE_BSR_UNICODE:
6010 break;
6011 default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
6012 }
6013
6014 /* Handle different types of newline. The three bits give seven cases. The
6015 current code allows for fixed one- or two-byte sequences, plus "any" and
6016 "anycrlf". */
6017
6018 switch (options & PCRE_NEWLINE_BITS)
6019 {
6020 case 0: newline = NEWLINE; break; /* Build-time default */
6021 case PCRE_NEWLINE_CR: newline = '\r'; break;
6022 case PCRE_NEWLINE_LF: newline = '\n'; break;
6023 case PCRE_NEWLINE_CR+
6024 PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
6025 case PCRE_NEWLINE_ANY: newline = -1; break;
6026 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
6027 default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
6028 }
6029
6030 if (newline == -2)
6031 {
6032 cd->nltype = NLTYPE_ANYCRLF;
6033 }
6034 else if (newline < 0)
6035 {
6036 cd->nltype = NLTYPE_ANY;
6037 }
6038 else
6039 {
6040 cd->nltype = NLTYPE_FIXED;
6041 if (newline > 255)
6042 {
6043 cd->nllen = 2;
6044 cd->nl[0] = (newline >> 8) & 255;
6045 cd->nl[1] = newline & 255;
6046 }
6047 else
6048 {
6049 cd->nllen = 1;
6050 cd->nl[0] = newline;
6051 }
6052 }
6053
6054 /* Maximum back reference and backref bitmap. The bitmap records up to 31 back
6055 references to help in deciding whether (.*) can be treated as anchored or not.
6056 */
6057
6058 cd->top_backref = 0;
6059 cd->backref_map = 0;
6060
6061 /* Reflect pattern for debugging output */
6062
6063 DPRINTF(("------------------------------------------------------------------\n"));
6064 DPRINTF(("%s\n", pattern));
6065
6066 /* Pretend to compile the pattern while actually just accumulating the length
6067 of memory required. This behaviour is triggered by passing a non-NULL final
6068 argument to compile_regex(). We pass a block of workspace (cworkspace) for it
6069 to compile parts of the pattern into; the compiled code is discarded when it is
6070 no longer needed, so hopefully this workspace will never overflow, though there
6071 is a test for its doing so. */
6072
6073 cd->bracount = cd->final_bracount = 0;
6074 cd->names_found = 0;
6075 cd->name_entry_size = 0;
6076 cd->name_table = NULL;
6077 cd->start_workspace = cworkspace;
6078 cd->start_code = cworkspace;
6079 cd->hwm = cworkspace;
6080 cd->start_pattern = (const uschar *)pattern;
6081 cd->end_pattern = (const uschar *)(pattern + strlen(pattern));
6082 cd->req_varyopt = 0;
6083 cd->external_options = options;
6084 cd->external_flags = 0;
6085
6086 /* Now do the pre-compile. On error, errorcode will be set non-zero, so we
6087 don't need to look at the result of the function here. The initial options have
6088 been put into the cd block so that they can be changed if an option setting is
6089 found within the regex right at the beginning. Bringing initial option settings
6090 outside can help speed up starting point checks. */
6091
6092 ptr += skipatstart;
6093 code = cworkspace;
6094 *code = OP_BRA;
6095 (void)compile_regex(cd->external_options, cd->external_options & PCRE_IMS,
6096 &code, &ptr, &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd,
6097 &length);
6098 if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN;
6099
6100 DPRINTF(("end pre-compile: length=%d workspace=%d\n", length,
6101 cd->hwm - cworkspace));
6102
6103 if (length > MAX_PATTERN_SIZE)
6104 {
6105 errorcode = ERR20;
6106 goto PCRE_EARLY_ERROR_RETURN;
6107 }
6108
6109 /* Compute the size of data block needed and get it, either from malloc or
6110 externally provided function. Integer overflow should no longer be possible
6111 because nowadays we limit the maximum value of cd->names_found and
6112 cd->name_entry_size. */
6113
6114 size = length + sizeof(real_pcre) + cd->names_found * (cd->name_entry_size + 3);
6115 re = (real_pcre *)(pcre_malloc)(size);
6116
6117 if (re == NULL)
6118 {
6119 errorcode = ERR21;
6120 goto PCRE_EARLY_ERROR_RETURN;
6121 }
6122
6123 /* Put in the magic number, and save the sizes, initial options, internal
6124 flags, and character table pointer. NULL is used for the default character
6125 tables. The nullpad field is at the end; it's there to help in the case when a
6126 regex compiled on a system with 4-byte pointers is run on another with 8-byte
6127 pointers. */
6128
6129 re->magic_number = MAGIC_NUMBER;
6130 re->size = size;
6131 re->options = cd->external_options;
6132 re->flags = cd->external_flags;
6133 re->dummy1 = 0;
6134 re->first_byte = 0;
6135 re->req_byte = 0;
6136 re->name_table_offset = sizeof(real_pcre);
6137 re->name_entry_size = cd->name_entry_size;
6138 re->name_count = cd->names_found;
6139 re->ref_count = 0;
6140 re->tables = (tables == _pcre_default_tables)? NULL : tables;
6141 re->nullpad = NULL;
6142
6143 /* The starting points of the name/number translation table and of the code are
6144 passed around in the compile data block. The start/end pattern and initial
6145 options are already set from the pre-compile phase, as is the name_entry_size
6146 field. Reset the bracket count and the names_found field. Also reset the hwm
6147 field; this time it's used for remembering forward references to subpatterns.
6148 */
6149
6150 cd->final_bracount = cd->bracount; /* Save for checking forward references */
6151 cd->bracount = 0;
6152 cd->names_found = 0;
6153 cd->name_table = (uschar *)re + re->name_table_offset;
6154 codestart = cd->name_table + re->name_entry_size * re->name_count;
6155 cd->start_code = codestart;
6156 cd->hwm = cworkspace;
6157 cd->req_varyopt = 0;
6158 cd->had_accept = FALSE;
6159
6160 /* Set up a starting, non-extracting bracket, then compile the expression. On
6161 error, errorcode will be set non-zero, so we don't need to look at the result
6162 of the function here. */
6163
6164 ptr = (const uschar *)pattern + skipatstart;
6165 code = (uschar *)codestart;
6166 *code = OP_BRA;
6167 (void)compile_regex(re->options, re->options & PCRE_IMS, &code, &ptr,
6168 &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd, NULL);
6169 re->top_bracket = cd->bracount;
6170 re->top_backref = cd->top_backref;
6171 re->flags = cd->external_flags;
6172
6173 if (cd->had_accept) reqbyte = -1; /* Must disable after (*ACCEPT) */
6174
6175 /* If not reached end of pattern on success, there's an excess bracket. */
6176
6177 if (errorcode == 0 && *ptr != 0) errorcode = ERR22;
6178
6179 /* Fill in the terminating state and check for disastrous overflow, but
6180 if debugging, leave the test till after things are printed out. */
6181
6182 *code++ = OP_END;
6183
6184 #ifndef DEBUG
6185 if (code - codestart > length) errorcode = ERR23;
6186 #endif
6187
6188 /* Fill in any forward references that are required. */
6189
6190 while (errorcode == 0 && cd->hwm > cworkspace)
6191 {
6192 int offset, recno;
6193 const uschar *groupptr;
6194 cd->hwm -= LINK_SIZE;
6195 offset = GET(cd->hwm, 0);
6196 recno = GET(codestart, offset);
6197 groupptr = find_bracket(codestart, (re->options & PCRE_UTF8) != 0, recno);
6198 if (groupptr == NULL) errorcode = ERR53;
6199 else PUT(((uschar *)codestart), offset, groupptr - codestart);
6200 }
6201
6202 /* Give an error if there's back reference to a non-existent capturing
6203 subpattern. */
6204
6205 if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15;
6206
6207 /* Failed to compile, or error while post-processing */
6208
6209 if (errorcode != 0)
6210 {
6211 (pcre_free)(re);
6212 PCRE_EARLY_ERROR_RETURN:
6213 *erroroffset = ptr - (const uschar *)pattern;
6214 PCRE_EARLY_ERROR_RETURN2:
6215 *errorptr = find_error_text(errorcode);
6216 if (errorcodeptr != NULL) *errorcodeptr = errorcode;
6217 return NULL;
6218 }
6219
6220 /* If the anchored option was not passed, set the flag if we can determine that
6221 the pattern is anchored by virtue of ^ characters or \A or anything else (such
6222 as starting with .* when DOTALL is set).
6223
6224 Otherwise, if we know what the first byte has to be, save it, because that
6225 speeds up unanchored matches no end. If not, see if we can set the
6226 PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
6227 start with ^. and also when all branches start with .* for non-DOTALL matches.
6228 */
6229
6230 if ((re->options & PCRE_ANCHORED) == 0)
6231 {
6232 int temp_options = re->options; /* May get changed during these scans */
6233 if (is_anchored(codestart, &temp_options, 0, cd->backref_map))
6234 re->options |= PCRE_ANCHORED;
6235 else
6236 {
6237 if (firstbyte < 0)
6238 firstbyte = find_firstassertedchar(codestart, &temp_options, FALSE);
6239 if (firstbyte >= 0) /* Remove caseless flag for non-caseable chars */
6240 {
6241 int ch = firstbyte & 255;
6242 re->first_byte = ((firstbyte & REQ_CASELESS) != 0 &&
6243 cd->fcc[ch] == ch)? ch : firstbyte;
6244 re->flags |= PCRE_FIRSTSET;
6245 }
6246 else if (is_startline(codestart, 0, cd->backref_map))
6247 re->flags |= PCRE_STARTLINE;
6248 }
6249 }
6250
6251 /* For an anchored pattern, we use the "required byte" only if it follows a
6252 variable length item in the regex. Remove the caseless flag for non-caseable
6253 bytes. */
6254
6255 if (reqbyte >= 0 &&
6256 ((re->options & PCRE_ANCHORED) == 0 || (reqbyte & REQ_VARY) != 0))
6257 {
6258 int ch = reqbyte & 255;
6259 re->req_byte = ((reqbyte & REQ_CASELESS) != 0 &&
6260 cd->fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte;
6261 re->flags |= PCRE_REQCHSET;
6262 }
6263
6264 /* Print out the compiled data if debugging is enabled. This is never the
6265 case when building a production library. */
6266
6267 #ifdef DEBUG
6268
6269 printf("Length = %d top_bracket = %d top_backref = %d\n",
6270 length, re->top_bracket, re->top_backref);
6271
6272 printf("Options=%08x\n", re->options);
6273
6274 if ((re->flags & PCRE_FIRSTSET) != 0)
6275 {
6276 int ch = re->first_byte & 255;
6277 const char *caseless = ((re->first_byte & REQ_CASELESS) == 0)?
6278 "" : " (caseless)";
6279 if (isprint(ch)) printf("First char = %c%s\n", ch, caseless);
6280 else printf("First char = \\x%02x%s\n", ch, caseless);
6281 }
6282
6283 if ((re->flags & PCRE_REQCHSET) != 0)
6284 {
6285 int ch = re->req_byte & 255;
6286 const char *caseless = ((re->req_byte & REQ_CASELESS) == 0)?
6287 "" : " (caseless)";
6288 if (isprint(ch)) printf("Req char = %c%s\n", ch, caseless);
6289 else printf("Req char = \\x%02x%s\n", ch, caseless);
6290 }
6291
6292 pcre_printint(re, stdout, TRUE);
6293
6294 /* This check is done here in the debugging case so that the code that
6295 was compiled can be seen. */
6296
6297 if (code - codestart > length)
6298 {
6299 (pcre_free)(re);
6300 *errorptr = find_error_text(ERR23);
6301 *erroroffset = ptr - (uschar *)pattern;
6302 if (errorcodeptr != NULL) *errorcodeptr = ERR23;
6303 return NULL;
6304 }
6305 #endif /* DEBUG */
6306
6307 return (pcre *)re;
6308 }
6309
6310 /* End of pcre_compile.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12