/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Contents of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 150 - (show annotations) (download)
Tue Apr 17 08:22:40 2007 UTC (7 years ago) by ph10
File MIME type: text/plain
File size: 172661 byte(s)
Update HTML documentation.

1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2007 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains the external function pcre_compile(), along with
42 supporting internal functions that are not used by other modules. */
43
44
45 #define NLBLOCK cd /* Block containing newline information */
46 #define PSSTART start_pattern /* Field containing processed string start */
47 #define PSEND end_pattern /* Field containing processed string end */
48
49
50 #include "pcre_internal.h"
51
52
53 /* When DEBUG is defined, we need the pcre_printint() function, which is also
54 used by pcretest. DEBUG is not defined when building a production library. */
55
56 #ifdef DEBUG
57 #include "pcre_printint.src"
58 #endif
59
60
61 /*************************************************
62 * Code parameters and static tables *
63 *************************************************/
64
65 /* This value specifies the size of stack workspace that is used during the
66 first pre-compile phase that determines how much memory is required. The regex
67 is partly compiled into this space, but the compiled parts are discarded as
68 soon as they can be, so that hopefully there will never be an overrun. The code
69 does, however, check for an overrun. The largest amount I've seen used is 218,
70 so this number is very generous.
71
72 The same workspace is used during the second, actual compile phase for
73 remembering forward references to groups so that they can be filled in at the
74 end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
75 is 4 there is plenty of room. */
76
77 #define COMPILE_WORK_SIZE (4096)
78
79
80 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
81 are simple data values; negative values are for special things like \d and so
82 on. Zero means further processing is needed (for things like \x), or the escape
83 is invalid. */
84
85 #ifndef EBCDIC /* This is the "normal" table for ASCII systems */
86 static const short int escapes[] = {
87 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
88 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
89 '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */
90 0, 0, 0, 0, 0, 0, 0, 0, /* H - O */
91 -ESC_P, -ESC_Q, -ESC_R, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
92 -ESC_X, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
93 '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */
94 0, 0, 0, -ESC_k, 0, 0, ESC_n, 0, /* h - o */
95 -ESC_p, 0, ESC_r, -ESC_s, ESC_tee, 0, 0, -ESC_w, /* p - w */
96 0, 0, -ESC_z /* x - z */
97 };
98
99 #else /* This is the "abnormal" table for EBCDIC systems */
100 static const short int escapes[] = {
101 /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
102 /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
103 /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
104 /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
105 /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
106 /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
107 /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
108 /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
109 /* 88 */ 0, 0, 0, '{', 0, 0, 0, 0,
110 /* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
111 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
112 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0, 0, -ESC_w, 0,
113 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
114 /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
115 /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
116 /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
117 /* C8 */ 0, 0, 0, 0, 0, 0, 0, 0,
118 /* D0 */ '}', 0, 0, 0, 0, 0, 0, -ESC_P,
119 /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
120 /* E0 */ '\\', 0, -ESC_S, 0, 0, 0, -ESC_W, -ESC_X,
121 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
122 /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
123 /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
124 };
125 #endif
126
127
128 /* Tables of names of POSIX character classes and their lengths. The list is
129 terminated by a zero length entry. The first three must be alpha, lower, upper,
130 as this is assumed for handling case independence. */
131
132 static const char *const posix_names[] = {
133 "alpha", "lower", "upper",
134 "alnum", "ascii", "blank", "cntrl", "digit", "graph",
135 "print", "punct", "space", "word", "xdigit" };
136
137 static const uschar posix_name_lengths[] = {
138 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
139
140 /* Table of class bit maps for each POSIX class. Each class is formed from a
141 base map, with an optional addition or removal of another map. Then, for some
142 classes, there is some additional tweaking: for [:blank:] the vertical space
143 characters are removed, and for [:alpha:] and [:alnum:] the underscore
144 character is removed. The triples in the table consist of the base map offset,
145 second map offset or -1 if no second map, and a non-negative value for map
146 addition or a negative value for map subtraction (if there are two maps). The
147 absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
148 remove vertical space characters, 2 => remove underscore. */
149
150 static const int posix_class_maps[] = {
151 cbit_word, cbit_digit, -2, /* alpha */
152 cbit_lower, -1, 0, /* lower */
153 cbit_upper, -1, 0, /* upper */
154 cbit_word, -1, 2, /* alnum - word without underscore */
155 cbit_print, cbit_cntrl, 0, /* ascii */
156 cbit_space, -1, 1, /* blank - a GNU extension */
157 cbit_cntrl, -1, 0, /* cntrl */
158 cbit_digit, -1, 0, /* digit */
159 cbit_graph, -1, 0, /* graph */
160 cbit_print, -1, 0, /* print */
161 cbit_punct, -1, 0, /* punct */
162 cbit_space, -1, 0, /* space */
163 cbit_word, -1, 0, /* word - a Perl extension */
164 cbit_xdigit,-1, 0 /* xdigit */
165 };
166
167
168 #define STRING(a) # a
169 #define XSTRING(s) STRING(s)
170
171 /* The texts of compile-time error messages. These are "char *" because they
172 are passed to the outside world. Do not ever re-use any error number, because
173 they are documented. Always add a new error instead. Messages marked DEAD below
174 are no longer used. */
175
176 static const char *error_texts[] = {
177 "no error",
178 "\\ at end of pattern",
179 "\\c at end of pattern",
180 "unrecognized character follows \\",
181 "numbers out of order in {} quantifier",
182 /* 5 */
183 "number too big in {} quantifier",
184 "missing terminating ] for character class",
185 "invalid escape sequence in character class",
186 "range out of order in character class",
187 "nothing to repeat",
188 /* 10 */
189 "operand of unlimited repeat could match the empty string", /** DEAD **/
190 "internal error: unexpected repeat",
191 "unrecognized character after (?",
192 "POSIX named classes are supported only within a class",
193 "missing )",
194 /* 15 */
195 "reference to non-existent subpattern",
196 "erroffset passed as NULL",
197 "unknown option bit(s) set",
198 "missing ) after comment",
199 "parentheses nested too deeply", /** DEAD **/
200 /* 20 */
201 "regular expression too large",
202 "failed to get memory",
203 "unmatched parentheses",
204 "internal error: code overflow",
205 "unrecognized character after (?<",
206 /* 25 */
207 "lookbehind assertion is not fixed length",
208 "malformed number or name after (?(",
209 "conditional group contains more than two branches",
210 "assertion expected after (?(",
211 "(?R or (?digits must be followed by )",
212 /* 30 */
213 "unknown POSIX class name",
214 "POSIX collating elements are not supported",
215 "this version of PCRE is not compiled with PCRE_UTF8 support",
216 "spare error", /** DEAD **/
217 "character value in \\x{...} sequence is too large",
218 /* 35 */
219 "invalid condition (?(0)",
220 "\\C not allowed in lookbehind assertion",
221 "PCRE does not support \\L, \\l, \\N, \\U, or \\u",
222 "number after (?C is > 255",
223 "closing ) for (?C expected",
224 /* 40 */
225 "recursive call could loop indefinitely",
226 "unrecognized character after (?P",
227 "syntax error in subpattern name (missing terminator)",
228 "two named subpatterns have the same name",
229 "invalid UTF-8 string",
230 /* 45 */
231 "support for \\P, \\p, and \\X has not been compiled",
232 "malformed \\P or \\p sequence",
233 "unknown property name after \\P or \\p",
234 "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)",
235 "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")",
236 /* 50 */
237 "repeated subpattern is too long",
238 "octal value is greater than \\377 (not in UTF-8 mode)",
239 "internal error: overran compiling workspace",
240 "internal error: previously-checked referenced subpattern not found",
241 "DEFINE group contains more than one branch",
242 /* 55 */
243 "repeating a DEFINE group is not allowed",
244 "inconsistent NEWLINE options",
245 "\\g is not followed by an (optionally braced) non-zero number"
246 };
247
248
249 /* Table to identify digits and hex digits. This is used when compiling
250 patterns. Note that the tables in chartables are dependent on the locale, and
251 may mark arbitrary characters as digits - but the PCRE compiling code expects
252 to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
253 a private table here. It costs 256 bytes, but it is a lot faster than doing
254 character value tests (at least in some simple cases I timed), and in some
255 applications one wants PCRE to compile efficiently as well as match
256 efficiently.
257
258 For convenience, we use the same bit definitions as in chartables:
259
260 0x04 decimal digit
261 0x08 hexadecimal digit
262
263 Then we can use ctype_digit and ctype_xdigit in the code. */
264
265 #ifndef EBCDIC /* This is the "normal" case, for ASCII systems */
266 static const unsigned char digitab[] =
267 {
268 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
269 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
270 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
271 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
272 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
273 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
274 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
275 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
276 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
277 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
278 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
279 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
280 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
281 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
282 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
283 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
284 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
285 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
286 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
287 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
288 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
289 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
290 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
291 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
292 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
293 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
294 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
295 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
296 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
297 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
298 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
299 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
300
301 #else /* This is the "abnormal" case, for EBCDIC systems */
302 static const unsigned char digitab[] =
303 {
304 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
305 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
306 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
307 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
308 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
309 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
310 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
311 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
312 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
313 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
314 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
315 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */
316 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
317 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
318 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
319 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
320 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
321 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
322 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
323 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
324 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
325 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
326 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
327 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
328 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
329 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
330 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
331 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
332 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
333 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
334 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
335 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
336
337 static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
338 0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
339 0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
340 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
341 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
342 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
343 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
344 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
345 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
346 0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
347 0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
348 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
349 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */
350 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
351 0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
352 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
353 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
354 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
355 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
356 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
357 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
358 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
359 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
360 0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
361 0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
362 0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
363 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
364 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
365 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
366 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
367 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
368 0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
369 0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
370 #endif
371
372
373 /* Definition to allow mutual recursion */
374
375 static BOOL
376 compile_regex(int, int, uschar **, const uschar **, int *, BOOL, int, int *,
377 int *, branch_chain *, compile_data *, int *);
378
379
380
381 /*************************************************
382 * Handle escapes *
383 *************************************************/
384
385 /* This function is called when a \ has been encountered. It either returns a
386 positive value for a simple escape such as \n, or a negative value which
387 encodes one of the more complicated things such as \d. A backreference to group
388 n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
389 UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
390 ptr is pointing at the \. On exit, it is on the final character of the escape
391 sequence.
392
393 Arguments:
394 ptrptr points to the pattern position pointer
395 errorcodeptr points to the errorcode variable
396 bracount number of previous extracting brackets
397 options the options bits
398 isclass TRUE if inside a character class
399
400 Returns: zero or positive => a data character
401 negative => a special escape sequence
402 on error, errorptr is set
403 */
404
405 static int
406 check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
407 int options, BOOL isclass)
408 {
409 BOOL utf8 = (options & PCRE_UTF8) != 0;
410 const uschar *ptr = *ptrptr + 1;
411 int c, i;
412
413 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
414 ptr--; /* Set pointer back to the last byte */
415
416 /* If backslash is at the end of the pattern, it's an error. */
417
418 if (c == 0) *errorcodeptr = ERR1;
419
420 /* Non-alphamerics are literals. For digits or letters, do an initial lookup in
421 a table. A non-zero result is something that can be returned immediately.
422 Otherwise further processing may be required. */
423
424 #ifndef EBCDIC /* ASCII coding */
425 else if (c < '0' || c > 'z') {} /* Not alphameric */
426 else if ((i = escapes[c - '0']) != 0) c = i;
427
428 #else /* EBCDIC coding */
429 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphameric */
430 else if ((i = escapes[c - 0x48]) != 0) c = i;
431 #endif
432
433 /* Escapes that need further processing, or are illegal. */
434
435 else
436 {
437 const uschar *oldptr;
438 BOOL braced, negated;
439
440 switch (c)
441 {
442 /* A number of Perl escapes are not handled by PCRE. We give an explicit
443 error. */
444
445 case 'l':
446 case 'L':
447 case 'N':
448 case 'u':
449 case 'U':
450 *errorcodeptr = ERR37;
451 break;
452
453 /* \g must be followed by a number, either plain or braced. If positive, it
454 is an absolute backreference. If negative, it is a relative backreference.
455 This is a Perl 5.10 feature. */
456
457 case 'g':
458 if (ptr[1] == '{')
459 {
460 braced = TRUE;
461 ptr++;
462 }
463 else braced = FALSE;
464
465 if (ptr[1] == '-')
466 {
467 negated = TRUE;
468 ptr++;
469 }
470 else negated = FALSE;
471
472 c = 0;
473 while ((digitab[ptr[1]] & ctype_digit) != 0)
474 c = c * 10 + *(++ptr) - '0';
475
476 if (c == 0 || (braced && *(++ptr) != '}'))
477 {
478 *errorcodeptr = ERR57;
479 return 0;
480 }
481
482 if (negated)
483 {
484 if (c > bracount)
485 {
486 *errorcodeptr = ERR15;
487 return 0;
488 }
489 c = bracount - (c - 1);
490 }
491
492 c = -(ESC_REF + c);
493 break;
494
495 /* The handling of escape sequences consisting of a string of digits
496 starting with one that is not zero is not straightforward. By experiment,
497 the way Perl works seems to be as follows:
498
499 Outside a character class, the digits are read as a decimal number. If the
500 number is less than 10, or if there are that many previous extracting
501 left brackets, then it is a back reference. Otherwise, up to three octal
502 digits are read to form an escaped byte. Thus \123 is likely to be octal
503 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
504 value is greater than 377, the least significant 8 bits are taken. Inside a
505 character class, \ followed by a digit is always an octal number. */
506
507 case '1': case '2': case '3': case '4': case '5':
508 case '6': case '7': case '8': case '9':
509
510 if (!isclass)
511 {
512 oldptr = ptr;
513 c -= '0';
514 while ((digitab[ptr[1]] & ctype_digit) != 0)
515 c = c * 10 + *(++ptr) - '0';
516 if (c < 10 || c <= bracount)
517 {
518 c = -(ESC_REF + c);
519 break;
520 }
521 ptr = oldptr; /* Put the pointer back and fall through */
522 }
523
524 /* Handle an octal number following \. If the first digit is 8 or 9, Perl
525 generates a binary zero byte and treats the digit as a following literal.
526 Thus we have to pull back the pointer by one. */
527
528 if ((c = *ptr) >= '8')
529 {
530 ptr--;
531 c = 0;
532 break;
533 }
534
535 /* \0 always starts an octal number, but we may drop through to here with a
536 larger first octal digit. The original code used just to take the least
537 significant 8 bits of octal numbers (I think this is what early Perls used
538 to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
539 than 3 octal digits. */
540
541 case '0':
542 c -= '0';
543 while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
544 c = c * 8 + *(++ptr) - '0';
545 if (!utf8 && c > 255) *errorcodeptr = ERR51;
546 break;
547
548 /* \x is complicated. \x{ddd} is a character number which can be greater
549 than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
550 treated as a data character. */
551
552 case 'x':
553 if (ptr[1] == '{')
554 {
555 const uschar *pt = ptr + 2;
556 int count = 0;
557
558 c = 0;
559 while ((digitab[*pt] & ctype_xdigit) != 0)
560 {
561 register int cc = *pt++;
562 if (c == 0 && cc == '0') continue; /* Leading zeroes */
563 count++;
564
565 #ifndef EBCDIC /* ASCII coding */
566 if (cc >= 'a') cc -= 32; /* Convert to upper case */
567 c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));
568 #else /* EBCDIC coding */
569 if (cc >= 'a' && cc <= 'z') cc += 64; /* Convert to upper case */
570 c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));
571 #endif
572 }
573
574 if (*pt == '}')
575 {
576 if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
577 ptr = pt;
578 break;
579 }
580
581 /* If the sequence of hex digits does not end with '}', then we don't
582 recognize this construct; fall through to the normal \x handling. */
583 }
584
585 /* Read just a single-byte hex-defined char */
586
587 c = 0;
588 while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
589 {
590 int cc; /* Some compilers don't like ++ */
591 cc = *(++ptr); /* in initializers */
592 #ifndef EBCDIC /* ASCII coding */
593 if (cc >= 'a') cc -= 32; /* Convert to upper case */
594 c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
595 #else /* EBCDIC coding */
596 if (cc <= 'z') cc += 64; /* Convert to upper case */
597 c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
598 #endif
599 }
600 break;
601
602 /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
603 This coding is ASCII-specific, but then the whole concept of \cx is
604 ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
605
606 case 'c':
607 c = *(++ptr);
608 if (c == 0)
609 {
610 *errorcodeptr = ERR2;
611 return 0;
612 }
613
614 #ifndef EBCDIC /* ASCII coding */
615 if (c >= 'a' && c <= 'z') c -= 32;
616 c ^= 0x40;
617 #else /* EBCDIC coding */
618 if (c >= 'a' && c <= 'z') c += 64;
619 c ^= 0xC0;
620 #endif
621 break;
622
623 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
624 other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
625 for Perl compatibility, it is a literal. This code looks a bit odd, but
626 there used to be some cases other than the default, and there may be again
627 in future, so I haven't "optimized" it. */
628
629 default:
630 if ((options & PCRE_EXTRA) != 0) switch(c)
631 {
632 default:
633 *errorcodeptr = ERR3;
634 break;
635 }
636 break;
637 }
638 }
639
640 *ptrptr = ptr;
641 return c;
642 }
643
644
645
646 #ifdef SUPPORT_UCP
647 /*************************************************
648 * Handle \P and \p *
649 *************************************************/
650
651 /* This function is called after \P or \p has been encountered, provided that
652 PCRE is compiled with support for Unicode properties. On entry, ptrptr is
653 pointing at the P or p. On exit, it is pointing at the final character of the
654 escape sequence.
655
656 Argument:
657 ptrptr points to the pattern position pointer
658 negptr points to a boolean that is set TRUE for negation else FALSE
659 dptr points to an int that is set to the detailed property value
660 errorcodeptr points to the error code variable
661
662 Returns: type value from ucp_type_table, or -1 for an invalid type
663 */
664
665 static int
666 get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
667 {
668 int c, i, bot, top;
669 const uschar *ptr = *ptrptr;
670 char name[32];
671
672 c = *(++ptr);
673 if (c == 0) goto ERROR_RETURN;
674
675 *negptr = FALSE;
676
677 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
678 negation. */
679
680 if (c == '{')
681 {
682 if (ptr[1] == '^')
683 {
684 *negptr = TRUE;
685 ptr++;
686 }
687 for (i = 0; i < sizeof(name) - 1; i++)
688 {
689 c = *(++ptr);
690 if (c == 0) goto ERROR_RETURN;
691 if (c == '}') break;
692 name[i] = c;
693 }
694 if (c !='}') goto ERROR_RETURN;
695 name[i] = 0;
696 }
697
698 /* Otherwise there is just one following character */
699
700 else
701 {
702 name[0] = c;
703 name[1] = 0;
704 }
705
706 *ptrptr = ptr;
707
708 /* Search for a recognized property name using binary chop */
709
710 bot = 0;
711 top = _pcre_utt_size;
712
713 while (bot < top)
714 {
715 i = (bot + top) >> 1;
716 c = strcmp(name, _pcre_utt[i].name);
717 if (c == 0)
718 {
719 *dptr = _pcre_utt[i].value;
720 return _pcre_utt[i].type;
721 }
722 if (c > 0) bot = i + 1; else top = i;
723 }
724
725 *errorcodeptr = ERR47;
726 *ptrptr = ptr;
727 return -1;
728
729 ERROR_RETURN:
730 *errorcodeptr = ERR46;
731 *ptrptr = ptr;
732 return -1;
733 }
734 #endif
735
736
737
738
739 /*************************************************
740 * Check for counted repeat *
741 *************************************************/
742
743 /* This function is called when a '{' is encountered in a place where it might
744 start a quantifier. It looks ahead to see if it really is a quantifier or not.
745 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
746 where the ddds are digits.
747
748 Arguments:
749 p pointer to the first char after '{'
750
751 Returns: TRUE or FALSE
752 */
753
754 static BOOL
755 is_counted_repeat(const uschar *p)
756 {
757 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
758 while ((digitab[*p] & ctype_digit) != 0) p++;
759 if (*p == '}') return TRUE;
760
761 if (*p++ != ',') return FALSE;
762 if (*p == '}') return TRUE;
763
764 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
765 while ((digitab[*p] & ctype_digit) != 0) p++;
766
767 return (*p == '}');
768 }
769
770
771
772 /*************************************************
773 * Read repeat counts *
774 *************************************************/
775
776 /* Read an item of the form {n,m} and return the values. This is called only
777 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
778 so the syntax is guaranteed to be correct, but we need to check the values.
779
780 Arguments:
781 p pointer to first char after '{'
782 minp pointer to int for min
783 maxp pointer to int for max
784 returned as -1 if no max
785 errorcodeptr points to error code variable
786
787 Returns: pointer to '}' on success;
788 current ptr on error, with errorcodeptr set non-zero
789 */
790
791 static const uschar *
792 read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
793 {
794 int min = 0;
795 int max = -1;
796
797 /* Read the minimum value and do a paranoid check: a negative value indicates
798 an integer overflow. */
799
800 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
801 if (min < 0 || min > 65535)
802 {
803 *errorcodeptr = ERR5;
804 return p;
805 }
806
807 /* Read the maximum value if there is one, and again do a paranoid on its size.
808 Also, max must not be less than min. */
809
810 if (*p == '}') max = min; else
811 {
812 if (*(++p) != '}')
813 {
814 max = 0;
815 while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
816 if (max < 0 || max > 65535)
817 {
818 *errorcodeptr = ERR5;
819 return p;
820 }
821 if (max < min)
822 {
823 *errorcodeptr = ERR4;
824 return p;
825 }
826 }
827 }
828
829 /* Fill in the required variables, and pass back the pointer to the terminating
830 '}'. */
831
832 *minp = min;
833 *maxp = max;
834 return p;
835 }
836
837
838
839 /*************************************************
840 * Find forward referenced subpattern *
841 *************************************************/
842
843 /* This function scans along a pattern's text looking for capturing
844 subpatterns, and counting them. If it finds a named pattern that matches the
845 name it is given, it returns its number. Alternatively, if the name is NULL, it
846 returns when it reaches a given numbered subpattern. This is used for forward
847 references to subpatterns. We know that if (?P< is encountered, the name will
848 be terminated by '>' because that is checked in the first pass.
849
850 Arguments:
851 ptr current position in the pattern
852 count current count of capturing parens so far encountered
853 name name to seek, or NULL if seeking a numbered subpattern
854 lorn name length, or subpattern number if name is NULL
855 xmode TRUE if we are in /x mode
856
857 Returns: the number of the named subpattern, or -1 if not found
858 */
859
860 static int
861 find_parens(const uschar *ptr, int count, const uschar *name, int lorn,
862 BOOL xmode)
863 {
864 const uschar *thisname;
865
866 for (; *ptr != 0; ptr++)
867 {
868 int term;
869
870 /* Skip over backslashed characters and also entire \Q...\E */
871
872 if (*ptr == '\\')
873 {
874 if (*(++ptr) == 0) return -1;
875 if (*ptr == 'Q') for (;;)
876 {
877 while (*(++ptr) != 0 && *ptr != '\\');
878 if (*ptr == 0) return -1;
879 if (*(++ptr) == 'E') break;
880 }
881 continue;
882 }
883
884 /* Skip over character classes */
885
886 if (*ptr == '[')
887 {
888 while (*(++ptr) != ']')
889 {
890 if (*ptr == '\\')
891 {
892 if (*(++ptr) == 0) return -1;
893 if (*ptr == 'Q') for (;;)
894 {
895 while (*(++ptr) != 0 && *ptr != '\\');
896 if (*ptr == 0) return -1;
897 if (*(++ptr) == 'E') break;
898 }
899 continue;
900 }
901 }
902 continue;
903 }
904
905 /* Skip comments in /x mode */
906
907 if (xmode && *ptr == '#')
908 {
909 while (*(++ptr) != 0 && *ptr != '\n');
910 if (*ptr == 0) return -1;
911 continue;
912 }
913
914 /* An opening parens must now be a real metacharacter */
915
916 if (*ptr != '(') continue;
917 if (ptr[1] != '?')
918 {
919 count++;
920 if (name == NULL && count == lorn) return count;
921 continue;
922 }
923
924 ptr += 2;
925 if (*ptr == 'P') ptr++; /* Allow optional P */
926
927 /* We have to disambiguate (?<! and (?<= from (?<name> */
928
929 if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') &&
930 *ptr != '\'')
931 continue;
932
933 count++;
934
935 if (name == NULL && count == lorn) return count;
936 term = *ptr++;
937 if (term == '<') term = '>';
938 thisname = ptr;
939 while (*ptr != term) ptr++;
940 if (name != NULL && lorn == ptr - thisname &&
941 strncmp((const char *)name, (const char *)thisname, lorn) == 0)
942 return count;
943 }
944
945 return -1;
946 }
947
948
949
950 /*************************************************
951 * Find first significant op code *
952 *************************************************/
953
954 /* This is called by several functions that scan a compiled expression looking
955 for a fixed first character, or an anchoring op code etc. It skips over things
956 that do not influence this. For some calls, a change of option is important.
957 For some calls, it makes sense to skip negative forward and all backward
958 assertions, and also the \b assertion; for others it does not.
959
960 Arguments:
961 code pointer to the start of the group
962 options pointer to external options
963 optbit the option bit whose changing is significant, or
964 zero if none are
965 skipassert TRUE if certain assertions are to be skipped
966
967 Returns: pointer to the first significant opcode
968 */
969
970 static const uschar*
971 first_significant_code(const uschar *code, int *options, int optbit,
972 BOOL skipassert)
973 {
974 for (;;)
975 {
976 switch ((int)*code)
977 {
978 case OP_OPT:
979 if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
980 *options = (int)code[1];
981 code += 2;
982 break;
983
984 case OP_ASSERT_NOT:
985 case OP_ASSERTBACK:
986 case OP_ASSERTBACK_NOT:
987 if (!skipassert) return code;
988 do code += GET(code, 1); while (*code == OP_ALT);
989 code += _pcre_OP_lengths[*code];
990 break;
991
992 case OP_WORD_BOUNDARY:
993 case OP_NOT_WORD_BOUNDARY:
994 if (!skipassert) return code;
995 /* Fall through */
996
997 case OP_CALLOUT:
998 case OP_CREF:
999 case OP_RREF:
1000 case OP_DEF:
1001 code += _pcre_OP_lengths[*code];
1002 break;
1003
1004 default:
1005 return code;
1006 }
1007 }
1008 /* Control never reaches here */
1009 }
1010
1011
1012
1013
1014 /*************************************************
1015 * Find the fixed length of a pattern *
1016 *************************************************/
1017
1018 /* Scan a pattern and compute the fixed length of subject that will match it,
1019 if the length is fixed. This is needed for dealing with backward assertions.
1020 In UTF8 mode, the result is in characters rather than bytes.
1021
1022 Arguments:
1023 code points to the start of the pattern (the bracket)
1024 options the compiling options
1025
1026 Returns: the fixed length, or -1 if there is no fixed length,
1027 or -2 if \C was encountered
1028 */
1029
1030 static int
1031 find_fixedlength(uschar *code, int options)
1032 {
1033 int length = -1;
1034
1035 register int branchlength = 0;
1036 register uschar *cc = code + 1 + LINK_SIZE;
1037
1038 /* Scan along the opcodes for this branch. If we get to the end of the
1039 branch, check the length against that of the other branches. */
1040
1041 for (;;)
1042 {
1043 int d;
1044 register int op = *cc;
1045
1046 switch (op)
1047 {
1048 case OP_CBRA:
1049 case OP_BRA:
1050 case OP_ONCE:
1051 case OP_COND:
1052 d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);
1053 if (d < 0) return d;
1054 branchlength += d;
1055 do cc += GET(cc, 1); while (*cc == OP_ALT);
1056 cc += 1 + LINK_SIZE;
1057 break;
1058
1059 /* Reached end of a branch; if it's a ket it is the end of a nested
1060 call. If it's ALT it is an alternation in a nested call. If it is
1061 END it's the end of the outer call. All can be handled by the same code. */
1062
1063 case OP_ALT:
1064 case OP_KET:
1065 case OP_KETRMAX:
1066 case OP_KETRMIN:
1067 case OP_END:
1068 if (length < 0) length = branchlength;
1069 else if (length != branchlength) return -1;
1070 if (*cc != OP_ALT) return length;
1071 cc += 1 + LINK_SIZE;
1072 branchlength = 0;
1073 break;
1074
1075 /* Skip over assertive subpatterns */
1076
1077 case OP_ASSERT:
1078 case OP_ASSERT_NOT:
1079 case OP_ASSERTBACK:
1080 case OP_ASSERTBACK_NOT:
1081 do cc += GET(cc, 1); while (*cc == OP_ALT);
1082 /* Fall through */
1083
1084 /* Skip over things that don't match chars */
1085
1086 case OP_REVERSE:
1087 case OP_CREF:
1088 case OP_RREF:
1089 case OP_DEF:
1090 case OP_OPT:
1091 case OP_CALLOUT:
1092 case OP_SOD:
1093 case OP_SOM:
1094 case OP_EOD:
1095 case OP_EODN:
1096 case OP_CIRC:
1097 case OP_DOLL:
1098 case OP_NOT_WORD_BOUNDARY:
1099 case OP_WORD_BOUNDARY:
1100 cc += _pcre_OP_lengths[*cc];
1101 break;
1102
1103 /* Handle literal characters */
1104
1105 case OP_CHAR:
1106 case OP_CHARNC:
1107 case OP_NOT:
1108 branchlength++;
1109 cc += 2;
1110 #ifdef SUPPORT_UTF8
1111 if ((options & PCRE_UTF8) != 0)
1112 {
1113 while ((*cc & 0xc0) == 0x80) cc++;
1114 }
1115 #endif
1116 break;
1117
1118 /* Handle exact repetitions. The count is already in characters, but we
1119 need to skip over a multibyte character in UTF8 mode. */
1120
1121 case OP_EXACT:
1122 branchlength += GET2(cc,1);
1123 cc += 4;
1124 #ifdef SUPPORT_UTF8
1125 if ((options & PCRE_UTF8) != 0)
1126 {
1127 while((*cc & 0x80) == 0x80) cc++;
1128 }
1129 #endif
1130 break;
1131
1132 case OP_TYPEEXACT:
1133 branchlength += GET2(cc,1);
1134 cc += 4;
1135 break;
1136
1137 /* Handle single-char matchers */
1138
1139 case OP_PROP:
1140 case OP_NOTPROP:
1141 cc += 2;
1142 /* Fall through */
1143
1144 case OP_NOT_DIGIT:
1145 case OP_DIGIT:
1146 case OP_NOT_WHITESPACE:
1147 case OP_WHITESPACE:
1148 case OP_NOT_WORDCHAR:
1149 case OP_WORDCHAR:
1150 case OP_ANY:
1151 branchlength++;
1152 cc++;
1153 break;
1154
1155 /* The single-byte matcher isn't allowed */
1156
1157 case OP_ANYBYTE:
1158 return -2;
1159
1160 /* Check a class for variable quantification */
1161
1162 #ifdef SUPPORT_UTF8
1163 case OP_XCLASS:
1164 cc += GET(cc, 1) - 33;
1165 /* Fall through */
1166 #endif
1167
1168 case OP_CLASS:
1169 case OP_NCLASS:
1170 cc += 33;
1171
1172 switch (*cc)
1173 {
1174 case OP_CRSTAR:
1175 case OP_CRMINSTAR:
1176 case OP_CRQUERY:
1177 case OP_CRMINQUERY:
1178 return -1;
1179
1180 case OP_CRRANGE:
1181 case OP_CRMINRANGE:
1182 if (GET2(cc,1) != GET2(cc,3)) return -1;
1183 branchlength += GET2(cc,1);
1184 cc += 5;
1185 break;
1186
1187 default:
1188 branchlength++;
1189 }
1190 break;
1191
1192 /* Anything else is variable length */
1193
1194 default:
1195 return -1;
1196 }
1197 }
1198 /* Control never gets here */
1199 }
1200
1201
1202
1203
1204 /*************************************************
1205 * Scan compiled regex for numbered bracket *
1206 *************************************************/
1207
1208 /* This little function scans through a compiled pattern until it finds a
1209 capturing bracket with the given number.
1210
1211 Arguments:
1212 code points to start of expression
1213 utf8 TRUE in UTF-8 mode
1214 number the required bracket number
1215
1216 Returns: pointer to the opcode for the bracket, or NULL if not found
1217 */
1218
1219 static const uschar *
1220 find_bracket(const uschar *code, BOOL utf8, int number)
1221 {
1222 for (;;)
1223 {
1224 register int c = *code;
1225 if (c == OP_END) return NULL;
1226
1227 /* XCLASS is used for classes that cannot be represented just by a bit
1228 map. This includes negated single high-valued characters. The length in
1229 the table is zero; the actual length is stored in the compiled code. */
1230
1231 if (c == OP_XCLASS) code += GET(code, 1);
1232
1233 /* Handle capturing bracket */
1234
1235 else if (c == OP_CBRA)
1236 {
1237 int n = GET2(code, 1+LINK_SIZE);
1238 if (n == number) return (uschar *)code;
1239 code += _pcre_OP_lengths[c];
1240 }
1241
1242 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1243 a multi-byte character. The length in the table is a minimum, so we have to
1244 arrange to skip the extra bytes. */
1245
1246 else
1247 {
1248 code += _pcre_OP_lengths[c];
1249 #ifdef SUPPORT_UTF8
1250 if (utf8) switch(c)
1251 {
1252 case OP_CHAR:
1253 case OP_CHARNC:
1254 case OP_EXACT:
1255 case OP_UPTO:
1256 case OP_MINUPTO:
1257 case OP_POSUPTO:
1258 case OP_STAR:
1259 case OP_MINSTAR:
1260 case OP_POSSTAR:
1261 case OP_PLUS:
1262 case OP_MINPLUS:
1263 case OP_POSPLUS:
1264 case OP_QUERY:
1265 case OP_MINQUERY:
1266 case OP_POSQUERY:
1267 if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1268 break;
1269 }
1270 #endif
1271 }
1272 }
1273 }
1274
1275
1276
1277 /*************************************************
1278 * Scan compiled regex for recursion reference *
1279 *************************************************/
1280
1281 /* This little function scans through a compiled pattern until it finds an
1282 instance of OP_RECURSE.
1283
1284 Arguments:
1285 code points to start of expression
1286 utf8 TRUE in UTF-8 mode
1287
1288 Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1289 */
1290
1291 static const uschar *
1292 find_recurse(const uschar *code, BOOL utf8)
1293 {
1294 for (;;)
1295 {
1296 register int c = *code;
1297 if (c == OP_END) return NULL;
1298 if (c == OP_RECURSE) return code;
1299
1300 /* XCLASS is used for classes that cannot be represented just by a bit
1301 map. This includes negated single high-valued characters. The length in
1302 the table is zero; the actual length is stored in the compiled code. */
1303
1304 if (c == OP_XCLASS) code += GET(code, 1);
1305
1306 /* Otherwise, we get the item's length from the table. In UTF-8 mode, opcodes
1307 that are followed by a character may be followed by a multi-byte character.
1308 The length in the table is a minimum, so we have to arrange to skip the extra
1309 bytes. */
1310
1311 else
1312 {
1313 code += _pcre_OP_lengths[c];
1314 #ifdef SUPPORT_UTF8
1315 if (utf8) switch(c)
1316 {
1317 case OP_CHAR:
1318 case OP_CHARNC:
1319 case OP_EXACT:
1320 case OP_UPTO:
1321 case OP_MINUPTO:
1322 case OP_POSUPTO:
1323 case OP_STAR:
1324 case OP_MINSTAR:
1325 case OP_POSSTAR:
1326 case OP_PLUS:
1327 case OP_MINPLUS:
1328 case OP_POSPLUS:
1329 case OP_QUERY:
1330 case OP_MINQUERY:
1331 case OP_POSQUERY:
1332 if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1333 break;
1334 }
1335 #endif
1336 }
1337 }
1338 }
1339
1340
1341
1342 /*************************************************
1343 * Scan compiled branch for non-emptiness *
1344 *************************************************/
1345
1346 /* This function scans through a branch of a compiled pattern to see whether it
1347 can match the empty string or not. It is called from could_be_empty()
1348 below and from compile_branch() when checking for an unlimited repeat of a
1349 group that can match nothing. Note that first_significant_code() skips over
1350 assertions. If we hit an unclosed bracket, we return "empty" - this means we've
1351 struck an inner bracket whose current branch will already have been scanned.
1352
1353 Arguments:
1354 code points to start of search
1355 endcode points to where to stop
1356 utf8 TRUE if in UTF8 mode
1357
1358 Returns: TRUE if what is matched could be empty
1359 */
1360
1361 static BOOL
1362 could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1363 {
1364 register int c;
1365 for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1366 code < endcode;
1367 code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1368 {
1369 const uschar *ccode;
1370
1371 c = *code;
1372
1373 if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE)
1374 {
1375 BOOL empty_branch;
1376 if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1377
1378 /* Scan a closed bracket */
1379
1380 empty_branch = FALSE;
1381 do
1382 {
1383 if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1384 empty_branch = TRUE;
1385 code += GET(code, 1);
1386 }
1387 while (*code == OP_ALT);
1388 if (!empty_branch) return FALSE; /* All branches are non-empty */
1389
1390 /* Move past the KET and fudge things so that the increment in the "for"
1391 above has no effect. */
1392
1393 c = OP_END;
1394 code += 1 + LINK_SIZE - _pcre_OP_lengths[c];
1395 continue;
1396 }
1397
1398 /* Handle the other opcodes */
1399
1400 switch (c)
1401 {
1402 /* Check for quantifiers after a class */
1403
1404 #ifdef SUPPORT_UTF8
1405 case OP_XCLASS:
1406 ccode = code + GET(code, 1);
1407 goto CHECK_CLASS_REPEAT;
1408 #endif
1409
1410 case OP_CLASS:
1411 case OP_NCLASS:
1412 ccode = code + 33;
1413
1414 #ifdef SUPPORT_UTF8
1415 CHECK_CLASS_REPEAT:
1416 #endif
1417
1418 switch (*ccode)
1419 {
1420 case OP_CRSTAR: /* These could be empty; continue */
1421 case OP_CRMINSTAR:
1422 case OP_CRQUERY:
1423 case OP_CRMINQUERY:
1424 break;
1425
1426 default: /* Non-repeat => class must match */
1427 case OP_CRPLUS: /* These repeats aren't empty */
1428 case OP_CRMINPLUS:
1429 return FALSE;
1430
1431 case OP_CRRANGE:
1432 case OP_CRMINRANGE:
1433 if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
1434 break;
1435 }
1436 break;
1437
1438 /* Opcodes that must match a character */
1439
1440 case OP_PROP:
1441 case OP_NOTPROP:
1442 case OP_EXTUNI:
1443 case OP_NOT_DIGIT:
1444 case OP_DIGIT:
1445 case OP_NOT_WHITESPACE:
1446 case OP_WHITESPACE:
1447 case OP_NOT_WORDCHAR:
1448 case OP_WORDCHAR:
1449 case OP_ANY:
1450 case OP_ANYBYTE:
1451 case OP_CHAR:
1452 case OP_CHARNC:
1453 case OP_NOT:
1454 case OP_PLUS:
1455 case OP_MINPLUS:
1456 case OP_POSPLUS:
1457 case OP_EXACT:
1458 case OP_NOTPLUS:
1459 case OP_NOTMINPLUS:
1460 case OP_NOTPOSPLUS:
1461 case OP_NOTEXACT:
1462 case OP_TYPEPLUS:
1463 case OP_TYPEMINPLUS:
1464 case OP_TYPEPOSPLUS:
1465 case OP_TYPEEXACT:
1466 return FALSE;
1467
1468 /* End of branch */
1469
1470 case OP_KET:
1471 case OP_KETRMAX:
1472 case OP_KETRMIN:
1473 case OP_ALT:
1474 return TRUE;
1475
1476 /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1477 MINUPTO, and POSUPTO may be followed by a multibyte character */
1478
1479 #ifdef SUPPORT_UTF8
1480 case OP_STAR:
1481 case OP_MINSTAR:
1482 case OP_POSSTAR:
1483 case OP_QUERY:
1484 case OP_MINQUERY:
1485 case OP_POSQUERY:
1486 case OP_UPTO:
1487 case OP_MINUPTO:
1488 case OP_POSUPTO:
1489 if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1490 break;
1491 #endif
1492 }
1493 }
1494
1495 return TRUE;
1496 }
1497
1498
1499
1500 /*************************************************
1501 * Scan compiled regex for non-emptiness *
1502 *************************************************/
1503
1504 /* This function is called to check for left recursive calls. We want to check
1505 the current branch of the current pattern to see if it could match the empty
1506 string. If it could, we must look outwards for branches at other levels,
1507 stopping when we pass beyond the bracket which is the subject of the recursion.
1508
1509 Arguments:
1510 code points to start of the recursion
1511 endcode points to where to stop (current RECURSE item)
1512 bcptr points to the chain of current (unclosed) branch starts
1513 utf8 TRUE if in UTF-8 mode
1514
1515 Returns: TRUE if what is matched could be empty
1516 */
1517
1518 static BOOL
1519 could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1520 BOOL utf8)
1521 {
1522 while (bcptr != NULL && bcptr->current >= code)
1523 {
1524 if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1525 bcptr = bcptr->outer;
1526 }
1527 return TRUE;
1528 }
1529
1530
1531
1532 /*************************************************
1533 * Check for POSIX class syntax *
1534 *************************************************/
1535
1536 /* This function is called when the sequence "[:" or "[." or "[=" is
1537 encountered in a character class. It checks whether this is followed by an
1538 optional ^ and then a sequence of letters, terminated by a matching ":]" or
1539 ".]" or "=]".
1540
1541 Argument:
1542 ptr pointer to the initial [
1543 endptr where to return the end pointer
1544 cd pointer to compile data
1545
1546 Returns: TRUE or FALSE
1547 */
1548
1549 static BOOL
1550 check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
1551 {
1552 int terminator; /* Don't combine these lines; the Solaris cc */
1553 terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
1554 if (*(++ptr) == '^') ptr++;
1555 while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
1556 if (*ptr == terminator && ptr[1] == ']')
1557 {
1558 *endptr = ptr;
1559 return TRUE;
1560 }
1561 return FALSE;
1562 }
1563
1564
1565
1566
1567 /*************************************************
1568 * Check POSIX class name *
1569 *************************************************/
1570
1571 /* This function is called to check the name given in a POSIX-style class entry
1572 such as [:alnum:].
1573
1574 Arguments:
1575 ptr points to the first letter
1576 len the length of the name
1577
1578 Returns: a value representing the name, or -1 if unknown
1579 */
1580
1581 static int
1582 check_posix_name(const uschar *ptr, int len)
1583 {
1584 register int yield = 0;
1585 while (posix_name_lengths[yield] != 0)
1586 {
1587 if (len == posix_name_lengths[yield] &&
1588 strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
1589 yield++;
1590 }
1591 return -1;
1592 }
1593
1594
1595 /*************************************************
1596 * Adjust OP_RECURSE items in repeated group *
1597 *************************************************/
1598
1599 /* OP_RECURSE items contain an offset from the start of the regex to the group
1600 that is referenced. This means that groups can be replicated for fixed
1601 repetition simply by copying (because the recursion is allowed to refer to
1602 earlier groups that are outside the current group). However, when a group is
1603 optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
1604 it, after it has been compiled. This means that any OP_RECURSE items within it
1605 that refer to the group itself or any contained groups have to have their
1606 offsets adjusted. That one of the jobs of this function. Before it is called,
1607 the partially compiled regex must be temporarily terminated with OP_END.
1608
1609 This function has been extended with the possibility of forward references for
1610 recursions and subroutine calls. It must also check the list of such references
1611 for the group we are dealing with. If it finds that one of the recursions in
1612 the current group is on this list, it adjusts the offset in the list, not the
1613 value in the reference (which is a group number).
1614
1615 Arguments:
1616 group points to the start of the group
1617 adjust the amount by which the group is to be moved
1618 utf8 TRUE in UTF-8 mode
1619 cd contains pointers to tables etc.
1620 save_hwm the hwm forward reference pointer at the start of the group
1621
1622 Returns: nothing
1623 */
1624
1625 static void
1626 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
1627 uschar *save_hwm)
1628 {
1629 uschar *ptr = group;
1630 while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1631 {
1632 int offset;
1633 uschar *hc;
1634
1635 /* See if this recursion is on the forward reference list. If so, adjust the
1636 reference. */
1637
1638 for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
1639 {
1640 offset = GET(hc, 0);
1641 if (cd->start_code + offset == ptr + 1)
1642 {
1643 PUT(hc, 0, offset + adjust);
1644 break;
1645 }
1646 }
1647
1648 /* Otherwise, adjust the recursion offset if it's after the start of this
1649 group. */
1650
1651 if (hc >= cd->hwm)
1652 {
1653 offset = GET(ptr, 1);
1654 if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1655 }
1656
1657 ptr += 1 + LINK_SIZE;
1658 }
1659 }
1660
1661
1662
1663 /*************************************************
1664 * Insert an automatic callout point *
1665 *************************************************/
1666
1667 /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
1668 callout points before each pattern item.
1669
1670 Arguments:
1671 code current code pointer
1672 ptr current pattern pointer
1673 cd pointers to tables etc
1674
1675 Returns: new code pointer
1676 */
1677
1678 static uschar *
1679 auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
1680 {
1681 *code++ = OP_CALLOUT;
1682 *code++ = 255;
1683 PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */
1684 PUT(code, LINK_SIZE, 0); /* Default length */
1685 return code + 2*LINK_SIZE;
1686 }
1687
1688
1689
1690 /*************************************************
1691 * Complete a callout item *
1692 *************************************************/
1693
1694 /* A callout item contains the length of the next item in the pattern, which
1695 we can't fill in till after we have reached the relevant point. This is used
1696 for both automatic and manual callouts.
1697
1698 Arguments:
1699 previous_callout points to previous callout item
1700 ptr current pattern pointer
1701 cd pointers to tables etc
1702
1703 Returns: nothing
1704 */
1705
1706 static void
1707 complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
1708 {
1709 int length = ptr - cd->start_pattern - GET(previous_callout, 2);
1710 PUT(previous_callout, 2 + LINK_SIZE, length);
1711 }
1712
1713
1714
1715 #ifdef SUPPORT_UCP
1716 /*************************************************
1717 * Get othercase range *
1718 *************************************************/
1719
1720 /* This function is passed the start and end of a class range, in UTF-8 mode
1721 with UCP support. It searches up the characters, looking for internal ranges of
1722 characters in the "other" case. Each call returns the next one, updating the
1723 start address.
1724
1725 Arguments:
1726 cptr points to starting character value; updated
1727 d end value
1728 ocptr where to put start of othercase range
1729 odptr where to put end of othercase range
1730
1731 Yield: TRUE when range returned; FALSE when no more
1732 */
1733
1734 static BOOL
1735 get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
1736 unsigned int *odptr)
1737 {
1738 unsigned int c, othercase, next;
1739
1740 for (c = *cptr; c <= d; c++)
1741 { if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR) break; }
1742
1743 if (c > d) return FALSE;
1744
1745 *ocptr = othercase;
1746 next = othercase + 1;
1747
1748 for (++c; c <= d; c++)
1749 {
1750 if (_pcre_ucp_othercase(c) != next) break;
1751 next++;
1752 }
1753
1754 *odptr = next - 1;
1755 *cptr = c;
1756
1757 return TRUE;
1758 }
1759 #endif /* SUPPORT_UCP */
1760
1761
1762
1763 /*************************************************
1764 * Check if auto-possessifying is possible *
1765 *************************************************/
1766
1767 /* This function is called for unlimited repeats of certain items, to see
1768 whether the next thing could possibly match the repeated item. If not, it makes
1769 sense to automatically possessify the repeated item.
1770
1771 Arguments:
1772 op_code the repeated op code
1773 this data for this item, depends on the opcode
1774 utf8 TRUE in UTF-8 mode
1775 utf8_char used for utf8 character bytes, NULL if not relevant
1776 ptr next character in pattern
1777 options options bits
1778 cd contains pointers to tables etc.
1779
1780 Returns: TRUE if possessifying is wanted
1781 */
1782
1783 static BOOL
1784 check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
1785 const uschar *ptr, int options, compile_data *cd)
1786 {
1787 int next;
1788
1789 /* Skip whitespace and comments in extended mode */
1790
1791 if ((options & PCRE_EXTENDED) != 0)
1792 {
1793 for (;;)
1794 {
1795 while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1796 if (*ptr == '#')
1797 {
1798 while (*(++ptr) != 0)
1799 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1800 }
1801 else break;
1802 }
1803 }
1804
1805 /* If the next item is one that we can handle, get its value. A non-negative
1806 value is a character, a negative value is an escape value. */
1807
1808 if (*ptr == '\\')
1809 {
1810 int temperrorcode = 0;
1811 next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
1812 if (temperrorcode != 0) return FALSE;
1813 ptr++; /* Point after the escape sequence */
1814 }
1815
1816 else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
1817 {
1818 #ifdef SUPPORT_UTF8
1819 if (utf8) { GETCHARINC(next, ptr); } else
1820 #endif
1821 next = *ptr++;
1822 }
1823
1824 else return FALSE;
1825
1826 /* Skip whitespace and comments in extended mode */
1827
1828 if ((options & PCRE_EXTENDED) != 0)
1829 {
1830 for (;;)
1831 {
1832 while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1833 if (*ptr == '#')
1834 {
1835 while (*(++ptr) != 0)
1836 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1837 }
1838 else break;
1839 }
1840 }
1841
1842 /* If the next thing is itself optional, we have to give up. */
1843
1844 if (*ptr == '*' || *ptr == '?' || strncmp((char *)ptr, "{0,", 3) == 0)
1845 return FALSE;
1846
1847 /* Now compare the next item with the previous opcode. If the previous is a
1848 positive single character match, "item" either contains the character or, if
1849 "item" is greater than 127 in utf8 mode, the character's bytes are in
1850 utf8_char. */
1851
1852
1853 /* Handle cases when the next item is a character. */
1854
1855 if (next >= 0) switch(op_code)
1856 {
1857 case OP_CHAR:
1858 #ifdef SUPPORT_UTF8
1859 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1860 #endif
1861 return item != next;
1862
1863 /* For CHARNC (caseless character) we must check the other case. If we have
1864 Unicode property support, we can use it to test the other case of
1865 high-valued characters. */
1866
1867 case OP_CHARNC:
1868 #ifdef SUPPORT_UTF8
1869 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1870 #endif
1871 if (item == next) return FALSE;
1872 #ifdef SUPPORT_UTF8
1873 if (utf8)
1874 {
1875 unsigned int othercase;
1876 if (next < 128) othercase = cd->fcc[next]; else
1877 #ifdef SUPPORT_UCP
1878 othercase = _pcre_ucp_othercase((unsigned int)next);
1879 #else
1880 othercase = NOTACHAR;
1881 #endif
1882 return (unsigned int)item != othercase;
1883 }
1884 else
1885 #endif /* SUPPORT_UTF8 */
1886 return (item != cd->fcc[next]); /* Non-UTF-8 mode */
1887
1888 /* For OP_NOT, "item" must be a single-byte character. */
1889
1890 case OP_NOT:
1891 if (next < 0) return FALSE; /* Not a character */
1892 if (item == next) return TRUE;
1893 if ((options & PCRE_CASELESS) == 0) return FALSE;
1894 #ifdef SUPPORT_UTF8
1895 if (utf8)
1896 {
1897 unsigned int othercase;
1898 if (next < 128) othercase = cd->fcc[next]; else
1899 #ifdef SUPPORT_UCP
1900 othercase = _pcre_ucp_othercase(next);
1901 #else
1902 othercase = NOTACHAR;
1903 #endif
1904 return (unsigned int)item == othercase;
1905 }
1906 else
1907 #endif /* SUPPORT_UTF8 */
1908 return (item == cd->fcc[next]); /* Non-UTF-8 mode */
1909
1910 case OP_DIGIT:
1911 return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
1912
1913 case OP_NOT_DIGIT:
1914 return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
1915
1916 case OP_WHITESPACE:
1917 return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
1918
1919 case OP_NOT_WHITESPACE:
1920 return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
1921
1922 case OP_WORDCHAR:
1923 return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
1924
1925 case OP_NOT_WORDCHAR:
1926 return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
1927
1928 default:
1929 return FALSE;
1930 }
1931
1932
1933 /* Handle the case when the next item is \d, \s, etc. */
1934
1935 switch(op_code)
1936 {
1937 case OP_CHAR:
1938 case OP_CHARNC:
1939 #ifdef SUPPORT_UTF8
1940 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1941 #endif
1942 switch(-next)
1943 {
1944 case ESC_d:
1945 return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
1946
1947 case ESC_D:
1948 return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
1949
1950 case ESC_s:
1951 return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
1952
1953 case ESC_S:
1954 return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
1955
1956 case ESC_w:
1957 return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
1958
1959 case ESC_W:
1960 return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
1961
1962 default:
1963 return FALSE;
1964 }
1965
1966 case OP_DIGIT:
1967 return next == -ESC_D || next == -ESC_s || next == -ESC_W;
1968
1969 case OP_NOT_DIGIT:
1970 return next == -ESC_d;
1971
1972 case OP_WHITESPACE:
1973 return next == -ESC_S || next == -ESC_d || next == -ESC_w;
1974
1975 case OP_NOT_WHITESPACE:
1976 return next == -ESC_s;
1977
1978 case OP_WORDCHAR:
1979 return next == -ESC_W || next == -ESC_s;
1980
1981 case OP_NOT_WORDCHAR:
1982 return next == -ESC_w || next == -ESC_d;
1983
1984 default:
1985 return FALSE;
1986 }
1987
1988 /* Control does not reach here */
1989 }
1990
1991
1992
1993 /*************************************************
1994 * Compile one branch *
1995 *************************************************/
1996
1997 /* Scan the pattern, compiling it into the a vector. If the options are
1998 changed during the branch, the pointer is used to change the external options
1999 bits. This function is used during the pre-compile phase when we are trying
2000 to find out the amount of memory needed, as well as during the real compile
2001 phase. The value of lengthptr distinguishes the two phases.
2002
2003 Arguments:
2004 optionsptr pointer to the option bits
2005 codeptr points to the pointer to the current code point
2006 ptrptr points to the current pattern pointer
2007 errorcodeptr points to error code variable
2008 firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2009 reqbyteptr set to the last literal character required, else < 0
2010 bcptr points to current branch chain
2011 cd contains pointers to tables etc.
2012 lengthptr NULL during the real compile phase
2013 points to length accumulator during pre-compile phase
2014
2015 Returns: TRUE on success
2016 FALSE, with *errorcodeptr set non-zero on error
2017 */
2018
2019 static BOOL
2020 compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2021 int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2022 compile_data *cd, int *lengthptr)
2023 {
2024 int repeat_type, op_type;
2025 int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
2026 int bravalue = 0;
2027 int greedy_default, greedy_non_default;
2028 int firstbyte, reqbyte;
2029 int zeroreqbyte, zerofirstbyte;
2030 int req_caseopt, reqvary, tempreqvary;
2031 int options = *optionsptr;
2032 int after_manual_callout = 0;
2033 int length_prevgroup = 0;
2034 register int c;
2035 register uschar *code = *codeptr;
2036 uschar *last_code = code;
2037 uschar *orig_code = code;
2038 uschar *tempcode;
2039 BOOL inescq = FALSE;
2040 BOOL groupsetfirstbyte = FALSE;
2041 const uschar *ptr = *ptrptr;
2042 const uschar *tempptr;
2043 uschar *previous = NULL;
2044 uschar *previous_callout = NULL;
2045 uschar *save_hwm = NULL;
2046 uschar classbits[32];
2047
2048 #ifdef SUPPORT_UTF8
2049 BOOL class_utf8;
2050 BOOL utf8 = (options & PCRE_UTF8) != 0;
2051 uschar *class_utf8data;
2052 uschar utf8_char[6];
2053 #else
2054 BOOL utf8 = FALSE;
2055 uschar *utf8_char = NULL;
2056 #endif
2057
2058 #ifdef DEBUG
2059 if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2060 #endif
2061
2062 /* Set up the default and non-default settings for greediness */
2063
2064 greedy_default = ((options & PCRE_UNGREEDY) != 0);
2065 greedy_non_default = greedy_default ^ 1;
2066
2067 /* Initialize no first byte, no required byte. REQ_UNSET means "no char
2068 matching encountered yet". It gets changed to REQ_NONE if we hit something that
2069 matches a non-fixed char first char; reqbyte just remains unset if we never
2070 find one.
2071
2072 When we hit a repeat whose minimum is zero, we may have to adjust these values
2073 to take the zero repeat into account. This is implemented by setting them to
2074 zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2075 item types that can be repeated set these backoff variables appropriately. */
2076
2077 firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2078
2079 /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2080 according to the current setting of the caseless flag. REQ_CASELESS is a bit
2081 value > 255. It is added into the firstbyte or reqbyte variables to record the
2082 case status of the value. This is used only for ASCII characters. */
2083
2084 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2085
2086 /* Switch on next character until the end of the branch */
2087
2088 for (;; ptr++)
2089 {
2090 BOOL negate_class;
2091 BOOL possessive_quantifier;
2092 BOOL is_quantifier;
2093 BOOL is_recurse;
2094 int class_charcount;
2095 int class_lastchar;
2096 int newoptions;
2097 int recno;
2098 int skipbytes;
2099 int subreqbyte;
2100 int subfirstbyte;
2101 int terminator;
2102 int mclength;
2103 uschar mcbuffer[8];
2104
2105 /* Get next byte in the pattern */
2106
2107 c = *ptr;
2108
2109 /* If we are in the pre-compile phase, accumulate the length used for the
2110 previous cycle of this loop. */
2111
2112 if (lengthptr != NULL)
2113 {
2114 #ifdef DEBUG
2115 if (code > cd->hwm) cd->hwm = code; /* High water info */
2116 #endif
2117 if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
2118 {
2119 *errorcodeptr = ERR52;
2120 goto FAILED;
2121 }
2122
2123 /* There is at least one situation where code goes backwards: this is the
2124 case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2125 the class is simply eliminated. However, it is created first, so we have to
2126 allow memory for it. Therefore, don't ever reduce the length at this point.
2127 */
2128
2129 if (code < last_code) code = last_code;
2130 *lengthptr += code - last_code;
2131 DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2132
2133 /* If "previous" is set and it is not at the start of the work space, move
2134 it back to there, in order to avoid filling up the work space. Otherwise,
2135 if "previous" is NULL, reset the current code pointer to the start. */
2136
2137 if (previous != NULL)
2138 {
2139 if (previous > orig_code)
2140 {
2141 memmove(orig_code, previous, code - previous);
2142 code -= previous - orig_code;
2143 previous = orig_code;
2144 }
2145 }
2146 else code = orig_code;
2147
2148 /* Remember where this code item starts so we can pick up the length
2149 next time round. */
2150
2151 last_code = code;
2152 }
2153
2154 /* In the real compile phase, just check the workspace used by the forward
2155 reference list. */
2156
2157 else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
2158 {
2159 *errorcodeptr = ERR52;
2160 goto FAILED;
2161 }
2162
2163 /* If in \Q...\E, check for the end; if not, we have a literal */
2164
2165 if (inescq && c != 0)
2166 {
2167 if (c == '\\' && ptr[1] == 'E')
2168 {
2169 inescq = FALSE;
2170 ptr++;
2171 continue;
2172 }
2173 else
2174 {
2175 if (previous_callout != NULL)
2176 {
2177 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2178 complete_callout(previous_callout, ptr, cd);
2179 previous_callout = NULL;
2180 }
2181 if ((options & PCRE_AUTO_CALLOUT) != 0)
2182 {
2183 previous_callout = code;
2184 code = auto_callout(code, ptr, cd);
2185 }
2186 goto NORMAL_CHAR;
2187 }
2188 }
2189
2190 /* Fill in length of a previous callout, except when the next thing is
2191 a quantifier. */
2192
2193 is_quantifier = c == '*' || c == '+' || c == '?' ||
2194 (c == '{' && is_counted_repeat(ptr+1));
2195
2196 if (!is_quantifier && previous_callout != NULL &&
2197 after_manual_callout-- <= 0)
2198 {
2199 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2200 complete_callout(previous_callout, ptr, cd);
2201 previous_callout = NULL;
2202 }
2203
2204 /* In extended mode, skip white space and comments */
2205
2206 if ((options & PCRE_EXTENDED) != 0)
2207 {
2208 if ((cd->ctypes[c] & ctype_space) != 0) continue;
2209 if (c == '#')
2210 {
2211 while (*(++ptr) != 0)
2212 {
2213 if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
2214 }
2215 if (*ptr != 0) continue;
2216
2217 /* Else fall through to handle end of string */
2218 c = 0;
2219 }
2220 }
2221
2222 /* No auto callout for quantifiers. */
2223
2224 if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
2225 {
2226 previous_callout = code;
2227 code = auto_callout(code, ptr, cd);
2228 }
2229
2230 switch(c)
2231 {
2232 /* ===================================================================*/
2233 case 0: /* The branch terminates at string end */
2234 case '|': /* or | or ) */
2235 case ')':
2236 *firstbyteptr = firstbyte;
2237 *reqbyteptr = reqbyte;
2238 *codeptr = code;
2239 *ptrptr = ptr;
2240 if (lengthptr != NULL)
2241 {
2242 *lengthptr += code - last_code; /* To include callout length */
2243 DPRINTF((">> end branch\n"));
2244 }
2245 return TRUE;
2246
2247
2248 /* ===================================================================*/
2249 /* Handle single-character metacharacters. In multiline mode, ^ disables
2250 the setting of any following char as a first character. */
2251
2252 case '^':
2253 if ((options & PCRE_MULTILINE) != 0)
2254 {
2255 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2256 }
2257 previous = NULL;
2258 *code++ = OP_CIRC;
2259 break;
2260
2261 case '$':
2262 previous = NULL;
2263 *code++ = OP_DOLL;
2264 break;
2265
2266 /* There can never be a first char if '.' is first, whatever happens about
2267 repeats. The value of reqbyte doesn't change either. */
2268
2269 case '.':
2270 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2271 zerofirstbyte = firstbyte;
2272 zeroreqbyte = reqbyte;
2273 previous = code;
2274 *code++ = OP_ANY;
2275 break;
2276
2277
2278 /* ===================================================================*/
2279 /* Character classes. If the included characters are all < 256, we build a
2280 32-byte bitmap of the permitted characters, except in the special case
2281 where there is only one such character. For negated classes, we build the
2282 map as usual, then invert it at the end. However, we use a different opcode
2283 so that data characters > 255 can be handled correctly.
2284
2285 If the class contains characters outside the 0-255 range, a different
2286 opcode is compiled. It may optionally have a bit map for characters < 256,
2287 but those above are are explicitly listed afterwards. A flag byte tells
2288 whether the bitmap is present, and whether this is a negated class or not.
2289 */
2290
2291 case '[':
2292 previous = code;
2293
2294 /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
2295 they are encountered at the top level, so we'll do that too. */
2296
2297 if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2298 check_posix_syntax(ptr, &tempptr, cd))
2299 {
2300 *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;
2301 goto FAILED;
2302 }
2303
2304 /* If the first character is '^', set the negation flag and skip it. */
2305
2306 if ((c = *(++ptr)) == '^')
2307 {
2308 negate_class = TRUE;
2309 c = *(++ptr);
2310 }
2311 else
2312 {
2313 negate_class = FALSE;
2314 }
2315
2316 /* Keep a count of chars with values < 256 so that we can optimize the case
2317 of just a single character (as long as it's < 256). However, For higher
2318 valued UTF-8 characters, we don't yet do any optimization. */
2319
2320 class_charcount = 0;
2321 class_lastchar = -1;
2322
2323 /* Initialize the 32-char bit map to all zeros. We build the map in a
2324 temporary bit of memory, in case the class contains only 1 character (less
2325 than 256), because in that case the compiled code doesn't use the bit map.
2326 */
2327
2328 memset(classbits, 0, 32 * sizeof(uschar));
2329
2330 #ifdef SUPPORT_UTF8
2331 class_utf8 = FALSE; /* No chars >= 256 */
2332 class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */
2333 #endif
2334
2335 /* Process characters until ] is reached. By writing this as a "do" it
2336 means that an initial ] is taken as a data character. At the start of the
2337 loop, c contains the first byte of the character. */
2338
2339 if (c != 0) do
2340 {
2341 const uschar *oldptr;
2342
2343 #ifdef SUPPORT_UTF8
2344 if (utf8 && c > 127)
2345 { /* Braces are required because the */
2346 GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
2347 }
2348 #endif
2349
2350 /* Inside \Q...\E everything is literal except \E */
2351
2352 if (inescq)
2353 {
2354 if (c == '\\' && ptr[1] == 'E') /* If we are at \E */
2355 {
2356 inescq = FALSE; /* Reset literal state */
2357 ptr++; /* Skip the 'E' */
2358 continue; /* Carry on with next */
2359 }
2360 goto CHECK_RANGE; /* Could be range if \E follows */
2361 }
2362
2363 /* Handle POSIX class names. Perl allows a negation extension of the
2364 form [:^name:]. A square bracket that doesn't match the syntax is
2365 treated as a literal. We also recognize the POSIX constructions
2366 [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
2367 5.6 and 5.8 do. */
2368
2369 if (c == '[' &&
2370 (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2371 check_posix_syntax(ptr, &tempptr, cd))
2372 {
2373 BOOL local_negate = FALSE;
2374 int posix_class, taboffset, tabopt;
2375 register const uschar *cbits = cd->cbits;
2376 uschar pbits[32];
2377
2378 if (ptr[1] != ':')
2379 {
2380 *errorcodeptr = ERR31;
2381 goto FAILED;
2382 }
2383
2384 ptr += 2;
2385 if (*ptr == '^')
2386 {
2387 local_negate = TRUE;
2388 ptr++;
2389 }
2390
2391 posix_class = check_posix_name(ptr, tempptr - ptr);
2392 if (posix_class < 0)
2393 {
2394 *errorcodeptr = ERR30;
2395 goto FAILED;
2396 }
2397
2398 /* If matching is caseless, upper and lower are converted to
2399 alpha. This relies on the fact that the class table starts with
2400 alpha, lower, upper as the first 3 entries. */
2401
2402 if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
2403 posix_class = 0;
2404
2405 /* We build the bit map for the POSIX class in a chunk of local store
2406 because we may be adding and subtracting from it, and we don't want to
2407 subtract bits that may be in the main map already. At the end we or the
2408 result into the bit map that is being built. */
2409
2410 posix_class *= 3;
2411
2412 /* Copy in the first table (always present) */
2413
2414 memcpy(pbits, cbits + posix_class_maps[posix_class],
2415 32 * sizeof(uschar));
2416
2417 /* If there is a second table, add or remove it as required. */
2418
2419 taboffset = posix_class_maps[posix_class + 1];
2420 tabopt = posix_class_maps[posix_class + 2];
2421
2422 if (taboffset >= 0)
2423 {
2424 if (tabopt >= 0)
2425 for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
2426 else
2427 for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
2428 }
2429
2430 /* Not see if we need to remove any special characters. An option
2431 value of 1 removes vertical space and 2 removes underscore. */
2432
2433 if (tabopt < 0) tabopt = -tabopt;
2434 if (tabopt == 1) pbits[1] &= ~0x3c;
2435 else if (tabopt == 2) pbits[11] &= 0x7f;
2436
2437 /* Add the POSIX table or its complement into the main table that is
2438 being built and we are done. */
2439
2440 if (local_negate)
2441 for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
2442 else
2443 for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
2444
2445 ptr = tempptr + 1;
2446 class_charcount = 10; /* Set > 1; assumes more than 1 per class */
2447 continue; /* End of POSIX syntax handling */
2448 }
2449
2450 /* Backslash may introduce a single character, or it may introduce one
2451 of the specials, which just set a flag. The sequence \b is a special
2452 case. Inside a class (and only there) it is treated as backspace.
2453 Elsewhere it marks a word boundary. Other escapes have preset maps ready
2454 to or into the one we are building. We assume they have more than one
2455 character in them, so set class_charcount bigger than one. */
2456
2457 if (c == '\\')
2458 {
2459 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2460 if (*errorcodeptr != 0) goto FAILED;
2461
2462 if (-c == ESC_b) c = '\b'; /* \b is backslash in a class */
2463 else if (-c == ESC_X) c = 'X'; /* \X is literal X in a class */
2464 else if (-c == ESC_R) c = 'R'; /* \R is literal R in a class */
2465 else if (-c == ESC_Q) /* Handle start of quoted string */
2466 {
2467 if (ptr[1] == '\\' && ptr[2] == 'E')
2468 {
2469 ptr += 2; /* avoid empty string */
2470 }
2471 else inescq = TRUE;
2472 continue;
2473 }
2474
2475 if (c < 0)
2476 {
2477 register const uschar *cbits = cd->cbits;
2478 class_charcount += 2; /* Greater than 1 is what matters */
2479
2480 /* Save time by not doing this in the pre-compile phase. */
2481
2482 if (lengthptr == NULL) switch (-c)
2483 {
2484 case ESC_d:
2485 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
2486 continue;
2487
2488 case ESC_D:
2489 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
2490 continue;
2491
2492 case ESC_w:
2493 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
2494 continue;
2495
2496 case ESC_W:
2497 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
2498 continue;
2499
2500 case ESC_s:
2501 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
2502 classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
2503 continue;
2504
2505 case ESC_S:
2506 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
2507 classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
2508 continue;
2509
2510 case ESC_E: /* Perl ignores an orphan \E */
2511 continue;
2512
2513 default: /* Not recognized; fall through */
2514 break; /* Need "default" setting to stop compiler warning. */
2515 }
2516
2517 /* In the pre-compile phase, just do the recognition. */
2518
2519 else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
2520 c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
2521
2522 /* We need to deal with \P and \p in both phases. */
2523
2524 #ifdef SUPPORT_UCP
2525 if (-c == ESC_p || -c == ESC_P)
2526 {
2527 BOOL negated;
2528 int pdata;
2529 int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
2530 if (ptype < 0) goto FAILED;
2531 class_utf8 = TRUE;
2532 *class_utf8data++ = ((-c == ESC_p) != negated)?
2533 XCL_PROP : XCL_NOTPROP;
2534 *class_utf8data++ = ptype;
2535 *class_utf8data++ = pdata;
2536 class_charcount -= 2; /* Not a < 256 character */
2537 continue;
2538 }
2539 #endif
2540 /* Unrecognized escapes are faulted if PCRE is running in its
2541 strict mode. By default, for compatibility with Perl, they are
2542 treated as literals. */
2543
2544 if ((options & PCRE_EXTRA) != 0)
2545 {
2546 *errorcodeptr = ERR7;
2547 goto FAILED;
2548 }
2549
2550 class_charcount -= 2; /* Undo the default count from above */
2551 c = *ptr; /* Get the final character and fall through */
2552 }
2553
2554 /* Fall through if we have a single character (c >= 0). This may be
2555 greater than 256 in UTF-8 mode. */
2556
2557 } /* End of backslash handling */
2558
2559 /* A single character may be followed by '-' to form a range. However,
2560 Perl does not permit ']' to be the end of the range. A '-' character
2561 at the end is treated as a literal. Perl ignores orphaned \E sequences
2562 entirely. The code for handling \Q and \E is messy. */
2563
2564 CHECK_RANGE:
2565 while (ptr[1] == '\\' && ptr[2] == 'E')
2566 {
2567 inescq = FALSE;
2568 ptr += 2;
2569 }
2570
2571 oldptr = ptr;
2572
2573 if (!inescq && ptr[1] == '-')
2574 {
2575 int d;
2576 ptr += 2;
2577 while (*ptr == '\\' && ptr[1] == 'E') ptr += 2;
2578
2579 /* If we hit \Q (not followed by \E) at this point, go into escaped
2580 mode. */
2581
2582 while (*ptr == '\\' && ptr[1] == 'Q')
2583 {
2584 ptr += 2;
2585 if (*ptr == '\\' && ptr[1] == 'E') { ptr += 2; continue; }
2586 inescq = TRUE;
2587 break;
2588 }
2589
2590 if (*ptr == 0 || (!inescq && *ptr == ']'))
2591 {
2592 ptr = oldptr;
2593 goto LONE_SINGLE_CHARACTER;
2594 }
2595
2596 #ifdef SUPPORT_UTF8
2597 if (utf8)
2598 { /* Braces are required because the */
2599 GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
2600 }
2601 else
2602 #endif
2603 d = *ptr; /* Not UTF-8 mode */
2604
2605 /* The second part of a range can be a single-character escape, but
2606 not any of the other escapes. Perl 5.6 treats a hyphen as a literal
2607 in such circumstances. */
2608
2609 if (!inescq && d == '\\')
2610 {
2611 d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2612 if (*errorcodeptr != 0) goto FAILED;
2613
2614 /* \b is backslash; \X is literal X; \R is literal R; any other
2615 special means the '-' was literal */
2616
2617 if (d < 0)
2618 {
2619 if (d == -ESC_b) d = '\b';
2620 else if (d == -ESC_X) d = 'X';
2621 else if (d == -ESC_R) d = 'R'; else
2622 {
2623 ptr = oldptr;
2624 goto LONE_SINGLE_CHARACTER; /* A few lines below */
2625 }
2626 }
2627 }
2628
2629 /* Check that the two values are in the correct order. Optimize
2630 one-character ranges */
2631
2632 if (d < c)
2633 {
2634 *errorcodeptr = ERR8;
2635 goto FAILED;
2636 }
2637
2638 if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
2639
2640 /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
2641 matching, we have to use an XCLASS with extra data items. Caseless
2642 matching for characters > 127 is available only if UCP support is
2643 available. */
2644
2645 #ifdef SUPPORT_UTF8
2646 if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
2647 {
2648 class_utf8 = TRUE;
2649
2650 /* With UCP support, we can find the other case equivalents of
2651 the relevant characters. There may be several ranges. Optimize how
2652 they fit with the basic range. */
2653
2654 #ifdef SUPPORT_UCP
2655 if ((options & PCRE_CASELESS) != 0)
2656 {
2657 unsigned int occ, ocd;
2658 unsigned int cc = c;
2659 unsigned int origd = d;
2660 while (get_othercase_range(&cc, origd, &occ, &ocd))
2661 {
2662 if (occ >= c && ocd <= d) continue; /* Skip embedded ranges */
2663
2664 if (occ < c && ocd >= c - 1) /* Extend the basic range */
2665 { /* if there is overlap, */
2666 c = occ; /* noting that if occ < c */
2667 continue; /* we can't have ocd > d */
2668 } /* because a subrange is */
2669 if (ocd > d && occ <= d + 1) /* always shorter than */
2670 { /* the basic range. */
2671 d = ocd;
2672 continue;
2673 }
2674
2675 if (occ == ocd)
2676 {
2677 *class_utf8data++ = XCL_SINGLE;
2678 }
2679 else
2680 {
2681 *class_utf8data++ = XCL_RANGE;
2682 class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
2683 }
2684 class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
2685 }
2686 }
2687 #endif /* SUPPORT_UCP */
2688
2689 /* Now record the original range, possibly modified for UCP caseless
2690 overlapping ranges. */
2691
2692 *class_utf8data++ = XCL_RANGE;
2693 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
2694 class_utf8data += _pcre_ord2utf8(d, class_utf8data);
2695
2696 /* With UCP support, we are done. Without UCP support, there is no
2697 caseless matching for UTF-8 characters > 127; we can use the bit map
2698 for the smaller ones. */
2699
2700 #ifdef SUPPORT_UCP
2701 continue; /* With next character in the class */
2702 #else
2703 if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
2704
2705 /* Adjust upper limit and fall through to set up the map */
2706
2707 d = 127;
2708
2709 #endif /* SUPPORT_UCP */
2710 }
2711 #endif /* SUPPORT_UTF8 */
2712
2713 /* We use the bit map for all cases when not in UTF-8 mode; else
2714 ranges that lie entirely within 0-127 when there is UCP support; else
2715 for partial ranges without UCP support. */
2716
2717 class_charcount += d - c + 1;
2718 class_lastchar = d;
2719
2720 /* We can save a bit of time by skipping this in the pre-compile. */
2721
2722 if (lengthptr == NULL) for (; c <= d; c++)
2723 {
2724 classbits[c/8] |= (1 << (c&7));
2725 if ((options & PCRE_CASELESS) != 0)
2726 {
2727 int uc = cd->fcc[c]; /* flip case */
2728 classbits[uc/8] |= (1 << (uc&7));
2729 }
2730 }
2731
2732 continue; /* Go get the next char in the class */
2733 }
2734
2735 /* Handle a lone single character - we can get here for a normal
2736 non-escape char, or after \ that introduces a single character or for an
2737 apparent range that isn't. */
2738
2739 LONE_SINGLE_CHARACTER:
2740
2741 /* Handle a character that cannot go in the bit map */
2742
2743 #ifdef SUPPORT_UTF8
2744 if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
2745 {
2746 class_utf8 = TRUE;
2747 *class_utf8data++ = XCL_SINGLE;
2748 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
2749
2750 #ifdef SUPPORT_UCP
2751 if ((options & PCRE_CASELESS) != 0)
2752 {
2753 unsigned int othercase;
2754 if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR)
2755 {
2756 *class_utf8data++ = XCL_SINGLE;
2757 class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
2758 }
2759 }
2760 #endif /* SUPPORT_UCP */
2761
2762 }
2763 else
2764 #endif /* SUPPORT_UTF8 */
2765
2766 /* Handle a single-byte character */
2767 {
2768 classbits[c/8] |= (1 << (c&7));
2769 if ((options & PCRE_CASELESS) != 0)
2770 {
2771 c = cd->fcc[c]; /* flip case */
2772 classbits[c/8] |= (1 << (c&7));
2773 }
2774 class_charcount++;
2775 class_lastchar = c;
2776 }
2777 }
2778
2779 /* Loop until ']' reached. This "while" is the end of the "do" above. */
2780
2781 while ((c = *(++ptr)) != 0 && (c != ']' || inescq));
2782
2783 if (c == 0) /* Missing terminating ']' */
2784 {
2785 *errorcodeptr = ERR6;
2786 goto FAILED;
2787 }
2788
2789 /* If class_charcount is 1, we saw precisely one character whose value is
2790 less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we
2791 can optimize the negative case only if there were no characters >= 128
2792 because OP_NOT and the related opcodes like OP_NOTSTAR operate on
2793 single-bytes only. This is an historical hangover. Maybe one day we can
2794 tidy these opcodes to handle multi-byte characters.
2795
2796 The optimization throws away the bit map. We turn the item into a
2797 1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
2798 that OP_NOT does not support multibyte characters. In the positive case, it
2799 can cause firstbyte to be set. Otherwise, there can be no first char if
2800 this item is first, whatever repeat count may follow. In the case of
2801 reqbyte, save the previous value for reinstating. */
2802
2803 #ifdef SUPPORT_UTF8
2804 if (class_charcount == 1 &&
2805 (!utf8 ||
2806 (!class_utf8 && (!negate_class || class_lastchar < 128))))
2807
2808 #else
2809 if (class_charcount == 1)
2810 #endif
2811 {
2812 zeroreqbyte = reqbyte;
2813
2814 /* The OP_NOT opcode works on one-byte characters only. */
2815
2816 if (negate_class)
2817 {
2818 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2819 zerofirstbyte = firstbyte;
2820 *code++ = OP_NOT;
2821 *code++ = class_lastchar;
2822 break;
2823 }
2824
2825 /* For a single, positive character, get the value into mcbuffer, and
2826 then we can handle this with the normal one-character code. */
2827
2828 #ifdef SUPPORT_UTF8
2829 if (utf8 && class_lastchar > 127)
2830 mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
2831 else
2832 #endif
2833 {
2834 mcbuffer[0] = class_lastchar;
2835 mclength = 1;
2836 }
2837 goto ONE_CHAR;
2838 } /* End of 1-char optimization */
2839
2840 /* The general case - not the one-char optimization. If this is the first
2841 thing in the branch, there can be no first char setting, whatever the
2842 repeat count. Any reqbyte setting must remain unchanged after any kind of
2843 repeat. */
2844
2845 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2846 zerofirstbyte = firstbyte;
2847 zeroreqbyte = reqbyte;
2848
2849 /* If there are characters with values > 255, we have to compile an
2850 extended class, with its own opcode. If there are no characters < 256,
2851 we can omit the bitmap in the actual compiled code. */
2852
2853 #ifdef SUPPORT_UTF8
2854 if (class_utf8)
2855 {
2856 *class_utf8data++ = XCL_END; /* Marks the end of extra data */
2857 *code++ = OP_XCLASS;
2858 code += LINK_SIZE;
2859 *code = negate_class? XCL_NOT : 0;
2860
2861 /* If the map is required, move up the extra data to make room for it;
2862 otherwise just move the code pointer to the end of the extra data. */
2863
2864 if (class_charcount > 0)
2865 {
2866 *code++ |= XCL_MAP;
2867 memmove(code + 32, code, class_utf8data - code);
2868 memcpy(code, classbits, 32);
2869 code = class_utf8data + 32;
2870 }
2871 else code = class_utf8data;
2872
2873 /* Now fill in the complete length of the item */
2874
2875 PUT(previous, 1, code - previous);
2876 break; /* End of class handling */
2877 }
2878 #endif
2879
2880 /* If there are no characters > 255, negate the 32-byte map if necessary,
2881 and copy it into the code vector. If this is the first thing in the branch,
2882 there can be no first char setting, whatever the repeat count. Any reqbyte
2883 setting must remain unchanged after any kind of repeat. */
2884
2885 if (negate_class)
2886 {
2887 *code++ = OP_NCLASS;
2888 if (lengthptr == NULL) /* Save time in the pre-compile phase */
2889 for (c = 0; c < 32; c++) code[c] = ~classbits[c];
2890 }
2891 else
2892 {
2893 *code++ = OP_CLASS;
2894 memcpy(code, classbits, 32);
2895 }
2896 code += 32;
2897 break;
2898
2899
2900 /* ===================================================================*/
2901 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
2902 has been tested above. */
2903
2904 case '{':
2905 if (!is_quantifier) goto NORMAL_CHAR;
2906 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
2907 if (*errorcodeptr != 0) goto FAILED;
2908 goto REPEAT;
2909
2910 case '*':
2911 repeat_min = 0;
2912 repeat_max = -1;
2913 goto REPEAT;
2914
2915 case '+':
2916 repeat_min = 1;
2917 repeat_max = -1;
2918 goto REPEAT;
2919
2920 case '?':
2921 repeat_min = 0;
2922 repeat_max = 1;
2923
2924 REPEAT:
2925 if (previous == NULL)
2926 {
2927 *errorcodeptr = ERR9;
2928 goto FAILED;
2929 }
2930
2931 if (repeat_min == 0)
2932 {
2933 firstbyte = zerofirstbyte; /* Adjust for zero repeat */
2934 reqbyte = zeroreqbyte; /* Ditto */
2935 }
2936
2937 /* Remember whether this is a variable length repeat */
2938
2939 reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
2940
2941 op_type = 0; /* Default single-char op codes */
2942 possessive_quantifier = FALSE; /* Default not possessive quantifier */
2943
2944 /* Save start of previous item, in case we have to move it up to make space
2945 for an inserted OP_ONCE for the additional '+' extension. */
2946
2947 tempcode = previous;
2948
2949 /* If the next character is '+', we have a possessive quantifier. This
2950 implies greediness, whatever the setting of the PCRE_UNGREEDY option.
2951 If the next character is '?' this is a minimizing repeat, by default,
2952 but if PCRE_UNGREEDY is set, it works the other way round. We change the
2953 repeat type to the non-default. */
2954
2955 if (ptr[1] == '+')
2956 {
2957 repeat_type = 0; /* Force greedy */
2958 possessive_quantifier = TRUE;
2959 ptr++;
2960 }
2961 else if (ptr[1] == '?')
2962 {
2963 repeat_type = greedy_non_default;
2964 ptr++;
2965 }
2966 else repeat_type = greedy_default;
2967
2968 /* If previous was a character match, abolish the item and generate a
2969 repeat item instead. If a char item has a minumum of more than one, ensure
2970 that it is set in reqbyte - it might not be if a sequence such as x{3} is
2971 the first thing in a branch because the x will have gone into firstbyte
2972 instead. */
2973
2974 if (*previous == OP_CHAR || *previous == OP_CHARNC)
2975 {
2976 /* Deal with UTF-8 characters that take up more than one byte. It's
2977 easier to write this out separately than try to macrify it. Use c to
2978 hold the length of the character in bytes, plus 0x80 to flag that it's a
2979 length rather than a small character. */
2980
2981 #ifdef SUPPORT_UTF8
2982 if (utf8 && (code[-1] & 0x80) != 0)
2983 {
2984 uschar *lastchar = code - 1;
2985 while((*lastchar & 0xc0) == 0x80) lastchar--;
2986 c = code - lastchar; /* Length of UTF-8 character */
2987 memcpy(utf8_char, lastchar, c); /* Save the char */
2988 c |= 0x80; /* Flag c as a length */
2989 }
2990 else
2991 #endif
2992
2993 /* Handle the case of a single byte - either with no UTF8 support, or
2994 with UTF-8 disabled, or for a UTF-8 character < 128. */
2995
2996 {
2997 c = code[-1];
2998 if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
2999 }
3000
3001 /* If the repetition is unlimited, it pays to see if the next thing on
3002 the line is something that cannot possibly match this character. If so,
3003 automatically possessifying this item gains some performance in the case
3004 where the match fails. */
3005
3006 if (!possessive_quantifier &&
3007 repeat_max < 0 &&
3008 check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3009 options, cd))
3010 {
3011 repeat_type = 0; /* Force greedy */
3012 possessive_quantifier = TRUE;
3013 }
3014
3015 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
3016 }
3017
3018 /* If previous was a single negated character ([^a] or similar), we use
3019 one of the special opcodes, replacing it. The code is shared with single-
3020 character repeats by setting opt_type to add a suitable offset into
3021 repeat_type. We can also test for auto-possessification. OP_NOT is
3022 currently used only for single-byte chars. */
3023
3024 else if (*previous == OP_NOT)
3025 {
3026 op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
3027 c = previous[1];
3028 if (!possessive_quantifier &&
3029 repeat_max < 0 &&
3030 check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3031 {
3032 repeat_type = 0; /* Force greedy */
3033 possessive_quantifier = TRUE;
3034 }
3035 goto OUTPUT_SINGLE_REPEAT;
3036 }
3037
3038 /* If previous was a character type match (\d or similar), abolish it and
3039 create a suitable repeat item. The code is shared with single-character
3040 repeats by setting op_type to add a suitable offset into repeat_type. Note
3041 the the Unicode property types will be present only when SUPPORT_UCP is
3042 defined, but we don't wrap the little bits of code here because it just
3043 makes it horribly messy. */
3044
3045 else if (*previous < OP_EODN)
3046 {
3047 uschar *oldcode;
3048 int prop_type, prop_value;
3049 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
3050 c = *previous;
3051
3052 if (!possessive_quantifier &&
3053 repeat_max < 0 &&
3054 check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3055 {
3056 repeat_type = 0; /* Force greedy */
3057 possessive_quantifier = TRUE;
3058 }
3059
3060 OUTPUT_SINGLE_REPEAT:
3061 if (*previous == OP_PROP || *previous == OP_NOTPROP)
3062 {
3063 prop_type = previous[1];
3064 prop_value = previous[2];
3065 }
3066 else prop_type = prop_value = -1;
3067
3068 oldcode = code;
3069 code = previous; /* Usually overwrite previous item */
3070
3071 /* If the maximum is zero then the minimum must also be zero; Perl allows
3072 this case, so we do too - by simply omitting the item altogether. */
3073
3074 if (repeat_max == 0) goto END_REPEAT;
3075
3076 /* All real repeats make it impossible to handle partial matching (maybe
3077 one day we will be able to remove this restriction). */
3078
3079 if (repeat_max != 1) cd->nopartial = TRUE;
3080
3081 /* Combine the op_type with the repeat_type */
3082
3083 repeat_type += op_type;
3084
3085 /* A minimum of zero is handled either as the special case * or ?, or as
3086 an UPTO, with the maximum given. */
3087
3088 if (repeat_min == 0)
3089 {
3090 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
3091 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
3092 else
3093 {
3094 *code++ = OP_UPTO + repeat_type;
3095 PUT2INC(code, 0, repeat_max);
3096 }
3097 }
3098
3099 /* A repeat minimum of 1 is optimized into some special cases. If the
3100 maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
3101 left in place and, if the maximum is greater than 1, we use OP_UPTO with
3102 one less than the maximum. */
3103
3104 else if (repeat_min == 1)
3105 {
3106 if (repeat_max == -1)
3107 *code++ = OP_PLUS + repeat_type;
3108 else
3109 {
3110 code = oldcode; /* leave previous item in place */
3111 if (repeat_max == 1) goto END_REPEAT;
3112 *code++ = OP_UPTO + repeat_type;
3113 PUT2INC(code, 0, repeat_max - 1);
3114 }
3115 }
3116
3117 /* The case {n,n} is just an EXACT, while the general case {n,m} is
3118 handled as an EXACT followed by an UPTO. */
3119
3120 else
3121 {
3122 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
3123 PUT2INC(code, 0, repeat_min);
3124
3125 /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
3126 we have to insert the character for the previous code. For a repeated
3127 Unicode property match, there are two extra bytes that define the
3128 required property. In UTF-8 mode, long characters have their length in
3129 c, with the 0x80 bit as a flag. */
3130
3131 if (repeat_max < 0)
3132 {
3133 #ifdef SUPPORT_UTF8
3134 if (utf8 && c >= 128)
3135 {
3136 memcpy(code, utf8_char, c & 7);
3137 code += c & 7;
3138 }
3139 else
3140 #endif
3141 {
3142 *code++ = c;
3143 if (prop_type >= 0)
3144 {
3145 *code++ = prop_type;
3146 *code++ = prop_value;
3147 }
3148 }
3149 *code++ = OP_STAR + repeat_type;
3150 }
3151
3152 /* Else insert an UPTO if the max is greater than the min, again
3153 preceded by the character, for the previously inserted code. If the
3154 UPTO is just for 1 instance, we can use QUERY instead. */
3155
3156 else if (repeat_max != repeat_min)
3157 {
3158 #ifdef SUPPORT_UTF8
3159 if (utf8 && c >= 128)
3160 {
3161 memcpy(code, utf8_char, c & 7);
3162 code += c & 7;
3163 }
3164 else
3165 #endif
3166 *code++ = c;
3167 if (prop_type >= 0)
3168 {
3169 *code++ = prop_type;
3170 *code++ = prop_value;
3171 }
3172 repeat_max -= repeat_min;
3173
3174 if (repeat_max == 1)
3175 {
3176 *code++ = OP_QUERY + repeat_type;
3177 }
3178 else
3179 {
3180 *code++ = OP_UPTO + repeat_type;
3181 PUT2INC(code, 0, repeat_max);
3182 }
3183 }
3184 }
3185
3186 /* The character or character type itself comes last in all cases. */
3187
3188 #ifdef SUPPORT_UTF8
3189 if (utf8 && c >= 128)
3190 {
3191 memcpy(code, utf8_char, c & 7);
3192 code += c & 7;
3193 }
3194 else
3195 #endif
3196 *code++ = c;
3197
3198 /* For a repeated Unicode property match, there are two extra bytes that
3199 define the required property. */
3200
3201 #ifdef SUPPORT_UCP
3202 if (prop_type >= 0)
3203 {
3204 *code++ = prop_type;
3205 *code++ = prop_value;
3206 }
3207 #endif
3208 }
3209
3210 /* If previous was a character class or a back reference, we put the repeat
3211 stuff after it, but just skip the item if the repeat was {0,0}. */
3212
3213 else if (*previous == OP_CLASS ||
3214 *previous == OP_NCLASS ||
3215 #ifdef SUPPORT_UTF8
3216 *previous == OP_XCLASS ||
3217 #endif
3218 *previous == OP_REF)
3219 {
3220 if (repeat_max == 0)
3221 {
3222 code = previous;
3223 goto END_REPEAT;
3224 }
3225
3226 /* All real repeats make it impossible to handle partial matching (maybe
3227 one day we will be able to remove this restriction). */
3228
3229 if (repeat_max != 1) cd->nopartial = TRUE;
3230
3231 if (repeat_min == 0 && repeat_max == -1)
3232 *code++ = OP_CRSTAR + repeat_type;
3233 else if (repeat_min == 1 && repeat_max == -1)
3234 *code++ = OP_CRPLUS + repeat_type;
3235 else if (repeat_min == 0 && repeat_max == 1)
3236 *code++ = OP_CRQUERY + repeat_type;
3237 else
3238 {
3239 *code++ = OP_CRRANGE + repeat_type;
3240 PUT2INC(code, 0, repeat_min);
3241 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
3242 PUT2INC(code, 0, repeat_max);
3243 }
3244 }
3245
3246 /* If previous was a bracket group, we may have to replicate it in certain
3247 cases. */
3248
3249 else if (*previous == OP_BRA || *previous == OP_CBRA ||
3250 *previous == OP_ONCE || *previous == OP_COND)
3251 {
3252 register int i;
3253 int ketoffset = 0;
3254 int len = code - previous;
3255 uschar *bralink = NULL;
3256
3257 /* Repeating a DEFINE group is pointless */
3258
3259 if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
3260 {
3261 *errorcodeptr = ERR55;
3262 goto FAILED;
3263 }
3264
3265 /* This is a paranoid check to stop integer overflow later on */
3266
3267 if (len > MAX_DUPLENGTH)
3268 {
3269 *errorcodeptr = ERR50;
3270 goto FAILED;
3271 }
3272
3273 /* If the maximum repeat count is unlimited, find the end of the bracket
3274 by scanning through from the start, and compute the offset back to it
3275 from the current code pointer. There may be an OP_OPT setting following
3276 the final KET, so we can't find the end just by going back from the code
3277 pointer. */
3278
3279 if (repeat_max == -1)
3280 {
3281 register uschar *ket = previous;
3282 do ket += GET(ket, 1); while (*ket != OP_KET);
3283 ketoffset = code - ket;
3284 }
3285
3286 /* The case of a zero minimum is special because of the need to stick
3287 OP_BRAZERO in front of it, and because the group appears once in the
3288 data, whereas in other cases it appears the minimum number of times. For
3289 this reason, it is simplest to treat this case separately, as otherwise
3290 the code gets far too messy. There are several special subcases when the
3291 minimum is zero. */
3292
3293 if (repeat_min == 0)
3294 {
3295 /* If the maximum is also zero, we just omit the group from the output
3296 altogether. */
3297
3298 if (repeat_max == 0)
3299 {
3300 code = previous;
3301 goto END_REPEAT;
3302 }
3303
3304 /* If the maximum is 1 or unlimited, we just have to stick in the
3305 BRAZERO and do no more at this point. However, we do need to adjust
3306 any OP_RECURSE calls inside the group that refer to the group itself or
3307 any internal or forward referenced group, because the offset is from
3308 the start of the whole regex. Temporarily terminate the pattern while
3309 doing this. */
3310
3311 if (repeat_max <= 1)
3312 {
3313 *code = OP_END;
3314 adjust_recurse(previous, 1, utf8, cd, save_hwm);
3315 memmove(previous+1, previous, len);
3316 code++;
3317 *previous++ = OP_BRAZERO + repeat_type;
3318 }
3319
3320 /* If the maximum is greater than 1 and limited, we have to replicate
3321 in a nested fashion, sticking OP_BRAZERO before each set of brackets.
3322 The first one has to be handled carefully because it's the original
3323 copy, which has to be moved up. The remainder can be handled by code
3324 that is common with the non-zero minimum case below. We have to
3325 adjust the value or repeat_max, since one less copy is required. Once
3326 again, we may have to adjust any OP_RECURSE calls inside the group. */
3327
3328 else
3329 {
3330 int offset;
3331 *code = OP_END;
3332 adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
3333 memmove(previous + 2 + LINK_SIZE, previous, len);
3334 code += 2 + LINK_SIZE;
3335 *previous++ = OP_BRAZERO + repeat_type;
3336 *previous++ = OP_BRA;
3337
3338 /* We chain together the bracket offset fields that have to be
3339 filled in later when the ends of the brackets are reached. */
3340
3341 offset = (bralink == NULL)? 0 : previous - bralink;
3342 bralink = previous;
3343 PUTINC(previous, 0, offset);
3344 }
3345
3346 repeat_max--;
3347 }
3348
3349 /* If the minimum is greater than zero, replicate the group as many
3350 times as necessary, and adjust the maximum to the number of subsequent
3351 copies that we need. If we set a first char from the group, and didn't
3352 set a required char, copy the latter from the former. If there are any
3353 forward reference subroutine calls in the group, there will be entries on
3354 the workspace list; replicate these with an appropriate increment. */
3355
3356 else
3357 {
3358 if (repeat_min > 1)
3359 {
3360 /* In the pre-compile phase, we don't actually do the replication. We
3361 just adjust the length as if we had. */
3362
3363 if (lengthptr != NULL)
3364 *lengthptr += (repeat_min - 1)*length_prevgroup;
3365
3366 /* This is compiling for real */
3367
3368 else
3369 {
3370 if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
3371 for (i = 1; i < repeat_min; i++)
3372 {
3373 uschar *hc;
3374 uschar *this_hwm = cd->hwm;
3375 memcpy(code, previous, len);
3376 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3377 {
3378 PUT(cd->hwm, 0, GET(hc, 0) + len);
3379 cd->hwm += LINK_SIZE;
3380 }
3381 save_hwm = this_hwm;
3382 code += len;
3383 }
3384 }
3385 }
3386
3387 if (repeat_max > 0) repeat_max -= repeat_min;
3388 }
3389
3390 /* This code is common to both the zero and non-zero minimum cases. If
3391 the maximum is limited, it replicates the group in a nested fashion,
3392 remembering the bracket starts on a stack. In the case of a zero minimum,
3393 the first one was set up above. In all cases the repeat_max now specifies
3394 the number of additional copies needed. Again, we must remember to
3395 replicate entries on the forward reference list. */
3396
3397 if (repeat_max >= 0)
3398 {
3399 /* In the pre-compile phase, we don't actually do the replication. We
3400 just adjust the length as if we had. For each repetition we must add 1
3401 to the length for BRAZERO and for all but the last repetition we must
3402 add 2 + 2*LINKSIZE to allow for the nesting that occurs. */
3403
3404 if (lengthptr != NULL && repeat_max > 0)
3405 *lengthptr += repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
3406 2 - 2*LINK_SIZE; /* Last one doesn't nest */
3407
3408 /* This is compiling for real */
3409
3410 else for (i = repeat_max - 1; i >= 0; i--)
3411 {
3412 uschar *hc;
3413 uschar *this_hwm = cd->hwm;
3414
3415 *code++ = OP_BRAZERO + repeat_type;
3416
3417 /* All but the final copy start a new nesting, maintaining the
3418 chain of brackets outstanding. */
3419
3420 if (i != 0)
3421 {
3422 int offset;
3423 *code++ = OP_BRA;
3424 offset = (bralink == NULL)? 0 : code - bralink;
3425 bralink = code;
3426 PUTINC(code, 0, offset);
3427 }
3428
3429 memcpy(code, previous, len);
3430 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3431 {
3432 PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
3433 cd->hwm += LINK_SIZE;
3434 }
3435 save_hwm = this_hwm;
3436 code += len;
3437 }
3438
3439 /* Now chain through the pending brackets, and fill in their length
3440 fields (which are holding the chain links pro tem). */
3441
3442 while (bralink != NULL)
3443 {
3444 int oldlinkoffset;
3445 int offset = code - bralink + 1;
3446 uschar *bra = code - offset;
3447 oldlinkoffset = GET(bra, 1);
3448 bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
3449 *code++ = OP_KET;
3450 PUTINC(code, 0, offset);
3451 PUT(bra, 1, offset);
3452 }
3453 }
3454
3455 /* If the maximum is unlimited, set a repeater in the final copy. We
3456 can't just offset backwards from the current code point, because we
3457 don't know if there's been an options resetting after the ket. The
3458 correct offset was computed above.
3459
3460 Then, when we are doing the actual compile phase, check to see whether
3461 this group is a non-atomic one that could match an empty string. If so,
3462 convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
3463 that runtime checking can be done. [This check is also applied to
3464 atomic groups at runtime, but in a different way.] */
3465
3466 else
3467 {
3468 uschar *ketcode = code - ketoffset;
3469 uschar *bracode = ketcode - GET(ketcode, 1);
3470 *ketcode = OP_KETRMAX + repeat_type;
3471 if (lengthptr == NULL && *bracode != OP_ONCE)
3472 {
3473 uschar *scode = bracode;
3474 do
3475 {
3476 if (could_be_empty_branch(scode, ketcode, utf8))
3477 {
3478 *bracode += OP_SBRA - OP_BRA;
3479 break;
3480 }
3481 scode += GET(scode, 1);
3482 }
3483 while (*scode == OP_ALT);
3484 }
3485 }
3486 }
3487
3488 /* Else there's some kind of shambles */
3489
3490 else
3491 {
3492 *errorcodeptr = ERR11;
3493 goto FAILED;
3494 }
3495
3496 /* If the character following a repeat is '+', or if certain optimization
3497 tests above succeeded, possessive_quantifier is TRUE. For some of the
3498 simpler opcodes, there is an special alternative opcode for this. For
3499 anything else, we wrap the entire repeated item inside OP_ONCE brackets.
3500 The '+' notation is just syntactic sugar, taken from Sun's Java package,
3501 but the special opcodes can optimize it a bit. The repeated item starts at
3502 tempcode, not at previous, which might be the first part of a string whose
3503 (former) last char we repeated.
3504
3505 Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
3506 an 'upto' may follow. We skip over an 'exact' item, and then test the
3507 length of what remains before proceeding. */
3508
3509 if (possessive_quantifier)
3510 {
3511 int len;
3512 if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||
3513 *tempcode == OP_NOTEXACT)
3514 tempcode += _pcre_OP_lengths[*tempcode];
3515 len = code - tempcode;
3516 if (len > 0) switch (*tempcode)
3517 {
3518 case OP_STAR: *tempcode = OP_POSSTAR; break;
3519 case OP_PLUS: *tempcode = OP_POSPLUS; break;
3520 case OP_QUERY: *tempcode = OP_POSQUERY; break;
3521 case OP_UPTO: *tempcode = OP_POSUPTO; break;
3522
3523 case OP_TYPESTAR: *tempcode = OP_TYPEPOSSTAR; break;
3524 case OP_TYPEPLUS: *tempcode = OP_TYPEPOSPLUS; break;
3525 case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
3526 case OP_TYPEUPTO: *tempcode = OP_TYPEPOSUPTO; break;
3527
3528 case OP_NOTSTAR: *tempcode = OP_NOTPOSSTAR; break;
3529 case OP_NOTPLUS: *tempcode = OP_NOTPOSPLUS; break;
3530 case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
3531 case OP_NOTUPTO: *tempcode = OP_NOTPOSUPTO; break;
3532
3533 default:
3534 memmove(tempcode + 1+LINK_SIZE, tempcode, len);
3535 code += 1 + LINK_SIZE;
3536 len += 1 + LINK_SIZE;
3537 tempcode[0] = OP_ONCE;
3538 *code++ = OP_KET;
3539 PUTINC(code, 0, len);
3540 PUT(tempcode, 1, len);
3541 break;
3542 }
3543 }
3544
3545 /* In all case we no longer have a previous item. We also set the
3546 "follows varying string" flag for subsequently encountered reqbytes if
3547 it isn't already set and we have just passed a varying length item. */
3548
3549 END_REPEAT:
3550 previous = NULL;
3551 cd->req_varyopt |= reqvary;
3552 break;
3553
3554
3555 /* ===================================================================*/
3556 /* Start of nested parenthesized sub-expression, or comment or lookahead or
3557 lookbehind or option setting or condition or all the other extended
3558 parenthesis forms. First deal with the specials; all are introduced by ?,
3559 and the appearance of any of them means that this is not a capturing
3560 group. */
3561
3562 case '(':
3563 newoptions = options;
3564 skipbytes = 0;
3565 bravalue = OP_CBRA;
3566 save_hwm = cd->hwm;
3567
3568 if (*(++ptr) == '?')
3569 {
3570 int i, set, unset, namelen;
3571 int *optset;
3572 const uschar *name;
3573 uschar *slot;
3574
3575 switch (*(++ptr))
3576 {
3577 case '#': /* Comment; skip to ket */
3578 ptr++;
3579 while (*ptr != 0 && *ptr != ')') ptr++;
3580 if (*ptr == 0)
3581 {
3582 *errorcodeptr = ERR18;
3583 goto FAILED;
3584 }
3585 continue;
3586
3587
3588 /* ------------------------------------------------------------ */
3589 case ':': /* Non-capturing bracket */
3590 bravalue = OP_BRA;
3591 ptr++;
3592 break;
3593
3594
3595 /* ------------------------------------------------------------ */
3596 case '(':
3597 bravalue = OP_COND; /* Conditional group */
3598
3599 /* A condition can be an assertion, a number (referring to a numbered
3600 group), a name (referring to a named group), or 'R', referring to
3601 recursion. R<digits> and R&name are also permitted for recursion tests.
3602
3603 There are several syntaxes for testing a named group: (?(name)) is used
3604 by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
3605
3606 There are two unfortunate ambiguities, caused by history. (a) 'R' can
3607 be the recursive thing or the name 'R' (and similarly for 'R' followed
3608 by digits), and (b) a number could be a name that consists of digits.
3609 In both cases, we look for a name first; if not found, we try the other
3610 cases. */
3611
3612 /* For conditions that are assertions, check the syntax, and then exit
3613 the switch. This will take control down to where bracketed groups,
3614 including assertions, are processed. */
3615
3616 if (ptr[1] == '?' && (ptr[2] == '=' || ptr[2] == '!' || ptr[2] == '<'))
3617 break;
3618
3619 /* Most other conditions use OP_CREF (a couple change to OP_RREF
3620 below), and all need to skip 3 bytes at the start of the group. */
3621
3622 code[1+LINK_SIZE] = OP_CREF;
3623 skipbytes = 3;
3624
3625 /* Check for a test for recursion in a named group. */
3626
3627 if (ptr[1] == 'R' && ptr[2] == '&')
3628 {
3629 terminator = -1;
3630 ptr += 2;
3631 code[1+LINK_SIZE] = OP_RREF; /* Change the type of test */
3632 }
3633
3634 /* Check for a test for a named group's having been set, using the Perl
3635 syntax (?(<name>) or (?('name') */
3636
3637 else if (ptr[1] == '<')
3638 {
3639 terminator = '>';
3640 ptr++;
3641 }
3642 else if (ptr[1] == '\'')
3643 {
3644 terminator = '\'';
3645 ptr++;
3646 }
3647 else terminator = 0;
3648
3649 /* We now expect to read a name; any thing else is an error */
3650
3651 if ((cd->ctypes[ptr[1]] & ctype_word) == 0)
3652 {
3653 ptr += 1; /* To get the right offset */
3654 *errorcodeptr = ERR28;
3655 goto FAILED;
3656 }
3657
3658 /* Read the name, but also get it as a number if it's all digits */
3659
3660 recno = 0;
3661 name = ++ptr;
3662 while ((cd->ctypes[*ptr] & ctype_word) != 0)
3663 {
3664 if (recno >= 0)
3665 recno = ((digitab[*ptr] & ctype_digit) != 0)?
3666 recno * 10 + *ptr - '0' : -1;
3667 ptr++;
3668 }
3669 namelen = ptr - name;
3670
3671 if ((terminator > 0 && *ptr++ != terminator) || *ptr++ != ')')
3672 {
3673 ptr--; /* Error offset */
3674 *errorcodeptr = ERR26;
3675 goto FAILED;
3676 }
3677
3678 /* Do no further checking in the pre-compile phase. */
3679
3680 if (lengthptr != NULL) break;
3681
3682 /* In the real compile we do the work of looking for the actual
3683 reference. */
3684
3685 slot = cd->name_table;
3686 for (i = 0; i < cd->names_found; i++)
3687 {
3688 if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
3689 slot += cd->name_entry_size;
3690 }
3691
3692 /* Found a previous named subpattern */
3693
3694 if (i < cd->names_found)
3695 {
3696 recno = GET2(slot, 0);
3697 PUT2(code, 2+LINK_SIZE, recno);
3698 }
3699
3700 /* Search the pattern for a forward reference */
3701
3702 else if ((i = find_parens(ptr, cd->bracount, name, namelen,
3703 (options & PCRE_EXTENDED) != 0)) > 0)
3704 {
3705 PUT2(code, 2+LINK_SIZE, i);
3706 }
3707
3708 /* If terminator == 0 it means that the name followed directly after
3709 the opening parenthesis [e.g. (?(abc)...] and in this case there are
3710 some further alternatives to try. For the cases where terminator != 0
3711 [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
3712 now checked all the possibilities, so give an error. */
3713
3714 else if (terminator != 0)
3715 {
3716 *errorcodeptr = ERR15;
3717 goto FAILED;
3718 }
3719
3720 /* Check for (?(R) for recursion. Allow digits after R to specify a
3721 specific group number. */
3722
3723 else if (*name == 'R')
3724 {
3725 recno = 0;
3726 for (i = 1; i < namelen; i++)
3727 {
3728 if ((digitab[name[i]] & ctype_digit) == 0)
3729 {
3730 *errorcodeptr = ERR15;
3731 goto FAILED;
3732 }
3733 recno = recno * 10 + name[i] - '0';
3734 }
3735 if (recno == 0) recno = RREF_ANY;
3736 code[1+LINK_SIZE] = OP_RREF; /* Change test type */
3737 PUT2(code, 2+LINK_SIZE, recno);
3738 }
3739
3740 /* Similarly, check for the (?(DEFINE) "condition", which is always
3741 false. */
3742
3743 else if (namelen == 6 && strncmp((char *)name, "DEFINE", 6) == 0)
3744 {
3745 code[1+LINK_SIZE] = OP_DEF;
3746 skipbytes = 1;
3747 }
3748
3749 /* Check for the "name" actually being a subpattern number. */
3750
3751 else if (recno > 0)
3752 {
3753 PUT2(code, 2+LINK_SIZE, recno);
3754 }
3755
3756 /* Either an unidentified subpattern, or a reference to (?(0) */
3757
3758 else
3759 {
3760 *errorcodeptr = (recno == 0)? ERR35: ERR15;
3761 goto FAILED;
3762 }
3763 break;
3764
3765
3766 /* ------------------------------------------------------------ */
3767 case '=': /* Positive lookahead */
3768 bravalue = OP_ASSERT;
3769 ptr++;
3770 break;
3771
3772
3773 /* ------------------------------------------------------------ */
3774 case '!': /* Negative lookahead */
3775 bravalue = OP_ASSERT_NOT;
3776 ptr++;
3777 break;
3778
3779
3780 /* ------------------------------------------------------------ */
3781 case '<': /* Lookbehind or named define */
3782 switch (ptr[1])
3783 {
3784 case '=': /* Positive lookbehind */
3785 bravalue = OP_ASSERTBACK;
3786 ptr += 2;
3787 break;
3788
3789 case '!': /* Negative lookbehind */
3790 bravalue = OP_ASSERTBACK_NOT;
3791 ptr += 2;
3792 break;
3793
3794 default: /* Could be name define, else bad */
3795 if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME;
3796 ptr++; /* Correct offset for error */
3797 *errorcodeptr = ERR24;
3798 goto FAILED;
3799 }
3800 break;
3801
3802
3803 /* ------------------------------------------------------------ */
3804 case '>': /* One-time brackets */
3805 bravalue = OP_ONCE;
3806 ptr++;
3807 break;
3808
3809
3810 /* ------------------------------------------------------------ */
3811 case 'C': /* Callout - may be followed by digits; */
3812 previous_callout = code; /* Save for later completion */
3813 after_manual_callout = 1; /* Skip one item before completing */
3814 *code++ = OP_CALLOUT;
3815 {
3816 int n = 0;
3817 while ((digitab[*(++ptr)] & ctype_digit) != 0)
3818 n = n * 10 + *ptr - '0';
3819 if (*ptr != ')')
3820 {
3821 *errorcodeptr = ERR39;
3822 goto FAILED;
3823 }
3824 if (n > 255)
3825 {
3826 *errorcodeptr = ERR38;
3827 goto FAILED;
3828 }
3829 *code++ = n;
3830 PUT(code, 0, ptr - cd->start_pattern + 1); /* Pattern offset */
3831 PUT(code, LINK_SIZE, 0); /* Default length */
3832 code += 2 * LINK_SIZE;
3833 }
3834 previous = NULL;
3835 continue;
3836
3837
3838 /* ------------------------------------------------------------ */
3839 case 'P': /* Python-style named subpattern handling */
3840 if (*(++ptr) == '=' || *ptr == '>') /* Reference or recursion */
3841 {
3842 is_recurse = *ptr == '>';
3843 terminator = ')';
3844 goto NAMED_REF_OR_RECURSE;
3845 }
3846 else if (*ptr != '<') /* Test for Python-style definition */
3847 {
3848 *errorcodeptr = ERR41;
3849 goto FAILED;
3850 }
3851 /* Fall through to handle (?P< as (?< is handled */
3852
3853
3854 /* ------------------------------------------------------------ */
3855 DEFINE_NAME: /* Come here from (?< handling */
3856 case '\'':
3857 {
3858 terminator = (*ptr == '<')? '>' : '\'';
3859 name = ++ptr;
3860
3861 while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
3862 namelen = ptr - name;
3863
3864 /* In the pre-compile phase, just do a syntax check. */
3865
3866 if (lengthptr != NULL)
3867 {
3868 if (*ptr != terminator)
3869 {
3870 *errorcodeptr = ERR42;
3871 goto FAILED;
3872 }
3873 if (cd->names_found >= MAX_NAME_COUNT)
3874 {
3875 *errorcodeptr = ERR49;
3876 goto FAILED;
3877 }
3878 if (namelen + 3 > cd->name_entry_size)
3879 {
3880 cd->name_entry_size = namelen + 3;
3881 if (namelen > MAX_NAME_SIZE)
3882 {
3883 *errorcodeptr = ERR48;
3884 goto FAILED;
3885 }
3886 }
3887 }
3888
3889 /* In the real compile, create the entry in the table */
3890
3891 else
3892 {
3893 slot = cd->name_table;
3894 for (i = 0; i < cd->names_found; i++)
3895 {
3896 int crc = memcmp(name, slot+2, namelen);
3897 if (crc == 0)
3898 {
3899 if (slot[2+namelen] == 0)
3900 {
3901 if ((options & PCRE_DUPNAMES) == 0)
3902 {
3903 *errorcodeptr = ERR43;
3904 goto FAILED;
3905 }
3906 }
3907 else crc = -1; /* Current name is substring */
3908 }
3909 if (crc < 0)
3910 {
3911 memmove(slot + cd->name_entry_size, slot,
3912 (cd->names_found - i) * cd->name_entry_size);
3913 break;
3914 }
3915 slot += cd->name_entry_size;
3916 }
3917
3918 PUT2(slot, 0, cd->bracount + 1);
3919 memcpy(slot + 2, name, namelen);
3920 slot[2+namelen] = 0;
3921 }
3922 }
3923
3924 /* In both cases, count the number of names we've encountered. */
3925
3926 ptr++; /* Move past > or ' */
3927 cd->names_found++;
3928 goto NUMBERED_GROUP;
3929
3930
3931 /* ------------------------------------------------------------ */
3932 case '&': /* Perl recursion/subroutine syntax */
3933 terminator = ')';
3934 is_recurse = TRUE;
3935 /* Fall through */
3936
3937 /* We come here from the Python syntax above that handles both
3938 references (?P=name) and recursion (?P>name), as well as falling
3939 through from the Perl recursion syntax (?&name). */
3940
3941 NAMED_REF_OR_RECURSE:
3942 name = ++ptr;
3943 while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
3944 namelen = ptr - name;
3945
3946 /* In the pre-compile phase, do a syntax check and set a dummy
3947 reference number. */
3948
3949 if (lengthptr != NULL)
3950 {
3951 if (*ptr != terminator)
3952 {
3953 *errorcodeptr = ERR42;
3954 goto FAILED;
3955 }
3956 if (namelen > MAX_NAME_SIZE)
3957 {
3958 *errorcodeptr = ERR48;
3959 goto FAILED;
3960 }
3961 recno = 0;
3962 }
3963
3964 /* In the real compile, seek the name in the table */
3965
3966 else
3967 {
3968 slot = cd->name_table;
3969 for (i = 0; i < cd->names_found; i++)
3970 {
3971 if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
3972 slot += cd->name_entry_size;
3973 }
3974
3975 if (i < cd->names_found) /* Back reference */
3976 {
3977 recno = GET2(slot, 0);
3978 }
3979 else if ((recno = /* Forward back reference */
3980 find_parens(ptr, cd->bracount, name, namelen,
3981 (options & PCRE_EXTENDED) != 0)) <= 0)
3982 {
3983 *errorcodeptr = ERR15;
3984 goto FAILED;
3985 }
3986 }
3987
3988 /* In both phases, we can now go to the code than handles numerical
3989 recursion or backreferences. */
3990
3991 if (is_recurse) goto HANDLE_RECURSION;
3992 else goto HANDLE_REFERENCE;
3993
3994
3995 /* ------------------------------------------------------------ */
3996 case 'R': /* Recursion */
3997 ptr++; /* Same as (?0) */
3998 /* Fall through */
3999
4000
4001 /* ------------------------------------------------------------ */
4002 case '0': case '1': case '2': case '3': case '4': /* Recursion or */
4003 case '5': case '6': case '7': case '8': case '9': /* subroutine */
4004 {
4005 const uschar *called;
4006 recno = 0;
4007 while((digitab[*ptr] & ctype_digit) != 0)
4008 recno = recno * 10 + *ptr++ - '0';
4009 if (*ptr != ')')
4010 {
4011 *errorcodeptr = ERR29;
4012 goto FAILED;
4013 }
4014
4015 /* Come here from code above that handles a named recursion */
4016
4017 HANDLE_RECURSION:
4018
4019 previous = code;
4020 called = cd->start_code;
4021
4022 /* When we are actually compiling, find the bracket that is being
4023 referenced. Temporarily end the regex in case it doesn't exist before
4024 this point. If we end up with a forward reference, first check that
4025 the bracket does occur later so we can give the error (and position)
4026 now. Then remember this forward reference in the workspace so it can
4027 be filled in at the end. */
4028
4029 if (lengthptr == NULL)
4030 {
4031 *code = OP_END;
4032 if (recno != 0) called = find_bracket(cd->start_code, utf8, recno);
4033
4034 /* Forward reference */
4035
4036 if (called == NULL)
4037 {
4038 if (find_parens(ptr, cd->bracount, NULL, recno,
4039 (options & PCRE_EXTENDED) != 0) < 0)
4040 {
4041 *errorcodeptr = ERR15;
4042 goto FAILED;
4043 }
4044 called = cd->start_code + recno;
4045 PUTINC(cd->hwm, 0, code + 2 + LINK_SIZE - cd->start_code);
4046 }
4047
4048 /* If not a forward reference, and the subpattern is still open,
4049 this is a recursive call. We check to see if this is a left
4050 recursion that could loop for ever, and diagnose that case. */
4051
4052 else if (GET(called, 1) == 0 &&
4053 could_be_empty(called, code, bcptr, utf8))
4054 {
4055 *errorcodeptr = ERR40;
4056 goto FAILED;
4057 }
4058 }
4059
4060 /* Insert the recursion/subroutine item, automatically wrapped inside
4061 "once" brackets. Set up a "previous group" length so that a
4062 subsequent quantifier will work. */
4063
4064 *code = OP_ONCE;
4065 PUT(code, 1, 2 + 2*LINK_SIZE);
4066 code += 1 + LINK_SIZE;
4067
4068 *code = OP_RECURSE;
4069 PUT(code, 1, called - cd->start_code);
4070 code += 1 + LINK_SIZE;
4071
4072 *code = OP_KET;
4073 PUT(code, 1, 2 + 2*LINK_SIZE);
4074 code += 1 + LINK_SIZE;
4075
4076 length_prevgroup = 3 + 3*LINK_SIZE;
4077 }
4078
4079 /* Can't determine a first byte now */
4080
4081 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
4082 continue;
4083
4084
4085 /* ------------------------------------------------------------ */
4086 default: /* Other characters: check option setting */
4087 set = unset = 0;
4088 optset = &set;
4089
4090 while (*ptr != ')' && *ptr != ':')
4091 {
4092 switch (*ptr++)
4093 {
4094 case '-': optset = &unset; break;
4095
4096 case 'J': /* Record that it changed in the external options */
4097 *optset |= PCRE_DUPNAMES;
4098 cd->external_options |= PCRE_JCHANGED;
4099 break;
4100
4101 case 'i': *optset |= PCRE_CASELESS; break;
4102 case 'm': *optset |= PCRE_MULTILINE; break;
4103 case 's': *optset |= PCRE_DOTALL; break;
4104 case 'x': *optset |= PCRE_EXTENDED; break;
4105 case 'U': *optset |= PCRE_UNGREEDY; break;
4106 case 'X': *optset |= PCRE_EXTRA; break;
4107
4108 default: *errorcodeptr = ERR12;
4109 ptr--; /* Correct the offset */
4110 goto FAILED;
4111 }
4112 }
4113
4114 /* Set up the changed option bits, but don't change anything yet. */
4115
4116 newoptions = (options | set) & (~unset);
4117
4118 /* If the options ended with ')' this is not the start of a nested
4119 group with option changes, so the options change at this level. If this
4120 item is right at the start of the pattern, the options can be
4121 abstracted and made external in the pre-compile phase, and ignored in
4122 the compile phase. This can be helpful when matching -- for instance in
4123 caseless checking of required bytes.
4124
4125 If the code pointer is not (cd->start_code + 1 + LINK_SIZE), we are
4126 definitely *not* at the start of the pattern because something has been
4127 compiled. In the pre-compile phase, however, the code pointer can have
4128 that value after the start, because it gets reset as code is discarded
4129 during the pre-compile. However, this can happen only at top level - if
4130 we are within parentheses, the starting BRA will still be present. At
4131 any parenthesis level, the length value can be used to test if anything
4132 has been compiled at that level. Thus, a test for both these conditions
4133 is necessary to ensure we correctly detect the start of the pattern in
4134 both phases.
4135
4136 If we are not at the pattern start, compile code to change the ims
4137 options if this setting actually changes any of them. We also pass the
4138 new setting back so that it can be put at the start of any following
4139 branches, and when this group ends (if we are in a group), a resetting
4140 item can be compiled. */
4141
4142 if (*ptr == ')')
4143 {
4144 if (code == cd->start_code + 1 + LINK_SIZE &&
4145 (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE))
4146 {
4147 cd->external_options = newoptions;
4148 options = newoptions;
4149 }
4150 else
4151 {
4152 if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
4153 {
4154 *code++ = OP_OPT;
4155 *code++ = newoptions & PCRE_IMS;
4156 }
4157
4158 /* Change options at this level, and pass them back for use
4159 in subsequent branches. Reset the greedy defaults and the case
4160 value for firstbyte and reqbyte. */
4161
4162 *optionsptr = options = newoptions;
4163 greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
4164 greedy_non_default = greedy_default ^ 1;
4165 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
4166 }
4167
4168 previous = NULL; /* This item can't be repeated */
4169 continue; /* It is complete */
4170 }
4171
4172 /* If the options ended with ':' we are heading into a nested group
4173 with possible change of options. Such groups are non-capturing and are
4174 not assertions of any kind. All we need to do is skip over the ':';
4175 the newoptions value is handled below. */
4176
4177 bravalue = OP_BRA;
4178 ptr++;
4179 } /* End of switch for character following (? */
4180 } /* End of (? handling */
4181
4182 /* Opening parenthesis not followed by '?'. If PCRE_NO_AUTO_CAPTURE is set,
4183 all unadorned brackets become non-capturing and behave like (?:...)
4184 brackets. */
4185
4186 else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
4187 {
4188 bravalue = OP_BRA;
4189 }
4190
4191 /* Else we have a capturing group. */
4192
4193 else
4194 {
4195 NUMBERED_GROUP:
4196 cd->bracount += 1;
4197 PUT2(code, 1+LINK_SIZE, cd->bracount);
4198 skipbytes = 2;
4199 }
4200
4201 /* Process nested bracketed regex. Assertions may not be repeated, but
4202 other kinds can be. All their opcodes are >= OP_ONCE. We copy code into a
4203 non-register variable in order to be able to pass its address because some
4204 compilers complain otherwise. Pass in a new setting for the ims options if
4205 they have changed. */
4206
4207 previous = (bravalue >= OP_ONCE)? code : NULL;
4208 *code = bravalue;
4209 tempcode = code;
4210 tempreqvary = cd->req_varyopt; /* Save value before bracket */
4211 length_prevgroup = 0; /* Initialize for pre-compile phase */
4212
4213 if (!compile_regex(
4214 newoptions, /* The complete new option state */
4215 options & PCRE_IMS, /* The previous ims option state */
4216 &tempcode, /* Where to put code (updated) */
4217 &ptr, /* Input pointer (updated) */
4218 errorcodeptr, /* Where to put an error message */
4219 (bravalue == OP_ASSERTBACK ||
4220 bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
4221 skipbytes, /* Skip over bracket number */
4222 &subfirstbyte, /* For possible first char */
4223 &subreqbyte, /* For possible last char */
4224 bcptr, /* Current branch chain */
4225 cd, /* Tables block */
4226 (lengthptr == NULL)? NULL : /* Actual compile phase */
4227 &length_prevgroup /* Pre-compile phase */
4228 ))
4229 goto FAILED;
4230
4231 /* At the end of compiling, code is still pointing to the start of the
4232 group, while tempcode has been updated to point past the end of the group
4233 and any option resetting that may follow it. The pattern pointer (ptr)
4234 is on the bracket. */
4235
4236 /* If this is a conditional bracket, check that there are no more than
4237 two branches in the group, or just one if it's a DEFINE group. */
4238
4239 if (bravalue == OP_COND)
4240 {
4241 uschar *tc = code;
4242 int condcount = 0;
4243
4244 do {
4245 condcount++;
4246 tc += GET(tc,1);
4247 }
4248 while (*tc != OP_KET);
4249
4250 /* A DEFINE group is never obeyed inline (the "condition" is always
4251 false). It must have only one branch. */
4252
4253 if (code[LINK_SIZE+1] == OP_DEF)
4254 {
4255 if (condcount > 1)
4256 {
4257 *errorcodeptr = ERR54;
4258 goto FAILED;
4259 }
4260 bravalue = OP_DEF; /* Just a flag to suppress char handling below */
4261 }
4262
4263 /* A "normal" conditional group. If there is just one branch, we must not
4264 make use of its firstbyte or reqbyte, because this is equivalent to an
4265 empty second branch. */
4266
4267 else
4268 {
4269 if (condcount > 2)
4270 {
4271 *errorcodeptr = ERR27;
4272 goto FAILED;
4273 }
4274 if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
4275 }
4276 }
4277
4278 /* Error if hit end of pattern */
4279
4280 if (*ptr != ')')
4281 {
4282 *errorcodeptr = ERR14;
4283 goto FAILED;
4284 }
4285
4286 /* In the pre-compile phase, update the length by the length of the nested
4287 group, less the brackets at either end. Then reduce the compiled code to
4288 just the brackets so that it doesn't use much memory if it is duplicated by
4289 a quantifier. */
4290
4291 if (lengthptr != NULL)
4292 {
4293 *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
4294 code++;
4295 PUTINC(code, 0, 1 + LINK_SIZE);
4296 *code++ = OP_KET;
4297 PUTINC(code, 0, 1 + LINK_SIZE);
4298 }
4299
4300 /* Otherwise update the main code pointer to the end of the group. */
4301
4302 else code = tempcode;
4303
4304 /* For a DEFINE group, required and first character settings are not
4305 relevant. */
4306
4307 if (bravalue == OP_DEF) break;
4308
4309 /* Handle updating of the required and first characters for other types of
4310 group. Update for normal brackets of all kinds, and conditions with two
4311 branches (see code above). If the bracket is followed by a quantifier with
4312 zero repeat, we have to back off. Hence the definition of zeroreqbyte and
4313 zerofirstbyte outside the main loop so that they can be accessed for the
4314 back off. */
4315
4316 zeroreqbyte = reqbyte;
4317 zerofirstbyte = firstbyte;
4318 groupsetfirstbyte = FALSE;
4319
4320 if (bravalue >= OP_ONCE)
4321 {
4322 /* If we have not yet set a firstbyte in this branch, take it from the
4323 subpattern, remembering that it was set here so that a repeat of more
4324 than one can replicate it as reqbyte if necessary. If the subpattern has
4325 no firstbyte, set "none" for the whole branch. In both cases, a zero
4326 repeat forces firstbyte to "none". */
4327
4328 if (firstbyte == REQ_UNSET)
4329 {
4330 if (subfirstbyte >= 0)
4331 {
4332 firstbyte = subfirstbyte;
4333 groupsetfirstbyte = TRUE;
4334 }
4335 else firstbyte = REQ_NONE;
4336 zerofirstbyte = REQ_NONE;
4337 }
4338
4339 /* If firstbyte was previously set, convert the subpattern's firstbyte
4340 into reqbyte if there wasn't one, using the vary flag that was in
4341 existence beforehand. */
4342
4343 else if (subfirstbyte >= 0 && subreqbyte < 0)
4344 subreqbyte = subfirstbyte | tempreqvary;
4345
4346 /* If the subpattern set a required byte (or set a first byte that isn't
4347 really the first byte - see above), set it. */
4348
4349 if (subreqbyte >= 0) reqbyte = subreqbyte;
4350 }
4351
4352 /* For a forward assertion, we take the reqbyte, if set. This can be
4353 helpful if the pattern that follows the assertion doesn't set a different
4354 char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
4355 for an assertion, however because it leads to incorrect effect for patterns
4356 such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
4357 of a firstbyte. This is overcome by a scan at the end if there's no
4358 firstbyte, looking for an asserted first char. */
4359
4360 else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
4361 break; /* End of processing '(' */
4362
4363
4364 /* ===================================================================*/
4365 /* Handle metasequences introduced by \. For ones like \d, the ESC_ values
4366 are arranged to be the negation of the corresponding OP_values. For the
4367 back references, the values are ESC_REF plus the reference number. Only
4368 back references and those types that consume a character may be repeated.
4369 We can test for values between ESC_b and ESC_Z for the latter; this may
4370 have to change if any new ones are ever created. */
4371
4372 case '\\':
4373 tempptr = ptr;
4374 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, FALSE);
4375 if (*errorcodeptr != 0) goto FAILED;
4376
4377 if (c < 0)
4378 {
4379 if (-c == ESC_Q) /* Handle start of quoted string */
4380 {
4381 if (ptr[1] == '\\' && ptr[2] == 'E') ptr += 2; /* avoid empty string */
4382 else inescq = TRUE;
4383 continue;
4384 }
4385
4386 if (-c == ESC_E) continue; /* Perl ignores an orphan \E */
4387
4388 /* For metasequences that actually match a character, we disable the
4389 setting of a first character if it hasn't already been set. */
4390
4391 if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
4392 firstbyte = REQ_NONE;
4393
4394 /* Set values to reset to if this is followed by a zero repeat. */
4395
4396 zerofirstbyte = firstbyte;
4397 zeroreqbyte = reqbyte;
4398
4399 /* \k<name> or \k'name' is a back reference by name (Perl syntax) */
4400
4401 if (-c == ESC_k && (ptr[1] == '<' || ptr[1] == '\''))
4402 {
4403 is_recurse = FALSE;
4404 terminator = (*(++ptr) == '<')? '>' : '\'';
4405 goto NAMED_REF_OR_RECURSE;
4406 }
4407
4408 /* Back references are handled specially; must disable firstbyte if
4409 not set to cope with cases like (?=(\w+))\1: which would otherwise set
4410 ':' later. */
4411
4412 if (-c >= ESC_REF)
4413 {
4414 recno = -c - ESC_REF;
4415
4416 HANDLE_REFERENCE: /* Come here from named backref handling */
4417 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
4418 previous = code;
4419 *code++ = OP_REF;
4420 PUT2INC(code, 0, recno);
4421 cd->backref_map |= (recno < 32)? (1 << recno) : 1;
4422 if (recno > cd->top_backref) cd->top_backref = recno;
4423 }
4424
4425 /* So are Unicode property matches, if supported. */
4426
4427 #ifdef SUPPORT_UCP
4428 else if (-c == ESC_P || -c == ESC_p)
4429 {
4430 BOOL negated;
4431 int pdata;
4432 int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
4433 if (ptype < 0) goto FAILED;
4434 previous = code;
4435 *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
4436 *code++ = ptype;
4437 *code++ = pdata;
4438 }
4439 #else
4440
4441 /* If Unicode properties are not supported, \X, \P, and \p are not
4442 allowed. */
4443
4444 else if (-c == ESC_X || -c == ESC_P || -c == ESC_p)
4445 {
4446 *errorcodeptr = ERR45;
4447 goto FAILED;
4448 }
4449 #endif
4450
4451 /* For the rest (including \X when Unicode properties are supported), we
4452 can obtain the OP value by negating the escape value. */
4453
4454 else
4455 {
4456 previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
4457 *code++ = -c;
4458 }
4459 continue;
4460 }
4461
4462 /* We have a data character whose value is in c. In UTF-8 mode it may have
4463 a value > 127. We set its representation in the length/buffer, and then
4464 handle it as a data character. */
4465
4466 #ifdef SUPPORT_UTF8
4467 if (utf8 && c > 127)
4468 mclength = _pcre_ord2utf8(c, mcbuffer);
4469 else
4470 #endif
4471
4472 {
4473 mcbuffer[0] = c;
4474 mclength = 1;
4475 }
4476 goto ONE_CHAR;
4477
4478
4479 /* ===================================================================*/
4480 /* Handle a literal character. It is guaranteed not to be whitespace or #
4481 when the extended flag is set. If we are in UTF-8 mode, it may be a
4482 multi-byte literal character. */
4483
4484 default:
4485 NORMAL_CHAR:
4486 mclength = 1;
4487 mcbuffer[0] = c;
4488
4489 #ifdef SUPPORT_UTF8
4490 if (utf8 && c >= 0xc0)
4491 {
4492 while ((ptr[1] & 0xc0) == 0x80)
4493 mcbuffer[mclength++] = *(++ptr);
4494 }
4495 #endif
4496
4497 /* At this point we have the character's bytes in mcbuffer, and the length
4498 in mclength. When not in UTF-8 mode, the length is always 1. */
4499
4500 ONE_CHAR:
4501 previous = code;
4502 *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARNC : OP_CHAR;
4503 for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
4504
4505 /* Set the first and required bytes appropriately. If no previous first
4506 byte, set it from this character, but revert to none on a zero repeat.
4507 Otherwise, leave the firstbyte value alone, and don't change it on a zero
4508 repeat. */
4509
4510 if (firstbyte == REQ_UNSET)
4511 {
4512 zerofirstbyte = REQ_NONE;
4513 zeroreqbyte = reqbyte;
4514
4515 /* If the character is more than one byte long, we can set firstbyte
4516 only if it is not to be matched caselessly. */
4517
4518 if (mclength == 1 || req_caseopt == 0)
4519 {
4520 firstbyte = mcbuffer[0] | req_caseopt;
4521 if (mclength != 1) reqbyte = code[-1] | cd->req_varyopt;
4522 }
4523 else firstbyte = reqbyte = REQ_NONE;
4524 }
4525
4526 /* firstbyte was previously set; we can set reqbyte only the length is
4527 1 or the matching is caseful. */
4528
4529 else
4530 {
4531 zerofirstbyte = firstbyte;
4532 zeroreqbyte = reqbyte;
4533 if (mclength == 1 || req_caseopt == 0)
4534 reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
4535 }
4536
4537 break; /* End of literal character handling */
4538 }
4539 } /* end of big loop */
4540
4541
4542 /* Control never reaches here by falling through, only by a goto for all the
4543 error states. Pass back the position in the pattern so that it can be displayed
4544 to the user for diagnosing the error. */
4545
4546 FAILED:
4547 *ptrptr = ptr;
4548 return FALSE;
4549 }
4550
4551
4552
4553
4554 /*************************************************
4555 * Compile sequence of alternatives *
4556 *************************************************/
4557
4558 /* On entry, ptr is pointing past the bracket character, but on return it
4559 points to the closing bracket, or vertical bar, or end of string. The code
4560 variable is pointing at the byte into which the BRA operator has been stored.
4561 If the ims options are changed at the start (for a (?ims: group) or during any
4562 branch, we need to insert an OP_OPT item at the start of every following branch
4563 to ensure they get set correctly at run time, and also pass the new options
4564 into every subsequent branch compile.
4565
4566 This function is used during the pre-compile phase when we are trying to find
4567 out the amount of memory needed, as well as during the real compile phase. The
4568 value of lengthptr distinguishes the two phases.
4569
4570 Argument:
4571 options option bits, including any changes for this subpattern
4572 oldims previous settings of ims option bits
4573 codeptr -> the address of the current code pointer
4574 ptrptr -> the address of the current pattern pointer
4575 errorcodeptr -> pointer to error code variable
4576 lookbehind TRUE if this is a lookbehind assertion
4577 skipbytes skip this many bytes at start (for brackets and OP_COND)
4578 firstbyteptr place to put the first required character, or a negative number
4579 reqbyteptr place to put the last required character, or a negative number
4580 bcptr pointer to the chain of currently open branches
4581 cd points to the data block with tables pointers etc.
4582 lengthptr NULL during the real compile phase
4583 points to length accumulator during pre-compile phase
4584
4585 Returns: TRUE on success
4586 */
4587
4588 static BOOL
4589 compile_regex(int options, int oldims, uschar **codeptr, const uschar **ptrptr,
4590 int *errorcodeptr, BOOL lookbehind, int skipbytes, int *firstbyteptr,
4591 int *reqbyteptr, branch_chain *bcptr, compile_data *cd, int *lengthptr)
4592 {
4593 const uschar *ptr = *ptrptr;
4594 uschar *code = *codeptr;
4595 uschar *last_branch = code;
4596 uschar *start_bracket = code;
4597 uschar *reverse_count = NULL;
4598 int firstbyte, reqbyte;
4599 int branchfirstbyte, branchreqbyte;
4600 int length;
4601 branch_chain bc;
4602
4603 bc.outer = bcptr;
4604 bc.current = code;
4605
4606 firstbyte = reqbyte = REQ_UNSET;
4607
4608 /* Accumulate the length for use in the pre-compile phase. Start with the
4609 length of the BRA and KET and any extra bytes that are required at the
4610 beginning. We accumulate in a local variable to save frequent testing of
4611 lenthptr for NULL. We cannot do this by looking at the value of code at the
4612 start and end of each alternative, because compiled items are discarded during
4613 the pre-compile phase so that the work space is not exceeded. */
4614
4615 length = 2 + 2*LINK_SIZE + skipbytes;
4616
4617 /* WARNING: If the above line is changed for any reason, you must also change
4618 the code that abstracts option settings at the start of the pattern and makes
4619 them global. It tests the value of length for (2 + 2*LINK_SIZE) in the
4620 pre-compile phase to find out whether anything has yet been compiled or not. */
4621
4622 /* Offset is set zero to mark that this bracket is still open */
4623
4624 PUT(code, 1, 0);
4625 code += 1 + LINK_SIZE + skipbytes;
4626
4627 /* Loop for each alternative branch */
4628
4629 for (;;)
4630 {
4631 /* Handle a change of ims options at the start of the branch */
4632
4633 if ((options & PCRE_IMS) != oldims)
4634 {
4635 *code++ = OP_OPT;
4636 *code++ = options & PCRE_IMS;
4637 length += 2;
4638 }
4639
4640 /* Set up dummy OP_REVERSE if lookbehind assertion */
4641
4642 if (lookbehind)
4643 {
4644 *code++ = OP_REVERSE;
4645 reverse_count = code;
4646 PUTINC(code, 0, 0);
4647 length += 1 + LINK_SIZE;
4648 }
4649
4650 /* Now compile the branch; in the pre-compile phase its length gets added
4651 into the length. */
4652
4653 if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstbyte,
4654 &branchreqbyte, &bc, cd, (lengthptr == NULL)? NULL : &length))
4655 {
4656 *ptrptr = ptr;
4657 return FALSE;
4658 }
4659
4660 /* In the real compile phase, there is some post-processing to be done. */
4661
4662 if (lengthptr == NULL)
4663 {
4664 /* If this is the first branch, the firstbyte and reqbyte values for the
4665 branch become the values for the regex. */
4666
4667 if (*last_branch != OP_ALT)
4668 {
4669 firstbyte = branchfirstbyte;
4670 reqbyte = branchreqbyte;
4671 }
4672
4673 /* If this is not the first branch, the first char and reqbyte have to
4674 match the values from all the previous branches, except that if the
4675 previous value for reqbyte didn't have REQ_VARY set, it can still match,
4676 and we set REQ_VARY for the regex. */
4677
4678 else
4679 {
4680 /* If we previously had a firstbyte, but it doesn't match the new branch,
4681 we have to abandon the firstbyte for the regex, but if there was
4682 previously no reqbyte, it takes on the value of the old firstbyte. */
4683
4684 if (firstbyte >= 0 && firstbyte != branchfirstbyte)
4685 {
4686 if (reqbyte < 0) reqbyte = firstbyte;
4687 firstbyte = REQ_NONE;
4688 }
4689
4690 /* If we (now or from before) have no firstbyte, a firstbyte from the
4691 branch becomes a reqbyte if there isn't a branch reqbyte. */
4692
4693 if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
4694 branchreqbyte = branchfirstbyte;
4695
4696 /* Now ensure that the reqbytes match */
4697
4698 if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
4699 reqbyte = REQ_NONE;
4700 else reqbyte |= branchreqbyte; /* To "or" REQ_VARY */
4701 }
4702
4703 /* If lookbehind, check that this branch matches a fixed-length string, and
4704 put the length into the OP_REVERSE item. Temporarily mark the end of the
4705 branch with OP_END. */
4706
4707 if (lookbehind)
4708 {
4709 int fixed_length;
4710 *code = OP_END;
4711 fixed_length = find_fixedlength(last_branch, options);
4712 DPRINTF(("fixed length = %d\n", fixed_length));
4713 if (fixed_length < 0)
4714 {
4715 *errorcodeptr = (fixed_length == -2)? ERR36 : ERR25;
4716 *ptrptr = ptr;
4717 return FALSE;
4718 }
4719 PUT(reverse_count, 0, fixed_length);
4720 }
4721 }
4722
4723 /* Reached end of expression, either ')' or end of pattern. Go back through
4724 the alternative branches and reverse the chain of offsets, with the field in
4725 the BRA item now becoming an offset to the first alternative. If there are
4726 no alternatives, it points to the end of the group. The length in the
4727 terminating ket is always the length of the whole bracketed item. If any of
4728 the ims options were changed inside the group, compile a resetting op-code
4729 following, except at the very end of the pattern. Return leaving the pointer
4730 at the terminating char. */
4731
4732 if (*ptr != '|')
4733 {
4734 int branch_length = code - last_branch;
4735 do
4736 {
4737 int prev_length = GET(last_branch, 1);
4738 PUT(last_branch, 1, branch_length);
4739 branch_length = prev_length;
4740 last_branch -= branch_length;
4741 }
4742 while (branch_length > 0);
4743
4744 /* Fill in the ket */
4745
4746 *code = OP_KET;
4747 PUT(code, 1, code - start_bracket);
4748 code += 1 + LINK_SIZE;
4749
4750 /* Resetting option if needed */
4751
4752 if ((options & PCRE_IMS) != oldims && *ptr == ')')
4753 {
4754 *code++ = OP_OPT;
4755 *code++ = oldims;
4756 length += 2;
4757 }
4758
4759 /* Set values to pass back */
4760
4761 *codeptr = code;
4762 *ptrptr = ptr;
4763 *firstbyteptr = firstbyte;
4764 *reqbyteptr = reqbyte;
4765 if (lengthptr != NULL) *lengthptr += length;
4766 return TRUE;
4767 }
4768
4769 /* Another branch follows; insert an "or" node. Its length field points back
4770 to the previous branch while the bracket remains open. At the end the chain
4771 is reversed. It's done like this so that the start of the bracket has a
4772 zero offset until it is closed, making it possible to detect recursion. */
4773
4774 *code = OP_ALT;
4775 PUT(code, 1, code - last_branch);
4776 bc.current = last_branch = code;
4777 code += 1 + LINK_SIZE;
4778 ptr++;
4779 length += 1 + LINK_SIZE;
4780 }
4781 /* Control never reaches here */
4782 }
4783
4784
4785
4786
4787 /*************************************************
4788 * Check for anchored expression *
4789 *************************************************/
4790
4791 /* Try to find out if this is an anchored regular expression. Consider each
4792 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
4793 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
4794 it's anchored. However, if this is a multiline pattern, then only OP_SOD
4795 counts, since OP_CIRC can match in the middle.
4796
4797 We can also consider a regex to be anchored if OP_SOM starts all its branches.
4798 This is the code for \G, which means "match at start of match position, taking
4799 into account the match offset".
4800
4801 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
4802 because that will try the rest of the pattern at all possible matching points,
4803 so there is no point trying again.... er ....
4804
4805 .... except when the .* appears inside capturing parentheses, and there is a
4806 subsequent back reference to those parentheses. We haven't enough information
4807 to catch that case precisely.
4808
4809 At first, the best we could do was to detect when .* was in capturing brackets
4810 and the highest back reference was greater than or equal to that level.
4811 However, by keeping a bitmap of the first 31 back references, we can catch some
4812 of the more common cases more precisely.
4813
4814 Arguments:
4815 code points to start of expression (the bracket)
4816 options points to the options setting
4817 bracket_map a bitmap of which brackets we are inside while testing; this
4818 handles up to substring 31; after that we just have to take
4819 the less precise approach
4820 backref_map the back reference bitmap
4821
4822 Returns: TRUE or FALSE
4823 */
4824
4825 static BOOL
4826 is_anchored(register const uschar *code, int *options, unsigned int bracket_map,
4827 unsigned int backref_map)
4828 {
4829 do {
4830 const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
4831 options, PCRE_MULTILINE, FALSE);
4832 register int op = *scode;
4833
4834 /* Non-capturing brackets */
4835
4836 if (op == OP_BRA)
4837 {
4838 if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
4839 }
4840
4841 /* Capturing brackets */
4842
4843 else if (op == OP_CBRA)
4844 {
4845 int n = GET2(scode, 1+LINK_SIZE);
4846 int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
4847 if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;
4848 }
4849
4850 /* Other brackets */
4851
4852 else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
4853 {
4854 if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
4855 }
4856
4857 /* .* is not anchored unless DOTALL is set and it isn't in brackets that
4858 are or may be referenced. */
4859
4860 else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
4861 op == OP_TYPEPOSSTAR) &&
4862 (*options & PCRE_DOTALL) != 0)
4863 {
4864 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
4865 }
4866
4867 /* Check for explicit anchoring */
4868
4869 else if (op != OP_SOD && op != OP_SOM &&
4870 ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
4871 return FALSE;
4872 code += GET(code, 1);
4873 }
4874 while (*code == OP_ALT); /* Loop for each alternative */
4875 return TRUE;
4876 }
4877
4878
4879
4880 /*************************************************
4881 * Check for starting with ^ or .* *
4882 *************************************************/
4883
4884 /* This is called to find out if every branch starts with ^ or .* so that
4885 "first char" processing can be done to speed things up in multiline
4886 matching and for non-DOTALL patterns that start with .* (which must start at
4887 the beginning or after \n). As in the case of is_anchored() (see above), we
4888 have to take account of back references to capturing brackets that contain .*
4889 because in that case we can't make the assumption.
4890
4891 Arguments:
4892 code points to start of expression (the bracket)
4893 bracket_map a bitmap of which brackets we are inside while testing; this
4894 handles up to substring 31; after that we just have to take
4895 the less precise approach
4896 backref_map the back reference bitmap
4897
4898 Returns: TRUE or FALSE
4899 */
4900
4901 static BOOL
4902 is_startline(const uschar *code, unsigned int bracket_map,
4903 unsigned int backref_map)
4904 {
4905 do {
4906 const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
4907 NULL, 0, FALSE);
4908 register int op = *scode;
4909
4910 /* Non-capturing brackets */
4911
4912 if (op == OP_BRA)
4913 {
4914 if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
4915 }
4916
4917 /* Capturing brackets */
4918
4919 else if (op == OP_CBRA)
4920 {
4921 int n = GET2(scode, 1+LINK_SIZE);
4922 int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
4923 if (!is_startline(scode, new_map, backref_map)) return FALSE;
4924 }
4925
4926 /* Other brackets */
4927
4928 else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
4929 { if (!is_startline(scode, bracket_map, backref_map)) return FALSE; }
4930
4931 /* .* means "start at start or after \n" if it isn't in brackets that
4932 may be referenced. */
4933
4934 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
4935 {
4936 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
4937 }
4938
4939 /* Check for explicit circumflex */
4940
4941 else if (op != OP_CIRC) return FALSE;
4942
4943 /* Move on to the next alternative */
4944
4945 code += GET(code, 1);
4946 }
4947 while (*code == OP_ALT); /* Loop for each alternative */
4948 return TRUE;
4949 }
4950
4951
4952
4953 /*************************************************
4954 * Check for asserted fixed first char *
4955 *************************************************/
4956
4957 /* During compilation, the "first char" settings from forward assertions are
4958 discarded, because they can cause conflicts with actual literals that follow.
4959 However, if we end up without a first char setting for an unanchored pattern,
4960 it is worth scanning the regex to see if there is an initial asserted first
4961 char. If all branches start with the same asserted char, or with a bracket all
4962 of whose alternatives start with the same asserted char (recurse ad lib), then
4963 we return that char, otherwise -1.
4964
4965 Arguments:
4966 code points to start of expression (the bracket)
4967 options pointer to the options (used to check casing changes)
4968 inassert TRUE if in an assertion
4969
4970 Returns: -1 or the fixed first char
4971 */
4972
4973 static int
4974 find_firstassertedchar(const uschar *code, int *options, BOOL inassert)
4975 {
4976 register int c = -1;
4977 do {
4978 int d;
4979 const uschar *scode =
4980 first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE);
4981 register int op = *scode;
4982
4983 switch(op)
4984 {
4985 default:
4986 return -1;
4987
4988 case OP_BRA:
4989 case OP_CBRA:
4990 case OP_ASSERT:
4991 case OP_ONCE:
4992 case OP_COND:
4993 if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0)
4994 return -1;
4995 if (c < 0) c = d; else if (c != d) return -1;
4996 break;
4997
4998 case OP_EXACT: /* Fall through */
4999 scode += 2;
5000
5001 case OP_CHAR:
5002 case OP_CHARNC:
5003 case OP_PLUS:
5004 case OP_MINPLUS:
5005 case OP_POSPLUS:
5006 if (!inassert) return -1;
5007 if (c < 0)
5008 {
5009 c = scode[1];
5010 if ((*options & PCRE_CASELESS) != 0) c |= REQ_CASELESS;
5011 }
5012 else if (c != scode[1]) return -1;
5013 break;
5014 }
5015
5016 code += GET(code, 1);
5017 }
5018 while (*code == OP_ALT);
5019 return c;
5020 }
5021
5022
5023
5024 /*************************************************
5025 * Compile a Regular Expression *
5026 *************************************************/
5027
5028 /* This function takes a string and returns a pointer to a block of store
5029 holding a compiled version of the expression. The original API for this
5030 function had no error code return variable; it is retained for backwards
5031 compatibility. The new function is given a new name.
5032
5033 Arguments:
5034 pattern the regular expression
5035 options various option bits
5036 errorcodeptr pointer to error code variable (pcre_compile2() only)
5037 can be NULL if you don't want a code value
5038 errorptr pointer to pointer to error text
5039 erroroffset ptr offset in pattern where error was detected
5040 tables pointer to character tables or NULL
5041
5042 Returns: pointer to compiled data block, or NULL on error,
5043 with errorptr and erroroffset set
5044 */
5045
5046 PCRE_EXP_DEFN pcre *
5047 pcre_compile(const char *pattern, int options, const char **errorptr,
5048 int *erroroffset, const unsigned char *tables)
5049 {
5050 return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
5051 }
5052
5053
5054 PCRE_EXP_DEFN pcre *
5055 pcre_compile2(const char *pattern, int options, int *errorcodeptr,
5056 const char **errorptr, int *erroroffset, const unsigned char *tables)
5057 {
5058 real_pcre *re;
5059 int length = 1; /* For final END opcode */
5060 int firstbyte, reqbyte, newline;
5061 int errorcode = 0;
5062 #ifdef SUPPORT_UTF8
5063 BOOL utf8;
5064 #endif
5065 size_t size;
5066 uschar *code;
5067 const uschar *codestart;
5068 const uschar *ptr;
5069 compile_data compile_block;
5070 compile_data *cd = &compile_block;
5071
5072 /* This space is used for "compiling" into during the first phase, when we are
5073 computing the amount of memory that is needed. Compiled items are thrown away
5074 as soon as possible, so that a fairly large buffer should be sufficient for
5075 this purpose. The same space is used in the second phase for remembering where
5076 to fill in forward references to subpatterns. */
5077
5078 uschar cworkspace[COMPILE_WORK_SIZE];
5079
5080
5081 /* Set this early so that early errors get offset 0. */
5082
5083 ptr = (const uschar *)pattern;
5084
5085 /* We can't pass back an error message if errorptr is NULL; I guess the best we
5086 can do is just return NULL, but we can set a code value if there is a code
5087 pointer. */
5088
5089 if (errorptr == NULL)
5090 {
5091 if (errorcodeptr != NULL) *errorcodeptr = 99;
5092 return NULL;
5093 }
5094
5095 *errorptr = NULL;
5096 if (errorcodeptr != NULL) *errorcodeptr = ERR0;
5097
5098 /* However, we can give a message for this error */
5099
5100 if (erroroffset == NULL)
5101 {
5102 errorcode = ERR16;
5103 goto PCRE_EARLY_ERROR_RETURN2;
5104 }
5105
5106 *erroroffset = 0;
5107
5108 /* Can't support UTF8 unless PCRE has been compiled to include the code. */
5109
5110 #ifdef SUPPORT_UTF8
5111 utf8 = (options & PCRE_UTF8) != 0;
5112 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
5113 (*erroroffset = _pcre_valid_utf8((uschar *)pattern, -1)) >= 0)
5114 {
5115 errorcode = ERR44;
5116 goto PCRE_EARLY_ERROR_RETURN2;
5117 }
5118 #else
5119 if ((options & PCRE_UTF8) != 0)
5120 {
5121 errorcode = ERR32;
5122 goto PCRE_EARLY_ERROR_RETURN;
5123 }
5124 #endif
5125
5126 if ((options & ~PUBLIC_OPTIONS) != 0)
5127 {
5128 errorcode = ERR17;
5129 goto PCRE_EARLY_ERROR_RETURN;
5130 }
5131
5132 /* Set up pointers to the individual character tables */
5133
5134 if (tables == NULL) tables = _pcre_default_tables;
5135 cd->lcc = tables + lcc_offset;
5136 cd->fcc = tables + fcc_offset;
5137 cd->cbits = tables + cbits_offset;
5138 cd->ctypes = tables + ctypes_offset;
5139
5140 /* Handle different types of newline. The three bits give seven cases. The
5141 current code allows for fixed one- or two-byte sequences, plus "any" and
5142 "anycrlf". */
5143
5144 switch (options & (PCRE_NEWLINE_CRLF | PCRE_NEWLINE_ANY))
5145 {
5146 case 0: newline = NEWLINE; break; /* Compile-time default */
5147 case PCRE_NEWLINE_CR: newline = '\r'; break;
5148 case PCRE_NEWLINE_LF: newline = '\n'; break;
5149 case PCRE_NEWLINE_CR+
5150 PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
5151 case PCRE_NEWLINE_ANY: newline = -1; break;
5152 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
5153 default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
5154 }
5155
5156 if (newline == -2)
5157 {
5158 cd->nltype = NLTYPE_ANYCRLF;
5159 }
5160 else if (newline < 0)
5161 {
5162 cd->nltype = NLTYPE_ANY;
5163 }
5164 else
5165 {
5166 cd->nltype = NLTYPE_FIXED;
5167 if (newline > 255)
5168 {
5169 cd->nllen = 2;
5170 cd->nl[0] = (newline >> 8) & 255;
5171 cd->nl[1] = newline & 255;
5172 }
5173 else
5174 {
5175 cd->nllen = 1;
5176 cd->nl[0] = newline;
5177 }
5178 }
5179
5180 /* Maximum back reference and backref bitmap. The bitmap records up to 31 back
5181 references to help in deciding whether (.*) can be treated as anchored or not.
5182 */
5183
5184 cd->top_backref = 0;
5185 cd->backref_map = 0;
5186
5187 /* Reflect pattern for debugging output */
5188
5189 DPRINTF(("------------------------------------------------------------------\n"));
5190 DPRINTF(("%s\n", pattern));
5191
5192 /* Pretend to compile the pattern while actually just accumulating the length
5193 of memory required. This behaviour is triggered by passing a non-NULL final
5194 argument to compile_regex(). We pass a block of workspace (cworkspace) for it
5195 to compile parts of the pattern into; the compiled code is discarded when it is
5196 no longer needed, so hopefully this workspace will never overflow, though there
5197 is a test for its doing so. */
5198
5199 cd->bracount = 0;
5200 cd->names_found = 0;
5201 cd->name_entry_size = 0;
5202 cd->name_table = NULL;
5203 cd->start_workspace = cworkspace;
5204 cd->start_code = cworkspace;
5205 cd->hwm = cworkspace;
5206 cd->start_pattern = (const uschar *)pattern;
5207 cd->end_pattern = (const uschar *)(pattern + strlen(pattern));
5208 cd->req_varyopt = 0;
5209 cd->nopartial = FALSE;
5210 cd->external_options = options;
5211
5212 /* Now do the pre-compile. On error, errorcode will be set non-zero, so we
5213 don't need to look at the result of the function here. The initial options have
5214 been put into the cd block so that they can be changed if an option setting is
5215 found within the regex right at the beginning. Bringing initial option settings
5216 outside can help speed up starting point checks. */
5217
5218 code = cworkspace;
5219 *code = OP_BRA;
5220 (void)compile_regex(cd->external_options, cd->external_options & PCRE_IMS,
5221 &code, &ptr, &errorcode, FALSE, 0, &firstbyte, &reqbyte, NULL, cd, &length);
5222 if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN;
5223
5224 DPRINTF(("end pre-compile: length=%d workspace=%d\n", length,
5225 cd->hwm - cworkspace));
5226
5227 if (length > MAX_PATTERN_SIZE)
5228 {
5229 errorcode = ERR20;
5230 goto PCRE_EARLY_ERROR_RETURN;
5231 }
5232
5233 /* Compute the size of data block needed and get it, either from malloc or
5234 externally provided function. Integer overflow should no longer be possible
5235 because nowadays we limit the maximum value of cd->names_found and
5236 cd->name_entry_size. */
5237
5238 size = length + sizeof(real_pcre) + cd->names_found * (cd->name_entry_size + 3);
5239 re = (real_pcre *)(pcre_malloc)(size);
5240
5241 if (re == NULL)
5242 {
5243 errorcode = ERR21;
5244 goto PCRE_EARLY_ERROR_RETURN;
5245 }
5246
5247 /* Put in the magic number, and save the sizes, initial options, and character
5248 table pointer. NULL is used for the default character tables. The nullpad field
5249 is at the end; it's there to help in the case when a regex compiled on a system
5250 with 4-byte pointers is run on another with 8-byte pointers. */
5251
5252 re->magic_number = MAGIC_NUMBER;
5253 re->size = size;
5254 re->options = cd->external_options;
5255 re->dummy1 = 0;
5256 re->first_byte = 0;
5257 re->req_byte = 0;
5258 re->name_table_offset = sizeof(real_pcre);
5259 re->name_entry_size = cd->name_entry_size;
5260 re->name_count = cd->names_found;
5261 re->ref_count = 0;
5262 re->tables = (tables == _pcre_default_tables)? NULL : tables;
5263 re->nullpad = NULL;
5264
5265 /* The starting points of the name/number translation table and of the code are
5266 passed around in the compile data block. The start/end pattern and initial
5267 options are already set from the pre-compile phase, as is the name_entry_size
5268 field. Reset the bracket count and the names_found field. Also reset the hwm
5269 field; this time it's used for remembering forward references to subpatterns.
5270 */
5271
5272 cd->bracount = 0;
5273 cd->names_found = 0;
5274 cd->name_table = (uschar *)re + re->name_table_offset;
5275 codestart = cd->name_table + re->name_entry_size * re->name_count;
5276 cd->start_code = codestart;
5277 cd->hwm = cworkspace;
5278 cd->req_varyopt = 0;
5279 cd->nopartial = FALSE;
5280
5281 /* Set up a starting, non-extracting bracket, then compile the expression. On
5282 error, errorcode will be set non-zero, so we don't need to look at the result
5283 of the function here. */
5284
5285 ptr = (const uschar *)pattern;
5286 code = (uschar *)codestart;
5287 *code = OP_BRA;
5288 (void)compile_regex(re->options, re->options & PCRE_IMS, &code, &ptr,
5289 &errorcode, FALSE, 0, &firstbyte, &reqbyte, NULL, cd, NULL);
5290 re->top_bracket = cd->bracount;
5291 re->top_backref = cd->top_backref;
5292
5293 if (cd->nopartial) re->options |= PCRE_NOPARTIAL;
5294
5295 /* If not reached end of pattern on success, there's an excess bracket. */
5296
5297 if (errorcode == 0 && *ptr != 0) errorcode = ERR22;
5298
5299 /* Fill in the terminating state and check for disastrous overflow, but
5300 if debugging, leave the test till after things are printed out. */
5301
5302 *code++ = OP_END;
5303
5304 #ifndef DEBUG
5305 if (code - codestart > length) errorcode = ERR23;
5306 #endif
5307
5308 /* Fill in any forward references that are required. */
5309
5310 while (errorcode == 0 && cd->hwm > cworkspace)
5311 {
5312 int offset, recno;
5313 const uschar *groupptr;
5314 cd->hwm -= LINK_SIZE;
5315 offset = GET(cd->hwm, 0);
5316 recno = GET(codestart, offset);
5317 groupptr = find_bracket(codestart, (re->options & PCRE_UTF8) != 0, recno);
5318 if (groupptr == NULL) errorcode = ERR53;
5319 else PUT(((uschar *)codestart), offset, groupptr - codestart);
5320 }
5321
5322 /* Give an error if there's back reference to a non-existent capturing
5323 subpattern. */
5324
5325 if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15;
5326
5327 /* Failed to compile, or error while post-processing */
5328
5329 if (errorcode != 0)
5330 {
5331 (pcre_free)(re);
5332 PCRE_EARLY_ERROR_RETURN:
5333 *erroroffset = ptr - (const uschar *)pattern;
5334 PCRE_EARLY_ERROR_RETURN2:
5335 *errorptr = error_texts[errorcode];
5336 if (errorcodeptr != NULL) *errorcodeptr = errorcode;
5337 return NULL;
5338 }
5339
5340 /* If the anchored option was not passed, set the flag if we can determine that
5341 the pattern is anchored by virtue of ^ characters or \A or anything else (such
5342 as starting with .* when DOTALL is set).
5343
5344 Otherwise, if we know what the first byte has to be, save it, because that
5345 speeds up unanchored matches no end. If not, see if we can set the
5346 PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
5347 start with ^. and also when all branches start with .* for non-DOTALL matches.
5348 */
5349
5350 if ((re->options & PCRE_ANCHORED) == 0)
5351 {
5352 int temp_options = re->options; /* May get changed during these scans */
5353 if (is_anchored(codestart, &temp_options, 0, cd->backref_map))
5354 re->options |= PCRE_ANCHORED;
5355 else
5356 {
5357 if (firstbyte < 0)
5358 firstbyte = find_firstassertedchar(codestart, &temp_options, FALSE);
5359 if (firstbyte >= 0) /* Remove caseless flag for non-caseable chars */
5360 {
5361 int ch = firstbyte & 255;
5362 re->first_byte = ((firstbyte & REQ_CASELESS) != 0 &&
5363 cd->fcc[ch] == ch)? ch : firstbyte;
5364 re->options |= PCRE_FIRSTSET;
5365 }
5366 else if (is_startline(codestart, 0, cd->backref_map))
5367 re->options |= PCRE_STARTLINE;
5368 }
5369 }
5370
5371 /* For an anchored pattern, we use the "required byte" only if it follows a
5372 variable length item in the regex. Remove the caseless flag for non-caseable
5373 bytes. */
5374
5375 if (reqbyte >= 0 &&
5376 ((re->options & PCRE_ANCHORED) == 0 || (reqbyte & REQ_VARY) != 0))
5377 {
5378 int ch = reqbyte & 255;
5379 re->req_byte = ((reqbyte & REQ_CASELESS) != 0 &&
5380 cd->fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte;
5381 re->options |= PCRE_REQCHSET;
5382 }
5383
5384 /* Print out the compiled data if debugging is enabled. This is never the
5385 case when building a production library. */
5386
5387 #ifdef DEBUG
5388
5389 printf("Length = %d top_bracket = %d top_backref = %d\n",
5390 length, re->top_bracket, re->top_backref);
5391
5392 if (re->options != 0)
5393 {
5394 printf("%s%s%s%s%s%s%s%s%s\n",
5395 ((re->options & PCRE_NOPARTIAL) != 0)? "nopartial " : "",
5396 ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
5397 ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
5398 ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
5399 ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
5400 ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
5401 ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
5402 ((re->options & PCRE_EXTRA) != 0)? "extra " : "",
5403 ((re->options & PCRE_UNGREEDY) != 0)? "ungreedy " : "");
5404 }
5405
5406 if ((re->options & PCRE_FIRSTSET) != 0)
5407 {
5408 int ch = re->first_byte & 255;
5409 const char *caseless = ((re->first_byte & REQ_CASELESS) == 0)?
5410 "" : " (caseless)";
5411 if (isprint(ch)) printf("First char = %c%s\n", ch, caseless);
5412 else printf("First char = \\x%02x%s\n", ch, caseless);
5413 }
5414
5415 if ((re->options & PCRE_REQCHSET) != 0)
5416 {
5417 int ch = re->req_byte & 255;
5418 const char *caseless = ((re->req_byte & REQ_CASELESS) == 0)?
5419 "" : " (caseless)";
5420 if (isprint(ch)) printf("Req char = %c%s\n", ch, caseless);
5421 else printf("Req char = \\x%02x%s\n", ch, caseless);
5422 }
5423
5424 pcre_printint(re, stdout, TRUE);
5425
5426 /* This check is done here in the debugging case so that the code that
5427 was compiled can be seen. */
5428
5429 if (code - codestart > length)
5430 {
5431 (pcre_free)(re);
5432 *errorptr = error_texts[ERR23];
5433 *erroroffset = ptr - (uschar *)pattern;
5434 if (errorcodeptr != NULL) *errorcodeptr = ERR23;
5435 return NULL;
5436 }
5437 #endif /* DEBUG */
5438
5439 return (pcre *)re;
5440 }
5441
5442 /* End of pcre_compile.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12