/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Contents of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 178 - (show annotations) (download)
Wed Jun 13 08:44:34 2007 UTC (6 years, 10 months ago) by ph10
File MIME type: text/plain
File size: 181949 byte(s)
Add support for \h, \H, \v, \V.

1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2007 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains the external function pcre_compile(), along with
42 supporting internal functions that are not used by other modules. */
43
44
45 #define NLBLOCK cd /* Block containing newline information */
46 #define PSSTART start_pattern /* Field containing processed string start */
47 #define PSEND end_pattern /* Field containing processed string end */
48
49
50 #include "pcre_internal.h"
51
52
53 /* When DEBUG is defined, we need the pcre_printint() function, which is also
54 used by pcretest. DEBUG is not defined when building a production library. */
55
56 #ifdef DEBUG
57 #include "pcre_printint.src"
58 #endif
59
60
61 /* Macro for setting individual bits in class bitmaps. */
62
63 #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
64
65
66 /*************************************************
67 * Code parameters and static tables *
68 *************************************************/
69
70 /* This value specifies the size of stack workspace that is used during the
71 first pre-compile phase that determines how much memory is required. The regex
72 is partly compiled into this space, but the compiled parts are discarded as
73 soon as they can be, so that hopefully there will never be an overrun. The code
74 does, however, check for an overrun. The largest amount I've seen used is 218,
75 so this number is very generous.
76
77 The same workspace is used during the second, actual compile phase for
78 remembering forward references to groups so that they can be filled in at the
79 end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
80 is 4 there is plenty of room. */
81
82 #define COMPILE_WORK_SIZE (4096)
83
84
85 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
86 are simple data values; negative values are for special things like \d and so
87 on. Zero means further processing is needed (for things like \x), or the escape
88 is invalid. */
89
90 #ifndef EBCDIC /* This is the "normal" table for ASCII systems */
91 static const short int escapes[] = {
92 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
93 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
94 '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */
95 -ESC_H, 0, 0, -ESC_K, 0, 0, 0, 0, /* H - O */
96 -ESC_P, -ESC_Q, -ESC_R, -ESC_S, 0, 0, -ESC_V, -ESC_W, /* P - W */
97 -ESC_X, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
98 '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */
99 -ESC_h, 0, 0, -ESC_k, 0, 0, ESC_n, 0, /* h - o */
100 -ESC_p, 0, ESC_r, -ESC_s, ESC_tee, 0, -ESC_v, -ESC_w, /* p - w */
101 0, 0, -ESC_z /* x - z */
102 };
103
104 #else /* This is the "abnormal" table for EBCDIC systems */
105 static const short int escapes[] = {
106 /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
107 /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
108 /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
109 /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
110 /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
111 /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
112 /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
113 /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
114 /* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0,
115 /* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
116 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
117 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0,
118 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
119 /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
120 /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
121 /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
122 /* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0,
123 /* D0 */ '}', 0, 0, 0, 0, 0, 0, -ESC_P,
124 /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
125 /* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X,
126 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
127 /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
128 /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
129 };
130 #endif
131
132
133 /* Tables of names of POSIX character classes and their lengths. The list is
134 terminated by a zero length entry. The first three must be alpha, lower, upper,
135 as this is assumed for handling case independence. */
136
137 static const char *const posix_names[] = {
138 "alpha", "lower", "upper",
139 "alnum", "ascii", "blank", "cntrl", "digit", "graph",
140 "print", "punct", "space", "word", "xdigit" };
141
142 static const uschar posix_name_lengths[] = {
143 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
144
145 /* Table of class bit maps for each POSIX class. Each class is formed from a
146 base map, with an optional addition or removal of another map. Then, for some
147 classes, there is some additional tweaking: for [:blank:] the vertical space
148 characters are removed, and for [:alpha:] and [:alnum:] the underscore
149 character is removed. The triples in the table consist of the base map offset,
150 second map offset or -1 if no second map, and a non-negative value for map
151 addition or a negative value for map subtraction (if there are two maps). The
152 absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
153 remove vertical space characters, 2 => remove underscore. */
154
155 static const int posix_class_maps[] = {
156 cbit_word, cbit_digit, -2, /* alpha */
157 cbit_lower, -1, 0, /* lower */
158 cbit_upper, -1, 0, /* upper */
159 cbit_word, -1, 2, /* alnum - word without underscore */
160 cbit_print, cbit_cntrl, 0, /* ascii */
161 cbit_space, -1, 1, /* blank - a GNU extension */
162 cbit_cntrl, -1, 0, /* cntrl */
163 cbit_digit, -1, 0, /* digit */
164 cbit_graph, -1, 0, /* graph */
165 cbit_print, -1, 0, /* print */
166 cbit_punct, -1, 0, /* punct */
167 cbit_space, -1, 0, /* space */
168 cbit_word, -1, 0, /* word - a Perl extension */
169 cbit_xdigit,-1, 0 /* xdigit */
170 };
171
172
173 #define STRING(a) # a
174 #define XSTRING(s) STRING(s)
175
176 /* The texts of compile-time error messages. These are "char *" because they
177 are passed to the outside world. Do not ever re-use any error number, because
178 they are documented. Always add a new error instead. Messages marked DEAD below
179 are no longer used. */
180
181 static const char *error_texts[] = {
182 "no error",
183 "\\ at end of pattern",
184 "\\c at end of pattern",
185 "unrecognized character follows \\",
186 "numbers out of order in {} quantifier",
187 /* 5 */
188 "number too big in {} quantifier",
189 "missing terminating ] for character class",
190 "invalid escape sequence in character class",
191 "range out of order in character class",
192 "nothing to repeat",
193 /* 10 */
194 "operand of unlimited repeat could match the empty string", /** DEAD **/
195 "internal error: unexpected repeat",
196 "unrecognized character after (?",
197 "POSIX named classes are supported only within a class",
198 "missing )",
199 /* 15 */
200 "reference to non-existent subpattern",
201 "erroffset passed as NULL",
202 "unknown option bit(s) set",
203 "missing ) after comment",
204 "parentheses nested too deeply", /** DEAD **/
205 /* 20 */
206 "regular expression too large",
207 "failed to get memory",
208 "unmatched parentheses",
209 "internal error: code overflow",
210 "unrecognized character after (?<",
211 /* 25 */
212 "lookbehind assertion is not fixed length",
213 "malformed number or name after (?(",
214 "conditional group contains more than two branches",
215 "assertion expected after (?(",
216 "(?R or (?[+-]digits must be followed by )",
217 /* 30 */
218 "unknown POSIX class name",
219 "POSIX collating elements are not supported",
220 "this version of PCRE is not compiled with PCRE_UTF8 support",
221 "spare error", /** DEAD **/
222 "character value in \\x{...} sequence is too large",
223 /* 35 */
224 "invalid condition (?(0)",
225 "\\C not allowed in lookbehind assertion",
226 "PCRE does not support \\L, \\l, \\N, \\U, or \\u",
227 "number after (?C is > 255",
228 "closing ) for (?C expected",
229 /* 40 */
230 "recursive call could loop indefinitely",
231 "unrecognized character after (?P",
232 "syntax error in subpattern name (missing terminator)",
233 "two named subpatterns have the same name",
234 "invalid UTF-8 string",
235 /* 45 */
236 "support for \\P, \\p, and \\X has not been compiled",
237 "malformed \\P or \\p sequence",
238 "unknown property name after \\P or \\p",
239 "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)",
240 "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")",
241 /* 50 */
242 "repeated subpattern is too long",
243 "octal value is greater than \\377 (not in UTF-8 mode)",
244 "internal error: overran compiling workspace",
245 "internal error: previously-checked referenced subpattern not found",
246 "DEFINE group contains more than one branch",
247 /* 55 */
248 "repeating a DEFINE group is not allowed",
249 "inconsistent NEWLINE options",
250 "\\g is not followed by a braced name or an optionally braced non-zero number",
251 "(?+ or (?- or (?(+ or (?(- must be followed by a non-zero number"
252 };
253
254
255 /* Table to identify digits and hex digits. This is used when compiling
256 patterns. Note that the tables in chartables are dependent on the locale, and
257 may mark arbitrary characters as digits - but the PCRE compiling code expects
258 to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
259 a private table here. It costs 256 bytes, but it is a lot faster than doing
260 character value tests (at least in some simple cases I timed), and in some
261 applications one wants PCRE to compile efficiently as well as match
262 efficiently.
263
264 For convenience, we use the same bit definitions as in chartables:
265
266 0x04 decimal digit
267 0x08 hexadecimal digit
268
269 Then we can use ctype_digit and ctype_xdigit in the code. */
270
271 #ifndef EBCDIC /* This is the "normal" case, for ASCII systems */
272 static const unsigned char digitab[] =
273 {
274 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
275 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
276 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
277 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
278 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
279 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
280 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
281 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
282 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
283 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
284 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
285 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
286 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
287 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
288 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
289 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
290 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
291 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
292 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
293 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
294 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
295 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
296 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
297 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
298 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
299 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
300 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
301 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
302 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
303 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
304 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
305 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
306
307 #else /* This is the "abnormal" case, for EBCDIC systems */
308 static const unsigned char digitab[] =
309 {
310 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
311 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
312 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
313 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
314 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
315 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
316 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
317 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
318 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
319 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
320 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
321 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */
322 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
323 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
324 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
325 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
326 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
327 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
328 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
329 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
330 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
331 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
332 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
333 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
334 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
335 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
336 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
337 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
338 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
339 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
340 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
341 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
342
343 static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
344 0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
345 0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
346 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
347 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
348 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
349 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
350 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
351 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
352 0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
353 0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
354 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
355 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */
356 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
357 0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
358 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
359 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
360 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
361 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
362 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
363 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
364 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
365 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
366 0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
367 0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
368 0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
369 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
370 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
371 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
372 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
373 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
374 0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
375 0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
376 #endif
377
378
379 /* Definition to allow mutual recursion */
380
381 static BOOL
382 compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
383 int *, int *, branch_chain *, compile_data *, int *);
384
385
386
387 /*************************************************
388 * Handle escapes *
389 *************************************************/
390
391 /* This function is called when a \ has been encountered. It either returns a
392 positive value for a simple escape such as \n, or a negative value which
393 encodes one of the more complicated things such as \d. A backreference to group
394 n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
395 UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
396 ptr is pointing at the \. On exit, it is on the final character of the escape
397 sequence.
398
399 Arguments:
400 ptrptr points to the pattern position pointer
401 errorcodeptr points to the errorcode variable
402 bracount number of previous extracting brackets
403 options the options bits
404 isclass TRUE if inside a character class
405
406 Returns: zero or positive => a data character
407 negative => a special escape sequence
408 on error, errorptr is set
409 */
410
411 static int
412 check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
413 int options, BOOL isclass)
414 {
415 BOOL utf8 = (options & PCRE_UTF8) != 0;
416 const uschar *ptr = *ptrptr + 1;
417 int c, i;
418
419 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
420 ptr--; /* Set pointer back to the last byte */
421
422 /* If backslash is at the end of the pattern, it's an error. */
423
424 if (c == 0) *errorcodeptr = ERR1;
425
426 /* Non-alphamerics are literals. For digits or letters, do an initial lookup in
427 a table. A non-zero result is something that can be returned immediately.
428 Otherwise further processing may be required. */
429
430 #ifndef EBCDIC /* ASCII coding */
431 else if (c < '0' || c > 'z') {} /* Not alphameric */
432 else if ((i = escapes[c - '0']) != 0) c = i;
433
434 #else /* EBCDIC coding */
435 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphameric */
436 else if ((i = escapes[c - 0x48]) != 0) c = i;
437 #endif
438
439 /* Escapes that need further processing, or are illegal. */
440
441 else
442 {
443 const uschar *oldptr;
444 BOOL braced, negated;
445
446 switch (c)
447 {
448 /* A number of Perl escapes are not handled by PCRE. We give an explicit
449 error. */
450
451 case 'l':
452 case 'L':
453 case 'N':
454 case 'u':
455 case 'U':
456 *errorcodeptr = ERR37;
457 break;
458
459 /* \g must be followed by a number, either plain or braced. If positive, it
460 is an absolute backreference. If negative, it is a relative backreference.
461 This is a Perl 5.10 feature. Perl 5.10 also supports \g{name} as a
462 reference to a named group. This is part of Perl's movement towards a
463 unified syntax for back references. As this is synonymous with \k{name}, we
464 fudge it up by pretending it really was \k. */
465
466 case 'g':
467 if (ptr[1] == '{')
468 {
469 const uschar *p;
470 for (p = ptr+2; *p != 0 && *p != '}'; p++)
471 if (*p != '-' && (digitab[*p] & ctype_digit) == 0) break;
472 if (*p != 0 && *p != '}')
473 {
474 c = -ESC_k;
475 break;
476 }
477 braced = TRUE;
478 ptr++;
479 }
480 else braced = FALSE;
481
482 if (ptr[1] == '-')
483 {
484 negated = TRUE;
485 ptr++;
486 }
487 else negated = FALSE;
488
489 c = 0;
490 while ((digitab[ptr[1]] & ctype_digit) != 0)
491 c = c * 10 + *(++ptr) - '0';
492
493 if (c == 0 || (braced && *(++ptr) != '}'))
494 {
495 *errorcodeptr = ERR57;
496 return 0;
497 }
498
499 if (negated)
500 {
501 if (c > bracount)
502 {
503 *errorcodeptr = ERR15;
504 return 0;
505 }
506 c = bracount - (c - 1);
507 }
508
509 c = -(ESC_REF + c);
510 break;
511
512 /* The handling of escape sequences consisting of a string of digits
513 starting with one that is not zero is not straightforward. By experiment,
514 the way Perl works seems to be as follows:
515
516 Outside a character class, the digits are read as a decimal number. If the
517 number is less than 10, or if there are that many previous extracting
518 left brackets, then it is a back reference. Otherwise, up to three octal
519 digits are read to form an escaped byte. Thus \123 is likely to be octal
520 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
521 value is greater than 377, the least significant 8 bits are taken. Inside a
522 character class, \ followed by a digit is always an octal number. */
523
524 case '1': case '2': case '3': case '4': case '5':
525 case '6': case '7': case '8': case '9':
526
527 if (!isclass)
528 {
529 oldptr = ptr;
530 c -= '0';
531 while ((digitab[ptr[1]] & ctype_digit) != 0)
532 c = c * 10 + *(++ptr) - '0';
533 if (c < 10 || c <= bracount)
534 {
535 c = -(ESC_REF + c);
536 break;
537 }
538 ptr = oldptr; /* Put the pointer back and fall through */
539 }
540
541 /* Handle an octal number following \. If the first digit is 8 or 9, Perl
542 generates a binary zero byte and treats the digit as a following literal.
543 Thus we have to pull back the pointer by one. */
544
545 if ((c = *ptr) >= '8')
546 {
547 ptr--;
548 c = 0;
549 break;
550 }
551
552 /* \0 always starts an octal number, but we may drop through to here with a
553 larger first octal digit. The original code used just to take the least
554 significant 8 bits of octal numbers (I think this is what early Perls used
555 to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
556 than 3 octal digits. */
557
558 case '0':
559 c -= '0';
560 while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
561 c = c * 8 + *(++ptr) - '0';
562 if (!utf8 && c > 255) *errorcodeptr = ERR51;
563 break;
564
565 /* \x is complicated. \x{ddd} is a character number which can be greater
566 than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
567 treated as a data character. */
568
569 case 'x':
570 if (ptr[1] == '{')
571 {
572 const uschar *pt = ptr + 2;
573 int count = 0;
574
575 c = 0;
576 while ((digitab[*pt] & ctype_xdigit) != 0)
577 {
578 register int cc = *pt++;
579 if (c == 0 && cc == '0') continue; /* Leading zeroes */
580 count++;
581
582 #ifndef EBCDIC /* ASCII coding */
583 if (cc >= 'a') cc -= 32; /* Convert to upper case */
584 c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));
585 #else /* EBCDIC coding */
586 if (cc >= 'a' && cc <= 'z') cc += 64; /* Convert to upper case */
587 c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));
588 #endif
589 }
590
591 if (*pt == '}')
592 {
593 if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
594 ptr = pt;
595 break;
596 }
597
598 /* If the sequence of hex digits does not end with '}', then we don't
599 recognize this construct; fall through to the normal \x handling. */
600 }
601
602 /* Read just a single-byte hex-defined char */
603
604 c = 0;
605 while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
606 {
607 int cc; /* Some compilers don't like ++ */
608 cc = *(++ptr); /* in initializers */
609 #ifndef EBCDIC /* ASCII coding */
610 if (cc >= 'a') cc -= 32; /* Convert to upper case */
611 c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
612 #else /* EBCDIC coding */
613 if (cc <= 'z') cc += 64; /* Convert to upper case */
614 c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
615 #endif
616 }
617 break;
618
619 /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
620 This coding is ASCII-specific, but then the whole concept of \cx is
621 ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
622
623 case 'c':
624 c = *(++ptr);
625 if (c == 0)
626 {
627 *errorcodeptr = ERR2;
628 return 0;
629 }
630
631 #ifndef EBCDIC /* ASCII coding */
632 if (c >= 'a' && c <= 'z') c -= 32;
633 c ^= 0x40;
634 #else /* EBCDIC coding */
635 if (c >= 'a' && c <= 'z') c += 64;
636 c ^= 0xC0;
637 #endif
638 break;
639
640 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
641 other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
642 for Perl compatibility, it is a literal. This code looks a bit odd, but
643 there used to be some cases other than the default, and there may be again
644 in future, so I haven't "optimized" it. */
645
646 default:
647 if ((options & PCRE_EXTRA) != 0) switch(c)
648 {
649 default:
650 *errorcodeptr = ERR3;
651 break;
652 }
653 break;
654 }
655 }
656
657 *ptrptr = ptr;
658 return c;
659 }
660
661
662
663 #ifdef SUPPORT_UCP
664 /*************************************************
665 * Handle \P and \p *
666 *************************************************/
667
668 /* This function is called after \P or \p has been encountered, provided that
669 PCRE is compiled with support for Unicode properties. On entry, ptrptr is
670 pointing at the P or p. On exit, it is pointing at the final character of the
671 escape sequence.
672
673 Argument:
674 ptrptr points to the pattern position pointer
675 negptr points to a boolean that is set TRUE for negation else FALSE
676 dptr points to an int that is set to the detailed property value
677 errorcodeptr points to the error code variable
678
679 Returns: type value from ucp_type_table, or -1 for an invalid type
680 */
681
682 static int
683 get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
684 {
685 int c, i, bot, top;
686 const uschar *ptr = *ptrptr;
687 char name[32];
688
689 c = *(++ptr);
690 if (c == 0) goto ERROR_RETURN;
691
692 *negptr = FALSE;
693
694 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
695 negation. */
696
697 if (c == '{')
698 {
699 if (ptr[1] == '^')
700 {
701 *negptr = TRUE;
702 ptr++;
703 }
704 for (i = 0; i < sizeof(name) - 1; i++)
705 {
706 c = *(++ptr);
707 if (c == 0) goto ERROR_RETURN;
708 if (c == '}') break;
709 name[i] = c;
710 }
711 if (c !='}') goto ERROR_RETURN;
712 name[i] = 0;
713 }
714
715 /* Otherwise there is just one following character */
716
717 else
718 {
719 name[0] = c;
720 name[1] = 0;
721 }
722
723 *ptrptr = ptr;
724
725 /* Search for a recognized property name using binary chop */
726
727 bot = 0;
728 top = _pcre_utt_size;
729
730 while (bot < top)
731 {
732 i = (bot + top) >> 1;
733 c = strcmp(name, _pcre_utt[i].name);
734 if (c == 0)
735 {
736 *dptr = _pcre_utt[i].value;
737 return _pcre_utt[i].type;
738 }
739 if (c > 0) bot = i + 1; else top = i;
740 }
741
742 *errorcodeptr = ERR47;
743 *ptrptr = ptr;
744 return -1;
745
746 ERROR_RETURN:
747 *errorcodeptr = ERR46;
748 *ptrptr = ptr;
749 return -1;
750 }
751 #endif
752
753
754
755
756 /*************************************************
757 * Check for counted repeat *
758 *************************************************/
759
760 /* This function is called when a '{' is encountered in a place where it might
761 start a quantifier. It looks ahead to see if it really is a quantifier or not.
762 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
763 where the ddds are digits.
764
765 Arguments:
766 p pointer to the first char after '{'
767
768 Returns: TRUE or FALSE
769 */
770
771 static BOOL
772 is_counted_repeat(const uschar *p)
773 {
774 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
775 while ((digitab[*p] & ctype_digit) != 0) p++;
776 if (*p == '}') return TRUE;
777
778 if (*p++ != ',') return FALSE;
779 if (*p == '}') return TRUE;
780
781 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
782 while ((digitab[*p] & ctype_digit) != 0) p++;
783
784 return (*p == '}');
785 }
786
787
788
789 /*************************************************
790 * Read repeat counts *
791 *************************************************/
792
793 /* Read an item of the form {n,m} and return the values. This is called only
794 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
795 so the syntax is guaranteed to be correct, but we need to check the values.
796
797 Arguments:
798 p pointer to first char after '{'
799 minp pointer to int for min
800 maxp pointer to int for max
801 returned as -1 if no max
802 errorcodeptr points to error code variable
803
804 Returns: pointer to '}' on success;
805 current ptr on error, with errorcodeptr set non-zero
806 */
807
808 static const uschar *
809 read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
810 {
811 int min = 0;
812 int max = -1;
813
814 /* Read the minimum value and do a paranoid check: a negative value indicates
815 an integer overflow. */
816
817 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
818 if (min < 0 || min > 65535)
819 {
820 *errorcodeptr = ERR5;
821 return p;
822 }
823
824 /* Read the maximum value if there is one, and again do a paranoid on its size.
825 Also, max must not be less than min. */
826
827 if (*p == '}') max = min; else
828 {
829 if (*(++p) != '}')
830 {
831 max = 0;
832 while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
833 if (max < 0 || max > 65535)
834 {
835 *errorcodeptr = ERR5;
836 return p;
837 }
838 if (max < min)
839 {
840 *errorcodeptr = ERR4;
841 return p;
842 }
843 }
844 }
845
846 /* Fill in the required variables, and pass back the pointer to the terminating
847 '}'. */
848
849 *minp = min;
850 *maxp = max;
851 return p;
852 }
853
854
855
856 /*************************************************
857 * Find forward referenced subpattern *
858 *************************************************/
859
860 /* This function scans along a pattern's text looking for capturing
861 subpatterns, and counting them. If it finds a named pattern that matches the
862 name it is given, it returns its number. Alternatively, if the name is NULL, it
863 returns when it reaches a given numbered subpattern. This is used for forward
864 references to subpatterns. We know that if (?P< is encountered, the name will
865 be terminated by '>' because that is checked in the first pass.
866
867 Arguments:
868 ptr current position in the pattern
869 count current count of capturing parens so far encountered
870 name name to seek, or NULL if seeking a numbered subpattern
871 lorn name length, or subpattern number if name is NULL
872 xmode TRUE if we are in /x mode
873
874 Returns: the number of the named subpattern, or -1 if not found
875 */
876
877 static int
878 find_parens(const uschar *ptr, int count, const uschar *name, int lorn,
879 BOOL xmode)
880 {
881 const uschar *thisname;
882
883 for (; *ptr != 0; ptr++)
884 {
885 int term;
886
887 /* Skip over backslashed characters and also entire \Q...\E */
888
889 if (*ptr == '\\')
890 {
891 if (*(++ptr) == 0) return -1;
892 if (*ptr == 'Q') for (;;)
893 {
894 while (*(++ptr) != 0 && *ptr != '\\');
895 if (*ptr == 0) return -1;
896 if (*(++ptr) == 'E') break;
897 }
898 continue;
899 }
900
901 /* Skip over character classes */
902
903 if (*ptr == '[')
904 {
905 while (*(++ptr) != ']')
906 {
907 if (*ptr == '\\')
908 {
909 if (*(++ptr) == 0) return -1;
910 if (*ptr == 'Q') for (;;)
911 {
912 while (*(++ptr) != 0 && *ptr != '\\');
913 if (*ptr == 0) return -1;
914 if (*(++ptr) == 'E') break;
915 }
916 continue;
917 }
918 }
919 continue;
920 }
921
922 /* Skip comments in /x mode */
923
924 if (xmode && *ptr == '#')
925 {
926 while (*(++ptr) != 0 && *ptr != '\n');
927 if (*ptr == 0) return -1;
928 continue;
929 }
930
931 /* An opening parens must now be a real metacharacter */
932
933 if (*ptr != '(') continue;
934 if (ptr[1] != '?')
935 {
936 count++;
937 if (name == NULL && count == lorn) return count;
938 continue;
939 }
940
941 ptr += 2;
942 if (*ptr == 'P') ptr++; /* Allow optional P */
943
944 /* We have to disambiguate (?<! and (?<= from (?<name> */
945
946 if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') &&
947 *ptr != '\'')
948 continue;
949
950 count++;
951
952 if (name == NULL && count == lorn) return count;
953 term = *ptr++;
954 if (term == '<') term = '>';
955 thisname = ptr;
956 while (*ptr != term) ptr++;
957 if (name != NULL && lorn == ptr - thisname &&
958 strncmp((const char *)name, (const char *)thisname, lorn) == 0)
959 return count;
960 }
961
962 return -1;
963 }
964
965
966
967 /*************************************************
968 * Find first significant op code *
969 *************************************************/
970
971 /* This is called by several functions that scan a compiled expression looking
972 for a fixed first character, or an anchoring op code etc. It skips over things
973 that do not influence this. For some calls, a change of option is important.
974 For some calls, it makes sense to skip negative forward and all backward
975 assertions, and also the \b assertion; for others it does not.
976
977 Arguments:
978 code pointer to the start of the group
979 options pointer to external options
980 optbit the option bit whose changing is significant, or
981 zero if none are
982 skipassert TRUE if certain assertions are to be skipped
983
984 Returns: pointer to the first significant opcode
985 */
986
987 static const uschar*
988 first_significant_code(const uschar *code, int *options, int optbit,
989 BOOL skipassert)
990 {
991 for (;;)
992 {
993 switch ((int)*code)
994 {
995 case OP_OPT:
996 if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
997 *options = (int)code[1];
998 code += 2;
999 break;
1000
1001 case OP_ASSERT_NOT:
1002 case OP_ASSERTBACK:
1003 case OP_ASSERTBACK_NOT:
1004 if (!skipassert) return code;
1005 do code += GET(code, 1); while (*code == OP_ALT);
1006 code += _pcre_OP_lengths[*code];
1007 break;
1008
1009 case OP_WORD_BOUNDARY:
1010 case OP_NOT_WORD_BOUNDARY:
1011 if (!skipassert) return code;
1012 /* Fall through */
1013
1014 case OP_CALLOUT:
1015 case OP_CREF:
1016 case OP_RREF:
1017 case OP_DEF:
1018 code += _pcre_OP_lengths[*code];
1019 break;
1020
1021 default:
1022 return code;
1023 }
1024 }
1025 /* Control never reaches here */
1026 }
1027
1028
1029
1030
1031 /*************************************************
1032 * Find the fixed length of a pattern *
1033 *************************************************/
1034
1035 /* Scan a pattern and compute the fixed length of subject that will match it,
1036 if the length is fixed. This is needed for dealing with backward assertions.
1037 In UTF8 mode, the result is in characters rather than bytes.
1038
1039 Arguments:
1040 code points to the start of the pattern (the bracket)
1041 options the compiling options
1042
1043 Returns: the fixed length, or -1 if there is no fixed length,
1044 or -2 if \C was encountered
1045 */
1046
1047 static int
1048 find_fixedlength(uschar *code, int options)
1049 {
1050 int length = -1;
1051
1052 register int branchlength = 0;
1053 register uschar *cc = code + 1 + LINK_SIZE;
1054
1055 /* Scan along the opcodes for this branch. If we get to the end of the
1056 branch, check the length against that of the other branches. */
1057
1058 for (;;)
1059 {
1060 int d;
1061 register int op = *cc;
1062
1063 switch (op)
1064 {
1065 case OP_CBRA:
1066 case OP_BRA:
1067 case OP_ONCE:
1068 case OP_COND:
1069 d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);
1070 if (d < 0) return d;
1071 branchlength += d;
1072 do cc += GET(cc, 1); while (*cc == OP_ALT);
1073 cc += 1 + LINK_SIZE;
1074 break;
1075
1076 /* Reached end of a branch; if it's a ket it is the end of a nested
1077 call. If it's ALT it is an alternation in a nested call. If it is
1078 END it's the end of the outer call. All can be handled by the same code. */
1079
1080 case OP_ALT:
1081 case OP_KET:
1082 case OP_KETRMAX:
1083 case OP_KETRMIN:
1084 case OP_END:
1085 if (length < 0) length = branchlength;
1086 else if (length != branchlength) return -1;
1087 if (*cc != OP_ALT) return length;
1088 cc += 1 + LINK_SIZE;
1089 branchlength = 0;
1090 break;
1091
1092 /* Skip over assertive subpatterns */
1093
1094 case OP_ASSERT:
1095 case OP_ASSERT_NOT:
1096 case OP_ASSERTBACK:
1097 case OP_ASSERTBACK_NOT:
1098 do cc += GET(cc, 1); while (*cc == OP_ALT);
1099 /* Fall through */
1100
1101 /* Skip over things that don't match chars */
1102
1103 case OP_REVERSE:
1104 case OP_CREF:
1105 case OP_RREF:
1106 case OP_DEF:
1107 case OP_OPT:
1108 case OP_CALLOUT:
1109 case OP_SOD:
1110 case OP_SOM:
1111 case OP_EOD:
1112 case OP_EODN:
1113 case OP_CIRC:
1114 case OP_DOLL:
1115 case OP_NOT_WORD_BOUNDARY:
1116 case OP_WORD_BOUNDARY:
1117 cc += _pcre_OP_lengths[*cc];
1118 break;
1119
1120 /* Handle literal characters */
1121
1122 case OP_CHAR:
1123 case OP_CHARNC:
1124 case OP_NOT:
1125 branchlength++;
1126 cc += 2;
1127 #ifdef SUPPORT_UTF8
1128 if ((options & PCRE_UTF8) != 0)
1129 {
1130 while ((*cc & 0xc0) == 0x80) cc++;
1131 }
1132 #endif
1133 break;
1134
1135 /* Handle exact repetitions. The count is already in characters, but we
1136 need to skip over a multibyte character in UTF8 mode. */
1137
1138 case OP_EXACT:
1139 branchlength += GET2(cc,1);
1140 cc += 4;
1141 #ifdef SUPPORT_UTF8
1142 if ((options & PCRE_UTF8) != 0)
1143 {
1144 while((*cc & 0x80) == 0x80) cc++;
1145 }
1146 #endif
1147 break;
1148
1149 case OP_TYPEEXACT:
1150 branchlength += GET2(cc,1);
1151 cc += 4;
1152 break;
1153
1154 /* Handle single-char matchers */
1155
1156 case OP_PROP:
1157 case OP_NOTPROP:
1158 cc += 2;
1159 /* Fall through */
1160
1161 case OP_NOT_DIGIT:
1162 case OP_DIGIT:
1163 case OP_NOT_WHITESPACE:
1164 case OP_WHITESPACE:
1165 case OP_NOT_WORDCHAR:
1166 case OP_WORDCHAR:
1167 case OP_ANY:
1168 branchlength++;
1169 cc++;
1170 break;
1171
1172 /* The single-byte matcher isn't allowed */
1173
1174 case OP_ANYBYTE:
1175 return -2;
1176
1177 /* Check a class for variable quantification */
1178
1179 #ifdef SUPPORT_UTF8
1180 case OP_XCLASS:
1181 cc += GET(cc, 1) - 33;
1182 /* Fall through */
1183 #endif
1184
1185 case OP_CLASS:
1186 case OP_NCLASS:
1187 cc += 33;
1188
1189 switch (*cc)
1190 {
1191 case OP_CRSTAR:
1192 case OP_CRMINSTAR:
1193 case OP_CRQUERY:
1194 case OP_CRMINQUERY:
1195 return -1;
1196
1197 case OP_CRRANGE:
1198 case OP_CRMINRANGE:
1199 if (GET2(cc,1) != GET2(cc,3)) return -1;
1200 branchlength += GET2(cc,1);
1201 cc += 5;
1202 break;
1203
1204 default:
1205 branchlength++;
1206 }
1207 break;
1208
1209 /* Anything else is variable length */
1210
1211 default:
1212 return -1;
1213 }
1214 }
1215 /* Control never gets here */
1216 }
1217
1218
1219
1220
1221 /*************************************************
1222 * Scan compiled regex for numbered bracket *
1223 *************************************************/
1224
1225 /* This little function scans through a compiled pattern until it finds a
1226 capturing bracket with the given number.
1227
1228 Arguments:
1229 code points to start of expression
1230 utf8 TRUE in UTF-8 mode
1231 number the required bracket number
1232
1233 Returns: pointer to the opcode for the bracket, or NULL if not found
1234 */
1235
1236 static const uschar *
1237 find_bracket(const uschar *code, BOOL utf8, int number)
1238 {
1239 for (;;)
1240 {
1241 register int c = *code;
1242 if (c == OP_END) return NULL;
1243
1244 /* XCLASS is used for classes that cannot be represented just by a bit
1245 map. This includes negated single high-valued characters. The length in
1246 the table is zero; the actual length is stored in the compiled code. */
1247
1248 if (c == OP_XCLASS) code += GET(code, 1);
1249
1250 /* Handle capturing bracket */
1251
1252 else if (c == OP_CBRA)
1253 {
1254 int n = GET2(code, 1+LINK_SIZE);
1255 if (n == number) return (uschar *)code;
1256 code += _pcre_OP_lengths[c];
1257 }
1258
1259 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1260 a multi-byte character. The length in the table is a minimum, so we have to
1261 arrange to skip the extra bytes. */
1262
1263 else
1264 {
1265 code += _pcre_OP_lengths[c];
1266 #ifdef SUPPORT_UTF8
1267 if (utf8) switch(c)
1268 {
1269 case OP_CHAR:
1270 case OP_CHARNC:
1271 case OP_EXACT:
1272 case OP_UPTO:
1273 case OP_MINUPTO:
1274 case OP_POSUPTO:
1275 case OP_STAR:
1276 case OP_MINSTAR:
1277 case OP_POSSTAR:
1278 case OP_PLUS:
1279 case OP_MINPLUS:
1280 case OP_POSPLUS:
1281 case OP_QUERY:
1282 case OP_MINQUERY:
1283 case OP_POSQUERY:
1284 if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1285 break;
1286 }
1287 #endif
1288 }
1289 }
1290 }
1291
1292
1293
1294 /*************************************************
1295 * Scan compiled regex for recursion reference *
1296 *************************************************/
1297
1298 /* This little function scans through a compiled pattern until it finds an
1299 instance of OP_RECURSE.
1300
1301 Arguments:
1302 code points to start of expression
1303 utf8 TRUE in UTF-8 mode
1304
1305 Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1306 */
1307
1308 static const uschar *
1309 find_recurse(const uschar *code, BOOL utf8)
1310 {
1311 for (;;)
1312 {
1313 register int c = *code;
1314 if (c == OP_END) return NULL;
1315 if (c == OP_RECURSE) return code;
1316
1317 /* XCLASS is used for classes that cannot be represented just by a bit
1318 map. This includes negated single high-valued characters. The length in
1319 the table is zero; the actual length is stored in the compiled code. */
1320
1321 if (c == OP_XCLASS) code += GET(code, 1);
1322
1323 /* Otherwise, we get the item's length from the table. In UTF-8 mode, opcodes
1324 that are followed by a character may be followed by a multi-byte character.
1325 The length in the table is a minimum, so we have to arrange to skip the extra
1326 bytes. */
1327
1328 else
1329 {
1330 code += _pcre_OP_lengths[c];
1331 #ifdef SUPPORT_UTF8
1332 if (utf8) switch(c)
1333 {
1334 case OP_CHAR:
1335 case OP_CHARNC:
1336 case OP_EXACT:
1337 case OP_UPTO:
1338 case OP_MINUPTO:
1339 case OP_POSUPTO:
1340 case OP_STAR:
1341 case OP_MINSTAR:
1342 case OP_POSSTAR:
1343 case OP_PLUS:
1344 case OP_MINPLUS:
1345 case OP_POSPLUS:
1346 case OP_QUERY:
1347 case OP_MINQUERY:
1348 case OP_POSQUERY:
1349 if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1350 break;
1351 }
1352 #endif
1353 }
1354 }
1355 }
1356
1357
1358
1359 /*************************************************
1360 * Scan compiled branch for non-emptiness *
1361 *************************************************/
1362
1363 /* This function scans through a branch of a compiled pattern to see whether it
1364 can match the empty string or not. It is called from could_be_empty()
1365 below and from compile_branch() when checking for an unlimited repeat of a
1366 group that can match nothing. Note that first_significant_code() skips over
1367 assertions. If we hit an unclosed bracket, we return "empty" - this means we've
1368 struck an inner bracket whose current branch will already have been scanned.
1369
1370 Arguments:
1371 code points to start of search
1372 endcode points to where to stop
1373 utf8 TRUE if in UTF8 mode
1374
1375 Returns: TRUE if what is matched could be empty
1376 */
1377
1378 static BOOL
1379 could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1380 {
1381 register int c;
1382 for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1383 code < endcode;
1384 code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1385 {
1386 const uschar *ccode;
1387
1388 c = *code;
1389
1390 /* Groups with zero repeats can of course be empty; skip them. */
1391
1392 if (c == OP_BRAZERO || c == OP_BRAMINZERO)
1393 {
1394 code += _pcre_OP_lengths[c];
1395 do code += GET(code, 1); while (*code == OP_ALT);
1396 c = *code;
1397 continue;
1398 }
1399
1400 /* For other groups, scan the branches. */
1401
1402 if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE)
1403 {
1404 BOOL empty_branch;
1405 if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1406
1407 /* Scan a closed bracket */
1408
1409 empty_branch = FALSE;
1410 do
1411 {
1412 if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1413 empty_branch = TRUE;
1414 code += GET(code, 1);
1415 }
1416 while (*code == OP_ALT);
1417 if (!empty_branch) return FALSE; /* All branches are non-empty */
1418 c = *code;
1419 continue;
1420 }
1421
1422 /* Handle the other opcodes */
1423
1424 switch (c)
1425 {
1426 /* Check for quantifiers after a class */
1427
1428 #ifdef SUPPORT_UTF8
1429 case OP_XCLASS:
1430 ccode = code + GET(code, 1);
1431 goto CHECK_CLASS_REPEAT;
1432 #endif
1433
1434 case OP_CLASS:
1435 case OP_NCLASS:
1436 ccode = code + 33;
1437
1438 #ifdef SUPPORT_UTF8
1439 CHECK_CLASS_REPEAT:
1440 #endif
1441
1442 switch (*ccode)
1443 {
1444 case OP_CRSTAR: /* These could be empty; continue */
1445 case OP_CRMINSTAR:
1446 case OP_CRQUERY:
1447 case OP_CRMINQUERY:
1448 break;
1449
1450 default: /* Non-repeat => class must match */
1451 case OP_CRPLUS: /* These repeats aren't empty */
1452 case OP_CRMINPLUS:
1453 return FALSE;
1454
1455 case OP_CRRANGE:
1456 case OP_CRMINRANGE:
1457 if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
1458 break;
1459 }
1460 break;
1461
1462 /* Opcodes that must match a character */
1463
1464 case OP_PROP:
1465 case OP_NOTPROP:
1466 case OP_EXTUNI:
1467 case OP_NOT_DIGIT:
1468 case OP_DIGIT:
1469 case OP_NOT_WHITESPACE:
1470 case OP_WHITESPACE:
1471 case OP_NOT_WORDCHAR:
1472 case OP_WORDCHAR:
1473 case OP_ANY:
1474 case OP_ANYBYTE:
1475 case OP_CHAR:
1476 case OP_CHARNC:
1477 case OP_NOT:
1478 case OP_PLUS:
1479 case OP_MINPLUS:
1480 case OP_POSPLUS:
1481 case OP_EXACT:
1482 case OP_NOTPLUS:
1483 case OP_NOTMINPLUS:
1484 case OP_NOTPOSPLUS:
1485 case OP_NOTEXACT:
1486 case OP_TYPEPLUS:
1487 case OP_TYPEMINPLUS:
1488 case OP_TYPEPOSPLUS:
1489 case OP_TYPEEXACT:
1490 return FALSE;
1491
1492 /* End of branch */
1493
1494 case OP_KET:
1495 case OP_KETRMAX:
1496 case OP_KETRMIN:
1497 case OP_ALT:
1498 return TRUE;
1499
1500 /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1501 MINUPTO, and POSUPTO may be followed by a multibyte character */
1502
1503 #ifdef SUPPORT_UTF8
1504 case OP_STAR:
1505 case OP_MINSTAR:
1506 case OP_POSSTAR:
1507 case OP_QUERY:
1508 case OP_MINQUERY:
1509 case OP_POSQUERY:
1510 case OP_UPTO:
1511 case OP_MINUPTO:
1512 case OP_POSUPTO:
1513 if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1514 break;
1515 #endif
1516 }
1517 }
1518
1519 return TRUE;
1520 }
1521
1522
1523
1524 /*************************************************
1525 * Scan compiled regex for non-emptiness *
1526 *************************************************/
1527
1528 /* This function is called to check for left recursive calls. We want to check
1529 the current branch of the current pattern to see if it could match the empty
1530 string. If it could, we must look outwards for branches at other levels,
1531 stopping when we pass beyond the bracket which is the subject of the recursion.
1532
1533 Arguments:
1534 code points to start of the recursion
1535 endcode points to where to stop (current RECURSE item)
1536 bcptr points to the chain of current (unclosed) branch starts
1537 utf8 TRUE if in UTF-8 mode
1538
1539 Returns: TRUE if what is matched could be empty
1540 */
1541
1542 static BOOL
1543 could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1544 BOOL utf8)
1545 {
1546 while (bcptr != NULL && bcptr->current >= code)
1547 {
1548 if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1549 bcptr = bcptr->outer;
1550 }
1551 return TRUE;
1552 }
1553
1554
1555
1556 /*************************************************
1557 * Check for POSIX class syntax *
1558 *************************************************/
1559
1560 /* This function is called when the sequence "[:" or "[." or "[=" is
1561 encountered in a character class. It checks whether this is followed by an
1562 optional ^ and then a sequence of letters, terminated by a matching ":]" or
1563 ".]" or "=]".
1564
1565 Argument:
1566 ptr pointer to the initial [
1567 endptr where to return the end pointer
1568 cd pointer to compile data
1569
1570 Returns: TRUE or FALSE
1571 */
1572
1573 static BOOL
1574 check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
1575 {
1576 int terminator; /* Don't combine these lines; the Solaris cc */
1577 terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
1578 if (*(++ptr) == '^') ptr++;
1579 while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
1580 if (*ptr == terminator && ptr[1] == ']')
1581 {
1582 *endptr = ptr;
1583 return TRUE;
1584 }
1585 return FALSE;
1586 }
1587
1588
1589
1590
1591 /*************************************************
1592 * Check POSIX class name *
1593 *************************************************/
1594
1595 /* This function is called to check the name given in a POSIX-style class entry
1596 such as [:alnum:].
1597
1598 Arguments:
1599 ptr points to the first letter
1600 len the length of the name
1601
1602 Returns: a value representing the name, or -1 if unknown
1603 */
1604
1605 static int
1606 check_posix_name(const uschar *ptr, int len)
1607 {
1608 register int yield = 0;
1609 while (posix_name_lengths[yield] != 0)
1610 {
1611 if (len == posix_name_lengths[yield] &&
1612 strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
1613 yield++;
1614 }
1615 return -1;
1616 }
1617
1618
1619 /*************************************************
1620 * Adjust OP_RECURSE items in repeated group *
1621 *************************************************/
1622
1623 /* OP_RECURSE items contain an offset from the start of the regex to the group
1624 that is referenced. This means that groups can be replicated for fixed
1625 repetition simply by copying (because the recursion is allowed to refer to
1626 earlier groups that are outside the current group). However, when a group is
1627 optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
1628 it, after it has been compiled. This means that any OP_RECURSE items within it
1629 that refer to the group itself or any contained groups have to have their
1630 offsets adjusted. That one of the jobs of this function. Before it is called,
1631 the partially compiled regex must be temporarily terminated with OP_END.
1632
1633 This function has been extended with the possibility of forward references for
1634 recursions and subroutine calls. It must also check the list of such references
1635 for the group we are dealing with. If it finds that one of the recursions in
1636 the current group is on this list, it adjusts the offset in the list, not the
1637 value in the reference (which is a group number).
1638
1639 Arguments:
1640 group points to the start of the group
1641 adjust the amount by which the group is to be moved
1642 utf8 TRUE in UTF-8 mode
1643 cd contains pointers to tables etc.
1644 save_hwm the hwm forward reference pointer at the start of the group
1645
1646 Returns: nothing
1647 */
1648
1649 static void
1650 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
1651 uschar *save_hwm)
1652 {
1653 uschar *ptr = group;
1654 while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1655 {
1656 int offset;
1657 uschar *hc;
1658
1659 /* See if this recursion is on the forward reference list. If so, adjust the
1660 reference. */
1661
1662 for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
1663 {
1664 offset = GET(hc, 0);
1665 if (cd->start_code + offset == ptr + 1)
1666 {
1667 PUT(hc, 0, offset + adjust);
1668 break;
1669 }
1670 }
1671
1672 /* Otherwise, adjust the recursion offset if it's after the start of this
1673 group. */
1674
1675 if (hc >= cd->hwm)
1676 {
1677 offset = GET(ptr, 1);
1678 if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1679 }
1680
1681 ptr += 1 + LINK_SIZE;
1682 }
1683 }
1684
1685
1686
1687 /*************************************************
1688 * Insert an automatic callout point *
1689 *************************************************/
1690
1691 /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
1692 callout points before each pattern item.
1693
1694 Arguments:
1695 code current code pointer
1696 ptr current pattern pointer
1697 cd pointers to tables etc
1698
1699 Returns: new code pointer
1700 */
1701
1702 static uschar *
1703 auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
1704 {
1705 *code++ = OP_CALLOUT;
1706 *code++ = 255;
1707 PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */
1708 PUT(code, LINK_SIZE, 0); /* Default length */
1709 return code + 2*LINK_SIZE;
1710 }
1711
1712
1713
1714 /*************************************************
1715 * Complete a callout item *
1716 *************************************************/
1717
1718 /* A callout item contains the length of the next item in the pattern, which
1719 we can't fill in till after we have reached the relevant point. This is used
1720 for both automatic and manual callouts.
1721
1722 Arguments:
1723 previous_callout points to previous callout item
1724 ptr current pattern pointer
1725 cd pointers to tables etc
1726
1727 Returns: nothing
1728 */
1729
1730 static void
1731 complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
1732 {
1733 int length = ptr - cd->start_pattern - GET(previous_callout, 2);
1734 PUT(previous_callout, 2 + LINK_SIZE, length);
1735 }
1736
1737
1738
1739 #ifdef SUPPORT_UCP
1740 /*************************************************
1741 * Get othercase range *
1742 *************************************************/
1743
1744 /* This function is passed the start and end of a class range, in UTF-8 mode
1745 with UCP support. It searches up the characters, looking for internal ranges of
1746 characters in the "other" case. Each call returns the next one, updating the
1747 start address.
1748
1749 Arguments:
1750 cptr points to starting character value; updated
1751 d end value
1752 ocptr where to put start of othercase range
1753 odptr where to put end of othercase range
1754
1755 Yield: TRUE when range returned; FALSE when no more
1756 */
1757
1758 static BOOL
1759 get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
1760 unsigned int *odptr)
1761 {
1762 unsigned int c, othercase, next;
1763
1764 for (c = *cptr; c <= d; c++)
1765 { if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR) break; }
1766
1767 if (c > d) return FALSE;
1768
1769 *ocptr = othercase;
1770 next = othercase + 1;
1771
1772 for (++c; c <= d; c++)
1773 {
1774 if (_pcre_ucp_othercase(c) != next) break;
1775 next++;
1776 }
1777
1778 *odptr = next - 1;
1779 *cptr = c;
1780
1781 return TRUE;
1782 }
1783 #endif /* SUPPORT_UCP */
1784
1785
1786
1787 /*************************************************
1788 * Check if auto-possessifying is possible *
1789 *************************************************/
1790
1791 /* This function is called for unlimited repeats of certain items, to see
1792 whether the next thing could possibly match the repeated item. If not, it makes
1793 sense to automatically possessify the repeated item.
1794
1795 Arguments:
1796 op_code the repeated op code
1797 this data for this item, depends on the opcode
1798 utf8 TRUE in UTF-8 mode
1799 utf8_char used for utf8 character bytes, NULL if not relevant
1800 ptr next character in pattern
1801 options options bits
1802 cd contains pointers to tables etc.
1803
1804 Returns: TRUE if possessifying is wanted
1805 */
1806
1807 static BOOL
1808 check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
1809 const uschar *ptr, int options, compile_data *cd)
1810 {
1811 int next;
1812
1813 /* Skip whitespace and comments in extended mode */
1814
1815 if ((options & PCRE_EXTENDED) != 0)
1816 {
1817 for (;;)
1818 {
1819 while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1820 if (*ptr == '#')
1821 {
1822 while (*(++ptr) != 0)
1823 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1824 }
1825 else break;
1826 }
1827 }
1828
1829 /* If the next item is one that we can handle, get its value. A non-negative
1830 value is a character, a negative value is an escape value. */
1831
1832 if (*ptr == '\\')
1833 {
1834 int temperrorcode = 0;
1835 next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
1836 if (temperrorcode != 0) return FALSE;
1837 ptr++; /* Point after the escape sequence */
1838 }
1839
1840 else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
1841 {
1842 #ifdef SUPPORT_UTF8
1843 if (utf8) { GETCHARINC(next, ptr); } else
1844 #endif
1845 next = *ptr++;
1846 }
1847
1848 else return FALSE;
1849
1850 /* Skip whitespace and comments in extended mode */
1851
1852 if ((options & PCRE_EXTENDED) != 0)
1853 {
1854 for (;;)
1855 {
1856 while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1857 if (*ptr == '#')
1858 {
1859 while (*(++ptr) != 0)
1860 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1861 }
1862 else break;
1863 }
1864 }
1865
1866 /* If the next thing is itself optional, we have to give up. */
1867
1868 if (*ptr == '*' || *ptr == '?' || strncmp((char *)ptr, "{0,", 3) == 0)
1869 return FALSE;
1870
1871 /* Now compare the next item with the previous opcode. If the previous is a
1872 positive single character match, "item" either contains the character or, if
1873 "item" is greater than 127 in utf8 mode, the character's bytes are in
1874 utf8_char. */
1875
1876
1877 /* Handle cases when the next item is a character. */
1878
1879 if (next >= 0) switch(op_code)
1880 {
1881 case OP_CHAR:
1882 #ifdef SUPPORT_UTF8
1883 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1884 #endif
1885 return item != next;
1886
1887 /* For CHARNC (caseless character) we must check the other case. If we have
1888 Unicode property support, we can use it to test the other case of
1889 high-valued characters. */
1890
1891 case OP_CHARNC:
1892 #ifdef SUPPORT_UTF8
1893 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1894 #endif
1895 if (item == next) return FALSE;
1896 #ifdef SUPPORT_UTF8
1897 if (utf8)
1898 {
1899 unsigned int othercase;
1900 if (next < 128) othercase = cd->fcc[next]; else
1901 #ifdef SUPPORT_UCP
1902 othercase = _pcre_ucp_othercase((unsigned int)next);
1903 #else
1904 othercase = NOTACHAR;
1905 #endif
1906 return (unsigned int)item != othercase;
1907 }
1908 else
1909 #endif /* SUPPORT_UTF8 */
1910 return (item != cd->fcc[next]); /* Non-UTF-8 mode */
1911
1912 /* For OP_NOT, "item" must be a single-byte character. */
1913
1914 case OP_NOT:
1915 if (next < 0) return FALSE; /* Not a character */
1916 if (item == next) return TRUE;
1917 if ((options & PCRE_CASELESS) == 0) return FALSE;
1918 #ifdef SUPPORT_UTF8
1919 if (utf8)
1920 {
1921 unsigned int othercase;
1922 if (next < 128) othercase = cd->fcc[next]; else
1923 #ifdef SUPPORT_UCP
1924 othercase = _pcre_ucp_othercase(next);
1925 #else
1926 othercase = NOTACHAR;
1927 #endif
1928 return (unsigned int)item == othercase;
1929 }
1930 else
1931 #endif /* SUPPORT_UTF8 */
1932 return (item == cd->fcc[next]); /* Non-UTF-8 mode */
1933
1934 case OP_DIGIT:
1935 return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
1936
1937 case OP_NOT_DIGIT:
1938 return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
1939
1940 case OP_WHITESPACE:
1941 return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
1942
1943 case OP_NOT_WHITESPACE:
1944 return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
1945
1946 case OP_WORDCHAR:
1947 return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
1948
1949 case OP_NOT_WORDCHAR:
1950 return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
1951
1952 default:
1953 return FALSE;
1954 }
1955
1956
1957 /* Handle the case when the next item is \d, \s, etc. */
1958
1959 switch(op_code)
1960 {
1961 case OP_CHAR:
1962 case OP_CHARNC:
1963 #ifdef SUPPORT_UTF8
1964 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1965 #endif
1966 switch(-next)
1967 {
1968 case ESC_d:
1969 return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
1970
1971 case ESC_D:
1972 return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
1973
1974 case ESC_s:
1975 return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
1976
1977 case ESC_S:
1978 return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
1979
1980 case ESC_w:
1981 return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
1982
1983 case ESC_W:
1984 return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
1985
1986 default:
1987 return FALSE;
1988 }
1989
1990 case OP_DIGIT:
1991 return next == -ESC_D || next == -ESC_s || next == -ESC_W;
1992
1993 case OP_NOT_DIGIT:
1994 return next == -ESC_d;
1995
1996 case OP_WHITESPACE:
1997 return next == -ESC_S || next == -ESC_d || next == -ESC_w;
1998
1999 case OP_NOT_WHITESPACE:
2000 return next == -ESC_s;
2001
2002 case OP_WORDCHAR:
2003 return next == -ESC_W || next == -ESC_s;
2004
2005 case OP_NOT_WORDCHAR:
2006 return next == -ESC_w || next == -ESC_d;
2007
2008 default:
2009 return FALSE;
2010 }
2011
2012 /* Control does not reach here */
2013 }
2014
2015
2016
2017 /*************************************************
2018 * Compile one branch *
2019 *************************************************/
2020
2021 /* Scan the pattern, compiling it into the a vector. If the options are
2022 changed during the branch, the pointer is used to change the external options
2023 bits. This function is used during the pre-compile phase when we are trying
2024 to find out the amount of memory needed, as well as during the real compile
2025 phase. The value of lengthptr distinguishes the two phases.
2026
2027 Arguments:
2028 optionsptr pointer to the option bits
2029 codeptr points to the pointer to the current code point
2030 ptrptr points to the current pattern pointer
2031 errorcodeptr points to error code variable
2032 firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2033 reqbyteptr set to the last literal character required, else < 0
2034 bcptr points to current branch chain
2035 cd contains pointers to tables etc.
2036 lengthptr NULL during the real compile phase
2037 points to length accumulator during pre-compile phase
2038
2039 Returns: TRUE on success
2040 FALSE, with *errorcodeptr set non-zero on error
2041 */
2042
2043 static BOOL
2044 compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2045 int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2046 compile_data *cd, int *lengthptr)
2047 {
2048 int repeat_type, op_type;
2049 int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
2050 int bravalue = 0;
2051 int greedy_default, greedy_non_default;
2052 int firstbyte, reqbyte;
2053 int zeroreqbyte, zerofirstbyte;
2054 int req_caseopt, reqvary, tempreqvary;
2055 int options = *optionsptr;
2056 int after_manual_callout = 0;
2057 int length_prevgroup = 0;
2058 register int c;
2059 register uschar *code = *codeptr;
2060 uschar *last_code = code;
2061 uschar *orig_code = code;
2062 uschar *tempcode;
2063 BOOL inescq = FALSE;
2064 BOOL groupsetfirstbyte = FALSE;
2065 const uschar *ptr = *ptrptr;
2066 const uschar *tempptr;
2067 uschar *previous = NULL;
2068 uschar *previous_callout = NULL;
2069 uschar *save_hwm = NULL;
2070 uschar classbits[32];
2071
2072 #ifdef SUPPORT_UTF8
2073 BOOL class_utf8;
2074 BOOL utf8 = (options & PCRE_UTF8) != 0;
2075 uschar *class_utf8data;
2076 uschar utf8_char[6];
2077 #else
2078 BOOL utf8 = FALSE;
2079 uschar *utf8_char = NULL;
2080 #endif
2081
2082 #ifdef DEBUG
2083 if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2084 #endif
2085
2086 /* Set up the default and non-default settings for greediness */
2087
2088 greedy_default = ((options & PCRE_UNGREEDY) != 0);
2089 greedy_non_default = greedy_default ^ 1;
2090
2091 /* Initialize no first byte, no required byte. REQ_UNSET means "no char
2092 matching encountered yet". It gets changed to REQ_NONE if we hit something that
2093 matches a non-fixed char first char; reqbyte just remains unset if we never
2094 find one.
2095
2096 When we hit a repeat whose minimum is zero, we may have to adjust these values
2097 to take the zero repeat into account. This is implemented by setting them to
2098 zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2099 item types that can be repeated set these backoff variables appropriately. */
2100
2101 firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2102
2103 /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2104 according to the current setting of the caseless flag. REQ_CASELESS is a bit
2105 value > 255. It is added into the firstbyte or reqbyte variables to record the
2106 case status of the value. This is used only for ASCII characters. */
2107
2108 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2109
2110 /* Switch on next character until the end of the branch */
2111
2112 for (;; ptr++)
2113 {
2114 BOOL negate_class;
2115 BOOL possessive_quantifier;
2116 BOOL is_quantifier;
2117 BOOL is_recurse;
2118 BOOL reset_bracount;
2119 int class_charcount;
2120 int class_lastchar;
2121 int newoptions;
2122 int recno;
2123 int refsign;
2124 int skipbytes;
2125 int subreqbyte;
2126 int subfirstbyte;
2127 int terminator;
2128 int mclength;
2129 uschar mcbuffer[8];
2130
2131 /* Get next byte in the pattern */
2132
2133 c = *ptr;
2134
2135 /* If we are in the pre-compile phase, accumulate the length used for the
2136 previous cycle of this loop. */
2137
2138 if (lengthptr != NULL)
2139 {
2140 #ifdef DEBUG
2141 if (code > cd->hwm) cd->hwm = code; /* High water info */
2142 #endif
2143 if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
2144 {
2145 *errorcodeptr = ERR52;
2146 goto FAILED;
2147 }
2148
2149 /* There is at least one situation where code goes backwards: this is the
2150 case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2151 the class is simply eliminated. However, it is created first, so we have to
2152 allow memory for it. Therefore, don't ever reduce the length at this point.
2153 */
2154
2155 if (code < last_code) code = last_code;
2156 *lengthptr += code - last_code;
2157 DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2158
2159 /* If "previous" is set and it is not at the start of the work space, move
2160 it back to there, in order to avoid filling up the work space. Otherwise,
2161 if "previous" is NULL, reset the current code pointer to the start. */
2162
2163 if (previous != NULL)
2164 {
2165 if (previous > orig_code)
2166 {
2167 memmove(orig_code, previous, code - previous);
2168 code -= previous - orig_code;
2169 previous = orig_code;
2170 }
2171 }
2172 else code = orig_code;
2173
2174 /* Remember where this code item starts so we can pick up the length
2175 next time round. */
2176
2177 last_code = code;
2178 }
2179
2180 /* In the real compile phase, just check the workspace used by the forward
2181 reference list. */
2182
2183 else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
2184 {
2185 *errorcodeptr = ERR52;
2186 goto FAILED;
2187 }
2188
2189 /* If in \Q...\E, check for the end; if not, we have a literal */
2190
2191 if (inescq && c != 0)
2192 {
2193 if (c == '\\' && ptr[1] == 'E')
2194 {
2195 inescq = FALSE;
2196 ptr++;
2197 continue;
2198 }
2199 else
2200 {
2201 if (previous_callout != NULL)
2202 {
2203 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2204 complete_callout(previous_callout, ptr, cd);
2205 previous_callout = NULL;
2206 }
2207 if ((options & PCRE_AUTO_CALLOUT) != 0)
2208 {
2209 previous_callout = code;
2210 code = auto_callout(code, ptr, cd);
2211 }
2212 goto NORMAL_CHAR;
2213 }
2214 }
2215
2216 /* Fill in length of a previous callout, except when the next thing is
2217 a quantifier. */
2218
2219 is_quantifier = c == '*' || c == '+' || c == '?' ||
2220 (c == '{' && is_counted_repeat(ptr+1));
2221
2222 if (!is_quantifier && previous_callout != NULL &&
2223 after_manual_callout-- <= 0)
2224 {
2225 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2226 complete_callout(previous_callout, ptr, cd);
2227 previous_callout = NULL;
2228 }
2229
2230 /* In extended mode, skip white space and comments */
2231
2232 if ((options & PCRE_EXTENDED) != 0)
2233 {
2234 if ((cd->ctypes[c] & ctype_space) != 0) continue;
2235 if (c == '#')
2236 {
2237 while (*(++ptr) != 0)
2238 {
2239 if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
2240 }
2241 if (*ptr != 0) continue;
2242
2243 /* Else fall through to handle end of string */
2244 c = 0;
2245 }
2246 }
2247
2248 /* No auto callout for quantifiers. */
2249
2250 if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
2251 {
2252 previous_callout = code;
2253 code = auto_callout(code, ptr, cd);
2254 }
2255
2256 switch(c)
2257 {
2258 /* ===================================================================*/
2259 case 0: /* The branch terminates at string end */
2260 case '|': /* or | or ) */
2261 case ')':
2262 *firstbyteptr = firstbyte;
2263 *reqbyteptr = reqbyte;
2264 *codeptr = code;
2265 *ptrptr = ptr;
2266 if (lengthptr != NULL)
2267 {
2268 *lengthptr += code - last_code; /* To include callout length */
2269 DPRINTF((">> end branch\n"));
2270 }
2271 return TRUE;
2272
2273
2274 /* ===================================================================*/
2275 /* Handle single-character metacharacters. In multiline mode, ^ disables
2276 the setting of any following char as a first character. */
2277
2278 case '^':
2279 if ((options & PCRE_MULTILINE) != 0)
2280 {
2281 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2282 }
2283 previous = NULL;
2284 *code++ = OP_CIRC;
2285 break;
2286
2287 case '$':
2288 previous = NULL;
2289 *code++ = OP_DOLL;
2290 break;
2291
2292 /* There can never be a first char if '.' is first, whatever happens about
2293 repeats. The value of reqbyte doesn't change either. */
2294
2295 case '.':
2296 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2297 zerofirstbyte = firstbyte;
2298 zeroreqbyte = reqbyte;
2299 previous = code;
2300 *code++ = OP_ANY;
2301 break;
2302
2303
2304 /* ===================================================================*/
2305 /* Character classes. If the included characters are all < 256, we build a
2306 32-byte bitmap of the permitted characters, except in the special case
2307 where there is only one such character. For negated classes, we build the
2308 map as usual, then invert it at the end. However, we use a different opcode
2309 so that data characters > 255 can be handled correctly.
2310
2311 If the class contains characters outside the 0-255 range, a different
2312 opcode is compiled. It may optionally have a bit map for characters < 256,
2313 but those above are are explicitly listed afterwards. A flag byte tells
2314 whether the bitmap is present, and whether this is a negated class or not.
2315 */
2316
2317 case '[':
2318 previous = code;
2319
2320 /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
2321 they are encountered at the top level, so we'll do that too. */
2322
2323 if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2324 check_posix_syntax(ptr, &tempptr, cd))
2325 {
2326 *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;
2327 goto FAILED;
2328 }
2329
2330 /* If the first character is '^', set the negation flag and skip it. */
2331
2332 if ((c = *(++ptr)) == '^')
2333 {
2334 negate_class = TRUE;
2335 c = *(++ptr);
2336 }
2337 else
2338 {
2339 negate_class = FALSE;
2340 }
2341
2342 /* Keep a count of chars with values < 256 so that we can optimize the case
2343 of just a single character (as long as it's < 256). However, For higher
2344 valued UTF-8 characters, we don't yet do any optimization. */
2345
2346 class_charcount = 0;
2347 class_lastchar = -1;
2348
2349 /* Initialize the 32-char bit map to all zeros. We build the map in a
2350 temporary bit of memory, in case the class contains only 1 character (less
2351 than 256), because in that case the compiled code doesn't use the bit map.
2352 */
2353
2354 memset(classbits, 0, 32 * sizeof(uschar));
2355
2356 #ifdef SUPPORT_UTF8
2357 class_utf8 = FALSE; /* No chars >= 256 */
2358 class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */
2359 #endif
2360
2361 /* Process characters until ] is reached. By writing this as a "do" it
2362 means that an initial ] is taken as a data character. At the start of the
2363 loop, c contains the first byte of the character. */
2364
2365 if (c != 0) do
2366 {
2367 const uschar *oldptr;
2368
2369 #ifdef SUPPORT_UTF8
2370 if (utf8 && c > 127)
2371 { /* Braces are required because the */
2372 GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
2373 }
2374 #endif
2375
2376 /* Inside \Q...\E everything is literal except \E */
2377
2378 if (inescq)
2379 {
2380 if (c == '\\' && ptr[1] == 'E') /* If we are at \E */
2381 {
2382 inescq = FALSE; /* Reset literal state */
2383 ptr++; /* Skip the 'E' */
2384 continue; /* Carry on with next */
2385 }
2386 goto CHECK_RANGE; /* Could be range if \E follows */
2387 }
2388
2389 /* Handle POSIX class names. Perl allows a negation extension of the
2390 form [:^name:]. A square bracket that doesn't match the syntax is
2391 treated as a literal. We also recognize the POSIX constructions
2392 [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
2393 5.6 and 5.8 do. */
2394
2395 if (c == '[' &&
2396 (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2397 check_posix_syntax(ptr, &tempptr, cd))
2398 {
2399 BOOL local_negate = FALSE;
2400 int posix_class, taboffset, tabopt;
2401 register const uschar *cbits = cd->cbits;
2402 uschar pbits[32];
2403
2404 if (ptr[1] != ':')
2405 {
2406 *errorcodeptr = ERR31;
2407 goto FAILED;
2408 }
2409
2410 ptr += 2;
2411 if (*ptr == '^')
2412 {
2413 local_negate = TRUE;
2414 ptr++;
2415 }
2416
2417 posix_class = check_posix_name(ptr, tempptr - ptr);
2418 if (posix_class < 0)
2419 {
2420 *errorcodeptr = ERR30;
2421 goto FAILED;
2422 }
2423
2424 /* If matching is caseless, upper and lower are converted to
2425 alpha. This relies on the fact that the class table starts with
2426 alpha, lower, upper as the first 3 entries. */
2427
2428 if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
2429 posix_class = 0;
2430
2431 /* We build the bit map for the POSIX class in a chunk of local store
2432 because we may be adding and subtracting from it, and we don't want to
2433 subtract bits that may be in the main map already. At the end we or the
2434 result into the bit map that is being built. */
2435
2436 posix_class *= 3;
2437
2438 /* Copy in the first table (always present) */
2439
2440 memcpy(pbits, cbits + posix_class_maps[posix_class],
2441 32 * sizeof(uschar));
2442
2443 /* If there is a second table, add or remove it as required. */
2444
2445 taboffset = posix_class_maps[posix_class + 1];
2446 tabopt = posix_class_maps[posix_class + 2];
2447
2448 if (taboffset >= 0)
2449 {
2450 if (tabopt >= 0)
2451 for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
2452 else
2453 for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
2454 }
2455
2456 /* Not see if we need to remove any special characters. An option
2457 value of 1 removes vertical space and 2 removes underscore. */
2458
2459 if (tabopt < 0) tabopt = -tabopt;
2460 if (tabopt == 1) pbits[1] &= ~0x3c;
2461 else if (tabopt == 2) pbits[11] &= 0x7f;
2462
2463 /* Add the POSIX table or its complement into the main table that is
2464 being built and we are done. */
2465
2466 if (local_negate)
2467 for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
2468 else
2469 for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
2470
2471 ptr = tempptr + 1;
2472 class_charcount = 10; /* Set > 1; assumes more than 1 per class */
2473 continue; /* End of POSIX syntax handling */
2474 }
2475
2476 /* Backslash may introduce a single character, or it may introduce one
2477 of the specials, which just set a flag. The sequence \b is a special
2478 case. Inside a class (and only there) it is treated as backspace.
2479 Elsewhere it marks a word boundary. Other escapes have preset maps ready
2480 to or into the one we are building. We assume they have more than one
2481 character in them, so set class_charcount bigger than one. */
2482
2483 if (c == '\\')
2484 {
2485 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2486 if (*errorcodeptr != 0) goto FAILED;
2487
2488 if (-c == ESC_b) c = '\b'; /* \b is backslash in a class */
2489 else if (-c == ESC_X) c = 'X'; /* \X is literal X in a class */
2490 else if (-c == ESC_R) c = 'R'; /* \R is literal R in a class */
2491 else if (-c == ESC_Q) /* Handle start of quoted string */
2492 {
2493 if (ptr[1] == '\\' && ptr[2] == 'E')
2494 {
2495 ptr += 2; /* avoid empty string */
2496 }
2497 else inescq = TRUE;
2498 continue;
2499 }
2500
2501 if (c < 0)
2502 {
2503 register const uschar *cbits = cd->cbits;
2504 class_charcount += 2; /* Greater than 1 is what matters */
2505
2506 /* Save time by not doing this in the pre-compile phase. */
2507
2508 if (lengthptr == NULL) switch (-c)
2509 {
2510 case ESC_d:
2511 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
2512 continue;
2513
2514 case ESC_D:
2515 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
2516 continue;
2517
2518 case ESC_w:
2519 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
2520 continue;
2521
2522 case ESC_W:
2523 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
2524 continue;
2525
2526 case ESC_s:
2527 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
2528 classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
2529 continue;
2530
2531 case ESC_S:
2532 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
2533 classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
2534 continue;
2535
2536 case ESC_E: /* Perl ignores an orphan \E */
2537 continue;
2538
2539 default: /* Not recognized; fall through */
2540 break; /* Need "default" setting to stop compiler warning. */
2541 }
2542
2543 /* In the pre-compile phase, just do the recognition. */
2544
2545 else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
2546 c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
2547
2548 /* We need to deal with \H, \h, \V, and \v in both phases because
2549 they use extra memory. */
2550
2551 if (-c == ESC_h)
2552 {
2553 SETBIT(classbits, 0x09); /* VT */
2554 SETBIT(classbits, 0x20); /* SPACE */
2555 SETBIT(classbits, 0xa0); /* NSBP */
2556 #ifdef SUPPORT_UTF8
2557 if (utf8)
2558 {
2559 class_utf8 = TRUE;
2560 *class_utf8data++ = XCL_SINGLE;
2561 class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
2562 *class_utf8data++ = XCL_SINGLE;
2563 class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
2564 *class_utf8data++ = XCL_RANGE;
2565 class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
2566 class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
2567 *class_utf8data++ = XCL_SINGLE;
2568 class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
2569 *class_utf8data++ = XCL_SINGLE;
2570 class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
2571 *class_utf8data++ = XCL_SINGLE;
2572 class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
2573 }
2574 #endif
2575 continue;
2576 }
2577
2578 if (-c == ESC_H)
2579 {
2580 for (c = 0; c < 32; c++)
2581 {
2582 int x = 0xff;
2583 switch (c)
2584 {
2585 case 0x09/8: x ^= 1 << (0x09%8); break;
2586 case 0x20/8: x ^= 1 << (0x20%8); break;
2587 case 0xa0/8: x ^= 1 << (0xa0%8); break;
2588 default: break;
2589 }
2590 classbits[c] |= x;
2591 }
2592
2593 #ifdef SUPPORT_UTF8
2594 if (utf8)
2595 {
2596 class_utf8 = TRUE;
2597 *class_utf8data++ = XCL_RANGE;
2598 class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2599 class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
2600 *class_utf8data++ = XCL_RANGE;
2601 class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
2602 class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
2603 *class_utf8data++ = XCL_RANGE;
2604 class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
2605 class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
2606 *class_utf8data++ = XCL_RANGE;
2607 class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
2608 class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
2609 *class_utf8data++ = XCL_RANGE;
2610 class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
2611 class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
2612 *class_utf8data++ = XCL_RANGE;
2613 class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
2614 class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
2615 *class_utf8data++ = XCL_RANGE;
2616 class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
2617 class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2618 }
2619 #endif
2620 continue;
2621 }
2622
2623 if (-c == ESC_v)
2624 {
2625 SETBIT(classbits, 0x0a); /* LF */
2626 SETBIT(classbits, 0x0b); /* VT */
2627 SETBIT(classbits, 0x0c); /* FF */
2628 SETBIT(classbits, 0x0d); /* CR */
2629 SETBIT(classbits, 0x85); /* NEL */
2630 #ifdef SUPPORT_UTF8
2631 if (utf8)
2632 {
2633 class_utf8 = TRUE;
2634 *class_utf8data++ = XCL_RANGE;
2635 class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
2636 class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
2637 }
2638 #endif
2639 continue;
2640 }
2641
2642 if (-c == ESC_V)
2643 {
2644 for (c = 0; c < 32; c++)
2645 {
2646 int x = 0xff;
2647 switch (c)
2648 {
2649 case 0x0a/8: x ^= 1 << (0x0a%8);
2650 x ^= 1 << (0x0b%8);
2651 x ^= 1 << (0x0c%8);
2652 x ^= 1 << (0x0d%8);
2653 break;
2654 case 0x85/8: x ^= 1 << (0x85%8); break;
2655 default: break;
2656 }
2657 classbits[c] |= x;
2658 }
2659
2660 #ifdef SUPPORT_UTF8
2661 if (utf8)
2662 {
2663 class_utf8 = TRUE;
2664 *class_utf8data++ = XCL_RANGE;
2665 class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2666 class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
2667 *class_utf8data++ = XCL_RANGE;
2668 class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
2669 class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2670 }
2671 #endif
2672 continue;
2673 }
2674
2675 /* We need to deal with \P and \p in both phases. */
2676
2677 #ifdef SUPPORT_UCP
2678 if (-c == ESC_p || -c == ESC_P)
2679 {
2680 BOOL negated;
2681 int pdata;
2682 int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
2683 if (ptype < 0) goto FAILED;
2684 class_utf8 = TRUE;
2685 *class_utf8data++ = ((-c == ESC_p) != negated)?
2686 XCL_PROP : XCL_NOTPROP;
2687 *class_utf8data++ = ptype;
2688 *class_utf8data++ = pdata;
2689 class_charcount -= 2; /* Not a < 256 character */
2690 continue;
2691 }
2692 #endif
2693 /* Unrecognized escapes are faulted if PCRE is running in its
2694 strict mode. By default, for compatibility with Perl, they are
2695 treated as literals. */
2696
2697 if ((options & PCRE_EXTRA) != 0)
2698 {
2699 *errorcodeptr = ERR7;
2700 goto FAILED;
2701 }
2702
2703 class_charcount -= 2; /* Undo the default count from above */
2704 c = *ptr; /* Get the final character and fall through */
2705 }
2706
2707 /* Fall through if we have a single character (c >= 0). This may be
2708 greater than 256 in UTF-8 mode. */
2709
2710 } /* End of backslash handling */
2711
2712 /* A single character may be followed by '-' to form a range. However,
2713 Perl does not permit ']' to be the end of the range. A '-' character
2714 at the end is treated as a literal. Perl ignores orphaned \E sequences
2715 entirely. The code for handling \Q and \E is messy. */
2716
2717 CHECK_RANGE:
2718 while (ptr[1] == '\\' && ptr[2] == 'E')
2719 {
2720 inescq = FALSE;
2721 ptr += 2;
2722 }
2723
2724 oldptr = ptr;
2725
2726 if (!inescq && ptr[1] == '-')
2727 {
2728 int d;
2729 ptr += 2;
2730 while (*ptr == '\\' && ptr[1] == 'E') ptr += 2;
2731
2732 /* If we hit \Q (not followed by \E) at this point, go into escaped
2733 mode. */
2734
2735 while (*ptr == '\\' && ptr[1] == 'Q')
2736 {
2737 ptr += 2;
2738 if (*ptr == '\\' && ptr[1] == 'E') { ptr += 2; continue; }
2739 inescq = TRUE;
2740 break;
2741 }
2742
2743 if (*ptr == 0 || (!inescq && *ptr == ']'))
2744 {
2745 ptr = oldptr;
2746 goto LONE_SINGLE_CHARACTER;
2747 }
2748
2749 #ifdef SUPPORT_UTF8
2750 if (utf8)
2751 { /* Braces are required because the */
2752 GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
2753 }
2754 else
2755 #endif
2756 d = *ptr; /* Not UTF-8 mode */
2757
2758 /* The second part of a range can be a single-character escape, but
2759 not any of the other escapes. Perl 5.6 treats a hyphen as a literal
2760 in such circumstances. */
2761
2762 if (!inescq && d == '\\')
2763 {
2764 d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2765 if (*errorcodeptr != 0) goto FAILED;
2766
2767 /* \b is backslash; \X is literal X; \R is literal R; any other
2768 special means the '-' was literal */
2769
2770 if (d < 0)
2771 {
2772 if (d == -ESC_b) d = '\b';
2773 else if (d == -ESC_X) d = 'X';
2774 else if (d == -ESC_R) d = 'R'; else
2775 {
2776 ptr = oldptr;
2777 goto LONE_SINGLE_CHARACTER; /* A few lines below */
2778 }
2779 }
2780 }
2781
2782 /* Check that the two values are in the correct order. Optimize
2783 one-character ranges */
2784
2785 if (d < c)
2786 {
2787 *errorcodeptr = ERR8;
2788 goto FAILED;
2789 }
2790
2791 if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
2792
2793 /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
2794 matching, we have to use an XCLASS with extra data items. Caseless
2795 matching for characters > 127 is available only if UCP support is
2796 available. */
2797
2798 #ifdef SUPPORT_UTF8
2799 if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
2800 {
2801 class_utf8 = TRUE;
2802
2803 /* With UCP support, we can find the other case equivalents of
2804 the relevant characters. There may be several ranges. Optimize how
2805 they fit with the basic range. */
2806
2807 #ifdef SUPPORT_UCP
2808 if ((options & PCRE_CASELESS) != 0)
2809 {
2810 unsigned int occ, ocd;
2811 unsigned int cc = c;
2812 unsigned int origd = d;
2813 while (get_othercase_range(&cc, origd, &occ, &ocd))
2814 {
2815 if (occ >= (unsigned int)c &&
2816 ocd <= (unsigned int)d)
2817 continue; /* Skip embedded ranges */
2818
2819 if (occ < (unsigned int)c &&
2820 ocd >= (unsigned int)c - 1) /* Extend the basic range */
2821 { /* if there is overlap, */
2822 c = occ; /* noting that if occ < c */
2823 continue; /* we can't have ocd > d */
2824 } /* because a subrange is */
2825 if (ocd > (unsigned int)d &&
2826 occ <= (unsigned int)d + 1) /* always shorter than */
2827 { /* the basic range. */
2828 d = ocd;
2829 continue;
2830 }
2831
2832 if (occ == ocd)
2833 {
2834 *class_utf8data++ = XCL_SINGLE;
2835 }
2836 else
2837 {
2838 *class_utf8data++ = XCL_RANGE;
2839 class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
2840 }
2841 class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
2842 }
2843 }
2844 #endif /* SUPPORT_UCP */
2845
2846 /* Now record the original range, possibly modified for UCP caseless
2847 overlapping ranges. */
2848
2849 *class_utf8data++ = XCL_RANGE;
2850 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
2851 class_utf8data += _pcre_ord2utf8(d, class_utf8data);
2852
2853 /* With UCP support, we are done. Without UCP support, there is no
2854 caseless matching for UTF-8 characters > 127; we can use the bit map
2855 for the smaller ones. */
2856
2857 #ifdef SUPPORT_UCP
2858 continue; /* With next character in the class */
2859 #else
2860 if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
2861
2862 /* Adjust upper limit and fall through to set up the map */
2863
2864 d = 127;
2865
2866 #endif /* SUPPORT_UCP */
2867 }
2868 #endif /* SUPPORT_UTF8 */
2869
2870 /* We use the bit map for all cases when not in UTF-8 mode; else
2871 ranges that lie entirely within 0-127 when there is UCP support; else
2872 for partial ranges without UCP support. */
2873
2874 class_charcount += d - c + 1;
2875 class_lastchar = d;
2876
2877 /* We can save a bit of time by skipping this in the pre-compile. */
2878
2879 if (lengthptr == NULL) for (; c <= d; c++)
2880 {
2881 classbits[c/8] |= (1 << (c&7));
2882 if ((options & PCRE_CASELESS) != 0)
2883 {
2884 int uc = cd->fcc[c]; /* flip case */
2885 classbits[uc/8] |= (1 << (uc&7));
2886 }
2887 }
2888
2889 continue; /* Go get the next char in the class */
2890 }
2891
2892 /* Handle a lone single character - we can get here for a normal
2893 non-escape char, or after \ that introduces a single character or for an
2894 apparent range that isn't. */
2895
2896 LONE_SINGLE_CHARACTER:
2897
2898 /* Handle a character that cannot go in the bit map */
2899
2900 #ifdef SUPPORT_UTF8
2901 if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
2902 {
2903 class_utf8 = TRUE;
2904 *class_utf8data++ = XCL_SINGLE;
2905 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
2906
2907 #ifdef SUPPORT_UCP
2908 if ((options & PCRE_CASELESS) != 0)
2909 {
2910 unsigned int othercase;
2911 if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR)
2912 {
2913 *class_utf8data++ = XCL_SINGLE;
2914 class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
2915 }
2916 }
2917 #endif /* SUPPORT_UCP */
2918
2919 }
2920 else
2921 #endif /* SUPPORT_UTF8 */
2922
2923 /* Handle a single-byte character */
2924 {
2925 classbits[c/8] |= (1 << (c&7));
2926 if ((options & PCRE_CASELESS) != 0)
2927 {
2928 c = cd->fcc[c]; /* flip case */
2929 classbits[c/8] |= (1 << (c&7));
2930 }
2931 class_charcount++;
2932 class_lastchar = c;
2933 }
2934 }
2935
2936 /* Loop until ']' reached. This "while" is the end of the "do" above. */
2937
2938 while ((c = *(++ptr)) != 0 && (c != ']' || inescq));
2939
2940 if (c == 0) /* Missing terminating ']' */
2941 {
2942 *errorcodeptr = ERR6;
2943 goto FAILED;
2944 }
2945
2946 /* If class_charcount is 1, we saw precisely one character whose value is
2947 less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we
2948 can optimize the negative case only if there were no characters >= 128
2949 because OP_NOT and the related opcodes like OP_NOTSTAR operate on
2950 single-bytes only. This is an historical hangover. Maybe one day we can
2951 tidy these opcodes to handle multi-byte characters.
2952
2953 The optimization throws away the bit map. We turn the item into a
2954 1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
2955 that OP_NOT does not support multibyte characters. In the positive case, it
2956 can cause firstbyte to be set. Otherwise, there can be no first char if
2957 this item is first, whatever repeat count may follow. In the case of
2958 reqbyte, save the previous value for reinstating. */
2959
2960 #ifdef SUPPORT_UTF8
2961 if (class_charcount == 1 &&
2962 (!utf8 ||
2963 (!class_utf8 && (!negate_class || class_lastchar < 128))))
2964
2965 #else
2966 if (class_charcount == 1)
2967 #endif
2968 {
2969 zeroreqbyte = reqbyte;
2970
2971 /* The OP_NOT opcode works on one-byte characters only. */
2972
2973 if (negate_class)
2974 {
2975 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2976 zerofirstbyte = firstbyte;
2977 *code++ = OP_NOT;
2978 *code++ = class_lastchar;
2979 break;
2980 }
2981
2982 /* For a single, positive character, get the value into mcbuffer, and
2983 then we can handle this with the normal one-character code. */
2984
2985 #ifdef SUPPORT_UTF8
2986 if (utf8 && class_lastchar > 127)
2987 mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
2988 else
2989 #endif
2990 {
2991 mcbuffer[0] = class_lastchar;
2992 mclength = 1;
2993 }
2994 goto ONE_CHAR;
2995 } /* End of 1-char optimization */
2996
2997 /* The general case - not the one-char optimization. If this is the first
2998 thing in the branch, there can be no first char setting, whatever the
2999 repeat count. Any reqbyte setting must remain unchanged after any kind of
3000 repeat. */
3001
3002 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3003 zerofirstbyte = firstbyte;
3004 zeroreqbyte = reqbyte;
3005
3006 /* If there are characters with values > 255, we have to compile an
3007 extended class, with its own opcode. If there are no characters < 256,
3008 we can omit the bitmap in the actual compiled code. */
3009
3010 #ifdef SUPPORT_UTF8
3011 if (class_utf8)
3012 {
3013 *class_utf8data++ = XCL_END; /* Marks the end of extra data */
3014 *code++ = OP_XCLASS;
3015 code += LINK_SIZE;
3016 *code = negate_class? XCL_NOT : 0;
3017
3018 /* If the map is required, move up the extra data to make room for it;
3019 otherwise just move the code pointer to the end of the extra data. */
3020
3021 if (class_charcount > 0)
3022 {
3023 *code++ |= XCL_MAP;
3024 memmove(code + 32, code, class_utf8data - code);
3025 memcpy(code, classbits, 32);
3026 code = class_utf8data + 32;
3027 }
3028 else code = class_utf8data;
3029
3030 /* Now fill in the complete length of the item */
3031
3032 PUT(previous, 1, code - previous);
3033 break; /* End of class handling */
3034 }
3035 #endif
3036
3037 /* If there are no characters > 255, negate the 32-byte map if necessary,
3038 and copy it into the code vector. If this is the first thing in the branch,
3039 there can be no first char setting, whatever the repeat count. Any reqbyte
3040 setting must remain unchanged after any kind of repeat. */
3041
3042 if (negate_class)
3043 {
3044 *code++ = OP_NCLASS;
3045 if (lengthptr == NULL) /* Save time in the pre-compile phase */
3046 for (c = 0; c < 32; c++) code[c] = ~classbits[c];
3047 }
3048 else
3049 {
3050 *code++ = OP_CLASS;
3051 memcpy(code, classbits, 32);
3052 }
3053 code += 32;
3054 break;
3055
3056
3057 /* ===================================================================*/
3058 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
3059 has been tested above. */
3060
3061 case '{':
3062 if (!is_quantifier) goto NORMAL_CHAR;
3063 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
3064 if (*errorcodeptr != 0) goto FAILED;
3065 goto REPEAT;
3066
3067 case '*':
3068 repeat_min = 0;
3069 repeat_max = -1;
3070 goto REPEAT;
3071
3072 case '+':
3073 repeat_min = 1;
3074 repeat_max = -1;
3075 goto REPEAT;
3076
3077 case '?':
3078 repeat_min = 0;
3079 repeat_max = 1;
3080
3081 REPEAT:
3082 if (previous == NULL)
3083 {
3084 *errorcodeptr = ERR9;
3085 goto FAILED;
3086 }
3087
3088 if (repeat_min == 0)
3089 {
3090 firstbyte = zerofirstbyte; /* Adjust for zero repeat */
3091 reqbyte = zeroreqbyte; /* Ditto */
3092 }
3093
3094 /* Remember whether this is a variable length repeat */
3095
3096 reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
3097
3098 op_type = 0; /* Default single-char op codes */
3099 possessive_quantifier = FALSE; /* Default not possessive quantifier */
3100
3101 /* Save start of previous item, in case we have to move it up to make space
3102 for an inserted OP_ONCE for the additional '+' extension. */
3103
3104 tempcode = previous;
3105
3106 /* If the next character is '+', we have a possessive quantifier. This
3107 implies greediness, whatever the setting of the PCRE_UNGREEDY option.
3108 If the next character is '?' this is a minimizing repeat, by default,
3109 but if PCRE_UNGREEDY is set, it works the other way round. We change the
3110 repeat type to the non-default. */
3111
3112 if (ptr[1] == '+')
3113 {
3114 repeat_type = 0; /* Force greedy */
3115 possessive_quantifier = TRUE;
3116 ptr++;
3117 }
3118 else if (ptr[1] == '?')
3119 {
3120 repeat_type = greedy_non_default;
3121 ptr++;
3122 }
3123 else repeat_type = greedy_default;
3124
3125 /* If previous was a character match, abolish the item and generate a
3126 repeat item instead. If a char item has a minumum of more than one, ensure
3127 that it is set in reqbyte - it might not be if a sequence such as x{3} is
3128 the first thing in a branch because the x will have gone into firstbyte
3129 instead. */
3130
3131 if (*previous == OP_CHAR || *previous == OP_CHARNC)
3132 {
3133 /* Deal with UTF-8 characters that take up more than one byte. It's
3134 easier to write this out separately than try to macrify it. Use c to
3135 hold the length of the character in bytes, plus 0x80 to flag that it's a
3136 length rather than a small character. */
3137
3138 #ifdef SUPPORT_UTF8
3139 if (utf8 && (code[-1] & 0x80) != 0)
3140 {
3141 uschar *lastchar = code - 1;
3142 while((*lastchar & 0xc0) == 0x80) lastchar--;
3143 c = code - lastchar; /* Length of UTF-8 character */
3144 memcpy(utf8_char, lastchar, c); /* Save the char */
3145 c |= 0x80; /* Flag c as a length */
3146 }
3147 else
3148 #endif
3149
3150 /* Handle the case of a single byte - either with no UTF8 support, or
3151 with UTF-8 disabled, or for a UTF-8 character < 128. */
3152
3153 {
3154 c = code[-1];
3155 if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
3156 }
3157
3158 /* If the repetition is unlimited, it pays to see if the next thing on
3159 the line is something that cannot possibly match this character. If so,
3160 automatically possessifying this item gains some performance in the case
3161 where the match fails. */
3162
3163 if (!possessive_quantifier &&
3164 repeat_max < 0 &&
3165 check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3166 options, cd))
3167 {
3168 repeat_type = 0; /* Force greedy */
3169 possessive_quantifier = TRUE;
3170 }
3171
3172 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
3173 }
3174
3175 /* If previous was a single negated character ([^a] or similar), we use
3176 one of the special opcodes, replacing it. The code is shared with single-
3177 character repeats by setting opt_type to add a suitable offset into
3178 repeat_type. We can also test for auto-possessification. OP_NOT is
3179 currently used only for single-byte chars. */
3180
3181 else if (*previous == OP_NOT)
3182 {
3183 op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
3184 c = previous[1];
3185 if (!possessive_quantifier &&
3186 repeat_max < 0 &&
3187 check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3188 {
3189 repeat_type = 0; /* Force greedy */
3190 possessive_quantifier = TRUE;
3191 }
3192 goto OUTPUT_SINGLE_REPEAT;
3193 }
3194
3195 /* If previous was a character type match (\d or similar), abolish it and
3196 create a suitable repeat item. The code is shared with single-character
3197 repeats by setting op_type to add a suitable offset into repeat_type. Note
3198 the the Unicode property types will be present only when SUPPORT_UCP is
3199 defined, but we don't wrap the little bits of code here because it just
3200 makes it horribly messy. */
3201
3202 else if (*previous < OP_EODN)
3203 {
3204 uschar *oldcode;
3205 int prop_type, prop_value;
3206 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
3207 c = *previous;
3208
3209 if (!possessive_quantifier &&
3210 repeat_max < 0 &&
3211 check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3212 {
3213 repeat_type = 0; /* Force greedy */
3214 possessive_quantifier = TRUE;
3215 }
3216
3217 OUTPUT_SINGLE_REPEAT:
3218 if (*previous == OP_PROP || *previous == OP_NOTPROP)
3219 {
3220 prop_type = previous[1];
3221 prop_value = previous[2];
3222 }
3223 else prop_type = prop_value = -1;
3224
3225 oldcode = code;
3226 code = previous; /* Usually overwrite previous item */
3227
3228 /* If the maximum is zero then the minimum must also be zero; Perl allows
3229 this case, so we do too - by simply omitting the item altogether. */
3230
3231 if (repeat_max == 0) goto END_REPEAT;
3232
3233 /* All real repeats make it impossible to handle partial matching (maybe
3234 one day we will be able to remove this restriction). */
3235
3236 if (repeat_max != 1) cd->nopartial = TRUE;
3237
3238 /* Combine the op_type with the repeat_type */
3239
3240 repeat_type += op_type;
3241
3242 /* A minimum of zero is handled either as the special case * or ?, or as
3243 an UPTO, with the maximum given. */
3244
3245 if (repeat_min == 0)
3246 {
3247 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
3248 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
3249 else
3250 {
3251 *code++ = OP_UPTO + repeat_type;
3252 PUT2INC(code, 0, repeat_max);
3253 }
3254 }
3255
3256 /* A repeat minimum of 1 is optimized into some special cases. If the
3257 maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
3258 left in place and, if the maximum is greater than 1, we use OP_UPTO with
3259 one less than the maximum. */
3260
3261 else if (repeat_min == 1)
3262 {
3263 if (repeat_max == -1)
3264 *code++ = OP_PLUS + repeat_type;
3265 else
3266 {
3267 code = oldcode; /* leave previous item in place */
3268 if (repeat_max == 1) goto END_REPEAT;
3269 *code++ = OP_UPTO + repeat_type;
3270 PUT2INC(code, 0, repeat_max - 1);
3271 }
3272 }
3273
3274 /* The case {n,n} is just an EXACT, while the general case {n,m} is
3275 handled as an EXACT followed by an UPTO. */
3276
3277 else
3278 {
3279 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
3280 PUT2INC(code, 0, repeat_min);
3281
3282 /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
3283 we have to insert the character for the previous code. For a repeated
3284 Unicode property match, there are two extra bytes that define the
3285 required property. In UTF-8 mode, long characters have their length in
3286 c, with the 0x80 bit as a flag. */
3287
3288 if (repeat_max < 0)
3289 {
3290 #ifdef SUPPORT_UTF8
3291 if (utf8 && c >= 128)
3292 {
3293 memcpy(code, utf8_char, c & 7);
3294 code += c & 7;
3295 }
3296 else
3297 #endif
3298 {
3299 *code++ = c;
3300 if (prop_type >= 0)
3301 {
3302 *code++ = prop_type;
3303 *code++ = prop_value;
3304 }
3305 }
3306 *code++ = OP_STAR + repeat_type;
3307 }
3308
3309 /* Else insert an UPTO if the max is greater than the min, again
3310 preceded by the character, for the previously inserted code. If the
3311 UPTO is just for 1 instance, we can use QUERY instead. */
3312
3313 else if (repeat_max != repeat_min)
3314 {
3315 #ifdef SUPPORT_UTF8
3316 if (utf8 && c >= 128)
3317 {
3318 memcpy(code, utf8_char, c & 7);
3319 code += c & 7;
3320 }
3321 else
3322 #endif
3323 *code++ = c;
3324 if (prop_type >= 0)
3325 {
3326 *code++ = prop_type;
3327 *code++ = prop_value;
3328 }
3329 repeat_max -= repeat_min;
3330
3331 if (repeat_max == 1)
3332 {
3333 *code++ = OP_QUERY + repeat_type;
3334 }
3335 else
3336 {
3337 *code++ = OP_UPTO + repeat_type;
3338 PUT2INC(code, 0, repeat_max);
3339 }
3340 }
3341 }
3342
3343 /* The character or character type itself comes last in all cases. */
3344
3345 #ifdef SUPPORT_UTF8
3346 if (utf8 && c >= 128)
3347 {
3348 memcpy(code, utf8_char, c & 7);
3349 code += c & 7;
3350 }
3351 else
3352 #endif
3353 *code++ = c;
3354
3355 /* For a repeated Unicode property match, there are two extra bytes that
3356 define the required property. */
3357
3358 #ifdef SUPPORT_UCP
3359 if (prop_type >= 0)
3360 {
3361 *code++ = prop_type;
3362 *code++ = prop_value;
3363 }
3364 #endif
3365 }
3366
3367 /* If previous was a character class or a back reference, we put the repeat
3368 stuff after it, but just skip the item if the repeat was {0,0}. */
3369
3370 else if (*previous == OP_CLASS ||
3371 *previous == OP_NCLASS ||
3372 #ifdef SUPPORT_UTF8
3373 *previous == OP_XCLASS ||
3374 #endif
3375 *previous == OP_REF)
3376 {
3377 if (repeat_max == 0)
3378 {
3379 code = previous;
3380 goto END_REPEAT;
3381 }
3382
3383 /* All real repeats make it impossible to handle partial matching (maybe
3384 one day we will be able to remove this restriction). */
3385
3386 if (repeat_max != 1) cd->nopartial = TRUE;
3387
3388 if (repeat_min == 0 && repeat_max == -1)
3389 *code++ = OP_CRSTAR + repeat_type;
3390 else if (repeat_min == 1 && repeat_max == -1)
3391 *code++ = OP_CRPLUS + repeat_type;
3392 else if (repeat_min == 0 && repeat_max == 1)
3393 *code++ = OP_CRQUERY + repeat_type;
3394 else
3395 {
3396 *code++ = OP_CRRANGE + repeat_type;
3397 PUT2INC(code, 0, repeat_min);
3398 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
3399 PUT2INC(code, 0, repeat_max);
3400 }
3401 }
3402
3403 /* If previous was a bracket group, we may have to replicate it in certain
3404 cases. */
3405
3406 else if (*previous == OP_BRA || *previous == OP_CBRA ||
3407 *previous == OP_ONCE || *previous == OP_COND)
3408 {
3409 register int i;
3410 int ketoffset = 0;
3411 int len = code - previous;
3412 uschar *bralink = NULL;
3413
3414 /* Repeating a DEFINE group is pointless */
3415
3416 if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
3417 {
3418 *errorcodeptr = ERR55;
3419 goto FAILED;
3420 }
3421
3422 /* This is a paranoid check to stop integer overflow later on */
3423
3424 if (len > MAX_DUPLENGTH)
3425 {
3426 *errorcodeptr = ERR50;
3427 goto FAILED;
3428 }
3429
3430 /* If the maximum repeat count is unlimited, find the end of the bracket
3431 by scanning through from the start, and compute the offset back to it
3432 from the current code pointer. There may be an OP_OPT setting following
3433 the final KET, so we can't find the end just by going back from the code
3434 pointer. */
3435
3436 if (repeat_max == -1)
3437 {
3438 register uschar *ket = previous;
3439 do ket += GET(ket, 1); while (*ket != OP_KET);
3440 ketoffset = code - ket;
3441 }
3442
3443 /* The case of a zero minimum is special because of the need to stick
3444 OP_BRAZERO in front of it, and because the group appears once in the
3445 data, whereas in other cases it appears the minimum number of times. For
3446 this reason, it is simplest to treat this case separately, as otherwise
3447 the code gets far too messy. There are several special subcases when the
3448 minimum is zero. */
3449
3450 if (repeat_min == 0)
3451 {
3452 /* If the maximum is also zero, we just omit the group from the output
3453 altogether. */
3454
3455 if (repeat_max == 0)
3456 {
3457 code = previous;
3458 goto END_REPEAT;
3459 }
3460
3461 /* If the maximum is 1 or unlimited, we just have to stick in the
3462 BRAZERO and do no more at this point. However, we do need to adjust
3463 any OP_RECURSE calls inside the group that refer to the group itself or
3464 any internal or forward referenced group, because the offset is from
3465 the start of the whole regex. Temporarily terminate the pattern while
3466 doing this. */
3467
3468 if (repeat_max <= 1)
3469 {
3470 *code = OP_END;
3471 adjust_recurse(previous, 1, utf8, cd, save_hwm);
3472 memmove(previous+1, previous, len);
3473 code++;
3474 *previous++ = OP_BRAZERO + repeat_type;
3475 }
3476
3477 /* If the maximum is greater than 1 and limited, we have to replicate
3478 in a nested fashion, sticking OP_BRAZERO before each set of brackets.
3479 The first one has to be handled carefully because it's the original
3480 copy, which has to be moved up. The remainder can be handled by code
3481 that is common with the non-zero minimum case below. We have to
3482 adjust the value or repeat_max, since one less copy is required. Once
3483 again, we may have to adjust any OP_RECURSE calls inside the group. */
3484
3485 else
3486 {
3487 int offset;
3488 *code = OP_END;
3489 adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
3490 memmove(previous + 2 + LINK_SIZE, previous, len);
3491 code += 2 + LINK_SIZE;
3492 *previous++ = OP_BRAZERO + repeat_type;
3493 *previous++ = OP_BRA;
3494
3495 /* We chain together the bracket offset fields that have to be
3496 filled in later when the ends of the brackets are reached. */
3497
3498 offset = (bralink == NULL)? 0 : previous - bralink;
3499 bralink = previous;
3500 PUTINC(previous, 0, offset);
3501 }
3502
3503 repeat_max--;
3504 }
3505
3506 /* If the minimum is greater than zero, replicate the group as many
3507 times as necessary, and adjust the maximum to the number of subsequent
3508 copies that we need. If we set a first char from the group, and didn't
3509 set a required char, copy the latter from the former. If there are any
3510 forward reference subroutine calls in the group, there will be entries on
3511 the workspace list; replicate these with an appropriate increment. */
3512
3513 else
3514 {
3515 if (repeat_min > 1)
3516 {
3517 /* In the pre-compile phase, we don't actually do the replication. We
3518 just adjust the length as if we had. */
3519
3520 if (lengthptr != NULL)
3521 *lengthptr += (repeat_min - 1)*length_prevgroup;
3522
3523 /* This is compiling for real */
3524
3525 else
3526 {
3527 if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
3528 for (i = 1; i < repeat_min; i++)
3529 {
3530 uschar *hc;
3531 uschar *this_hwm = cd->hwm;
3532 memcpy(code, previous, len);
3533 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3534 {
3535 PUT(cd->hwm, 0, GET(hc, 0) + len);
3536 cd->hwm += LINK_SIZE;
3537 }
3538 save_hwm = this_hwm;
3539 code += len;
3540 }
3541 }
3542 }
3543
3544 if (repeat_max > 0) repeat_max -= repeat_min;
3545 }
3546
3547 /* This code is common to both the zero and non-zero minimum cases. If
3548 the maximum is limited, it replicates the group in a nested fashion,
3549 remembering the bracket starts on a stack. In the case of a zero minimum,
3550 the first one was set up above. In all cases the repeat_max now specifies
3551 the number of additional copies needed. Again, we must remember to
3552 replicate entries on the forward reference list. */
3553
3554 if (repeat_max >= 0)
3555 {
3556 /* In the pre-compile phase, we don't actually do the replication. We
3557 just adjust the length as if we had. For each repetition we must add 1
3558 to the length for BRAZERO and for all but the last repetition we must
3559 add 2 + 2*LINKSIZE to allow for the nesting that occurs. */
3560
3561 if (lengthptr != NULL && repeat_max > 0)
3562 *lengthptr += repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
3563 2 - 2*LINK_SIZE; /* Last one doesn't nest */
3564
3565 /* This is compiling for real */
3566
3567 else for (i = repeat_max - 1; i >= 0; i--)
3568 {
3569 uschar *hc;
3570 uschar *this_hwm = cd->hwm;
3571
3572 *code++ = OP_BRAZERO + repeat_type;
3573
3574 /* All but the final copy start a new nesting, maintaining the
3575 chain of brackets outstanding. */
3576
3577 if (i != 0)
3578 {
3579 int offset;
3580 *code++ = OP_BRA;
3581 offset = (bralink == NULL)? 0 : code - bralink;
3582 bralink = code;
3583 PUTINC(code, 0, offset);
3584 }
3585
3586 memcpy(code, previous, len);
3587 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3588 {
3589 PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
3590 cd->hwm += LINK_SIZE;
3591 }
3592 save_hwm = this_hwm;
3593 code += len;
3594 }
3595
3596 /* Now chain through the pending brackets, and fill in their length
3597 fields (which are holding the chain links pro tem). */
3598
3599 while (bralink != NULL)
3600 {
3601 int oldlinkoffset;
3602 int offset = code - bralink + 1;
3603 uschar *bra = code - offset;
3604 oldlinkoffset = GET(bra, 1);
3605 bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
3606 *code++ = OP_KET;
3607 PUTINC(code, 0, offset);
3608 PUT(bra, 1, offset);
3609 }
3610 }
3611
3612 /* If the maximum is unlimited, set a repeater in the final copy. We
3613 can't just offset backwards from the current code point, because we
3614 don't know if there's been an options resetting after the ket. The
3615 correct offset was computed above.
3616
3617 Then, when we are doing the actual compile phase, check to see whether
3618 this group is a non-atomic one that could match an empty string. If so,
3619 convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
3620 that runtime checking can be done. [This check is also applied to
3621 atomic groups at runtime, but in a different way.] */
3622
3623 else
3624 {
3625 uschar *ketcode = code - ketoffset;
3626 uschar *bracode = ketcode - GET(ketcode, 1);
3627 *ketcode = OP_KETRMAX + repeat_type;
3628 if (lengthptr == NULL && *bracode != OP_ONCE)
3629 {
3630 uschar *scode = bracode;
3631 do
3632 {
3633 if (could_be_empty_branch(scode, ketcode, utf8))
3634 {
3635 *bracode += OP_SBRA - OP_BRA;
3636 break;
3637 }
3638 scode += GET(scode, 1);
3639 }
3640 while (*scode == OP_ALT);
3641 }
3642 }
3643 }
3644
3645 /* Else there's some kind of shambles */
3646
3647 else
3648 {
3649 *errorcodeptr = ERR11;
3650 goto FAILED;
3651 }
3652
3653 /* If the character following a repeat is '+', or if certain optimization
3654 tests above succeeded, possessive_quantifier is TRUE. For some of the
3655 simpler opcodes, there is an special alternative opcode for this. For
3656 anything else, we wrap the entire repeated item inside OP_ONCE brackets.
3657 The '+' notation is just syntactic sugar, taken from Sun's Java package,
3658 but the special opcodes can optimize it a bit. The repeated item starts at
3659 tempcode, not at previous, which might be the first part of a string whose
3660 (former) last char we repeated.
3661
3662 Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
3663 an 'upto' may follow. We skip over an 'exact' item, and then test the
3664 length of what remains before proceeding. */
3665
3666 if (possessive_quantifier)
3667 {
3668 int len;
3669 if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||
3670 *tempcode == OP_NOTEXACT)
3671 tempcode += _pcre_OP_lengths[*tempcode];
3672 len = code - tempcode;
3673 if (len > 0) switch (*tempcode)
3674 {
3675 case OP_STAR: *tempcode = OP_POSSTAR; break;
3676 case OP_PLUS: *tempcode = OP_POSPLUS; break;
3677 case OP_QUERY: *tempcode = OP_POSQUERY; break;
3678 case OP_UPTO: *tempcode = OP_POSUPTO; break;
3679
3680 case OP_TYPESTAR: *tempcode = OP_TYPEPOSSTAR; break;
3681 case OP_TYPEPLUS: *tempcode = OP_TYPEPOSPLUS; break;
3682 case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
3683 case OP_TYPEUPTO: *tempcode = OP_TYPEPOSUPTO; break;
3684
3685 case OP_NOTSTAR: *tempcode = OP_NOTPOSSTAR; break;
3686 case OP_NOTPLUS: *tempcode = OP_NOTPOSPLUS; break;
3687 case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
3688 case OP_NOTUPTO: *tempcode = OP_NOTPOSUPTO; break;
3689
3690 default:
3691 memmove(tempcode + 1+LINK_SIZE, tempcode, len);
3692 code += 1 + LINK_SIZE;
3693 len += 1 + LINK_SIZE;
3694 tempcode[0] = OP_ONCE;
3695 *code++ = OP_KET;
3696 PUTINC(code, 0, len);
3697 PUT(tempcode, 1, len);
3698 break;
3699 }
3700 }
3701
3702 /* In all case we no longer have a previous item. We also set the
3703 "follows varying string" flag for subsequently encountered reqbytes if
3704 it isn't already set and we have just passed a varying length item. */
3705
3706 END_REPEAT:
3707 previous = NULL;
3708 cd->req_varyopt |= reqvary;
3709 break;
3710
3711
3712 /* ===================================================================*/
3713 /* Start of nested parenthesized sub-expression, or comment or lookahead or
3714 lookbehind or option setting or condition or all the other extended
3715 parenthesis forms. First deal with the specials; all are introduced by ?,
3716 and the appearance of any of them means that this is not a capturing
3717 group. */
3718
3719 case '(':
3720 newoptions = options;
3721 skipbytes = 0;
3722 bravalue = OP_CBRA;
3723 save_hwm = cd->hwm;
3724 reset_bracount = FALSE;
3725
3726 if (*(++ptr) == '?')
3727 {
3728 int i, set, unset, namelen;
3729 int *optset;
3730 const uschar *name;
3731 uschar *slot;
3732
3733 switch (*(++ptr))
3734 {
3735 case '#': /* Comment; skip to ket */
3736 ptr++;
3737 while (*ptr != 0 && *ptr != ')') ptr++;
3738 if (*ptr == 0)
3739 {
3740 *errorcodeptr = ERR18;
3741 goto FAILED;
3742 }
3743 continue;
3744
3745
3746 /* ------------------------------------------------------------ */
3747 case '|': /* Reset capture count for each branch */
3748 reset_bracount = TRUE;
3749 /* Fall through */
3750
3751 /* ------------------------------------------------------------ */
3752 case ':': /* Non-capturing bracket */
3753 bravalue = OP_BRA;
3754 ptr++;
3755 break;
3756
3757
3758 /* ------------------------------------------------------------ */
3759 case '(':
3760 bravalue = OP_COND; /* Conditional group */
3761
3762 /* A condition can be an assertion, a number (referring to a numbered
3763 group), a name (referring to a named group), or 'R', referring to
3764 recursion. R<digits> and R&name are also permitted for recursion tests.
3765
3766 There are several syntaxes for testing a named group: (?(name)) is used
3767 by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
3768
3769 There are two unfortunate ambiguities, caused by history. (a) 'R' can
3770 be the recursive thing or the name 'R' (and similarly for 'R' followed
3771 by digits), and (b) a number could be a name that consists of digits.
3772 In both cases, we look for a name first; if not found, we try the other
3773 cases. */
3774
3775 /* For conditions that are assertions, check the syntax, and then exit
3776 the switch. This will take control down to where bracketed groups,
3777 including assertions, are processed. */
3778
3779 if (ptr[1] == '?' && (ptr[2] == '=' || ptr[2] == '!' || ptr[2] == '<'))
3780 break;
3781
3782 /* Most other conditions use OP_CREF (a couple change to OP_RREF
3783 below), and all need to skip 3 bytes at the start of the group. */
3784
3785 code[1+LINK_SIZE] = OP_CREF;
3786 skipbytes = 3;
3787 refsign = -1;
3788
3789 /* Check for a test for recursion in a named group. */
3790
3791 if (ptr[1] == 'R' && ptr[2] == '&')
3792 {
3793 terminator = -1;
3794 ptr += 2;
3795 code[1+LINK_SIZE] = OP_RREF; /* Change the type of test */
3796 }
3797
3798 /* Check for a test for a named group's having been set, using the Perl
3799 syntax (?(<name>) or (?('name') */
3800
3801 else if (ptr[1] == '<')
3802 {
3803 terminator = '>';
3804 ptr++;
3805 }
3806 else if (ptr[1] == '\'')
3807 {
3808 terminator = '\'';
3809 ptr++;
3810 }
3811 else
3812 {
3813 terminator = 0;
3814 if (ptr[1] == '-' || ptr[1] == '+') refsign = *(++ptr);
3815 }
3816
3817 /* We now expect to read a name; any thing else is an error */
3818
3819 if ((cd->ctypes[ptr[1]] & ctype_word) == 0)
3820 {
3821 ptr += 1; /* To get the right offset */
3822 *errorcodeptr = ERR28;
3823 goto FAILED;
3824 }
3825
3826 /* Read the name, but also get it as a number if it's all digits */
3827
3828 recno = 0;
3829 name = ++ptr;
3830 while ((cd->ctypes[*ptr] & ctype_word) != 0)
3831 {
3832 if (recno >= 0)
3833 recno = ((digitab[*ptr] & ctype_digit) != 0)?
3834 recno * 10 + *ptr - '0' : -1;
3835 ptr++;
3836 }
3837 namelen = ptr - name;
3838
3839 if ((terminator > 0 && *ptr++ != terminator) || *ptr++ != ')')
3840 {
3841 ptr--; /* Error offset */
3842 *errorcodeptr = ERR26;
3843 goto FAILED;
3844 }
3845
3846 /* Do no further checking in the pre-compile phase. */
3847
3848 if (lengthptr != NULL) break;
3849
3850 /* In the real compile we do the work of looking for the actual
3851 reference. If the string started with "+" or "-" we require the rest to
3852 be digits, in which case recno will be set. */
3853
3854 if (refsign > 0)
3855 {
3856 if (recno <= 0)
3857 {
3858 *errorcodeptr = ERR58;
3859 goto FAILED;
3860 }
3861 if (refsign == '-')
3862 {
3863 recno = cd->bracount - recno + 1;
3864 if (recno <= 0)
3865 {
3866 *errorcodeptr = ERR15;
3867 goto FAILED;
3868 }
3869 }
3870 else recno += cd->bracount;
3871 PUT2(code, 2+LINK_SIZE, recno);
3872 break;
3873 }
3874
3875 /* Otherwise (did not start with "+" or "-"), start by looking for the
3876 name. */
3877
3878 slot = cd->name_table;
3879 for (i = 0; i < cd->names_found; i++)
3880 {
3881 if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
3882 slot += cd->name_entry_size;
3883 }
3884
3885 /* Found a previous named subpattern */
3886
3887 if (i < cd->names_found)
3888 {
3889 recno = GET2(slot, 0);
3890 PUT2(code, 2+LINK_SIZE, recno);
3891 }
3892
3893 /* Search the pattern for a forward reference */
3894
3895 else if ((i = find_parens(ptr, cd->bracount, name, namelen,
3896 (options & PCRE_EXTENDED) != 0)) > 0)
3897 {
3898 PUT2(code, 2+LINK_SIZE, i);
3899 }
3900
3901 /* If terminator == 0 it means that the name followed directly after
3902 the opening parenthesis [e.g. (?(abc)...] and in this case there are
3903 some further alternatives to try. For the cases where terminator != 0
3904 [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
3905 now checked all the possibilities, so give an error. */
3906
3907 else if (terminator != 0)
3908 {
3909 *errorcodeptr = ERR15;
3910 goto FAILED;
3911 }
3912
3913 /* Check for (?(R) for recursion. Allow digits after R to specify a
3914 specific group number. */
3915
3916 else if (*name == 'R')
3917 {
3918 recno = 0;
3919 for (i = 1; i < namelen; i++)
3920 {
3921 if ((digitab[name[i]] & ctype_digit) == 0)
3922 {
3923 *errorcodeptr = ERR15;
3924 goto FAILED;
3925 }
3926 recno = recno * 10 + name[i] - '0';
3927 }
3928 if (recno == 0) recno = RREF_ANY;
3929 code[1+LINK_SIZE] = OP_RREF; /* Change test type */
3930 PUT2(code, 2+LINK_SIZE, recno);
3931 }
3932
3933 /* Similarly, check for the (?(DEFINE) "condition", which is always
3934 false. */
3935
3936 else if (namelen == 6 && strncmp((char *)name, "DEFINE", 6) == 0)
3937 {
3938 code[1+LINK_SIZE] = OP_DEF;
3939 skipbytes = 1;
3940 }
3941
3942 /* Check for the "name" actually being a subpattern number. */
3943
3944 else if (recno > 0)
3945 {
3946 PUT2(code, 2+LINK_SIZE, recno);
3947 }
3948
3949 /* Either an unidentified subpattern, or a reference to (?(0) */
3950
3951 else
3952 {
3953 *errorcodeptr = (recno == 0)? ERR35: ERR15;
3954 goto FAILED;
3955 }
3956 break;
3957
3958
3959 /* ------------------------------------------------------------ */
3960 case '=': /* Positive lookahead */
3961 bravalue = OP_ASSERT;
3962 ptr++;
3963 break;
3964
3965
3966 /* ------------------------------------------------------------ */
3967 case '!': /* Negative lookahead */
3968 bravalue = OP_ASSERT_NOT;
3969 ptr++;
3970 break;
3971
3972
3973 /* ------------------------------------------------------------ */
3974 case '<': /* Lookbehind or named define */
3975 switch (ptr[1])
3976 {
3977 case '=': /* Positive lookbehind */
3978 bravalue = OP_ASSERTBACK;
3979 ptr += 2;
3980 break;
3981
3982 case '!': /* Negative lookbehind */
3983 bravalue = OP_ASSERTBACK_NOT;
3984 ptr += 2;
3985 break;
3986
3987 default: /* Could be name define, else bad */
3988 if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME;
3989 ptr++; /* Correct offset for error */
3990 *errorcodeptr = ERR24;
3991 goto FAILED;
3992 }
3993 break;
3994
3995
3996 /* ------------------------------------------------------------ */
3997 case '>': /* One-time brackets */
3998 bravalue = OP_ONCE;
3999 ptr++;
4000 break;
4001
4002
4003 /* ------------------------------------------------------------ */
4004 case 'C': /* Callout - may be followed by digits; */
4005 previous_callout = code; /* Save for later completion */
4006 after_manual_callout = 1; /* Skip one item before completing */
4007 *code++ = OP_CALLOUT;
4008 {
4009 int n = 0;
4010 while ((digitab[*(++ptr)] & ctype_digit) != 0)
4011 n = n * 10 + *ptr - '0';
4012 if (*ptr != ')')
4013 {
4014 *errorcodeptr = ERR39;
4015 goto FAILED;
4016 }
4017 if (n > 255)
4018 {
4019 *errorcodeptr = ERR38;
4020 goto FAILED;
4021 }
4022 *code++ = n;
4023 PUT(code, 0, ptr - cd->start_pattern + 1); /* Pattern offset */
4024 PUT(code, LINK_SIZE, 0); /* Default length */
4025 code += 2 * LINK_SIZE;
4026 }
4027 previous = NULL;
4028 continue;
4029
4030
4031 /* ------------------------------------------------------------ */
4032 case 'P': /* Python-style named subpattern handling */
4033 if (*(++ptr) == '=' || *ptr == '>') /* Reference or recursion */
4034 {
4035 is_recurse = *ptr == '>';
4036 terminator = ')';
4037 goto NAMED_REF_OR_RECURSE;
4038 }
4039 else if (*ptr != '<') /* Test for Python-style definition */
4040 {
4041 *errorcodeptr = ERR41;
4042 goto FAILED;
4043 }
4044 /* Fall through to handle (?P< as (?< is handled */
4045
4046
4047 /* ------------------------------------------------------------ */
4048 DEFINE_NAME: /* Come here from (?< handling */
4049 case '\'':
4050 {
4051 terminator = (*ptr == '<')? '>' : '\'';
4052 name = ++ptr;
4053
4054 while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4055 namelen = ptr - name;
4056
4057 /* In the pre-compile phase, just do a syntax check. */
4058
4059 if (lengthptr != NULL)
4060 {
4061 if (*ptr != terminator)
4062 {
4063 *errorcodeptr = ERR42;
4064 goto FAILED;
4065 }
4066 if (cd->names_found >= MAX_NAME_COUNT)
4067 {
4068 *errorcodeptr = ERR49;
4069 goto FAILED;
4070 }
4071 if (namelen + 3 > cd->name_entry_size)
4072 {
4073 cd->name_entry_size = namelen + 3;
4074 if (namelen > MAX_NAME_SIZE)
4075 {
4076 *errorcodeptr = ERR48;
4077 goto FAILED;
4078 }
4079 }
4080 }
4081
4082 /* In the real compile, create the entry in the table */
4083
4084 else
4085 {
4086 slot = cd->name_table;
4087 for (i = 0; i < cd->names_found; i++)
4088 {
4089 int crc = memcmp(name, slot+2, namelen);
4090 if (crc == 0)
4091 {
4092 if (slot[2+namelen] == 0)
4093 {
4094 if ((options & PCRE_DUPNAMES) == 0)
4095 {
4096 *errorcodeptr = ERR43;
4097 goto FAILED;
4098 }
4099 }
4100 else crc = -1; /* Current name is substring */
4101 }
4102 if (crc < 0)
4103 {
4104 memmove(slot + cd->name_entry_size, slot,
4105 (cd->names_found - i) * cd->name_entry_size);
4106 break;
4107 }
4108 slot += cd->name_entry_size;
4109 }
4110
4111 PUT2(slot, 0, cd->bracount + 1);
4112 memcpy(slot + 2, name, namelen);
4113 slot[2+namelen] = 0;
4114 }
4115 }
4116
4117 /* In both cases, count the number of names we've encountered. */
4118
4119 ptr++; /* Move past > or ' */
4120 cd->names_found++;
4121 goto NUMBERED_GROUP;
4122
4123
4124 /* ------------------------------------------------------------ */
4125 case '&': /* Perl recursion/subroutine syntax */
4126 terminator = ')';
4127 is_recurse = TRUE;
4128 /* Fall through */
4129
4130 /* We come here from the Python syntax above that handles both
4131 references (?P=name) and recursion (?P>name), as well as falling
4132 through from the Perl recursion syntax (?&name). */
4133
4134 NAMED_REF_OR_RECURSE:
4135 name = ++ptr;
4136 while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4137 namelen = ptr - name;
4138
4139 /* In the pre-compile phase, do a syntax check and set a dummy
4140 reference number. */
4141
4142 if (lengthptr != NULL)
4143 {
4144 if (*ptr != terminator)
4145 {
4146 *errorcodeptr = ERR42;
4147 goto FAILED;
4148 }
4149 if (namelen > MAX_NAME_SIZE)
4150 {
4151 *errorcodeptr = ERR48;
4152 goto FAILED;
4153 }
4154 recno = 0;
4155 }
4156
4157 /* In the real compile, seek the name in the table */
4158
4159 else
4160 {
4161 slot = cd->name_table;
4162 for (i = 0; i < cd->names_found; i++)
4163 {
4164 if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
4165 slot += cd->name_entry_size;
4166 }
4167
4168 if (i < cd->names_found) /* Back reference */
4169 {
4170 recno = GET2(slot, 0);
4171 }
4172 else if ((recno = /* Forward back reference */
4173 find_parens(ptr, cd->bracount, name, namelen,
4174 (options & PCRE_EXTENDED) != 0)) <= 0)
4175 {
4176 *errorcodeptr = ERR15;
4177 goto FAILED;
4178 }
4179 }
4180
4181 /* In both phases, we can now go to the code than handles numerical
4182 recursion or backreferences. */
4183
4184 if (is_recurse) goto HANDLE_RECURSION;
4185 else goto HANDLE_REFERENCE;
4186
4187
4188 /* ------------------------------------------------------------ */
4189 case 'R': /* Recursion */
4190 ptr++; /* Same as (?0) */
4191 /* Fall through */
4192
4193
4194 /* ------------------------------------------------------------ */
4195 case '-': case '+':
4196 case '0': case '1': case '2': case '3': case '4': /* Recursion or */
4197 case '5': case '6': case '7': case '8': case '9': /* subroutine */
4198 {
4199 const uschar *called;
4200
4201 if ((refsign = *ptr) == '+') ptr++;
4202 else if (refsign == '-')
4203 {
4204 if ((digitab[ptr[1]] & ctype_digit) == 0)
4205 goto OTHER_CHAR_AFTER_QUERY;
4206 ptr++;
4207 }
4208
4209 recno = 0;
4210 while((digitab[*ptr] & ctype_digit) != 0)
4211 recno = recno * 10 + *ptr++ - '0';
4212
4213 if (*ptr != ')')
4214 {
4215 *errorcodeptr = ERR29;
4216 goto FAILED;
4217 }
4218
4219 if (refsign == '-')
4220 {
4221 if (recno == 0)
4222 {
4223 *errorcodeptr = ERR58;
4224 goto FAILED;
4225 }
4226 recno = cd->bracount - recno + 1;
4227 if (recno <= 0)
4228 {
4229 *errorcodeptr = ERR15;
4230 goto FAILED;
4231 }
4232 }
4233 else if (refsign == '+')
4234 {
4235 if (recno == 0)
4236 {
4237 *errorcodeptr = ERR58;
4238 goto FAILED;
4239 }
4240 recno += cd->bracount;
4241 }
4242
4243 /* Come here from code above that handles a named recursion */
4244
4245 HANDLE_RECURSION:
4246
4247 previous = code;
4248 called = cd->start_code;
4249
4250 /* When we are actually compiling, find the bracket that is being
4251 referenced. Temporarily end the regex in case it doesn't exist before
4252 this point. If we end up with a forward reference, first check that
4253 the bracket does occur later so we can give the error (and position)
4254 now. Then remember this forward reference in the workspace so it can
4255 be filled in at the end. */
4256
4257 if (lengthptr == NULL)
4258 {
4259 *code = OP_END;
4260 if (recno != 0) called = find_bracket(cd->start_code, utf8, recno);
4261
4262 /* Forward reference */
4263
4264 if (called == NULL)
4265 {
4266 if (find_parens(ptr, cd->bracount, NULL, recno,
4267 (options & PCRE_EXTENDED) != 0) < 0)
4268 {
4269 *errorcodeptr = ERR15;
4270 goto FAILED;
4271 }
4272 called = cd->start_code + recno;
4273 PUTINC(cd->hwm, 0, code + 2 + LINK_SIZE - cd->start_code);
4274 }
4275
4276 /* If not a forward reference, and the subpattern is still open,
4277 this is a recursive call. We check to see if this is a left
4278 recursion that could loop for ever, and diagnose that case. */
4279
4280 else if (GET(called, 1) == 0 &&
4281 could_be_empty(called, code, bcptr, utf8))
4282 {
4283 *errorcodeptr = ERR40;
4284 goto FAILED;
4285 }
4286 }
4287
4288 /* Insert the recursion/subroutine item, automatically wrapped inside
4289 "once" brackets. Set up a "previous group" length so that a
4290 subsequent quantifier will work. */
4291
4292 *code = OP_ONCE;
4293 PUT(code, 1, 2 + 2*LINK_SIZE);
4294 code += 1 + LINK_SIZE;
4295
4296 *code = OP_RECURSE;
4297 PUT(code, 1, called - cd->start_code);
4298 code += 1 + LINK_SIZE;
4299
4300 *code = OP_KET;
4301 PUT(code, 1, 2 + 2*LINK_SIZE);
4302 code += 1 + LINK_SIZE;
4303
4304 length_prevgroup = 3 + 3*LINK_SIZE;
4305 }
4306
4307 /* Can't determine a first byte now */
4308
4309 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
4310 continue;
4311
4312
4313 /* ------------------------------------------------------------ */
4314 default: /* Other characters: check option setting */
4315 OTHER_CHAR_AFTER_QUERY:
4316 set = unset = 0;
4317 optset = &set;
4318
4319 while (*ptr != ')' && *ptr != ':')
4320 {
4321 switch (*ptr++)
4322 {
4323 case '-': optset = &unset; break;
4324
4325 case 'J': /* Record that it changed in the external options */
4326 *optset |= PCRE_DUPNAMES;
4327 cd->external_options |= PCRE_JCHANGED;
4328 break;
4329
4330 case 'i': *optset |= PCRE_CASELESS; break;
4331 case 'm': *optset |= PCRE_MULTILINE; break;
4332 case 's': *optset |= PCRE_DOTALL; break;
4333 case 'x': *optset |= PCRE_EXTENDED; break;
4334 case 'U': *optset |= PCRE_UNGREEDY; break;
4335 case 'X': *optset |= PCRE_EXTRA; break;
4336
4337 default: *errorcodeptr = ERR12;
4338 ptr--; /* Correct the offset */
4339 goto FAILED;
4340 }
4341 }
4342
4343 /* Set up the changed option bits, but don't change anything yet. */
4344
4345 newoptions = (options | set) & (~unset);
4346
4347 /* If the options ended with ')' this is not the start of a nested
4348 group with option changes, so the options change at this level. If this
4349 item is right at the start of the pattern, the options can be
4350 abstracted and made external in the pre-compile phase, and ignored in
4351 the compile phase. This can be helpful when matching -- for instance in
4352 caseless checking of required bytes.
4353
4354 If the code pointer is not (cd->start_code + 1 + LINK_SIZE), we are
4355 definitely *not* at the start of the pattern because something has been
4356 compiled. In the pre-compile phase, however, the code pointer can have
4357 that value after the start, because it gets reset as code is discarded
4358 during the pre-compile. However, this can happen only at top level - if
4359 we are within parentheses, the starting BRA will still be present. At
4360 any parenthesis level, the length value can be used to test if anything
4361 has been compiled at that level. Thus, a test for both these conditions
4362 is necessary to ensure we correctly detect the start of the pattern in
4363 both phases.
4364
4365 If we are not at the pattern start, compile code to change the ims
4366 options if this setting actually changes any of them. We also pass the
4367 new setting back so that it can be put at the start of any following
4368 branches, and when this group ends (if we are in a group), a resetting
4369 item can be compiled. */
4370
4371 if (*ptr == ')')
4372 {
4373 if (code == cd->start_code + 1 + LINK_SIZE &&
4374 (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE))
4375 {
4376 cd->external_options = newoptions;
4377 options = newoptions;
4378 }
4379 else
4380 {
4381 if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
4382 {
4383 *code++ = OP_OPT;
4384 *code++ = newoptions & PCRE_IMS;
4385 }
4386
4387 /* Change options at this level, and pass them back for use
4388 in subsequent branches. Reset the greedy defaults and the case
4389 value for firstbyte and reqbyte. */
4390
4391 *optionsptr = options = newoptions;
4392 greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
4393 greedy_non_default = greedy_default ^ 1;
4394 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
4395 }
4396
4397 previous = NULL; /* This item can't be repeated */
4398 continue; /* It is complete */
4399 }
4400
4401 /* If the options ended with ':' we are heading into a nested group
4402 with possible change of options. Such groups are non-capturing and are
4403 not assertions of any kind. All we need to do is skip over the ':';
4404 the newoptions value is handled below. */
4405
4406 bravalue = OP_BRA;
4407 ptr++;
4408 } /* End of switch for character following (? */
4409 } /* End of (? handling */
4410
4411 /* Opening parenthesis not followed by '?'. If PCRE_NO_AUTO_CAPTURE is set,
4412 all unadorned brackets become non-capturing and behave like (?:...)
4413 brackets. */
4414
4415 else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
4416 {
4417 bravalue = OP_BRA;
4418 }
4419
4420 /* Else we have a capturing group. */
4421
4422 else
4423 {
4424 NUMBERED_GROUP:
4425 cd->bracount += 1;
4426 PUT2(code, 1+LINK_SIZE, cd->bracount);
4427 skipbytes = 2;
4428 }
4429
4430 /* Process nested bracketed regex. Assertions may not be repeated, but
4431 other kinds can be. All their opcodes are >= OP_ONCE. We copy code into a
4432 non-register variable in order to be able to pass its address because some
4433 compilers complain otherwise. Pass in a new setting for the ims options if
4434 they have changed. */
4435
4436 previous = (bravalue >= OP_ONCE)? code : NULL;
4437 *code = bravalue;
4438 tempcode = code;
4439 tempreqvary = cd->req_varyopt; /* Save value before bracket */
4440 length_prevgroup = 0; /* Initialize for pre-compile phase */
4441
4442 if (!compile_regex(
4443 newoptions, /* The complete new option state */
4444 options & PCRE_IMS, /* The previous ims option state */
4445 &tempcode, /* Where to put code (updated) */
4446 &ptr, /* Input pointer (updated) */
4447 errorcodeptr, /* Where to put an error message */
4448 (bravalue == OP_ASSERTBACK ||
4449 bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
4450 reset_bracount, /* True if (?| group */
4451 skipbytes, /* Skip over bracket number */
4452 &subfirstbyte, /* For possible first char */
4453 &subreqbyte, /* For possible last char */
4454 bcptr, /* Current branch chain */
4455 cd, /* Tables block */
4456 (lengthptr == NULL)? NULL : /* Actual compile phase */
4457 &length_prevgroup /* Pre-compile phase */
4458 ))
4459 goto FAILED;
4460
4461 /* At the end of compiling, code is still pointing to the start of the
4462 group, while tempcode has been updated to point past the end of the group
4463 and any option resetting that may follow it. The pattern pointer (ptr)
4464 is on the bracket. */
4465
4466 /* If this is a conditional bracket, check that there are no more than
4467 two branches in the group, or just one if it's a DEFINE group. We do this
4468 in the real compile phase, not in the pre-pass, where the whole group may
4469 not be available. */
4470
4471 if (bravalue == OP_COND && lengthptr == NULL)
4472 {
4473 uschar *tc = code;
4474 int condcount = 0;
4475
4476 do {
4477 condcount++;
4478 tc += GET(tc,1);
4479 }
4480 while (*tc != OP_KET);
4481
4482 /* A DEFINE group is never obeyed inline (the "condition" is always
4483 false). It must have only one branch. */
4484
4485 if (code[LINK_SIZE+1] == OP_DEF)
4486 {
4487 if (condcount > 1)
4488 {
4489 *errorcodeptr = ERR54;
4490 goto FAILED;
4491 }
4492 bravalue = OP_DEF; /* Just a flag to suppress char handling below */
4493 }
4494
4495 /* A "normal" conditional group. If there is just one branch, we must not
4496 make use of its firstbyte or reqbyte, because this is equivalent to an
4497 empty second branch. */
4498
4499 else
4500 {
4501 if (condcount > 2)
4502 {
4503 *errorcodeptr = ERR27;
4504 goto FAILED;
4505 }
4506 if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
4507 }
4508 }
4509
4510 /* Error if hit end of pattern */
4511
4512 if (*ptr != ')')
4513 {
4514 *errorcodeptr = ERR14;
4515 goto FAILED;
4516 }
4517
4518 /* In the pre-compile phase, update the length by the length of the nested
4519 group, less the brackets at either end. Then reduce the compiled code to
4520 just the brackets so that it doesn't use much memory if it is duplicated by
4521 a quantifier. */
4522
4523 if (lengthptr != NULL)
4524 {
4525 *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
4526 code++;
4527 PUTINC(code, 0, 1 + LINK_SIZE);
4528 *code++ = OP_KET;
4529 PUTINC(code, 0, 1 + LINK_SIZE);
4530 }
4531
4532 /* Otherwise update the main code pointer to the end of the group. */
4533
4534 else code = tempcode;
4535
4536 /* For a DEFINE group, required and first character settings are not
4537 relevant. */
4538
4539 if (bravalue == OP_DEF) break;
4540
4541 /* Handle updating of the required and first characters for other types of
4542 group. Update for normal brackets of all kinds, and conditions with two
4543 branches (see code above). If the bracket is followed by a quantifier with
4544 zero repeat, we have to back off. Hence the definition of zeroreqbyte and
4545 zerofirstbyte outside the main loop so that they can be accessed for the
4546 back off. */
4547
4548 zeroreqbyte = reqbyte;
4549 zerofirstbyte = firstbyte;
4550 groupsetfirstbyte = FALSE;
4551
4552 if (bravalue >= OP_ONCE)
4553 {
4554 /* If we have not yet set a firstbyte in this branch, take it from the
4555 subpattern, remembering that it was set here so that a repeat of more
4556 than one can replicate it as reqbyte if necessary. If the subpattern has
4557 no firstbyte, set "none" for the whole branch. In both cases, a zero
4558 repeat forces firstbyte to "none". */
4559
4560 if (firstbyte == REQ_UNSET)
4561 {
4562 if (subfirstbyte >= 0)
4563 {
4564 firstbyte = subfirstbyte;
4565 groupsetfirstbyte = TRUE;
4566 }
4567 else firstbyte = REQ_NONE;
4568 zerofirstbyte = REQ_NONE;
4569 }
4570
4571 /* If firstbyte was previously set, convert the subpattern's firstbyte
4572 into reqbyte if there wasn't one, using the vary flag that was in
4573 existence beforehand. */
4574
4575 else if (subfirstbyte >= 0 && subreqbyte < 0)
4576 subreqbyte = subfirstbyte | tempreqvary;
4577
4578 /* If the subpattern set a required byte (or set a first byte that isn't
4579 really the first byte - see above), set it. */
4580
4581 if (subreqbyte >= 0) reqbyte = subreqbyte;
4582 }
4583
4584 /* For a forward assertion, we take the reqbyte, if set. This can be
4585 helpful if the pattern that follows the assertion doesn't set a different
4586 char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
4587 for an assertion, however because it leads to incorrect effect for patterns
4588 such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
4589 of a firstbyte. This is overcome by a scan at the end if there's no
4590 firstbyte, looking for an asserted first char. */
4591
4592 else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
4593 break; /* End of processing '(' */
4594
4595
4596 /* ===================================================================*/
4597 /* Handle metasequences introduced by \. For ones like \d, the ESC_ values
4598 are arranged to be the negation of the corresponding OP_values. For the
4599 back references, the values are ESC_REF plus the reference number. Only
4600 back references and those types that consume a character may be repeated.
4601 We can test for values between ESC_b and ESC_Z for the latter; this may
4602 have to change if any new ones are ever created. */
4603
4604 case '\\':
4605 tempptr = ptr;
4606 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, FALSE);
4607 if (*errorcodeptr != 0) goto FAILED;
4608
4609 if (c < 0)
4610 {
4611 if (-c == ESC_Q) /* Handle start of quoted string */
4612 {
4613 if (ptr[1] == '\\' && ptr[2] == 'E') ptr += 2; /* avoid empty string */
4614 else inescq = TRUE;
4615 continue;
4616 }
4617
4618 if (-c == ESC_E) continue; /* Perl ignores an orphan \E */
4619
4620 /* For metasequences that actually match a character, we disable the
4621 setting of a first character if it hasn't already been set. */
4622
4623 if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
4624 firstbyte = REQ_NONE;
4625
4626 /* Set values to reset to if this is followed by a zero repeat. */
4627
4628 zerofirstbyte = firstbyte;
4629 zeroreqbyte = reqbyte;
4630
4631 /* \k<name> or \k'name' is a back reference by name (Perl syntax).
4632 We also support \k{name} (.NET syntax) */
4633
4634 if (-c == ESC_k && (ptr[1] == '<' || ptr[1] == '\'' || ptr[1] == '{'))
4635 {
4636 is_recurse = FALSE;
4637 terminator = (*(++ptr) == '<')? '>' : (*ptr == '\'')? '\'' : '}';
4638 goto NAMED_REF_OR_RECURSE;
4639 }
4640
4641 /* Back references are handled specially; must disable firstbyte if
4642 not set to cope with cases like (?=(\w+))\1: which would otherwise set
4643 ':' later. */
4644
4645 if (-c >= ESC_REF)
4646 {
4647 recno = -c - ESC_REF;
4648
4649 HANDLE_REFERENCE: /* Come here from named backref handling */
4650 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
4651 previous = code;
4652 *code++ = OP_REF;
4653 PUT2INC(code, 0, recno);
4654 cd->backref_map |= (recno < 32)? (1 << recno) : 1;
4655 if (recno > cd->top_backref) cd->top_backref = recno;
4656 }
4657
4658 /* So are Unicode property matches, if supported. */
4659
4660 #ifdef SUPPORT_UCP
4661 else if (-c == ESC_P || -c == ESC_p)
4662 {
4663 BOOL negated;
4664 int pdata;
4665 int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
4666 if (ptype < 0) goto FAILED;
4667 previous = code;
4668 *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
4669 *code++ = ptype;
4670 *code++ = pdata;
4671 }
4672 #else
4673
4674 /* If Unicode properties are not supported, \X, \P, and \p are not
4675 allowed. */
4676
4677 else if (-c == ESC_X || -c == ESC_P || -c == ESC_p)
4678 {
4679 *errorcodeptr = ERR45;
4680 goto FAILED;
4681 }
4682 #endif
4683
4684 /* For the rest (including \X when Unicode properties are supported), we
4685 can obtain the OP value by negating the escape value. */
4686
4687 else
4688 {
4689 previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
4690 *code++ = -c;
4691 }
4692 continue;
4693 }
4694
4695 /* We have a data character whose value is in c. In UTF-8 mode it may have
4696 a value > 127. We set its representation in the length/buffer, and then
4697 handle it as a data character. */
4698
4699 #ifdef SUPPORT_UTF8
4700 if (utf8 && c > 127)
4701 mclength = _pcre_ord2utf8(c, mcbuffer);
4702 else
4703 #endif
4704
4705 {
4706 mcbuffer[0] = c;
4707 mclength = 1;
4708 }
4709 goto ONE_CHAR;
4710
4711
4712 /* ===================================================================*/
4713 /* Handle a literal character. It is guaranteed not to be whitespace or #
4714 when the extended flag is set. If we are in UTF-8 mode, it may be a
4715 multi-byte literal character. */
4716
4717 default:
4718 NORMAL_CHAR:
4719 mclength = 1;
4720 mcbuffer[0] = c;
4721
4722 #ifdef SUPPORT_UTF8
4723 if (utf8 && c >= 0xc0)
4724 {
4725 while ((ptr[1] & 0xc0) == 0x80)
4726 mcbuffer[mclength++] = *(++ptr);
4727 }
4728 #endif
4729
4730 /* At this point we have the character's bytes in mcbuffer, and the length
4731 in mclength. When not in UTF-8 mode, the length is always 1. */
4732
4733 ONE_CHAR:
4734 previous = code;
4735 *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARNC : OP_CHAR;
4736 for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
4737
4738 /* Set the first and required bytes appropriately. If no previous first
4739 byte, set it from this character, but revert to none on a zero repeat.
4740 Otherwise, leave the firstbyte value alone, and don't change it on a zero
4741 repeat. */
4742
4743 if (firstbyte == REQ_UNSET)
4744 {
4745 zerofirstbyte = REQ_NONE;
4746 zeroreqbyte = reqbyte;
4747
4748 /* If the character is more than one byte long, we can set firstbyte
4749 only if it is not to be matched caselessly. */
4750
4751 if (mclength == 1 || req_caseopt == 0)
4752 {
4753 firstbyte = mcbuffer[0] | req_caseopt;
4754 if (mclength != 1) reqbyte = code[-1] | cd->req_varyopt;
4755 }
4756 else firstbyte = reqbyte = REQ_NONE;
4757 }
4758
4759 /* firstbyte was previously set; we can set reqbyte only the length is
4760 1 or the matching is caseful. */
4761
4762 else
4763 {
4764 zerofirstbyte = firstbyte;
4765 zeroreqbyte = reqbyte;
4766 if (mclength == 1 || req_caseopt == 0)
4767 reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
4768 }
4769
4770 break; /* End of literal character handling */
4771 }
4772 } /* end of big loop */
4773
4774
4775 /* Control never reaches here by falling through, only by a goto for all the
4776 error states. Pass back the position in the pattern so that it can be displayed
4777 to the user for diagnosing the error. */
4778
4779 FAILED:
4780 *ptrptr = ptr;
4781 return FALSE;
4782 }
4783
4784
4785
4786
4787 /*************************************************
4788 * Compile sequence of alternatives *
4789 *************************************************/
4790
4791 /* On entry, ptr is pointing past the bracket character, but on return it
4792 points to the closing bracket, or vertical bar, or end of string. The code
4793 variable is pointing at the byte into which the BRA operator has been stored.
4794 If the ims options are changed at the start (for a (?ims: group) or during any
4795 branch, we need to insert an OP_OPT item at the start of every following branch
4796 to ensure they get set correctly at run time, and also pass the new options
4797 into every subsequent branch compile.
4798
4799 This function is used during the pre-compile phase when we are trying to find
4800 out the amount of memory needed, as well as during the real compile phase. The
4801 value of lengthptr distinguishes the two phases.
4802
4803 Arguments:
4804 options option bits, including any changes for this subpattern
4805 oldims previous settings of ims option bits
4806 codeptr -> the address of the current code pointer
4807 ptrptr -> the address of the current pattern pointer
4808 errorcodeptr -> pointer to error code variable
4809 lookbehind TRUE if this is a lookbehind assertion
4810 reset_bracount TRUE to reset the count for each branch
4811 skipbytes skip this many bytes at start (for brackets and OP_COND)
4812 firstbyteptr place to put the first required character, or a negative number
4813 reqbyteptr place to put the last required character, or a negative number
4814 bcptr pointer to the chain of currently open branches
4815 cd points to the data block with tables pointers etc.
4816 lengthptr NULL during the real compile phase
4817 points to length accumulator during pre-compile phase
4818
4819 Returns: TRUE on success
4820 */
4821
4822 static BOOL
4823 compile_regex(int options, int oldims, uschar **codeptr, const uschar **ptrptr,
4824 int *errorcodeptr, BOOL lookbehind, BOOL reset_bracount, int skipbytes,
4825 int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd,
4826 int *lengthptr)
4827 {
4828 const uschar *ptr = *ptrptr;
4829 uschar *code = *codeptr;
4830 uschar *last_branch = code;
4831 uschar *start_bracket = code;
4832 uschar *reverse_count = NULL;
4833 int firstbyte, reqbyte;
4834 int branchfirstbyte, branchreqbyte;
4835 int length;
4836 int orig_bracount;
4837 int max_bracount;
4838 branch_chain bc;
4839
4840 bc.outer = bcptr;
4841 bc.current = code;
4842
4843 firstbyte = reqbyte = REQ_UNSET;
4844
4845 /* Accumulate the length for use in the pre-compile phase. Start with the
4846 length of the BRA and KET and any extra bytes that are required at the
4847 beginning. We accumulate in a local variable to save frequent testing of
4848 lenthptr for NULL. We cannot do this by looking at the value of code at the
4849 start and end of each alternative, because compiled items are discarded during
4850 the pre-compile phase so that the work space is not exceeded. */
4851
4852 length = 2 + 2*LINK_SIZE + skipbytes;
4853
4854 /* WARNING: If the above line is changed for any reason, you must also change
4855 the code that abstracts option settings at the start of the pattern and makes
4856 them global. It tests the value of length for (2 + 2*LINK_SIZE) in the
4857 pre-compile phase to find out whether anything has yet been compiled or not. */
4858
4859 /* Offset is set zero to mark that this bracket is still open */
4860
4861 PUT(code, 1, 0);
4862 code += 1 + LINK_SIZE + skipbytes;
4863
4864 /* Loop for each alternative branch */
4865
4866 orig_bracount = max_bracount = cd->bracount;
4867 for (;;)
4868 {
4869 /* For a (?| group, reset the capturing bracket count so that each branch
4870 uses the same numbers. */
4871
4872 if (reset_bracount) cd->bracount = orig_bracount;
4873
4874 /* Handle a change of ims options at the start of the branch */
4875
4876 if ((options & PCRE_IMS) != oldims)
4877 {
4878 *code++ = OP_OPT;
4879 *code++ = options & PCRE_IMS;
4880 length += 2;
4881 }
4882
4883 /* Set up dummy OP_REVERSE if lookbehind assertion */
4884
4885 if (lookbehind)
4886 {
4887 *code++ = OP_REVERSE;
4888 reverse_count = code;
4889 PUTINC(code, 0, 0);
4890 length += 1 + LINK_SIZE;
4891 }
4892
4893 /* Now compile the branch; in the pre-compile phase its length gets added
4894 into the length. */
4895
4896 if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstbyte,
4897 &branchreqbyte, &bc, cd, (lengthptr == NULL)? NULL : &length))
4898 {
4899 *ptrptr = ptr;
4900 return FALSE;
4901 }
4902
4903 /* Keep the highest bracket count in case (?| was used and some branch
4904 has fewer than the rest. */
4905
4906 if (cd->bracount > max_bracount) max_bracount = cd->bracount;
4907
4908 /* In the real compile phase, there is some post-processing to be done. */
4909
4910 if (lengthptr == NULL)
4911 {
4912 /* If this is the first branch, the firstbyte and reqbyte values for the
4913 branch become the values for the regex. */
4914
4915 if (*last_branch != OP_ALT)
4916 {
4917 firstbyte = branchfirstbyte;
4918 reqbyte = branchreqbyte;
4919 }
4920
4921 /* If this is not the first branch, the first char and reqbyte have to
4922 match the values from all the previous branches, except that if the
4923 previous value for reqbyte didn't have REQ_VARY set, it can still match,
4924 and we set REQ_VARY for the regex. */
4925
4926 else
4927 {
4928 /* If we previously had a firstbyte, but it doesn't match the new branch,
4929 we have to abandon the firstbyte for the regex, but if there was
4930 previously no reqbyte, it takes on the value of the old firstbyte. */
4931
4932 if (firstbyte >= 0 && firstbyte != branchfirstbyte)
4933 {
4934 if (reqbyte < 0) reqbyte = firstbyte;
4935 firstbyte = REQ_NONE;
4936 }
4937
4938 /* If we (now or from before) have no firstbyte, a firstbyte from the
4939 branch becomes a reqbyte if there isn't a branch reqbyte. */
4940
4941 if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
4942 branchreqbyte = branchfirstbyte;
4943
4944 /* Now ensure that the reqbytes match */
4945
4946 if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
4947 reqbyte = REQ_NONE;
4948 else reqbyte |= branchreqbyte; /* To "or" REQ_VARY */
4949 }
4950
4951 /* If lookbehind, check that this branch matches a fixed-length string, and
4952 put the length into the OP_REVERSE item. Temporarily mark the end of the
4953 branch with OP_END. */
4954
4955 if (lookbehind)
4956 {
4957 int fixed_length;
4958 *code = OP_END;
4959 fixed_length = find_fixedlength(last_branch, options);
4960 DPRINTF(("fixed length = %d\n", fixed_length));
4961 if (fixed_length < 0)
4962 {
4963 *errorcodeptr = (fixed_length == -2)? ERR36 : ERR25;
4964 *ptrptr = ptr;
4965 return FALSE;
4966 }
4967 PUT(reverse_count, 0, fixed_length);
4968 }
4969 }
4970
4971 /* Reached end of expression, either ')' or end of pattern. In the real
4972 compile phase, go back through the alternative branches and reverse the chain
4973 of offsets, with the field in the BRA item now becoming an offset to the
4974 first alternative. If there are no alternatives, it points to the end of the
4975 group. The length in the terminating ket is always the length of the whole
4976 bracketed item. If any of the ims options were changed inside the group,
4977 compile a resetting op-code following, except at the very end of the pattern.
4978 Return leaving the pointer at the terminating char. */
4979
4980 if (*ptr != '|')
4981 {
4982 if (lengthptr == NULL)
4983 {
4984 int branch_length = code - last_branch;
4985 do
4986 {
4987 int prev_length = GET(last_branch, 1);
4988 PUT(last_branch, 1, branch_length);
4989 branch_length = prev_length;
4990 last_branch -= branch_length;
4991 }
4992 while (branch_length > 0);
4993 }
4994
4995 /* Fill in the ket */
4996
4997 *code = OP_KET;
4998 PUT(code, 1, code - start_bracket);
4999 code += 1 + LINK_SIZE;
5000
5001 /* Resetting option if needed */
5002
5003 if ((options & PCRE_IMS) != oldims && *ptr == ')')
5004 {
5005 *code++ = OP_OPT;
5006 *code++ = oldims;
5007 length += 2;
5008 }
5009
5010 /* Retain the highest bracket number, in case resetting was used. */
5011
5012 cd->bracount = max_bracount;
5013
5014 /* Set values to pass back */
5015
5016 *codeptr = code;
5017 *ptrptr = ptr;
5018 *firstbyteptr = firstbyte;
5019 *reqbyteptr = reqbyte;
5020 if (lengthptr != NULL) *lengthptr += length;
5021 return TRUE;
5022 }
5023
5024 /* Another branch follows. In the pre-compile phase, we can move the code
5025 pointer back to where it was for the start of the first branch. (That is,
5026 pretend that each branch is the only one.)
5027
5028 In the real compile phase, insert an ALT node. Its length field points back
5029 to the previous branch while the bracket remains open. At the end the chain
5030 is reversed. It's done like this so that the start of the bracket has a
5031 zero offset until it is closed, making it possible to detect recursion. */
5032
5033 if (lengthptr != NULL)
5034 {
5035 code = *codeptr + 1 + LINK_SIZE + skipbytes;
5036 length += 1 + LINK_SIZE;
5037 }
5038 else
5039 {
5040 *code = OP_ALT;
5041 PUT(code, 1, code - last_branch);
5042 bc.current = last_branch = code;
5043 code += 1 + LINK_SIZE;
5044 }
5045
5046 ptr++;
5047 }
5048 /* Control never reaches here */
5049 }
5050
5051
5052
5053
5054 /*************************************************
5055 * Check for anchored expression *
5056 *************************************************/
5057
5058 /* Try to find out if this is an anchored regular expression. Consider each
5059 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
5060 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
5061 it's anchored. However, if this is a multiline pattern, then only OP_SOD
5062 counts, since OP_CIRC can match in the middle.
5063
5064 We can also consider a regex to be anchored if OP_SOM starts all its branches.
5065 This is the code for \G, which means "match at start of match position, taking
5066 into account the match offset".
5067
5068 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
5069 because that will try the rest of the pattern at all possible matching points,
5070 so there is no point trying again.... er ....
5071
5072 .... except when the .* appears inside capturing parentheses, and there is a
5073 subsequent back reference to those parentheses. We haven't enough information
5074 to catch that case precisely.
5075
5076 At first, the best we could do was to detect when .* was in capturing brackets
5077 and the highest back reference was greater than or equal to that level.
5078 However, by keeping a bitmap of the first 31 back references, we can catch some
5079 of the more common cases more precisely.
5080
5081 Arguments:
5082 code points to start of expression (the bracket)
5083 options points to the options setting
5084 bracket_map a bitmap of which brackets we are inside while testing; this
5085 handles up to substring 31; after that we just have to take
5086 the less precise approach
5087 backref_map the back reference bitmap
5088
5089 Returns: TRUE or FALSE
5090 */
5091
5092 static BOOL
5093 is_anchored(register const uschar *code, int *options, unsigned int bracket_map,
5094 unsigned int backref_map)
5095 {
5096 do {
5097 const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
5098 options, PCRE_MULTILINE, FALSE);
5099 register int op = *scode;
5100
5101 /* Non-capturing brackets */
5102
5103 if (op == OP_BRA)
5104 {
5105 if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
5106 }
5107
5108 /* Capturing brackets */
5109
5110 else if (op == OP_CBRA)
5111 {
5112 int n = GET2(scode, 1+LINK_SIZE);
5113 int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
5114 if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;
5115 }
5116
5117 /* Other brackets */
5118
5119 else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
5120 {
5121 if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
5122 }
5123
5124 /* .* is not anchored unless DOTALL is set and it isn't in brackets that
5125 are or may be referenced. */
5126
5127 else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
5128 op == OP_TYPEPOSSTAR) &&
5129 (*options & PCRE_DOTALL) != 0)
5130 {
5131 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
5132 }
5133
5134 /* Check for explicit anchoring */
5135
5136 else if (op != OP_SOD && op != OP_SOM &&
5137 ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
5138 return FALSE;
5139 code += GET(code, 1);
5140 }
5141 while (*code == OP_ALT); /* Loop for each alternative */
5142 return TRUE;
5143 }
5144
5145
5146
5147 /*************************************************
5148 * Check for starting with ^ or .* *
5149 *************************************************/
5150
5151 /* This is called to find out if every branch starts with ^ or .* so that
5152 "first char" processing can be done to speed things up in multiline
5153 matching and for non-DOTALL patterns that start with .* (which must start at
5154 the beginning or after \n). As in the case of is_anchored() (see above), we
5155 have to take account of back references to capturing brackets that contain .*
5156 because in that case we can't make the assumption.
5157
5158 Arguments:
5159 code points to start of expression (the bracket)
5160 bracket_map a bitmap of which brackets we are inside while testing; this
5161 handles up to substring 31; after that we just have to take
5162 the less precise approach
5163 backref_map the back reference bitmap
5164
5165 Returns: TRUE or FALSE
5166 */
5167
5168 static BOOL
5169 is_startline(const uschar *code, unsigned int bracket_map,
5170 unsigned int backref_map)
5171 {
5172 do {
5173 const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
5174 NULL, 0, FALSE);
5175 register int op = *scode;
5176
5177 /* Non-capturing brackets */
5178
5179 if (op == OP_BRA)
5180 {
5181 if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
5182 }
5183
5184 /* Capturing brackets */
5185
5186 else if (op == OP_CBRA)
5187 {
5188 int n = GET2(scode, 1+LINK_SIZE);
5189 int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
5190 if (!is_startline(scode, new_map, backref_map)) return FALSE;
5191 }
5192
5193 /* Other brackets */
5194
5195 else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
5196 { if (!is_startline(scode, bracket_map, backref_map)) return FALSE; }
5197
5198 /* .* means "start at start or after \n" if it isn't in brackets that
5199 may be referenced. */
5200
5201 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
5202 {
5203 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
5204 }
5205
5206 /* Check for explicit circumflex */
5207
5208 else if (op != OP_CIRC) return FALSE;
5209
5210 /* Move on to the next alternative */
5211
5212 code += GET(code, 1);
5213 }
5214 while (*code == OP_ALT); /* Loop for each alternative */
5215 return TRUE;
5216 }
5217
5218
5219
5220 /*************************************************
5221 * Check for asserted fixed first char *
5222 *************************************************/
5223
5224 /* During compilation, the "first char" settings from forward assertions are
5225 discarded, because they can cause conflicts with actual literals that follow.
5226 However, if we end up without a first char setting for an unanchored pattern,
5227 it is worth scanning the regex to see if there is an initial asserted first
5228 char. If all branches start with the same asserted char, or with a bracket all
5229 of whose alternatives start with the same asserted char (recurse ad lib), then
5230 we return that char, otherwise -1.
5231
5232 Arguments:
5233 code points to start of expression (the bracket)
5234 options pointer to the options (used to check casing changes)
5235 inassert TRUE if in an assertion
5236
5237 Returns: -1 or the fixed first char
5238 */
5239
5240 static int
5241 find_firstassertedchar(const uschar *code, int *options, BOOL inassert)
5242 {
5243 register int c = -1;
5244 do {
5245 int d;
5246 const uschar *scode =
5247 first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE);
5248 register int op = *scode;
5249
5250 switch(op)
5251 {
5252 default:
5253 return -1;
5254
5255 case OP_BRA:
5256 case OP_CBRA:
5257 case OP_ASSERT:
5258 case OP_ONCE:
5259 case OP_COND:
5260 if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0)
5261 return -1;
5262 if (c < 0) c = d; else if (c != d) return -1;
5263 break;
5264
5265 case OP_EXACT: /* Fall through */
5266 scode += 2;
5267
5268 case OP_CHAR:
5269 case OP_CHARNC:
5270 case OP_PLUS:
5271 case OP_MINPLUS:
5272 case OP_POSPLUS:
5273 if (!inassert) return -1;
5274 if (c < 0)
5275 {
5276 c = scode[1];
5277 if ((*options & PCRE_CASELESS) != 0) c |= REQ_CASELESS;
5278 }
5279 else if (c != scode[1]) return -1;
5280 break;
5281 }
5282
5283 code += GET(code, 1);
5284 }
5285 while (*code == OP_ALT);
5286 return c;
5287 }
5288
5289
5290
5291 /*************************************************
5292 * Compile a Regular Expression *
5293 *************************************************/
5294
5295 /* This function takes a string and returns a pointer to a block of store
5296 holding a compiled version of the expression. The original API for this
5297 function had no error code return variable; it is retained for backwards
5298 compatibility. The new function is given a new name.
5299
5300 Arguments:
5301 pattern the regular expression
5302 options various option bits
5303 errorcodeptr pointer to error code variable (pcre_compile2() only)
5304 can be NULL if you don't want a code value
5305 errorptr pointer to pointer to error text
5306 erroroffset ptr offset in pattern where error was detected
5307 tables pointer to character tables or NULL
5308
5309 Returns: pointer to compiled data block, or NULL on error,
5310 with errorptr and erroroffset set
5311 */
5312
5313 PCRE_EXP_DEFN pcre *
5314 pcre_compile(const char *pattern, int options, const char **errorptr,
5315 int *erroroffset, const unsigned char *tables)
5316 {
5317 return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
5318 }
5319
5320
5321 PCRE_EXP_DEFN pcre *
5322 pcre_compile2(const char *pattern, int options, int *errorcodeptr,
5323 const char **errorptr, int *erroroffset, const unsigned char *tables)
5324 {
5325 real_pcre *re;
5326 int length = 1; /* For final END opcode */
5327 int firstbyte, reqbyte, newline;
5328 int errorcode = 0;
5329 #ifdef SUPPORT_UTF8
5330 BOOL utf8;
5331 #endif
5332 size_t size;
5333 uschar *code;
5334 const uschar *codestart;
5335 const uschar *ptr;
5336 compile_data compile_block;
5337 compile_data *cd = &compile_block;
5338
5339 /* This space is used for "compiling" into during the first phase, when we are
5340 computing the amount of memory that is needed. Compiled items are thrown away
5341 as soon as possible, so that a fairly large buffer should be sufficient for
5342 this purpose. The same space is used in the second phase for remembering where
5343 to fill in forward references to subpatterns. */
5344
5345 uschar cworkspace[COMPILE_WORK_SIZE];
5346
5347
5348 /* Set this early so that early errors get offset 0. */
5349
5350 ptr = (const uschar *)pattern;
5351
5352 /* We can't pass back an error message if errorptr is NULL; I guess the best we
5353 can do is just return NULL, but we can set a code value if there is a code
5354 pointer. */
5355
5356 if (errorptr == NULL)
5357 {
5358 if (errorcodeptr != NULL) *errorcodeptr = 99;
5359 return NULL;
5360 }
5361
5362 *errorptr = NULL;
5363 if (errorcodeptr != NULL) *errorcodeptr = ERR0;
5364
5365 /* However, we can give a message for this error */
5366
5367 if (erroroffset == NULL)
5368 {
5369 errorcode = ERR16;
5370 goto PCRE_EARLY_ERROR_RETURN2;
5371 }
5372
5373 *erroroffset = 0;
5374
5375 /* Can't support UTF8 unless PCRE has been compiled to include the code. */
5376
5377 #ifdef SUPPORT_UTF8
5378 utf8 = (options & PCRE_UTF8) != 0;
5379 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
5380 (*erroroffset = _pcre_valid_utf8((uschar *)pattern, -1)) >= 0)
5381 {
5382 errorcode = ERR44;
5383 goto PCRE_EARLY_ERROR_RETURN2;
5384 }
5385 #else
5386 if ((options & PCRE_UTF8) != 0)
5387 {
5388 errorcode = ERR32;
5389 goto PCRE_EARLY_ERROR_RETURN;
5390 }
5391 #endif
5392
5393 if ((options & ~PUBLIC_OPTIONS) != 0)
5394 {
5395 errorcode = ERR17;
5396 goto PCRE_EARLY_ERROR_RETURN;
5397 }
5398
5399 /* Set up pointers to the individual character tables */
5400
5401 if (tables == NULL) tables = _pcre_default_tables;
5402 cd->lcc = tables + lcc_offset;
5403 cd->fcc = tables + fcc_offset;
5404 cd->cbits = tables + cbits_offset;
5405 cd->ctypes = tables + ctypes_offset;
5406
5407 /* Handle different types of newline. The three bits give seven cases. The
5408 current code allows for fixed one- or two-byte sequences, plus "any" and
5409 "anycrlf". */
5410
5411 switch (options & (PCRE_NEWLINE_CRLF | PCRE_NEWLINE_ANY))
5412 {
5413 case 0: newline = NEWLINE; break; /* Compile-time default */
5414 case PCRE_NEWLINE_CR: newline = '\r'; break;
5415 case PCRE_NEWLINE_LF: newline = '\n'; break;
5416 case PCRE_NEWLINE_CR+
5417 PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
5418 case PCRE_NEWLINE_ANY: newline = -1; break;
5419 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
5420 default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
5421 }
5422
5423 if (newline == -2)
5424 {
5425 cd->nltype = NLTYPE_ANYCRLF;
5426 }
5427 else if (newline < 0)
5428 {
5429 cd->nltype = NLTYPE_ANY;
5430 }
5431 else
5432 {
5433 cd->nltype = NLTYPE_FIXED;
5434 if (newline > 255)
5435 {
5436 cd->nllen = 2;
5437 cd->nl[0] = (newline >> 8) & 255;
5438 cd->nl[1] = newline & 255;
5439 }
5440 else
5441 {
5442 cd->nllen = 1;
5443 cd->nl[0] = newline;
5444 }
5445 }
5446
5447 /* Maximum back reference and backref bitmap. The bitmap records up to 31 back
5448 references to help in deciding whether (.*) can be treated as anchored or not.
5449 */
5450
5451 cd->top_backref = 0;
5452 cd->backref_map = 0;
5453
5454 /* Reflect pattern for debugging output */
5455
5456 DPRINTF(("------------------------------------------------------------------\n"));
5457 DPRINTF(("%s\n", pattern));
5458
5459 /* Pretend to compile the pattern while actually just accumulating the length
5460 of memory required. This behaviour is triggered by passing a non-NULL final
5461 argument to compile_regex(). We pass a block of workspace (cworkspace) for it
5462 to compile parts of the pattern into; the compiled code is discarded when it is
5463 no longer needed, so hopefully this workspace will never overflow, though there
5464 is a test for its doing so. */
5465
5466 cd->bracount = 0;
5467 cd->names_found = 0;
5468 cd->name_entry_size = 0;
5469 cd->name_table = NULL;
5470 cd->start_workspace = cworkspace;
5471 cd->start_code = cworkspace;
5472 cd->hwm = cworkspace;
5473 cd->start_pattern = (const uschar *)pattern;
5474 cd->end_pattern = (const uschar *)(pattern + strlen(pattern));
5475 cd->req_varyopt = 0;
5476 cd->nopartial = FALSE;
5477 cd->external_options = options;
5478
5479 /* Now do the pre-compile. On error, errorcode will be set non-zero, so we
5480 don't need to look at the result of the function here. The initial options have
5481 been put into the cd block so that they can be changed if an option setting is
5482 found within the regex right at the beginning. Bringing initial option settings
5483 outside can help speed up starting point checks. */
5484
5485 code = cworkspace;
5486 *code = OP_BRA;
5487 (void)compile_regex(cd->external_options, cd->external_options & PCRE_IMS,
5488 &code, &ptr, &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd,
5489 &length);
5490 if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN;
5491
5492 DPRINTF(("end pre-compile: length=%d workspace=%d\n", length,
5493 cd->hwm - cworkspace));
5494
5495 if (length > MAX_PATTERN_SIZE)
5496 {
5497 errorcode = ERR20;
5498 goto PCRE_EARLY_ERROR_RETURN;
5499 }
5500
5501 /* Compute the size of data block needed and get it, either from malloc or
5502 externally provided function. Integer overflow should no longer be possible
5503 because nowadays we limit the maximum value of cd->names_found and
5504 cd->name_entry_size. */
5505
5506 size = length + sizeof(real_pcre) + cd->names_found * (cd->name_entry_size + 3);
5507 re = (real_pcre *)(pcre_malloc)(size);
5508
5509 if (re == NULL)
5510 {
5511 errorcode = ERR21;
5512 goto PCRE_EARLY_ERROR_RETURN;
5513 }
5514
5515 /* Put in the magic number, and save the sizes, initial options, and character
5516 table pointer. NULL is used for the default character tables. The nullpad field
5517 is at the end; it's there to help in the case when a regex compiled on a system
5518 with 4-byte pointers is run on another with 8-byte pointers. */
5519
5520 re->magic_number = MAGIC_NUMBER;
5521 re->size = size;
5522 re->options = cd->external_options;
5523 re->dummy1 = 0;
5524 re->first_byte = 0;
5525 re->req_byte = 0;
5526 re->name_table_offset = sizeof(real_pcre);
5527 re->name_entry_size = cd->name_entry_size;
5528 re->name_count = cd->names_found;
5529 re->ref_count = 0;
5530 re->tables = (tables == _pcre_default_tables)? NULL : tables;
5531 re->nullpad = NULL;
5532
5533 /* The starting points of the name/number translation table and of the code are
5534 passed around in the compile data block. The start/end pattern and initial
5535 options are already set from the pre-compile phase, as is the name_entry_size
5536 field. Reset the bracket count and the names_found field. Also reset the hwm
5537 field; this time it's used for remembering forward references to subpatterns.
5538 */
5539
5540 cd->bracount = 0;
5541 cd->names_found = 0;
5542 cd->name_table = (uschar *)re + re->name_table_offset;
5543 codestart = cd->name_table + re->name_entry_size * re->name_count;
5544 cd->start_code = codestart;
5545 cd->hwm = cworkspace;
5546 cd->req_varyopt = 0;
5547 cd->nopartial = FALSE;
5548
5549 /* Set up a starting, non-extracting bracket, then compile the expression. On
5550 error, errorcode will be set non-zero, so we don't need to look at the result
5551 of the function here. */
5552
5553 ptr = (const uschar *)pattern;
5554 code = (uschar *)codestart;
5555 *code = OP_BRA;
5556 (void)compile_regex(re->options, re->options & PCRE_IMS, &code, &ptr,
5557 &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd, NULL);
5558 re->top_bracket = cd->bracount;
5559 re->top_backref = cd->top_backref;
5560
5561 if (cd->nopartial) re->options |= PCRE_NOPARTIAL;
5562
5563 /* If not reached end of pattern on success, there's an excess bracket. */
5564
5565 if (errorcode == 0 && *ptr != 0) errorcode = ERR22;
5566
5567 /* Fill in the terminating state and check for disastrous overflow, but
5568 if debugging, leave the test till after things are printed out. */
5569
5570 *code++ = OP_END;
5571
5572 #ifndef DEBUG
5573 if (code - codestart > length) errorcode = ERR23;
5574 #endif
5575
5576 /* Fill in any forward references that are required. */
5577
5578 while (errorcode == 0 && cd->hwm > cworkspace)
5579 {
5580 int offset, recno;
5581 const uschar *groupptr;
5582 cd->hwm -= LINK_SIZE;
5583 offset = GET(cd->hwm, 0);
5584 recno = GET(codestart, offset);
5585 groupptr = find_bracket(codestart, (re->options & PCRE_UTF8) != 0, recno);
5586 if (groupptr == NULL) errorcode = ERR53;
5587 else PUT(((uschar *)codestart), offset, groupptr - codestart);
5588 }
5589
5590 /* Give an error if there's back reference to a non-existent capturing
5591 subpattern. */
5592
5593 if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15;
5594
5595 /* Failed to compile, or error while post-processing */
5596
5597 if (errorcode != 0)
5598 {
5599 (pcre_free)(re);
5600 PCRE_EARLY_ERROR_RETURN:
5601 *erroroffset = ptr - (const uschar *)pattern;
5602 PCRE_EARLY_ERROR_RETURN2:
5603 *errorptr = error_texts[errorcode];
5604 if (errorcodeptr != NULL) *errorcodeptr = errorcode;
5605 return NULL;
5606 }
5607
5608 /* If the anchored option was not passed, set the flag if we can determine that
5609 the pattern is anchored by virtue of ^ characters or \A or anything else (such
5610 as starting with .* when DOTALL is set).
5611
5612 Otherwise, if we know what the first byte has to be, save it, because that
5613 speeds up unanchored matches no end. If not, see if we can set the
5614 PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
5615 start with ^. and also when all branches start with .* for non-DOTALL matches.
5616 */
5617
5618 if ((re->options & PCRE_ANCHORED) == 0)
5619 {
5620 int temp_options = re->options; /* May get changed during these scans */
5621 if (is_anchored(codestart, &temp_options, 0, cd->backref_map))
5622 re->options |= PCRE_ANCHORED;
5623 else
5624 {
5625 if (firstbyte < 0)
5626 firstbyte = find_firstassertedchar(codestart, &temp_options, FALSE);
5627 if (firstbyte >= 0) /* Remove caseless flag for non-caseable chars */
5628 {
5629 int ch = firstbyte & 255;
5630 re->first_byte = ((firstbyte & REQ_CASELESS) != 0 &&
5631 cd->fcc[ch] == ch)? ch : firstbyte;
5632 re->options |= PCRE_FIRSTSET;
5633 }
5634 else if (is_startline(codestart, 0, cd->backref_map))
5635 re->options |= PCRE_STARTLINE;
5636 }
5637 }
5638
5639 /* For an anchored pattern, we use the "required byte" only if it follows a
5640 variable length item in the regex. Remove the caseless flag for non-caseable
5641 bytes. */
5642
5643 if (reqbyte >= 0 &&
5644 ((re->options & PCRE_ANCHORED) == 0 || (reqbyte & REQ_VARY) != 0))
5645 {
5646 int ch = reqbyte & 255;
5647 re->req_byte = ((reqbyte & REQ_CASELESS) != 0 &&
5648 cd->fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte;
5649 re->options |= PCRE_REQCHSET;
5650 }
5651
5652 /* Print out the compiled data if debugging is enabled. This is never the
5653 case when building a production library. */
5654
5655 #ifdef DEBUG
5656
5657 printf("Length = %d top_bracket = %d top_backref = %d\n",
5658 length, re->top_bracket, re->top_backref);
5659
5660 if (re->options != 0)
5661 {
5662 printf("%s%s%s%s%s%s%s%s%s\n",
5663 ((re->options & PCRE_NOPARTIAL) != 0)? "nopartial " : "",
5664 ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
5665 ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
5666 ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
5667 ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
5668 ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
5669 ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
5670 ((re->options & PCRE_EXTRA) != 0)? "extra " : "",
5671 ((re->options & PCRE_UNGREEDY) != 0)? "ungreedy " : "");
5672 }
5673
5674 if ((re->options & PCRE_FIRSTSET) != 0)
5675 {
5676 int ch = re->first_byte & 255;
5677 const char *caseless = ((re->first_byte & REQ_CASELESS) == 0)?
5678 "" : " (caseless)";
5679 if (isprint(ch)) printf("First char = %c%s\n", ch, caseless);
5680 else printf("First char = \\x%02x%s\n", ch, caseless);
5681 }
5682
5683 if ((re->options & PCRE_REQCHSET) != 0)
5684 {
5685 int ch = re->req_byte & 255;
5686 const char *caseless = ((re->req_byte & REQ_CASELESS) == 0)?
5687 "" : " (caseless)";
5688 if (isprint(ch)) printf("Req char = %c%s\n", ch, caseless);
5689 else printf("Req char = \\x%02x%s\n", ch, caseless);
5690 }
5691
5692 pcre_printint(re, stdout, TRUE);
5693
5694 /* This check is done here in the debugging case so that the code that
5695 was compiled can be seen. */
5696
5697 if (code - codestart > length)
5698 {
5699 (pcre_free)(re);
5700 *errorptr = error_texts[ERR23];
5701 *erroroffset = ptr - (uschar *)pattern;
5702 if (errorcodeptr != NULL) *errorcodeptr = ERR23;
5703 return NULL;
5704 }
5705 #endif /* DEBUG */
5706
5707 return (pcre *)re;
5708 }
5709
5710 /* End of pcre_compile.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12