/[pcre]/code/tags/pcre-7.3/pcre_compile.c
ViewVC logotype

Contents of /code/tags/pcre-7.3/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 229 - (show annotations) (download)
Tue Aug 28 13:42:43 2007 UTC (6 years, 7 months ago) by ph10
File MIME type: text/plain
File size: 190736 byte(s)
Tag for 7.3.

1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2007 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains the external function pcre_compile(), along with
42 supporting internal functions that are not used by other modules. */
43
44
45 #ifdef HAVE_CONFIG_H
46 #include <config.h>
47 #endif
48
49 #define NLBLOCK cd /* Block containing newline information */
50 #define PSSTART start_pattern /* Field containing processed string start */
51 #define PSEND end_pattern /* Field containing processed string end */
52
53 #include "pcre_internal.h"
54
55
56 /* When DEBUG is defined, we need the pcre_printint() function, which is also
57 used by pcretest. DEBUG is not defined when building a production library. */
58
59 #ifdef DEBUG
60 #include "pcre_printint.src"
61 #endif
62
63
64 /* Macro for setting individual bits in class bitmaps. */
65
66 #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
67
68 /* Maximum length value to check against when making sure that the integer that
69 holds the compiled pattern length does not overflow. We make it a bit less than
70 INT_MAX to allow for adding in group terminating bytes, so that we don't have
71 to check them every time. */
72
73 #define OFLOW_MAX (INT_MAX - 20)
74
75
76 /*************************************************
77 * Code parameters and static tables *
78 *************************************************/
79
80 /* This value specifies the size of stack workspace that is used during the
81 first pre-compile phase that determines how much memory is required. The regex
82 is partly compiled into this space, but the compiled parts are discarded as
83 soon as they can be, so that hopefully there will never be an overrun. The code
84 does, however, check for an overrun. The largest amount I've seen used is 218,
85 so this number is very generous.
86
87 The same workspace is used during the second, actual compile phase for
88 remembering forward references to groups so that they can be filled in at the
89 end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
90 is 4 there is plenty of room. */
91
92 #define COMPILE_WORK_SIZE (4096)
93
94
95 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
96 are simple data values; negative values are for special things like \d and so
97 on. Zero means further processing is needed (for things like \x), or the escape
98 is invalid. */
99
100 #ifndef EBCDIC /* This is the "normal" table for ASCII systems */
101 static const short int escapes[] = {
102 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
103 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
104 '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */
105 -ESC_H, 0, 0, -ESC_K, 0, 0, 0, 0, /* H - O */
106 -ESC_P, -ESC_Q, -ESC_R, -ESC_S, 0, 0, -ESC_V, -ESC_W, /* P - W */
107 -ESC_X, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
108 '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */
109 -ESC_h, 0, 0, -ESC_k, 0, 0, ESC_n, 0, /* h - o */
110 -ESC_p, 0, ESC_r, -ESC_s, ESC_tee, 0, -ESC_v, -ESC_w, /* p - w */
111 0, 0, -ESC_z /* x - z */
112 };
113
114 #else /* This is the "abnormal" table for EBCDIC systems */
115 static const short int escapes[] = {
116 /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
117 /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
118 /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
119 /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
120 /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
121 /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
122 /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
123 /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
124 /* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0,
125 /* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
126 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
127 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0,
128 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
129 /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
130 /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
131 /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
132 /* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0,
133 /* D0 */ '}', 0, -ESC_K, 0, 0, 0, 0, -ESC_P,
134 /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
135 /* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X,
136 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
137 /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
138 /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
139 };
140 #endif
141
142
143 /* Table of special "verbs" like (*PRUNE) */
144
145 typedef struct verbitem {
146 const char *name;
147 int len;
148 int op;
149 } verbitem;
150
151 static verbitem verbs[] = {
152 { "ACCEPT", 6, OP_ACCEPT },
153 { "COMMIT", 6, OP_COMMIT },
154 { "F", 1, OP_FAIL },
155 { "FAIL", 4, OP_FAIL },
156 { "PRUNE", 5, OP_PRUNE },
157 { "SKIP", 4, OP_SKIP },
158 { "THEN", 4, OP_THEN }
159 };
160
161 static int verbcount = sizeof(verbs)/sizeof(verbitem);
162
163
164 /* Tables of names of POSIX character classes and their lengths. The list is
165 terminated by a zero length entry. The first three must be alpha, lower, upper,
166 as this is assumed for handling case independence. */
167
168 static const char *const posix_names[] = {
169 "alpha", "lower", "upper",
170 "alnum", "ascii", "blank", "cntrl", "digit", "graph",
171 "print", "punct", "space", "word", "xdigit" };
172
173 static const uschar posix_name_lengths[] = {
174 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
175
176 /* Table of class bit maps for each POSIX class. Each class is formed from a
177 base map, with an optional addition or removal of another map. Then, for some
178 classes, there is some additional tweaking: for [:blank:] the vertical space
179 characters are removed, and for [:alpha:] and [:alnum:] the underscore
180 character is removed. The triples in the table consist of the base map offset,
181 second map offset or -1 if no second map, and a non-negative value for map
182 addition or a negative value for map subtraction (if there are two maps). The
183 absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
184 remove vertical space characters, 2 => remove underscore. */
185
186 static const int posix_class_maps[] = {
187 cbit_word, cbit_digit, -2, /* alpha */
188 cbit_lower, -1, 0, /* lower */
189 cbit_upper, -1, 0, /* upper */
190 cbit_word, -1, 2, /* alnum - word without underscore */
191 cbit_print, cbit_cntrl, 0, /* ascii */
192 cbit_space, -1, 1, /* blank - a GNU extension */
193 cbit_cntrl, -1, 0, /* cntrl */
194 cbit_digit, -1, 0, /* digit */
195 cbit_graph, -1, 0, /* graph */
196 cbit_print, -1, 0, /* print */
197 cbit_punct, -1, 0, /* punct */
198 cbit_space, -1, 0, /* space */
199 cbit_word, -1, 0, /* word - a Perl extension */
200 cbit_xdigit,-1, 0 /* xdigit */
201 };
202
203
204 #define STRING(a) # a
205 #define XSTRING(s) STRING(s)
206
207 /* The texts of compile-time error messages. These are "char *" because they
208 are passed to the outside world. Do not ever re-use any error number, because
209 they are documented. Always add a new error instead. Messages marked DEAD below
210 are no longer used. */
211
212 static const char *error_texts[] = {
213 "no error",
214 "\\ at end of pattern",
215 "\\c at end of pattern",
216 "unrecognized character follows \\",
217 "numbers out of order in {} quantifier",
218 /* 5 */
219 "number too big in {} quantifier",
220 "missing terminating ] for character class",
221 "invalid escape sequence in character class",
222 "range out of order in character class",
223 "nothing to repeat",
224 /* 10 */
225 "operand of unlimited repeat could match the empty string", /** DEAD **/
226 "internal error: unexpected repeat",
227 "unrecognized character after (?",
228 "POSIX named classes are supported only within a class",
229 "missing )",
230 /* 15 */
231 "reference to non-existent subpattern",
232 "erroffset passed as NULL",
233 "unknown option bit(s) set",
234 "missing ) after comment",
235 "parentheses nested too deeply", /** DEAD **/
236 /* 20 */
237 "regular expression is too large",
238 "failed to get memory",
239 "unmatched parentheses",
240 "internal error: code overflow",
241 "unrecognized character after (?<",
242 /* 25 */
243 "lookbehind assertion is not fixed length",
244 "malformed number or name after (?(",
245 "conditional group contains more than two branches",
246 "assertion expected after (?(",
247 "(?R or (?[+-]digits must be followed by )",
248 /* 30 */
249 "unknown POSIX class name",
250 "POSIX collating elements are not supported",
251 "this version of PCRE is not compiled with PCRE_UTF8 support",
252 "spare error", /** DEAD **/
253 "character value in \\x{...} sequence is too large",
254 /* 35 */
255 "invalid condition (?(0)",
256 "\\C not allowed in lookbehind assertion",
257 "PCRE does not support \\L, \\l, \\N, \\U, or \\u",
258 "number after (?C is > 255",
259 "closing ) for (?C expected",
260 /* 40 */
261 "recursive call could loop indefinitely",
262 "unrecognized character after (?P",
263 "syntax error in subpattern name (missing terminator)",
264 "two named subpatterns have the same name",
265 "invalid UTF-8 string",
266 /* 45 */
267 "support for \\P, \\p, and \\X has not been compiled",
268 "malformed \\P or \\p sequence",
269 "unknown property name after \\P or \\p",
270 "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)",
271 "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")",
272 /* 50 */
273 "repeated subpattern is too long", /** DEAD **/
274 "octal value is greater than \\377 (not in UTF-8 mode)",
275 "internal error: overran compiling workspace",
276 "internal error: previously-checked referenced subpattern not found",
277 "DEFINE group contains more than one branch",
278 /* 55 */
279 "repeating a DEFINE group is not allowed",
280 "inconsistent NEWLINE options",
281 "\\g is not followed by a braced name or an optionally braced non-zero number",
282 "(?+ or (?- or (?(+ or (?(- must be followed by a non-zero number",
283 "(*VERB) with an argument is not supported",
284 /* 60 */
285 "(*VERB) not recognized",
286 "number is too big"
287 };
288
289
290 /* Table to identify digits and hex digits. This is used when compiling
291 patterns. Note that the tables in chartables are dependent on the locale, and
292 may mark arbitrary characters as digits - but the PCRE compiling code expects
293 to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
294 a private table here. It costs 256 bytes, but it is a lot faster than doing
295 character value tests (at least in some simple cases I timed), and in some
296 applications one wants PCRE to compile efficiently as well as match
297 efficiently.
298
299 For convenience, we use the same bit definitions as in chartables:
300
301 0x04 decimal digit
302 0x08 hexadecimal digit
303
304 Then we can use ctype_digit and ctype_xdigit in the code. */
305
306 #ifndef EBCDIC /* This is the "normal" case, for ASCII systems */
307 static const unsigned char digitab[] =
308 {
309 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
310 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
311 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
312 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
313 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
314 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
315 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
316 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
317 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
318 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
319 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
320 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
321 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
322 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
323 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
324 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
325 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
326 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
327 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
328 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
329 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
330 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
331 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
332 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
333 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
334 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
335 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
336 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
337 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
338 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
339 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
340 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
341
342 #else /* This is the "abnormal" case, for EBCDIC systems */
343 static const unsigned char digitab[] =
344 {
345 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
346 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
347 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
348 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
349 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
350 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
351 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
352 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
353 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
354 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
355 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
356 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */
357 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
358 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
359 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
360 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
361 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
362 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
363 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
364 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
365 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
366 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
367 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
368 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
369 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
370 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
371 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
372 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
373 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
374 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
375 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
376 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
377
378 static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
379 0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
380 0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
381 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
382 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
383 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
384 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
385 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
386 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
387 0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
388 0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
389 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
390 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */
391 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
392 0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
393 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
394 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
395 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
396 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
397 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
398 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
399 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
400 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
401 0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
402 0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
403 0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
404 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
405 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
406 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
407 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
408 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
409 0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
410 0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
411 #endif
412
413
414 /* Definition to allow mutual recursion */
415
416 static BOOL
417 compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
418 int *, int *, branch_chain *, compile_data *, int *);
419
420
421
422 /*************************************************
423 * Handle escapes *
424 *************************************************/
425
426 /* This function is called when a \ has been encountered. It either returns a
427 positive value for a simple escape such as \n, or a negative value which
428 encodes one of the more complicated things such as \d. A backreference to group
429 n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
430 UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
431 ptr is pointing at the \. On exit, it is on the final character of the escape
432 sequence.
433
434 Arguments:
435 ptrptr points to the pattern position pointer
436 errorcodeptr points to the errorcode variable
437 bracount number of previous extracting brackets
438 options the options bits
439 isclass TRUE if inside a character class
440
441 Returns: zero or positive => a data character
442 negative => a special escape sequence
443 on error, errorcodeptr is set
444 */
445
446 static int
447 check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
448 int options, BOOL isclass)
449 {
450 BOOL utf8 = (options & PCRE_UTF8) != 0;
451 const uschar *ptr = *ptrptr + 1;
452 int c, i;
453
454 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
455 ptr--; /* Set pointer back to the last byte */
456
457 /* If backslash is at the end of the pattern, it's an error. */
458
459 if (c == 0) *errorcodeptr = ERR1;
460
461 /* Non-alphamerics are literals. For digits or letters, do an initial lookup in
462 a table. A non-zero result is something that can be returned immediately.
463 Otherwise further processing may be required. */
464
465 #ifndef EBCDIC /* ASCII coding */
466 else if (c < '0' || c > 'z') {} /* Not alphameric */
467 else if ((i = escapes[c - '0']) != 0) c = i;
468
469 #else /* EBCDIC coding */
470 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphameric */
471 else if ((i = escapes[c - 0x48]) != 0) c = i;
472 #endif
473
474 /* Escapes that need further processing, or are illegal. */
475
476 else
477 {
478 const uschar *oldptr;
479 BOOL braced, negated;
480
481 switch (c)
482 {
483 /* A number of Perl escapes are not handled by PCRE. We give an explicit
484 error. */
485
486 case 'l':
487 case 'L':
488 case 'N':
489 case 'u':
490 case 'U':
491 *errorcodeptr = ERR37;
492 break;
493
494 /* \g must be followed by a number, either plain or braced. If positive, it
495 is an absolute backreference. If negative, it is a relative backreference.
496 This is a Perl 5.10 feature. Perl 5.10 also supports \g{name} as a
497 reference to a named group. This is part of Perl's movement towards a
498 unified syntax for back references. As this is synonymous with \k{name}, we
499 fudge it up by pretending it really was \k. */
500
501 case 'g':
502 if (ptr[1] == '{')
503 {
504 const uschar *p;
505 for (p = ptr+2; *p != 0 && *p != '}'; p++)
506 if (*p != '-' && (digitab[*p] & ctype_digit) == 0) break;
507 if (*p != 0 && *p != '}')
508 {
509 c = -ESC_k;
510 break;
511 }
512 braced = TRUE;
513 ptr++;
514 }
515 else braced = FALSE;
516
517 if (ptr[1] == '-')
518 {
519 negated = TRUE;
520 ptr++;
521 }
522 else negated = FALSE;
523
524 c = 0;
525 while ((digitab[ptr[1]] & ctype_digit) != 0)
526 c = c * 10 + *(++ptr) - '0';
527
528 if (c < 0)
529 {
530 *errorcodeptr = ERR61;
531 break;
532 }
533
534 if (c == 0 || (braced && *(++ptr) != '}'))
535 {
536 *errorcodeptr = ERR57;
537 break;
538 }
539
540 if (negated)
541 {
542 if (c > bracount)
543 {
544 *errorcodeptr = ERR15;
545 break;
546 }
547 c = bracount - (c - 1);
548 }
549
550 c = -(ESC_REF + c);
551 break;
552
553 /* The handling of escape sequences consisting of a string of digits
554 starting with one that is not zero is not straightforward. By experiment,
555 the way Perl works seems to be as follows:
556
557 Outside a character class, the digits are read as a decimal number. If the
558 number is less than 10, or if there are that many previous extracting
559 left brackets, then it is a back reference. Otherwise, up to three octal
560 digits are read to form an escaped byte. Thus \123 is likely to be octal
561 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
562 value is greater than 377, the least significant 8 bits are taken. Inside a
563 character class, \ followed by a digit is always an octal number. */
564
565 case '1': case '2': case '3': case '4': case '5':
566 case '6': case '7': case '8': case '9':
567
568 if (!isclass)
569 {
570 oldptr = ptr;
571 c -= '0';
572 while ((digitab[ptr[1]] & ctype_digit) != 0)
573 c = c * 10 + *(++ptr) - '0';
574 if (c < 0)
575 {
576 *errorcodeptr = ERR61;
577 break;
578 }
579 if (c < 10 || c <= bracount)
580 {
581 c = -(ESC_REF + c);
582 break;
583 }
584 ptr = oldptr; /* Put the pointer back and fall through */
585 }
586
587 /* Handle an octal number following \. If the first digit is 8 or 9, Perl
588 generates a binary zero byte and treats the digit as a following literal.
589 Thus we have to pull back the pointer by one. */
590
591 if ((c = *ptr) >= '8')
592 {
593 ptr--;
594 c = 0;
595 break;
596 }
597
598 /* \0 always starts an octal number, but we may drop through to here with a
599 larger first octal digit. The original code used just to take the least
600 significant 8 bits of octal numbers (I think this is what early Perls used
601 to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
602 than 3 octal digits. */
603
604 case '0':
605 c -= '0';
606 while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
607 c = c * 8 + *(++ptr) - '0';
608 if (!utf8 && c > 255) *errorcodeptr = ERR51;
609 break;
610
611 /* \x is complicated. \x{ddd} is a character number which can be greater
612 than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
613 treated as a data character. */
614
615 case 'x':
616 if (ptr[1] == '{')
617 {
618 const uschar *pt = ptr + 2;
619 int count = 0;
620
621 c = 0;
622 while ((digitab[*pt] & ctype_xdigit) != 0)
623 {
624 register int cc = *pt++;
625 if (c == 0 && cc == '0') continue; /* Leading zeroes */
626 count++;
627
628 #ifndef EBCDIC /* ASCII coding */
629 if (cc >= 'a') cc -= 32; /* Convert to upper case */
630 c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));
631 #else /* EBCDIC coding */
632 if (cc >= 'a' && cc <= 'z') cc += 64; /* Convert to upper case */
633 c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));
634 #endif
635 }
636
637 if (*pt == '}')
638 {
639 if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
640 ptr = pt;
641 break;
642 }
643
644 /* If the sequence of hex digits does not end with '}', then we don't
645 recognize this construct; fall through to the normal \x handling. */
646 }
647
648 /* Read just a single-byte hex-defined char */
649
650 c = 0;
651 while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
652 {
653 int cc; /* Some compilers don't like ++ */
654 cc = *(++ptr); /* in initializers */
655 #ifndef EBCDIC /* ASCII coding */
656 if (cc >= 'a') cc -= 32; /* Convert to upper case */
657 c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
658 #else /* EBCDIC coding */
659 if (cc <= 'z') cc += 64; /* Convert to upper case */
660 c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
661 #endif
662 }
663 break;
664
665 /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
666 This coding is ASCII-specific, but then the whole concept of \cx is
667 ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
668
669 case 'c':
670 c = *(++ptr);
671 if (c == 0)
672 {
673 *errorcodeptr = ERR2;
674 break;
675 }
676
677 #ifndef EBCDIC /* ASCII coding */
678 if (c >= 'a' && c <= 'z') c -= 32;
679 c ^= 0x40;
680 #else /* EBCDIC coding */
681 if (c >= 'a' && c <= 'z') c += 64;
682 c ^= 0xC0;
683 #endif
684 break;
685
686 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
687 other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
688 for Perl compatibility, it is a literal. This code looks a bit odd, but
689 there used to be some cases other than the default, and there may be again
690 in future, so I haven't "optimized" it. */
691
692 default:
693 if ((options & PCRE_EXTRA) != 0) switch(c)
694 {
695 default:
696 *errorcodeptr = ERR3;
697 break;
698 }
699 break;
700 }
701 }
702
703 *ptrptr = ptr;
704 return c;
705 }
706
707
708
709 #ifdef SUPPORT_UCP
710 /*************************************************
711 * Handle \P and \p *
712 *************************************************/
713
714 /* This function is called after \P or \p has been encountered, provided that
715 PCRE is compiled with support for Unicode properties. On entry, ptrptr is
716 pointing at the P or p. On exit, it is pointing at the final character of the
717 escape sequence.
718
719 Argument:
720 ptrptr points to the pattern position pointer
721 negptr points to a boolean that is set TRUE for negation else FALSE
722 dptr points to an int that is set to the detailed property value
723 errorcodeptr points to the error code variable
724
725 Returns: type value from ucp_type_table, or -1 for an invalid type
726 */
727
728 static int
729 get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
730 {
731 int c, i, bot, top;
732 const uschar *ptr = *ptrptr;
733 char name[32];
734
735 c = *(++ptr);
736 if (c == 0) goto ERROR_RETURN;
737
738 *negptr = FALSE;
739
740 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
741 negation. */
742
743 if (c == '{')
744 {
745 if (ptr[1] == '^')
746 {
747 *negptr = TRUE;
748 ptr++;
749 }
750 for (i = 0; i < (int)sizeof(name) - 1; i++)
751 {
752 c = *(++ptr);
753 if (c == 0) goto ERROR_RETURN;
754 if (c == '}') break;
755 name[i] = c;
756 }
757 if (c !='}') goto ERROR_RETURN;
758 name[i] = 0;
759 }
760
761 /* Otherwise there is just one following character */
762
763 else
764 {
765 name[0] = c;
766 name[1] = 0;
767 }
768
769 *ptrptr = ptr;
770
771 /* Search for a recognized property name using binary chop */
772
773 bot = 0;
774 top = _pcre_utt_size;
775
776 while (bot < top)
777 {
778 i = (bot + top) >> 1;
779 c = strcmp(name, _pcre_utt[i].name);
780 if (c == 0)
781 {
782 *dptr = _pcre_utt[i].value;
783 return _pcre_utt[i].type;
784 }
785 if (c > 0) bot = i + 1; else top = i;
786 }
787
788 *errorcodeptr = ERR47;
789 *ptrptr = ptr;
790 return -1;
791
792 ERROR_RETURN:
793 *errorcodeptr = ERR46;
794 *ptrptr = ptr;
795 return -1;
796 }
797 #endif
798
799
800
801
802 /*************************************************
803 * Check for counted repeat *
804 *************************************************/
805
806 /* This function is called when a '{' is encountered in a place where it might
807 start a quantifier. It looks ahead to see if it really is a quantifier or not.
808 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
809 where the ddds are digits.
810
811 Arguments:
812 p pointer to the first char after '{'
813
814 Returns: TRUE or FALSE
815 */
816
817 static BOOL
818 is_counted_repeat(const uschar *p)
819 {
820 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
821 while ((digitab[*p] & ctype_digit) != 0) p++;
822 if (*p == '}') return TRUE;
823
824 if (*p++ != ',') return FALSE;
825 if (*p == '}') return TRUE;
826
827 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
828 while ((digitab[*p] & ctype_digit) != 0) p++;
829
830 return (*p == '}');
831 }
832
833
834
835 /*************************************************
836 * Read repeat counts *
837 *************************************************/
838
839 /* Read an item of the form {n,m} and return the values. This is called only
840 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
841 so the syntax is guaranteed to be correct, but we need to check the values.
842
843 Arguments:
844 p pointer to first char after '{'
845 minp pointer to int for min
846 maxp pointer to int for max
847 returned as -1 if no max
848 errorcodeptr points to error code variable
849
850 Returns: pointer to '}' on success;
851 current ptr on error, with errorcodeptr set non-zero
852 */
853
854 static const uschar *
855 read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
856 {
857 int min = 0;
858 int max = -1;
859
860 /* Read the minimum value and do a paranoid check: a negative value indicates
861 an integer overflow. */
862
863 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
864 if (min < 0 || min > 65535)
865 {
866 *errorcodeptr = ERR5;
867 return p;
868 }
869
870 /* Read the maximum value if there is one, and again do a paranoid on its size.
871 Also, max must not be less than min. */
872
873 if (*p == '}') max = min; else
874 {
875 if (*(++p) != '}')
876 {
877 max = 0;
878 while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
879 if (max < 0 || max > 65535)
880 {
881 *errorcodeptr = ERR5;
882 return p;
883 }
884 if (max < min)
885 {
886 *errorcodeptr = ERR4;
887 return p;
888 }
889 }
890 }
891
892 /* Fill in the required variables, and pass back the pointer to the terminating
893 '}'. */
894
895 *minp = min;
896 *maxp = max;
897 return p;
898 }
899
900
901
902 /*************************************************
903 * Find forward referenced subpattern *
904 *************************************************/
905
906 /* This function scans along a pattern's text looking for capturing
907 subpatterns, and counting them. If it finds a named pattern that matches the
908 name it is given, it returns its number. Alternatively, if the name is NULL, it
909 returns when it reaches a given numbered subpattern. This is used for forward
910 references to subpatterns. We know that if (?P< is encountered, the name will
911 be terminated by '>' because that is checked in the first pass.
912
913 Arguments:
914 ptr current position in the pattern
915 count current count of capturing parens so far encountered
916 name name to seek, or NULL if seeking a numbered subpattern
917 lorn name length, or subpattern number if name is NULL
918 xmode TRUE if we are in /x mode
919
920 Returns: the number of the named subpattern, or -1 if not found
921 */
922
923 static int
924 find_parens(const uschar *ptr, int count, const uschar *name, int lorn,
925 BOOL xmode)
926 {
927 const uschar *thisname;
928
929 for (; *ptr != 0; ptr++)
930 {
931 int term;
932
933 /* Skip over backslashed characters and also entire \Q...\E */
934
935 if (*ptr == '\\')
936 {
937 if (*(++ptr) == 0) return -1;
938 if (*ptr == 'Q') for (;;)
939 {
940 while (*(++ptr) != 0 && *ptr != '\\');
941 if (*ptr == 0) return -1;
942 if (*(++ptr) == 'E') break;
943 }
944 continue;
945 }
946
947 /* Skip over character classes */
948
949 if (*ptr == '[')
950 {
951 while (*(++ptr) != ']')
952 {
953 if (*ptr == 0) return -1;
954 if (*ptr == '\\')
955 {
956 if (*(++ptr) == 0) return -1;
957 if (*ptr == 'Q') for (;;)
958 {
959 while (*(++ptr) != 0 && *ptr != '\\');
960 if (*ptr == 0) return -1;
961 if (*(++ptr) == 'E') break;
962 }
963 continue;
964 }
965 }
966 continue;
967 }
968
969 /* Skip comments in /x mode */
970
971 if (xmode && *ptr == '#')
972 {
973 while (*(++ptr) != 0 && *ptr != '\n');
974 if (*ptr == 0) return -1;
975 continue;
976 }
977
978 /* An opening parens must now be a real metacharacter */
979
980 if (*ptr != '(') continue;
981 if (ptr[1] != '?' && ptr[1] != '*')
982 {
983 count++;
984 if (name == NULL && count == lorn) return count;
985 continue;
986 }
987
988 ptr += 2;
989 if (*ptr == 'P') ptr++; /* Allow optional P */
990
991 /* We have to disambiguate (?<! and (?<= from (?<name> */
992
993 if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') &&
994 *ptr != '\'')
995 continue;
996
997 count++;
998
999 if (name == NULL && count == lorn) return count;
1000 term = *ptr++;
1001 if (term == '<') term = '>';
1002 thisname = ptr;
1003 while (*ptr != term) ptr++;
1004 if (name != NULL && lorn == ptr - thisname &&
1005 strncmp((const char *)name, (const char *)thisname, lorn) == 0)
1006 return count;
1007 }
1008
1009 return -1;
1010 }
1011
1012
1013
1014 /*************************************************
1015 * Find first significant op code *
1016 *************************************************/
1017
1018 /* This is called by several functions that scan a compiled expression looking
1019 for a fixed first character, or an anchoring op code etc. It skips over things
1020 that do not influence this. For some calls, a change of option is important.
1021 For some calls, it makes sense to skip negative forward and all backward
1022 assertions, and also the \b assertion; for others it does not.
1023
1024 Arguments:
1025 code pointer to the start of the group
1026 options pointer to external options
1027 optbit the option bit whose changing is significant, or
1028 zero if none are
1029 skipassert TRUE if certain assertions are to be skipped
1030
1031 Returns: pointer to the first significant opcode
1032 */
1033
1034 static const uschar*
1035 first_significant_code(const uschar *code, int *options, int optbit,
1036 BOOL skipassert)
1037 {
1038 for (;;)
1039 {
1040 switch ((int)*code)
1041 {
1042 case OP_OPT:
1043 if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
1044 *options = (int)code[1];
1045 code += 2;
1046 break;
1047
1048 case OP_ASSERT_NOT:
1049 case OP_ASSERTBACK:
1050 case OP_ASSERTBACK_NOT:
1051 if (!skipassert) return code;
1052 do code += GET(code, 1); while (*code == OP_ALT);
1053 code += _pcre_OP_lengths[*code];
1054 break;
1055
1056 case OP_WORD_BOUNDARY:
1057 case OP_NOT_WORD_BOUNDARY:
1058 if (!skipassert) return code;
1059 /* Fall through */
1060
1061 case OP_CALLOUT:
1062 case OP_CREF:
1063 case OP_RREF:
1064 case OP_DEF:
1065 code += _pcre_OP_lengths[*code];
1066 break;
1067
1068 default:
1069 return code;
1070 }
1071 }
1072 /* Control never reaches here */
1073 }
1074
1075
1076
1077
1078 /*************************************************
1079 * Find the fixed length of a pattern *
1080 *************************************************/
1081
1082 /* Scan a pattern and compute the fixed length of subject that will match it,
1083 if the length is fixed. This is needed for dealing with backward assertions.
1084 In UTF8 mode, the result is in characters rather than bytes.
1085
1086 Arguments:
1087 code points to the start of the pattern (the bracket)
1088 options the compiling options
1089
1090 Returns: the fixed length, or -1 if there is no fixed length,
1091 or -2 if \C was encountered
1092 */
1093
1094 static int
1095 find_fixedlength(uschar *code, int options)
1096 {
1097 int length = -1;
1098
1099 register int branchlength = 0;
1100 register uschar *cc = code + 1 + LINK_SIZE;
1101
1102 /* Scan along the opcodes for this branch. If we get to the end of the
1103 branch, check the length against that of the other branches. */
1104
1105 for (;;)
1106 {
1107 int d;
1108 register int op = *cc;
1109 switch (op)
1110 {
1111 case OP_CBRA:
1112 case OP_BRA:
1113 case OP_ONCE:
1114 case OP_COND:
1115 d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);
1116 if (d < 0) return d;
1117 branchlength += d;
1118 do cc += GET(cc, 1); while (*cc == OP_ALT);
1119 cc += 1 + LINK_SIZE;
1120 break;
1121
1122 /* Reached end of a branch; if it's a ket it is the end of a nested
1123 call. If it's ALT it is an alternation in a nested call. If it is
1124 END it's the end of the outer call. All can be handled by the same code. */
1125
1126 case OP_ALT:
1127 case OP_KET:
1128 case OP_KETRMAX:
1129 case OP_KETRMIN:
1130 case OP_END:
1131 if (length < 0) length = branchlength;
1132 else if (length != branchlength) return -1;
1133 if (*cc != OP_ALT) return length;
1134 cc += 1 + LINK_SIZE;
1135 branchlength = 0;
1136 break;
1137
1138 /* Skip over assertive subpatterns */
1139
1140 case OP_ASSERT:
1141 case OP_ASSERT_NOT:
1142 case OP_ASSERTBACK:
1143 case OP_ASSERTBACK_NOT:
1144 do cc += GET(cc, 1); while (*cc == OP_ALT);
1145 /* Fall through */
1146
1147 /* Skip over things that don't match chars */
1148
1149 case OP_REVERSE:
1150 case OP_CREF:
1151 case OP_RREF:
1152 case OP_DEF:
1153 case OP_OPT:
1154 case OP_CALLOUT:
1155 case OP_SOD:
1156 case OP_SOM:
1157 case OP_EOD:
1158 case OP_EODN:
1159 case OP_CIRC:
1160 case OP_DOLL:
1161 case OP_NOT_WORD_BOUNDARY:
1162 case OP_WORD_BOUNDARY:
1163 cc += _pcre_OP_lengths[*cc];
1164 break;
1165
1166 /* Handle literal characters */
1167
1168 case OP_CHAR:
1169 case OP_CHARNC:
1170 case OP_NOT:
1171 branchlength++;
1172 cc += 2;
1173 #ifdef SUPPORT_UTF8
1174 if ((options & PCRE_UTF8) != 0)
1175 {
1176 while ((*cc & 0xc0) == 0x80) cc++;
1177 }
1178 #endif
1179 break;
1180
1181 /* Handle exact repetitions. The count is already in characters, but we
1182 need to skip over a multibyte character in UTF8 mode. */
1183
1184 case OP_EXACT:
1185 branchlength += GET2(cc,1);
1186 cc += 4;
1187 #ifdef SUPPORT_UTF8
1188 if ((options & PCRE_UTF8) != 0)
1189 {
1190 while((*cc & 0x80) == 0x80) cc++;
1191 }
1192 #endif
1193 break;
1194
1195 case OP_TYPEEXACT:
1196 branchlength += GET2(cc,1);
1197 if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
1198 cc += 4;
1199 break;
1200
1201 /* Handle single-char matchers */
1202
1203 case OP_PROP:
1204 case OP_NOTPROP:
1205 cc += 2;
1206 /* Fall through */
1207
1208 case OP_NOT_DIGIT:
1209 case OP_DIGIT:
1210 case OP_NOT_WHITESPACE:
1211 case OP_WHITESPACE:
1212 case OP_NOT_WORDCHAR:
1213 case OP_WORDCHAR:
1214 case OP_ANY:
1215 branchlength++;
1216 cc++;
1217 break;
1218
1219 /* The single-byte matcher isn't allowed */
1220
1221 case OP_ANYBYTE:
1222 return -2;
1223
1224 /* Check a class for variable quantification */
1225
1226 #ifdef SUPPORT_UTF8
1227 case OP_XCLASS:
1228 cc += GET(cc, 1) - 33;
1229 /* Fall through */
1230 #endif
1231
1232 case OP_CLASS:
1233 case OP_NCLASS:
1234 cc += 33;
1235
1236 switch (*cc)
1237 {
1238 case OP_CRSTAR:
1239 case OP_CRMINSTAR:
1240 case OP_CRQUERY:
1241 case OP_CRMINQUERY:
1242 return -1;
1243
1244 case OP_CRRANGE:
1245 case OP_CRMINRANGE:
1246 if (GET2(cc,1) != GET2(cc,3)) return -1;
1247 branchlength += GET2(cc,1);
1248 cc += 5;
1249 break;
1250
1251 default:
1252 branchlength++;
1253 }
1254 break;
1255
1256 /* Anything else is variable length */
1257
1258 default:
1259 return -1;
1260 }
1261 }
1262 /* Control never gets here */
1263 }
1264
1265
1266
1267
1268 /*************************************************
1269 * Scan compiled regex for numbered bracket *
1270 *************************************************/
1271
1272 /* This little function scans through a compiled pattern until it finds a
1273 capturing bracket with the given number.
1274
1275 Arguments:
1276 code points to start of expression
1277 utf8 TRUE in UTF-8 mode
1278 number the required bracket number
1279
1280 Returns: pointer to the opcode for the bracket, or NULL if not found
1281 */
1282
1283 static const uschar *
1284 find_bracket(const uschar *code, BOOL utf8, int number)
1285 {
1286 for (;;)
1287 {
1288 register int c = *code;
1289 if (c == OP_END) return NULL;
1290
1291 /* XCLASS is used for classes that cannot be represented just by a bit
1292 map. This includes negated single high-valued characters. The length in
1293 the table is zero; the actual length is stored in the compiled code. */
1294
1295 if (c == OP_XCLASS) code += GET(code, 1);
1296
1297 /* Handle capturing bracket */
1298
1299 else if (c == OP_CBRA)
1300 {
1301 int n = GET2(code, 1+LINK_SIZE);
1302 if (n == number) return (uschar *)code;
1303 code += _pcre_OP_lengths[c];
1304 }
1305
1306 /* Otherwise, we can get the item's length from the table, except that for
1307 repeated character types, we have to test for \p and \P, which have an extra
1308 two bytes of parameters. */
1309
1310 else
1311 {
1312 switch(c)
1313 {
1314 case OP_TYPESTAR:
1315 case OP_TYPEMINSTAR:
1316 case OP_TYPEPLUS:
1317 case OP_TYPEMINPLUS:
1318 case OP_TYPEQUERY:
1319 case OP_TYPEMINQUERY:
1320 case OP_TYPEPOSSTAR:
1321 case OP_TYPEPOSPLUS:
1322 case OP_TYPEPOSQUERY:
1323 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1324 break;
1325
1326 case OP_TYPEUPTO:
1327 case OP_TYPEMINUPTO:
1328 case OP_TYPEEXACT:
1329 case OP_TYPEPOSUPTO:
1330 if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1331 break;
1332 }
1333
1334 /* Add in the fixed length from the table */
1335
1336 code += _pcre_OP_lengths[c];
1337
1338 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1339 a multi-byte character. The length in the table is a minimum, so we have to
1340 arrange to skip the extra bytes. */
1341
1342 #ifdef SUPPORT_UTF8
1343 if (utf8) switch(c)
1344 {
1345 case OP_CHAR:
1346 case OP_CHARNC:
1347 case OP_EXACT:
1348 case OP_UPTO:
1349 case OP_MINUPTO:
1350 case OP_POSUPTO:
1351 case OP_STAR:
1352 case OP_MINSTAR:
1353 case OP_POSSTAR:
1354 case OP_PLUS:
1355 case OP_MINPLUS:
1356 case OP_POSPLUS:
1357 case OP_QUERY:
1358 case OP_MINQUERY:
1359 case OP_POSQUERY:
1360 if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1361 break;
1362 }
1363 #endif
1364 }
1365 }
1366 }
1367
1368
1369
1370 /*************************************************
1371 * Scan compiled regex for recursion reference *
1372 *************************************************/
1373
1374 /* This little function scans through a compiled pattern until it finds an
1375 instance of OP_RECURSE.
1376
1377 Arguments:
1378 code points to start of expression
1379 utf8 TRUE in UTF-8 mode
1380
1381 Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1382 */
1383
1384 static const uschar *
1385 find_recurse(const uschar *code, BOOL utf8)
1386 {
1387 for (;;)
1388 {
1389 register int c = *code;
1390 if (c == OP_END) return NULL;
1391 if (c == OP_RECURSE) return code;
1392
1393 /* XCLASS is used for classes that cannot be represented just by a bit
1394 map. This includes negated single high-valued characters. The length in
1395 the table is zero; the actual length is stored in the compiled code. */
1396
1397 if (c == OP_XCLASS) code += GET(code, 1);
1398
1399 /* Otherwise, we can get the item's length from the table, except that for
1400 repeated character types, we have to test for \p and \P, which have an extra
1401 two bytes of parameters. */
1402
1403 else
1404 {
1405 switch(c)
1406 {
1407 case OP_TYPESTAR:
1408 case OP_TYPEMINSTAR:
1409 case OP_TYPEPLUS:
1410 case OP_TYPEMINPLUS:
1411 case OP_TYPEQUERY:
1412 case OP_TYPEMINQUERY:
1413 case OP_TYPEPOSSTAR:
1414 case OP_TYPEPOSPLUS:
1415 case OP_TYPEPOSQUERY:
1416 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1417 break;
1418
1419 case OP_TYPEPOSUPTO:
1420 case OP_TYPEUPTO:
1421 case OP_TYPEMINUPTO:
1422 case OP_TYPEEXACT:
1423 if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1424 break;
1425 }
1426
1427 /* Add in the fixed length from the table */
1428
1429 code += _pcre_OP_lengths[c];
1430
1431 /* In UTF-8 mode, opcodes that are followed by a character may be followed
1432 by a multi-byte character. The length in the table is a minimum, so we have
1433 to arrange to skip the extra bytes. */
1434
1435 #ifdef SUPPORT_UTF8
1436 if (utf8) switch(c)
1437 {
1438 case OP_CHAR:
1439 case OP_CHARNC:
1440 case OP_EXACT:
1441 case OP_UPTO:
1442 case OP_MINUPTO:
1443 case OP_POSUPTO:
1444 case OP_STAR:
1445 case OP_MINSTAR:
1446 case OP_POSSTAR:
1447 case OP_PLUS:
1448 case OP_MINPLUS:
1449 case OP_POSPLUS:
1450 case OP_QUERY:
1451 case OP_MINQUERY:
1452 case OP_POSQUERY:
1453 if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1454 break;
1455 }
1456 #endif
1457 }
1458 }
1459 }
1460
1461
1462
1463 /*************************************************
1464 * Scan compiled branch for non-emptiness *
1465 *************************************************/
1466
1467 /* This function scans through a branch of a compiled pattern to see whether it
1468 can match the empty string or not. It is called from could_be_empty()
1469 below and from compile_branch() when checking for an unlimited repeat of a
1470 group that can match nothing. Note that first_significant_code() skips over
1471 assertions. If we hit an unclosed bracket, we return "empty" - this means we've
1472 struck an inner bracket whose current branch will already have been scanned.
1473
1474 Arguments:
1475 code points to start of search
1476 endcode points to where to stop
1477 utf8 TRUE if in UTF8 mode
1478
1479 Returns: TRUE if what is matched could be empty
1480 */
1481
1482 static BOOL
1483 could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1484 {
1485 register int c;
1486 for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1487 code < endcode;
1488 code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1489 {
1490 const uschar *ccode;
1491
1492 c = *code;
1493
1494 /* Groups with zero repeats can of course be empty; skip them. */
1495
1496 if (c == OP_BRAZERO || c == OP_BRAMINZERO)
1497 {
1498 code += _pcre_OP_lengths[c];
1499 do code += GET(code, 1); while (*code == OP_ALT);
1500 c = *code;
1501 continue;
1502 }
1503
1504 /* For other groups, scan the branches. */
1505
1506 if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND)
1507 {
1508 BOOL empty_branch;
1509 if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1510
1511 /* Scan a closed bracket */
1512
1513 empty_branch = FALSE;
1514 do
1515 {
1516 if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1517 empty_branch = TRUE;
1518 code += GET(code, 1);
1519 }
1520 while (*code == OP_ALT);
1521 if (!empty_branch) return FALSE; /* All branches are non-empty */
1522 c = *code;
1523 continue;
1524 }
1525
1526 /* Handle the other opcodes */
1527
1528 switch (c)
1529 {
1530 /* Check for quantifiers after a class. XCLASS is used for classes that
1531 cannot be represented just by a bit map. This includes negated single
1532 high-valued characters. The length in _pcre_OP_lengths[] is zero; the
1533 actual length is stored in the compiled code, so we must update "code"
1534 here. */
1535
1536 #ifdef SUPPORT_UTF8
1537 case OP_XCLASS:
1538 ccode = code += GET(code, 1);
1539 goto CHECK_CLASS_REPEAT;
1540 #endif
1541
1542 case OP_CLASS:
1543 case OP_NCLASS:
1544 ccode = code + 33;
1545
1546 #ifdef SUPPORT_UTF8
1547 CHECK_CLASS_REPEAT:
1548 #endif
1549
1550 switch (*ccode)
1551 {
1552 case OP_CRSTAR: /* These could be empty; continue */
1553 case OP_CRMINSTAR:
1554 case OP_CRQUERY:
1555 case OP_CRMINQUERY:
1556 break;
1557
1558 default: /* Non-repeat => class must match */
1559 case OP_CRPLUS: /* These repeats aren't empty */
1560 case OP_CRMINPLUS:
1561 return FALSE;
1562
1563 case OP_CRRANGE:
1564 case OP_CRMINRANGE:
1565 if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
1566 break;
1567 }
1568 break;
1569
1570 /* Opcodes that must match a character */
1571
1572 case OP_PROP:
1573 case OP_NOTPROP:
1574 case OP_EXTUNI:
1575 case OP_NOT_DIGIT:
1576 case OP_DIGIT:
1577 case OP_NOT_WHITESPACE:
1578 case OP_WHITESPACE:
1579 case OP_NOT_WORDCHAR:
1580 case OP_WORDCHAR:
1581 case OP_ANY:
1582 case OP_ANYBYTE:
1583 case OP_CHAR:
1584 case OP_CHARNC:
1585 case OP_NOT:
1586 case OP_PLUS:
1587 case OP_MINPLUS:
1588 case OP_POSPLUS:
1589 case OP_EXACT:
1590 case OP_NOTPLUS:
1591 case OP_NOTMINPLUS:
1592 case OP_NOTPOSPLUS:
1593 case OP_NOTEXACT:
1594 case OP_TYPEPLUS:
1595 case OP_TYPEMINPLUS:
1596 case OP_TYPEPOSPLUS:
1597 case OP_TYPEEXACT:
1598 return FALSE;
1599
1600 /* These are going to continue, as they may be empty, but we have to
1601 fudge the length for the \p and \P cases. */
1602
1603 case OP_TYPESTAR:
1604 case OP_TYPEMINSTAR:
1605 case OP_TYPEPOSSTAR:
1606 case OP_TYPEQUERY:
1607 case OP_TYPEMINQUERY:
1608 case OP_TYPEPOSQUERY:
1609 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1610 break;
1611
1612 /* Same for these */
1613
1614 case OP_TYPEUPTO:
1615 case OP_TYPEMINUPTO:
1616 case OP_TYPEPOSUPTO:
1617 if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1618 break;
1619
1620 /* End of branch */
1621
1622 case OP_KET:
1623 case OP_KETRMAX:
1624 case OP_KETRMIN:
1625 case OP_ALT:
1626 return TRUE;
1627
1628 /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1629 MINUPTO, and POSUPTO may be followed by a multibyte character */
1630
1631 #ifdef SUPPORT_UTF8
1632 case OP_STAR:
1633 case OP_MINSTAR:
1634 case OP_POSSTAR:
1635 case OP_QUERY:
1636 case OP_MINQUERY:
1637 case OP_POSQUERY:
1638 case OP_UPTO:
1639 case OP_MINUPTO:
1640 case OP_POSUPTO:
1641 if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1642 break;
1643 #endif
1644 }
1645 }
1646
1647 return TRUE;
1648 }
1649
1650
1651
1652 /*************************************************
1653 * Scan compiled regex for non-emptiness *
1654 *************************************************/
1655
1656 /* This function is called to check for left recursive calls. We want to check
1657 the current branch of the current pattern to see if it could match the empty
1658 string. If it could, we must look outwards for branches at other levels,
1659 stopping when we pass beyond the bracket which is the subject of the recursion.
1660
1661 Arguments:
1662 code points to start of the recursion
1663 endcode points to where to stop (current RECURSE item)
1664 bcptr points to the chain of current (unclosed) branch starts
1665 utf8 TRUE if in UTF-8 mode
1666
1667 Returns: TRUE if what is matched could be empty
1668 */
1669
1670 static BOOL
1671 could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1672 BOOL utf8)
1673 {
1674 while (bcptr != NULL && bcptr->current >= code)
1675 {
1676 if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1677 bcptr = bcptr->outer;
1678 }
1679 return TRUE;
1680 }
1681
1682
1683
1684 /*************************************************
1685 * Check for POSIX class syntax *
1686 *************************************************/
1687
1688 /* This function is called when the sequence "[:" or "[." or "[=" is
1689 encountered in a character class. It checks whether this is followed by an
1690 optional ^ and then a sequence of letters, terminated by a matching ":]" or
1691 ".]" or "=]".
1692
1693 Argument:
1694 ptr pointer to the initial [
1695 endptr where to return the end pointer
1696 cd pointer to compile data
1697
1698 Returns: TRUE or FALSE
1699 */
1700
1701 static BOOL
1702 check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
1703 {
1704 int terminator; /* Don't combine these lines; the Solaris cc */
1705 terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
1706 if (*(++ptr) == '^') ptr++;
1707 while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
1708 if (*ptr == terminator && ptr[1] == ']')
1709 {
1710 *endptr = ptr;
1711 return TRUE;
1712 }
1713 return FALSE;
1714 }
1715
1716
1717
1718
1719 /*************************************************
1720 * Check POSIX class name *
1721 *************************************************/
1722
1723 /* This function is called to check the name given in a POSIX-style class entry
1724 such as [:alnum:].
1725
1726 Arguments:
1727 ptr points to the first letter
1728 len the length of the name
1729
1730 Returns: a value representing the name, or -1 if unknown
1731 */
1732
1733 static int
1734 check_posix_name(const uschar *ptr, int len)
1735 {
1736 register int yield = 0;
1737 while (posix_name_lengths[yield] != 0)
1738 {
1739 if (len == posix_name_lengths[yield] &&
1740 strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
1741 yield++;
1742 }
1743 return -1;
1744 }
1745
1746
1747 /*************************************************
1748 * Adjust OP_RECURSE items in repeated group *
1749 *************************************************/
1750
1751 /* OP_RECURSE items contain an offset from the start of the regex to the group
1752 that is referenced. This means that groups can be replicated for fixed
1753 repetition simply by copying (because the recursion is allowed to refer to
1754 earlier groups that are outside the current group). However, when a group is
1755 optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
1756 it, after it has been compiled. This means that any OP_RECURSE items within it
1757 that refer to the group itself or any contained groups have to have their
1758 offsets adjusted. That one of the jobs of this function. Before it is called,
1759 the partially compiled regex must be temporarily terminated with OP_END.
1760
1761 This function has been extended with the possibility of forward references for
1762 recursions and subroutine calls. It must also check the list of such references
1763 for the group we are dealing with. If it finds that one of the recursions in
1764 the current group is on this list, it adjusts the offset in the list, not the
1765 value in the reference (which is a group number).
1766
1767 Arguments:
1768 group points to the start of the group
1769 adjust the amount by which the group is to be moved
1770 utf8 TRUE in UTF-8 mode
1771 cd contains pointers to tables etc.
1772 save_hwm the hwm forward reference pointer at the start of the group
1773
1774 Returns: nothing
1775 */
1776
1777 static void
1778 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
1779 uschar *save_hwm)
1780 {
1781 uschar *ptr = group;
1782
1783 while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1784 {
1785 int offset;
1786 uschar *hc;
1787
1788 /* See if this recursion is on the forward reference list. If so, adjust the
1789 reference. */
1790
1791 for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
1792 {
1793 offset = GET(hc, 0);
1794 if (cd->start_code + offset == ptr + 1)
1795 {
1796 PUT(hc, 0, offset + adjust);
1797 break;
1798 }
1799 }
1800
1801 /* Otherwise, adjust the recursion offset if it's after the start of this
1802 group. */
1803
1804 if (hc >= cd->hwm)
1805 {
1806 offset = GET(ptr, 1);
1807 if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1808 }
1809
1810 ptr += 1 + LINK_SIZE;
1811 }
1812 }
1813
1814
1815
1816 /*************************************************
1817 * Insert an automatic callout point *
1818 *************************************************/
1819
1820 /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
1821 callout points before each pattern item.
1822
1823 Arguments:
1824 code current code pointer
1825 ptr current pattern pointer
1826 cd pointers to tables etc
1827
1828 Returns: new code pointer
1829 */
1830
1831 static uschar *
1832 auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
1833 {
1834 *code++ = OP_CALLOUT;
1835 *code++ = 255;
1836 PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */
1837 PUT(code, LINK_SIZE, 0); /* Default length */
1838 return code + 2*LINK_SIZE;
1839 }
1840
1841
1842
1843 /*************************************************
1844 * Complete a callout item *
1845 *************************************************/
1846
1847 /* A callout item contains the length of the next item in the pattern, which
1848 we can't fill in till after we have reached the relevant point. This is used
1849 for both automatic and manual callouts.
1850
1851 Arguments:
1852 previous_callout points to previous callout item
1853 ptr current pattern pointer
1854 cd pointers to tables etc
1855
1856 Returns: nothing
1857 */
1858
1859 static void
1860 complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
1861 {
1862 int length = ptr - cd->start_pattern - GET(previous_callout, 2);
1863 PUT(previous_callout, 2 + LINK_SIZE, length);
1864 }
1865
1866
1867
1868 #ifdef SUPPORT_UCP
1869 /*************************************************
1870 * Get othercase range *
1871 *************************************************/
1872
1873 /* This function is passed the start and end of a class range, in UTF-8 mode
1874 with UCP support. It searches up the characters, looking for internal ranges of
1875 characters in the "other" case. Each call returns the next one, updating the
1876 start address.
1877
1878 Arguments:
1879 cptr points to starting character value; updated
1880 d end value
1881 ocptr where to put start of othercase range
1882 odptr where to put end of othercase range
1883
1884 Yield: TRUE when range returned; FALSE when no more
1885 */
1886
1887 static BOOL
1888 get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
1889 unsigned int *odptr)
1890 {
1891 unsigned int c, othercase, next;
1892
1893 for (c = *cptr; c <= d; c++)
1894 { if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR) break; }
1895
1896 if (c > d) return FALSE;
1897
1898 *ocptr = othercase;
1899 next = othercase + 1;
1900
1901 for (++c; c <= d; c++)
1902 {
1903 if (_pcre_ucp_othercase(c) != next) break;
1904 next++;
1905 }
1906
1907 *odptr = next - 1;
1908 *cptr = c;
1909
1910 return TRUE;
1911 }
1912 #endif /* SUPPORT_UCP */
1913
1914
1915
1916 /*************************************************
1917 * Check if auto-possessifying is possible *
1918 *************************************************/
1919
1920 /* This function is called for unlimited repeats of certain items, to see
1921 whether the next thing could possibly match the repeated item. If not, it makes
1922 sense to automatically possessify the repeated item.
1923
1924 Arguments:
1925 op_code the repeated op code
1926 this data for this item, depends on the opcode
1927 utf8 TRUE in UTF-8 mode
1928 utf8_char used for utf8 character bytes, NULL if not relevant
1929 ptr next character in pattern
1930 options options bits
1931 cd contains pointers to tables etc.
1932
1933 Returns: TRUE if possessifying is wanted
1934 */
1935
1936 static BOOL
1937 check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
1938 const uschar *ptr, int options, compile_data *cd)
1939 {
1940 int next;
1941
1942 /* Skip whitespace and comments in extended mode */
1943
1944 if ((options & PCRE_EXTENDED) != 0)
1945 {
1946 for (;;)
1947 {
1948 while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1949 if (*ptr == '#')
1950 {
1951 while (*(++ptr) != 0)
1952 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1953 }
1954 else break;
1955 }
1956 }
1957
1958 /* If the next item is one that we can handle, get its value. A non-negative
1959 value is a character, a negative value is an escape value. */
1960
1961 if (*ptr == '\\')
1962 {
1963 int temperrorcode = 0;
1964 next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
1965 if (temperrorcode != 0) return FALSE;
1966 ptr++; /* Point after the escape sequence */
1967 }
1968
1969 else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
1970 {
1971 #ifdef SUPPORT_UTF8
1972 if (utf8) { GETCHARINC(next, ptr); } else
1973 #endif
1974 next = *ptr++;
1975 }
1976
1977 else return FALSE;
1978
1979 /* Skip whitespace and comments in extended mode */
1980
1981 if ((options & PCRE_EXTENDED) != 0)
1982 {
1983 for (;;)
1984 {
1985 while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1986 if (*ptr == '#')
1987 {
1988 while (*(++ptr) != 0)
1989 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1990 }
1991 else break;
1992 }
1993 }
1994
1995 /* If the next thing is itself optional, we have to give up. */
1996
1997 if (*ptr == '*' || *ptr == '?' || strncmp((char *)ptr, "{0,", 3) == 0)
1998 return FALSE;
1999
2000 /* Now compare the next item with the previous opcode. If the previous is a
2001 positive single character match, "item" either contains the character or, if
2002 "item" is greater than 127 in utf8 mode, the character's bytes are in
2003 utf8_char. */
2004
2005
2006 /* Handle cases when the next item is a character. */
2007
2008 if (next >= 0) switch(op_code)
2009 {
2010 case OP_CHAR:
2011 #ifdef SUPPORT_UTF8
2012 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2013 #endif
2014 return item != next;
2015
2016 /* For CHARNC (caseless character) we must check the other case. If we have
2017 Unicode property support, we can use it to test the other case of
2018 high-valued characters. */
2019
2020 case OP_CHARNC:
2021 #ifdef SUPPORT_UTF8
2022 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2023 #endif
2024 if (item == next) return FALSE;
2025 #ifdef SUPPORT_UTF8
2026 if (utf8)
2027 {
2028 unsigned int othercase;
2029 if (next < 128) othercase = cd->fcc[next]; else
2030 #ifdef SUPPORT_UCP
2031 othercase = _pcre_ucp_othercase((unsigned int)next);
2032 #else
2033 othercase = NOTACHAR;
2034 #endif
2035 return (unsigned int)item != othercase;
2036 }
2037 else
2038 #endif /* SUPPORT_UTF8 */
2039 return (item != cd->fcc[next]); /* Non-UTF-8 mode */
2040
2041 /* For OP_NOT, "item" must be a single-byte character. */
2042
2043 case OP_NOT:
2044 if (next < 0) return FALSE; /* Not a character */
2045 if (item == next) return TRUE;
2046 if ((options & PCRE_CASELESS) == 0) return FALSE;
2047 #ifdef SUPPORT_UTF8
2048 if (utf8)
2049 {
2050 unsigned int othercase;
2051 if (next < 128) othercase = cd->fcc[next]; else
2052 #ifdef SUPPORT_UCP
2053 othercase = _pcre_ucp_othercase(next);
2054 #else
2055 othercase = NOTACHAR;
2056 #endif
2057 return (unsigned int)item == othercase;
2058 }
2059 else
2060 #endif /* SUPPORT_UTF8 */
2061 return (item == cd->fcc[next]); /* Non-UTF-8 mode */
2062
2063 case OP_DIGIT:
2064 return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
2065
2066 case OP_NOT_DIGIT:
2067 return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
2068
2069 case OP_WHITESPACE:
2070 return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
2071
2072 case OP_NOT_WHITESPACE:
2073 return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
2074
2075 case OP_WORDCHAR:
2076 return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
2077
2078 case OP_NOT_WORDCHAR:
2079 return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
2080
2081 case OP_HSPACE:
2082 case OP_NOT_HSPACE:
2083 switch(next)
2084 {
2085 case 0x09:
2086 case 0x20:
2087 case 0xa0:
2088 case 0x1680:
2089 case 0x180e:
2090 case 0x2000:
2091 case 0x2001:
2092 case 0x2002:
2093 case 0x2003:
2094 case 0x2004:
2095 case 0x2005:
2096 case 0x2006:
2097 case 0x2007:
2098 case 0x2008:
2099 case 0x2009:
2100 case 0x200A:
2101 case 0x202f:
2102 case 0x205f:
2103 case 0x3000:
2104 return op_code != OP_HSPACE;
2105 default:
2106 return op_code == OP_HSPACE;
2107 }
2108
2109 case OP_VSPACE:
2110 case OP_NOT_VSPACE:
2111 switch(next)
2112 {
2113 case 0x0a:
2114 case 0x0b:
2115 case 0x0c:
2116 case 0x0d:
2117 case 0x85:
2118 case 0x2028:
2119 case 0x2029:
2120 return op_code != OP_VSPACE;
2121 default:
2122 return op_code == OP_VSPACE;
2123 }
2124
2125 default:
2126 return FALSE;
2127 }
2128
2129
2130 /* Handle the case when the next item is \d, \s, etc. */
2131
2132 switch(op_code)
2133 {
2134 case OP_CHAR:
2135 case OP_CHARNC:
2136 #ifdef SUPPORT_UTF8
2137 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2138 #endif
2139 switch(-next)
2140 {
2141 case ESC_d:
2142 return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
2143
2144 case ESC_D:
2145 return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
2146
2147 case ESC_s:
2148 return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
2149
2150 case ESC_S:
2151 return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
2152
2153 case ESC_w:
2154 return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
2155
2156 case ESC_W:
2157 return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
2158
2159 case ESC_h:
2160 case ESC_H:
2161 switch(item)
2162 {
2163 case 0x09:
2164 case 0x20:
2165 case 0xa0:
2166 case 0x1680:
2167 case 0x180e:
2168 case 0x2000:
2169 case 0x2001:
2170 case 0x2002:
2171 case 0x2003:
2172 case 0x2004:
2173 case 0x2005:
2174 case 0x2006:
2175 case 0x2007:
2176 case 0x2008:
2177 case 0x2009:
2178 case 0x200A:
2179 case 0x202f:
2180 case 0x205f:
2181 case 0x3000:
2182 return -next != ESC_h;
2183 default:
2184 return -next == ESC_h;
2185 }
2186
2187 case ESC_v:
2188 case ESC_V:
2189 switch(item)
2190 {
2191 case 0x0a:
2192 case 0x0b:
2193 case 0x0c:
2194 case 0x0d:
2195 case 0x85:
2196 case 0x2028:
2197 case 0x2029:
2198 return -next != ESC_v;
2199 default:
2200 return -next == ESC_v;
2201 }
2202
2203 default:
2204 return FALSE;
2205 }
2206
2207 case OP_DIGIT:
2208 return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
2209 next == -ESC_h || next == -ESC_v;
2210
2211 case OP_NOT_DIGIT:
2212 return next == -ESC_d;
2213
2214 case OP_WHITESPACE:
2215 return next == -ESC_S || next == -ESC_d || next == -ESC_w;
2216
2217 case OP_NOT_WHITESPACE:
2218 return next == -ESC_s || next == -ESC_h || next == -ESC_v;
2219
2220 case OP_HSPACE:
2221 return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w;
2222
2223 case OP_NOT_HSPACE:
2224 return next == -ESC_h;
2225
2226 /* Can't have \S in here because VT matches \S (Perl anomaly) */
2227 case OP_VSPACE:
2228 return next == -ESC_V || next == -ESC_d || next == -ESC_w;
2229
2230 case OP_NOT_VSPACE:
2231 return next == -ESC_v;
2232
2233 case OP_WORDCHAR:
2234 return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v;
2235
2236 case OP_NOT_WORDCHAR:
2237 return next == -ESC_w || next == -ESC_d;
2238
2239 default:
2240 return FALSE;
2241 }
2242
2243 /* Control does not reach here */
2244 }
2245
2246
2247
2248 /*************************************************
2249 * Compile one branch *
2250 *************************************************/
2251
2252 /* Scan the pattern, compiling it into the a vector. If the options are
2253 changed during the branch, the pointer is used to change the external options
2254 bits. This function is used during the pre-compile phase when we are trying
2255 to find out the amount of memory needed, as well as during the real compile
2256 phase. The value of lengthptr distinguishes the two phases.
2257
2258 Arguments:
2259 optionsptr pointer to the option bits
2260 codeptr points to the pointer to the current code point
2261 ptrptr points to the current pattern pointer
2262 errorcodeptr points to error code variable
2263 firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2264 reqbyteptr set to the last literal character required, else < 0
2265 bcptr points to current branch chain
2266 cd contains pointers to tables etc.
2267 lengthptr NULL during the real compile phase
2268 points to length accumulator during pre-compile phase
2269
2270 Returns: TRUE on success
2271 FALSE, with *errorcodeptr set non-zero on error
2272 */
2273
2274 static BOOL
2275 compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2276 int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2277 compile_data *cd, int *lengthptr)
2278 {
2279 int repeat_type, op_type;
2280 int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
2281 int bravalue = 0;
2282 int greedy_default, greedy_non_default;
2283 int firstbyte, reqbyte;
2284 int zeroreqbyte, zerofirstbyte;
2285 int req_caseopt, reqvary, tempreqvary;
2286 int options = *optionsptr;
2287 int after_manual_callout = 0;
2288 int length_prevgroup = 0;
2289 register int c;
2290 register uschar *code = *codeptr;
2291 uschar *last_code = code;
2292 uschar *orig_code = code;
2293 uschar *tempcode;
2294 BOOL inescq = FALSE;
2295 BOOL groupsetfirstbyte = FALSE;
2296 const uschar *ptr = *ptrptr;
2297 const uschar *tempptr;
2298 uschar *previous = NULL;
2299 uschar *previous_callout = NULL;
2300 uschar *save_hwm = NULL;
2301 uschar classbits[32];
2302
2303 #ifdef SUPPORT_UTF8
2304 BOOL class_utf8;
2305 BOOL utf8 = (options & PCRE_UTF8) != 0;
2306 uschar *class_utf8data;
2307 uschar utf8_char[6];
2308 #else
2309 BOOL utf8 = FALSE;
2310 uschar *utf8_char = NULL;
2311 #endif
2312
2313 #ifdef DEBUG
2314 if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2315 #endif
2316
2317 /* Set up the default and non-default settings for greediness */
2318
2319 greedy_default = ((options & PCRE_UNGREEDY) != 0);
2320 greedy_non_default = greedy_default ^ 1;
2321
2322 /* Initialize no first byte, no required byte. REQ_UNSET means "no char
2323 matching encountered yet". It gets changed to REQ_NONE if we hit something that
2324 matches a non-fixed char first char; reqbyte just remains unset if we never
2325 find one.
2326
2327 When we hit a repeat whose minimum is zero, we may have to adjust these values
2328 to take the zero repeat into account. This is implemented by setting them to
2329 zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2330 item types that can be repeated set these backoff variables appropriately. */
2331
2332 firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2333
2334 /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2335 according to the current setting of the caseless flag. REQ_CASELESS is a bit
2336 value > 255. It is added into the firstbyte or reqbyte variables to record the
2337 case status of the value. This is used only for ASCII characters. */
2338
2339 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2340
2341 /* Switch on next character until the end of the branch */
2342
2343 for (;; ptr++)
2344 {
2345 BOOL negate_class;
2346 BOOL possessive_quantifier;
2347 BOOL is_quantifier;
2348 BOOL is_recurse;
2349 BOOL reset_bracount;
2350 int class_charcount;
2351 int class_lastchar;
2352 int newoptions;
2353 int recno;
2354 int refsign;
2355 int skipbytes;
2356 int subreqbyte;
2357 int subfirstbyte;
2358 int terminator;
2359 int mclength;
2360 uschar mcbuffer[8];
2361
2362 /* Get next byte in the pattern */
2363
2364 c = *ptr;
2365
2366 /* If we are in the pre-compile phase, accumulate the length used for the
2367 previous cycle of this loop. */
2368
2369 if (lengthptr != NULL)
2370 {
2371 #ifdef DEBUG
2372 if (code > cd->hwm) cd->hwm = code; /* High water info */
2373 #endif
2374 if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
2375 {
2376 *errorcodeptr = ERR52;
2377 goto FAILED;
2378 }
2379
2380 /* There is at least one situation where code goes backwards: this is the
2381 case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2382 the class is simply eliminated. However, it is created first, so we have to
2383 allow memory for it. Therefore, don't ever reduce the length at this point.
2384 */
2385
2386 if (code < last_code) code = last_code;
2387
2388 /* Paranoid check for integer overflow */
2389
2390 if (OFLOW_MAX - *lengthptr < code - last_code)
2391 {
2392 *errorcodeptr = ERR20;
2393 goto FAILED;
2394 }
2395
2396 *lengthptr += code - last_code;
2397 DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2398
2399 /* If "previous" is set and it is not at the start of the work space, move
2400 it back to there, in order to avoid filling up the work space. Otherwise,
2401 if "previous" is NULL, reset the current code pointer to the start. */
2402
2403 if (previous != NULL)
2404 {
2405 if (previous > orig_code)
2406 {
2407 memmove(orig_code, previous, code - previous);
2408 code -= previous - orig_code;
2409 previous = orig_code;
2410 }
2411 }
2412 else code = orig_code;
2413
2414 /* Remember where this code item starts so we can pick up the length
2415 next time round. */
2416
2417 last_code = code;
2418 }
2419
2420 /* In the real compile phase, just check the workspace used by the forward
2421 reference list. */
2422
2423 else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
2424 {
2425 *errorcodeptr = ERR52;
2426 goto FAILED;
2427 }
2428
2429 /* If in \Q...\E, check for the end; if not, we have a literal */
2430
2431 if (inescq && c != 0)
2432 {
2433 if (c == '\\' && ptr[1] == 'E')
2434 {
2435 inescq = FALSE;
2436 ptr++;
2437 continue;
2438 }
2439 else
2440 {
2441 if (previous_callout != NULL)
2442 {
2443 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2444 complete_callout(previous_callout, ptr, cd);
2445 previous_callout = NULL;
2446 }
2447 if ((options & PCRE_AUTO_CALLOUT) != 0)
2448 {
2449 previous_callout = code;
2450 code = auto_callout(code, ptr, cd);
2451 }
2452 goto NORMAL_CHAR;
2453 }
2454 }
2455
2456 /* Fill in length of a previous callout, except when the next thing is
2457 a quantifier. */
2458
2459 is_quantifier = c == '*' || c == '+' || c == '?' ||
2460 (c == '{' && is_counted_repeat(ptr+1));
2461
2462 if (!is_quantifier && previous_callout != NULL &&
2463 after_manual_callout-- <= 0)
2464 {
2465 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2466 complete_callout(previous_callout, ptr, cd);
2467 previous_callout = NULL;
2468 }
2469
2470 /* In extended mode, skip white space and comments */
2471
2472 if ((options & PCRE_EXTENDED) != 0)
2473 {
2474 if ((cd->ctypes[c] & ctype_space) != 0) continue;
2475 if (c == '#')
2476 {
2477 while (*(++ptr) != 0)
2478 {
2479 if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
2480 }
2481 if (*ptr != 0) continue;
2482
2483 /* Else fall through to handle end of string */
2484 c = 0;
2485 }
2486 }
2487
2488 /* No auto callout for quantifiers. */
2489
2490 if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
2491 {
2492 previous_callout = code;
2493 code = auto_callout(code, ptr, cd);
2494 }
2495
2496 switch(c)
2497 {
2498 /* ===================================================================*/
2499 case 0: /* The branch terminates at string end */
2500 case '|': /* or | or ) */
2501 case ')':
2502 *firstbyteptr = firstbyte;
2503 *reqbyteptr = reqbyte;
2504 *codeptr = code;
2505 *ptrptr = ptr;
2506 if (lengthptr != NULL)
2507 {
2508 if (OFLOW_MAX - *lengthptr < code - last_code)
2509 {
2510 *errorcodeptr = ERR20;
2511 goto FAILED;
2512 }
2513 *lengthptr += code - last_code; /* To include callout length */
2514 DPRINTF((">> end branch\n"));
2515 }
2516 return TRUE;
2517
2518
2519 /* ===================================================================*/
2520 /* Handle single-character metacharacters. In multiline mode, ^ disables
2521 the setting of any following char as a first character. */
2522
2523 case '^':
2524 if ((options & PCRE_MULTILINE) != 0)
2525 {
2526 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2527 }
2528 previous = NULL;
2529 *code++ = OP_CIRC;
2530 break;
2531
2532 case '$':
2533 previous = NULL;
2534 *code++ = OP_DOLL;
2535 break;
2536
2537 /* There can never be a first char if '.' is first, whatever happens about
2538 repeats. The value of reqbyte doesn't change either. */
2539
2540 case '.':
2541 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2542 zerofirstbyte = firstbyte;
2543 zeroreqbyte = reqbyte;
2544 previous = code;
2545 *code++ = OP_ANY;
2546 break;
2547
2548
2549 /* ===================================================================*/
2550 /* Character classes. If the included characters are all < 256, we build a
2551 32-byte bitmap of the permitted characters, except in the special case
2552 where there is only one such character. For negated classes, we build the
2553 map as usual, then invert it at the end. However, we use a different opcode
2554 so that data characters > 255 can be handled correctly.
2555
2556 If the class contains characters outside the 0-255 range, a different
2557 opcode is compiled. It may optionally have a bit map for characters < 256,
2558 but those above are are explicitly listed afterwards. A flag byte tells
2559 whether the bitmap is present, and whether this is a negated class or not.
2560 */
2561
2562 case '[':
2563 previous = code;
2564
2565 /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
2566 they are encountered at the top level, so we'll do that too. */
2567
2568 if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2569 check_posix_syntax(ptr, &tempptr, cd))
2570 {
2571 *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;
2572 goto FAILED;
2573 }
2574
2575 /* If the first character is '^', set the negation flag and skip it. Also,
2576 if the first few characters (either before or after ^) are \Q\E or \E we
2577 skip them too. This makes for compatibility with Perl. */
2578
2579 negate_class = FALSE;
2580 for (;;)
2581 {
2582 c = *(++ptr);
2583 if (c == '\\')
2584 {
2585 if (ptr[1] == 'E') ptr++;
2586 else if (strncmp((const char *)ptr+1, "Q\\E", 3) == 0) ptr += 3;
2587 else break;
2588 }
2589 else if (!negate_class && c == '^')
2590 negate_class = TRUE;
2591 else break;
2592 }
2593
2594 /* Keep a count of chars with values < 256 so that we can optimize the case
2595 of just a single character (as long as it's < 256). However, For higher
2596 valued UTF-8 characters, we don't yet do any optimization. */
2597
2598 class_charcount = 0;
2599 class_lastchar = -1;
2600
2601 /* Initialize the 32-char bit map to all zeros. We build the map in a
2602 temporary bit of memory, in case the class contains only 1 character (less
2603 than 256), because in that case the compiled code doesn't use the bit map.
2604 */
2605
2606 memset(classbits, 0, 32 * sizeof(uschar));
2607
2608 #ifdef SUPPORT_UTF8
2609 class_utf8 = FALSE; /* No chars >= 256 */
2610 class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */
2611 #endif
2612
2613 /* Process characters until ] is reached. By writing this as a "do" it
2614 means that an initial ] is taken as a data character. At the start of the
2615 loop, c contains the first byte of the character. */
2616
2617 if (c != 0) do
2618 {
2619 const uschar *oldptr;
2620
2621 #ifdef SUPPORT_UTF8
2622 if (utf8 && c > 127)
2623 { /* Braces are required because the */
2624 GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
2625 }
2626 #endif
2627
2628 /* Inside \Q...\E everything is literal except \E */
2629
2630 if (inescq)
2631 {
2632 if (c == '\\' && ptr[1] == 'E') /* If we are at \E */
2633 {
2634 inescq = FALSE; /* Reset literal state */
2635 ptr++; /* Skip the 'E' */
2636 continue; /* Carry on with next */
2637 }
2638 goto CHECK_RANGE; /* Could be range if \E follows */
2639 }
2640
2641 /* Handle POSIX class names. Perl allows a negation extension of the
2642 form [:^name:]. A square bracket that doesn't match the syntax is
2643 treated as a literal. We also recognize the POSIX constructions
2644 [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
2645 5.6 and 5.8 do. */
2646
2647 if (c == '[' &&
2648 (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2649 check_posix_syntax(ptr, &tempptr, cd))
2650 {
2651 BOOL local_negate = FALSE;
2652 int posix_class, taboffset, tabopt;
2653 register const uschar *cbits = cd->cbits;
2654 uschar pbits[32];
2655
2656 if (ptr[1] != ':')
2657 {
2658 *errorcodeptr = ERR31;
2659 goto FAILED;
2660 }
2661
2662 ptr += 2;
2663 if (*ptr == '^')
2664 {
2665 local_negate = TRUE;
2666 ptr++;
2667 }
2668
2669 posix_class = check_posix_name(ptr, tempptr - ptr);
2670 if (posix_class < 0)
2671 {
2672 *errorcodeptr = ERR30;
2673 goto FAILED;
2674 }
2675
2676 /* If matching is caseless, upper and lower are converted to
2677 alpha. This relies on the fact that the class table starts with
2678 alpha, lower, upper as the first 3 entries. */
2679
2680 if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
2681 posix_class = 0;
2682
2683 /* We build the bit map for the POSIX class in a chunk of local store
2684 because we may be adding and subtracting from it, and we don't want to
2685 subtract bits that may be in the main map already. At the end we or the
2686 result into the bit map that is being built. */
2687
2688 posix_class *= 3;
2689
2690 /* Copy in the first table (always present) */
2691
2692 memcpy(pbits, cbits + posix_class_maps[posix_class],
2693 32 * sizeof(uschar));
2694
2695 /* If there is a second table, add or remove it as required. */
2696
2697 taboffset = posix_class_maps[posix_class + 1];
2698 tabopt = posix_class_maps[posix_class + 2];
2699
2700 if (taboffset >= 0)
2701 {
2702 if (tabopt >= 0)
2703 for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
2704 else
2705 for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
2706 }
2707
2708 /* Not see if we need to remove any special characters. An option
2709 value of 1 removes vertical space and 2 removes underscore. */
2710
2711 if (tabopt < 0) tabopt = -tabopt;
2712 if (tabopt == 1) pbits[1] &= ~0x3c;
2713 else if (tabopt == 2) pbits[11] &= 0x7f;
2714
2715 /* Add the POSIX table or its complement into the main table that is
2716 being built and we are done. */
2717
2718 if (local_negate)
2719 for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
2720 else
2721 for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
2722
2723 ptr = tempptr + 1;
2724 class_charcount = 10; /* Set > 1; assumes more than 1 per class */
2725 continue; /* End of POSIX syntax handling */
2726 }
2727
2728 /* Backslash may introduce a single character, or it may introduce one
2729 of the specials, which just set a flag. The sequence \b is a special
2730 case. Inside a class (and only there) it is treated as backspace.
2731 Elsewhere it marks a word boundary. Other escapes have preset maps ready
2732 to 'or' into the one we are building. We assume they have more than one
2733 character in them, so set class_charcount bigger than one. */
2734
2735 if (c == '\\')
2736 {
2737 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2738 if (*errorcodeptr != 0) goto FAILED;
2739
2740 if (-c == ESC_b) c = '\b'; /* \b is backslash in a class */
2741 else if (-c == ESC_X) c = 'X'; /* \X is literal X in a class */
2742 else if (-c == ESC_R) c = 'R'; /* \R is literal R in a class */
2743 else if (-c == ESC_Q) /* Handle start of quoted string */
2744 {
2745 if (ptr[1] == '\\' && ptr[2] == 'E')
2746 {
2747 ptr += 2; /* avoid empty string */
2748 }
2749 else inescq = TRUE;
2750 continue;
2751 }
2752 else if (-c == ESC_E) continue; /* Ignore orphan \E */
2753
2754 if (c < 0)
2755 {
2756 register const uschar *cbits = cd->cbits;
2757 class_charcount += 2; /* Greater than 1 is what matters */
2758
2759 /* Save time by not doing this in the pre-compile phase. */
2760
2761 if (lengthptr == NULL) switch (-c)
2762 {
2763 case ESC_d:
2764 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
2765 continue;
2766
2767 case ESC_D:
2768 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
2769 continue;
2770
2771 case ESC_w:
2772 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
2773 continue;
2774
2775 case ESC_W:
2776 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
2777 continue;
2778
2779 case ESC_s:
2780 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
2781 classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
2782 continue;
2783
2784 case ESC_S:
2785 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
2786 classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
2787 continue;
2788
2789 case ESC_E: /* Perl ignores an orphan \E */
2790 continue;
2791
2792 default: /* Not recognized; fall through */
2793 break; /* Need "default" setting to stop compiler warning. */
2794 }
2795
2796 /* In the pre-compile phase, just do the recognition. */
2797
2798 else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
2799 c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
2800
2801 /* We need to deal with \H, \h, \V, and \v in both phases because
2802 they use extra memory. */
2803
2804 if (-c == ESC_h)
2805 {
2806 SETBIT(classbits, 0x09); /* VT */
2807 SETBIT(classbits, 0x20); /* SPACE */
2808 SETBIT(classbits, 0xa0); /* NSBP */
2809 #ifdef SUPPORT_UTF8
2810 if (utf8)
2811 {
2812 class_utf8 = TRUE;
2813 *class_utf8data++ = XCL_SINGLE;
2814 class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
2815 *class_utf8data++ = XCL_SINGLE;
2816 class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
2817 *class_utf8data++ = XCL_RANGE;
2818 class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
2819 class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
2820 *class_utf8data++ = XCL_SINGLE;
2821 class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
2822 *class_utf8data++ = XCL_SINGLE;
2823 class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
2824 *class_utf8data++ = XCL_SINGLE;
2825 class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
2826 }
2827 #endif
2828 continue;
2829 }
2830
2831 if (-c == ESC_H)
2832 {
2833 for (c = 0; c < 32; c++)
2834 {
2835 int x = 0xff;
2836 switch (c)
2837 {
2838 case 0x09/8: x ^= 1 << (0x09%8); break;
2839 case 0x20/8: x ^= 1 << (0x20%8); break;
2840 case 0xa0/8: x ^= 1 << (0xa0%8); break;
2841 default: break;
2842 }
2843 classbits[c] |= x;
2844 }
2845
2846 #ifdef SUPPORT_UTF8
2847 if (utf8)
2848 {
2849 class_utf8 = TRUE;
2850 *class_utf8data++ = XCL_RANGE;
2851 class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2852 class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
2853 *class_utf8data++ = XCL_RANGE;
2854 class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
2855 class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
2856 *class_utf8data++ = XCL_RANGE;
2857 class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
2858 class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
2859 *class_utf8data++ = XCL_RANGE;
2860 class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
2861 class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
2862 *class_utf8data++ = XCL_RANGE;
2863 class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
2864 class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
2865 *class_utf8data++ = XCL_RANGE;
2866 class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
2867 class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
2868 *class_utf8data++ = XCL_RANGE;
2869 class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
2870 class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2871 }
2872 #endif
2873 continue;
2874 }
2875
2876 if (-c == ESC_v)
2877 {
2878 SETBIT(classbits, 0x0a); /* LF */
2879 SETBIT(classbits, 0x0b); /* VT */
2880 SETBIT(classbits, 0x0c); /* FF */
2881 SETBIT(classbits, 0x0d); /* CR */
2882 SETBIT(classbits, 0x85); /* NEL */
2883 #ifdef SUPPORT_UTF8
2884 if (utf8)
2885 {
2886 class_utf8 = TRUE;
2887 *class_utf8data++ = XCL_RANGE;
2888 class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
2889 class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
2890 }
2891 #endif
2892 continue;
2893 }
2894
2895 if (-c == ESC_V)
2896 {
2897 for (c = 0; c < 32; c++)
2898 {
2899 int x = 0xff;
2900 switch (c)
2901 {
2902 case 0x0a/8: x ^= 1 << (0x0a%8);
2903 x ^= 1 << (0x0b%8);
2904 x ^= 1 << (0x0c%8);
2905 x ^= 1 << (0x0d%8);
2906 break;
2907 case 0x85/8: x ^= 1 << (0x85%8); break;
2908 default: break;
2909 }
2910 classbits[c] |= x;
2911 }
2912
2913 #ifdef SUPPORT_UTF8
2914 if (utf8)
2915 {
2916 class_utf8 = TRUE;
2917 *class_utf8data++ = XCL_RANGE;
2918 class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2919 class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
2920 *class_utf8data++ = XCL_RANGE;
2921 class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
2922 class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2923 }
2924 #endif
2925 continue;
2926 }
2927
2928 /* We need to deal with \P and \p in both phases. */
2929
2930 #ifdef SUPPORT_UCP
2931 if (-c == ESC_p || -c == ESC_P)
2932 {
2933 BOOL negated;
2934 int pdata;
2935 int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
2936 if (ptype < 0) goto FAILED;
2937 class_utf8 = TRUE;
2938 *class_utf8data++ = ((-c == ESC_p) != negated)?
2939 XCL_PROP : XCL_NOTPROP;
2940 *class_utf8data++ = ptype;
2941 *class_utf8data++ = pdata;
2942 class_charcount -= 2; /* Not a < 256 character */
2943 continue;
2944 }
2945 #endif
2946 /* Unrecognized escapes are faulted if PCRE is running in its
2947 strict mode. By default, for compatibility with Perl, they are
2948 treated as literals. */
2949
2950 if ((options & PCRE_EXTRA) != 0)
2951 {
2952 *errorcodeptr = ERR7;
2953 goto FAILED;
2954 }
2955
2956 class_charcount -= 2; /* Undo the default count from above */
2957 c = *ptr; /* Get the final character and fall through */
2958 }
2959
2960 /* Fall through if we have a single character (c >= 0). This may be
2961 greater than 256 in UTF-8 mode. */
2962
2963 } /* End of backslash handling */
2964
2965 /* A single character may be followed by '-' to form a range. However,
2966 Perl does not permit ']' to be the end of the range. A '-' character
2967 at the end is treated as a literal. Perl ignores orphaned \E sequences
2968 entirely. The code for handling \Q and \E is messy. */
2969
2970 CHECK_RANGE:
2971 while (ptr[1] == '\\' && ptr[2] == 'E')
2972 {
2973 inescq = FALSE;
2974 ptr += 2;
2975 }
2976
2977 oldptr = ptr;
2978
2979 if (!inescq && ptr[1] == '-')
2980 {
2981 int d;
2982 ptr += 2;
2983 while (*ptr == '\\' && ptr[1] == 'E') ptr += 2;
2984
2985 /* If we hit \Q (not followed by \E) at this point, go into escaped
2986 mode. */
2987
2988 while (*ptr == '\\' && ptr[1] == 'Q')
2989 {
2990 ptr += 2;
2991 if (*ptr == '\\' && ptr[1] == 'E') { ptr += 2; continue; }
2992 inescq = TRUE;
2993 break;
2994 }
2995
2996 if (*ptr == 0 || (!inescq && *ptr == ']'))
2997 {
2998 ptr = oldptr;
2999 goto LONE_SINGLE_CHARACTER;
3000 }
3001
3002 #ifdef SUPPORT_UTF8
3003 if (utf8)
3004 { /* Braces are required because the */
3005 GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
3006 }
3007 else
3008 #endif
3009 d = *ptr; /* Not UTF-8 mode */
3010
3011 /* The second part of a range can be a single-character escape, but
3012 not any of the other escapes. Perl 5.6 treats a hyphen as a literal
3013 in such circumstances. */
3014
3015 if (!inescq && d == '\\')
3016 {
3017 d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3018 if (*errorcodeptr != 0) goto FAILED;
3019
3020 /* \b is backslash; \X is literal X; \R is literal R; any other
3021 special means the '-' was literal */
3022
3023 if (d < 0)
3024 {
3025 if (d == -ESC_b) d = '\b';
3026 else if (d == -ESC_X) d = 'X';
3027 else if (d == -ESC_R) d = 'R'; else
3028 {
3029 ptr = oldptr;
3030 goto LONE_SINGLE_CHARACTER; /* A few lines below */
3031 }
3032 }
3033 }
3034
3035 /* Check that the two values are in the correct order. Optimize
3036 one-character ranges */
3037
3038 if (d < c)
3039 {
3040 *errorcodeptr = ERR8;
3041 goto FAILED;
3042 }
3043
3044 if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
3045
3046 /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
3047 matching, we have to use an XCLASS with extra data items. Caseless
3048 matching for characters > 127 is available only if UCP support is
3049 available. */
3050
3051 #ifdef SUPPORT_UTF8
3052 if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
3053 {
3054 class_utf8 = TRUE;
3055
3056 /* With UCP support, we can find the other case equivalents of
3057 the relevant characters. There may be several ranges. Optimize how
3058 they fit with the basic range. */
3059
3060 #ifdef SUPPORT_UCP
3061 if ((options & PCRE_CASELESS) != 0)
3062 {
3063 unsigned int occ, ocd;
3064 unsigned int cc = c;
3065 unsigned int origd = d;
3066 while (get_othercase_range(&cc, origd, &occ, &ocd))
3067 {
3068 if (occ >= (unsigned int)c &&
3069 ocd <= (unsigned int)d)
3070 continue; /* Skip embedded ranges */
3071
3072 if (occ < (unsigned int)c &&
3073 ocd >= (unsigned int)c - 1) /* Extend the basic range */
3074 { /* if there is overlap, */
3075 c = occ; /* noting that if occ < c */
3076 continue; /* we can't have ocd > d */
3077 } /* because a subrange is */
3078 if (ocd > (unsigned int)d &&
3079 occ <= (unsigned int)d + 1) /* always shorter than */
3080 { /* the basic range. */
3081 d = ocd;
3082 continue;
3083 }
3084
3085 if (occ == ocd)
3086 {
3087 *class_utf8data++ = XCL_SINGLE;
3088 }
3089 else
3090 {
3091 *class_utf8data++ = XCL_RANGE;
3092 class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
3093 }
3094 class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
3095 }
3096 }
3097 #endif /* SUPPORT_UCP */
3098
3099 /* Now record the original range, possibly modified for UCP caseless
3100 overlapping ranges. */
3101
3102 *class_utf8data++ = XCL_RANGE;
3103 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3104 class_utf8data += _pcre_ord2utf8(d, class_utf8data);
3105
3106 /* With UCP support, we are done. Without UCP support, there is no
3107 caseless matching for UTF-8 characters > 127; we can use the bit map
3108 for the smaller ones. */
3109
3110 #ifdef SUPPORT_UCP
3111 continue; /* With next character in the class */
3112 #else
3113 if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
3114
3115 /* Adjust upper limit and fall through to set up the map */
3116
3117 d = 127;
3118
3119 #endif /* SUPPORT_UCP */
3120 }
3121 #endif /* SUPPORT_UTF8 */
3122
3123 /* We use the bit map for all cases when not in UTF-8 mode; else
3124 ranges that lie entirely within 0-127 when there is UCP support; else
3125 for partial ranges without UCP support. */
3126
3127 class_charcount += d - c + 1;
3128 class_lastchar = d;
3129
3130 /* We can save a bit of time by skipping this in the pre-compile. */
3131
3132 if (lengthptr == NULL) for (; c <= d; c++)
3133 {
3134 classbits[c/8] |= (1 << (c&7));
3135 if ((options & PCRE_CASELESS) != 0)
3136 {
3137 int uc = cd->fcc[c]; /* flip case */
3138 classbits[uc/8] |= (1 << (uc&7));
3139 }
3140 }
3141
3142 continue; /* Go get the next char in the class */
3143 }
3144
3145 /* Handle a lone single character - we can get here for a normal
3146 non-escape char, or after \ that introduces a single character or for an
3147 apparent range that isn't. */
3148
3149 LONE_SINGLE_CHARACTER:
3150
3151 /* Handle a character that cannot go in the bit map */
3152
3153 #ifdef SUPPORT_UTF8
3154 if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
3155 {
3156 class_utf8 = TRUE;
3157 *class_utf8data++ = XCL_SINGLE;
3158 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3159
3160 #ifdef SUPPORT_UCP
3161 if ((options & PCRE_CASELESS) != 0)
3162 {
3163 unsigned int othercase;
3164 if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR)
3165 {
3166 *class_utf8data++ = XCL_SINGLE;
3167 class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
3168 }
3169 }
3170 #endif /* SUPPORT_UCP */
3171
3172 }
3173 else
3174 #endif /* SUPPORT_UTF8 */
3175
3176 /* Handle a single-byte character */
3177 {
3178 classbits[c/8] |= (1 << (c&7));
3179 if ((options & PCRE_CASELESS) != 0)
3180 {
3181 c = cd->fcc[c]; /* flip case */
3182 classbits[c/8] |= (1 << (c&7));
3183 }
3184 class_charcount++;
3185 class_lastchar = c;
3186 }
3187 }
3188
3189 /* Loop until ']' reached. This "while" is the end of the "do" above. */
3190
3191 while ((c = *(++ptr)) != 0 && (c != ']' || inescq));
3192
3193 if (c == 0) /* Missing terminating ']' */
3194 {
3195 *errorcodeptr = ERR6;
3196 goto FAILED;
3197 }
3198
3199 /* Remember whether \r or \n are in this class */
3200
3201 if (negate_class)
3202 {
3203 if ((classbits[1] & 0x24) != 0x24) cd->external_options |= PCRE_HASCRORLF;
3204 }
3205 else
3206 {
3207 if ((classbits[1] & 0x24) != 0) cd->external_options |= PCRE_HASCRORLF;
3208 }
3209
3210 /* If class_charcount is 1, we saw precisely one character whose value is
3211 less than 256. As long as there were no characters >= 128 and there was no
3212 use of \p or \P, in other words, no use of any XCLASS features, we can
3213 optimize.
3214
3215 In UTF-8 mode, we can optimize the negative case only if there were no
3216 characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR
3217 operate on single-bytes only. This is an historical hangover. Maybe one day
3218 we can tidy these opcodes to handle multi-byte characters.
3219
3220 The optimization throws away the bit map. We turn the item into a
3221 1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
3222 that OP_NOT does not support multibyte characters. In the positive case, it
3223 can cause firstbyte to be set. Otherwise, there can be no first char if
3224 this item is first, whatever repeat count may follow. In the case of
3225 reqbyte, save the previous value for reinstating. */
3226
3227 #ifdef SUPPORT_UTF8
3228 if (class_charcount == 1 && !class_utf8 &&
3229 (!utf8 || !negate_class || class_lastchar < 128))
3230 #else
3231 if (class_charcount == 1)
3232 #endif
3233 {
3234 zeroreqbyte = reqbyte;
3235
3236 /* The OP_NOT opcode works on one-byte characters only. */
3237
3238 if (negate_class)
3239 {
3240 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3241 zerofirstbyte = firstbyte;
3242 *code++ = OP_NOT;
3243 *code++ = class_lastchar;
3244 break;
3245 }
3246
3247 /* For a single, positive character, get the value into mcbuffer, and
3248 then we can handle this with the normal one-character code. */
3249
3250 #ifdef SUPPORT_UTF8
3251 if (utf8 && class_lastchar > 127)
3252 mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
3253 else
3254 #endif
3255 {
3256 mcbuffer[0] = class_lastchar;
3257 mclength = 1;
3258 }
3259 goto ONE_CHAR;
3260 } /* End of 1-char optimization */
3261
3262 /* The general case - not the one-char optimization. If this is the first
3263 thing in the branch, there can be no first char setting, whatever the
3264 repeat count. Any reqbyte setting must remain unchanged after any kind of
3265 repeat. */
3266
3267 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3268 zerofirstbyte = firstbyte;
3269 zeroreqbyte = reqbyte;
3270
3271 /* If there are characters with values > 255, we have to compile an
3272 extended class, with its own opcode. If there are no characters < 256,
3273 we can omit the bitmap in the actual compiled code. */
3274
3275 #ifdef SUPPORT_UTF8
3276 if (class_utf8)
3277 {
3278 *class_utf8data++ = XCL_END; /* Marks the end of extra data */
3279 *code++ = OP_XCLASS;
3280 code += LINK_SIZE;
3281 *code = negate_class? XCL_NOT : 0;
3282
3283 /* If the map is required, move up the extra data to make room for it;
3284 otherwise just move the code pointer to the end of the extra data. */
3285
3286 if (class_charcount > 0)
3287 {
3288 *code++ |= XCL_MAP;
3289 memmove(code + 32, code, class_utf8data - code);
3290 memcpy(code, classbits, 32);
3291 code = class_utf8data + 32;
3292 }
3293 else code = class_utf8data;
3294
3295 /* Now fill in the complete length of the item */
3296
3297 PUT(previous, 1, code - previous);
3298 break; /* End of class handling */
3299 }
3300 #endif
3301
3302 /* If there are no characters > 255, negate the 32-byte map if necessary,
3303 and copy it into the code vector. If this is the first thing in the branch,
3304 there can be no first char setting, whatever the repeat count. Any reqbyte
3305 setting must remain unchanged after any kind of repeat. */
3306
3307 if (negate_class)
3308 {
3309 *code++ = OP_NCLASS;
3310 if (lengthptr == NULL) /* Save time in the pre-compile phase */
3311 for (c = 0; c < 32; c++) code[c] = ~classbits[c];
3312 }
3313 else
3314 {
3315 *code++ = OP_CLASS;
3316 memcpy(code, classbits, 32);
3317 }
3318 code += 32;
3319 break;
3320
3321
3322 /* ===================================================================*/
3323 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
3324 has been tested above. */
3325
3326 case '{':
3327 if (!is_quantifier) goto NORMAL_CHAR;
3328 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
3329 if (*errorcodeptr != 0) goto FAILED;
3330 goto REPEAT;
3331
3332 case '*':
3333 repeat_min = 0;
3334 repeat_max = -1;
3335 goto REPEAT;
3336
3337 case '+':
3338 repeat_min = 1;
3339 repeat_max = -1;
3340 goto REPEAT;
3341
3342 case '?':
3343 repeat_min = 0;
3344 repeat_max = 1;
3345
3346 REPEAT:
3347 if (previous == NULL)
3348 {
3349 *errorcodeptr = ERR9;
3350 goto FAILED;
3351 }
3352
3353 if (repeat_min == 0)
3354 {
3355 firstbyte = zerofirstbyte; /* Adjust for zero repeat */
3356 reqbyte = zeroreqbyte; /* Ditto */
3357 }
3358
3359 /* Remember whether this is a variable length repeat */
3360
3361 reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
3362
3363 op_type = 0; /* Default single-char op codes */
3364 possessive_quantifier = FALSE; /* Default not possessive quantifier */
3365
3366 /* Save start of previous item, in case we have to move it up to make space
3367 for an inserted OP_ONCE for the additional '+' extension. */
3368
3369 tempcode = previous;
3370
3371 /* If the next character is '+', we have a possessive quantifier. This
3372 implies greediness, whatever the setting of the PCRE_UNGREEDY option.
3373 If the next character is '?' this is a minimizing repeat, by default,
3374 but if PCRE_UNGREEDY is set, it works the other way round. We change the
3375 repeat type to the non-default. */
3376
3377 if (ptr[1] == '+')
3378 {
3379 repeat_type = 0; /* Force greedy */
3380 possessive_quantifier = TRUE;
3381 ptr++;
3382 }
3383 else if (ptr[1] == '?')
3384 {
3385 repeat_type = greedy_non_default;
3386 ptr++;
3387 }
3388 else repeat_type = greedy_default;
3389
3390 /* If previous was a character match, abolish the item and generate a
3391 repeat item instead. If a char item has a minumum of more than one, ensure
3392 that it is set in reqbyte - it might not be if a sequence such as x{3} is
3393 the first thing in a branch because the x will have gone into firstbyte
3394 instead. */
3395
3396 if (*previous == OP_CHAR || *previous == OP_CHARNC)
3397 {
3398 /* Deal with UTF-8 characters that take up more than one byte. It's
3399 easier to write this out separately than try to macrify it. Use c to
3400 hold the length of the character in bytes, plus 0x80 to flag that it's a
3401 length rather than a small character. */
3402
3403 #ifdef SUPPORT_UTF8
3404 if (utf8 && (code[-1] & 0x80) != 0)
3405 {
3406 uschar *lastchar = code - 1;
3407 while((*lastchar & 0xc0) == 0x80) lastchar--;
3408 c = code - lastchar; /* Length of UTF-8 character */
3409 memcpy(utf8_char, lastchar, c); /* Save the char */
3410 c |= 0x80; /* Flag c as a length */
3411 }
3412 else
3413 #endif
3414
3415 /* Handle the case of a single byte - either with no UTF8 support, or
3416 with UTF-8 disabled, or for a UTF-8 character < 128. */
3417
3418 {
3419 c = code[-1];
3420 if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
3421 }
3422
3423 /* If the repetition is unlimited, it pays to see if the next thing on
3424 the line is something that cannot possibly match this character. If so,
3425 automatically possessifying this item gains some performance in the case
3426 where the match fails. */
3427
3428 if (!possessive_quantifier &&
3429 repeat_max < 0 &&
3430 check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3431 options, cd))
3432 {
3433 repeat_type = 0; /* Force greedy */
3434 possessive_quantifier = TRUE;
3435 }
3436
3437 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
3438 }
3439
3440 /* If previous was a single negated character ([^a] or similar), we use
3441 one of the special opcodes, replacing it. The code is shared with single-
3442 character repeats by setting opt_type to add a suitable offset into
3443 repeat_type. We can also test for auto-possessification. OP_NOT is
3444 currently used only for single-byte chars. */
3445
3446 else if (*previous == OP_NOT)
3447 {
3448 op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
3449 c = previous[1];
3450 if (!possessive_quantifier &&
3451 repeat_max < 0 &&
3452 check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3453 {
3454 repeat_type = 0; /* Force greedy */
3455 possessive_quantifier = TRUE;
3456 }
3457 goto OUTPUT_SINGLE_REPEAT;
3458 }
3459
3460 /* If previous was a character type match (\d or similar), abolish it and
3461 create a suitable repeat item. The code is shared with single-character
3462 repeats by setting op_type to add a suitable offset into repeat_type. Note
3463 the the Unicode property types will be present only when SUPPORT_UCP is
3464 defined, but we don't wrap the little bits of code here because it just
3465 makes it horribly messy. */
3466
3467 else if (*previous < OP_EODN)
3468 {
3469 uschar *oldcode;
3470 int prop_type, prop_value;
3471 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
3472 c = *previous;
3473
3474 if (!possessive_quantifier &&
3475 repeat_max < 0 &&
3476 check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3477 {
3478 repeat_type = 0; /* Force greedy */
3479 possessive_quantifier = TRUE;
3480 }
3481
3482 OUTPUT_SINGLE_REPEAT:
3483 if (*previous == OP_PROP || *previous == OP_NOTPROP)
3484 {
3485 prop_type = previous[1];
3486 prop_value = previous[2];
3487 }
3488 else prop_type = prop_value = -1;
3489
3490 oldcode = code;
3491 code = previous; /* Usually overwrite previous item */
3492
3493 /* If the maximum is zero then the minimum must also be zero; Perl allows
3494 this case, so we do too - by simply omitting the item altogether. */
3495
3496 if (repeat_max == 0) goto END_REPEAT;
3497
3498 /* All real repeats make it impossible to handle partial matching (maybe
3499 one day we will be able to remove this restriction). */
3500
3501 if (repeat_max != 1) cd->nopartial = TRUE;
3502
3503 /* Combine the op_type with the repeat_type */
3504
3505 repeat_type += op_type;
3506
3507 /* A minimum of zero is handled either as the special case * or ?, or as
3508 an UPTO, with the maximum given. */
3509
3510 if (repeat_min == 0)
3511 {
3512 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
3513 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
3514 else
3515 {
3516 *code++ = OP_UPTO + repeat_type;
3517 PUT2INC(code, 0, repeat_max);
3518 }
3519 }
3520
3521 /* A repeat minimum of 1 is optimized into some special cases. If the
3522 maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
3523 left in place and, if the maximum is greater than 1, we use OP_UPTO with
3524 one less than the maximum. */
3525
3526 else if (repeat_min == 1)
3527 {
3528 if (repeat_max == -1)
3529 *code++ = OP_PLUS + repeat_type;
3530 else
3531 {
3532 code = oldcode; /* leave previous item in place */
3533 if (repeat_max == 1) goto END_REPEAT;
3534 *code++ = OP_UPTO + repeat_type;
3535 PUT2INC(code, 0, repeat_max - 1);
3536 }
3537 }
3538
3539 /* The case {n,n} is just an EXACT, while the general case {n,m} is
3540 handled as an EXACT followed by an UPTO. */
3541
3542 else
3543 {
3544 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
3545 PUT2INC(code, 0, repeat_min);
3546
3547 /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
3548 we have to insert the character for the previous code. For a repeated
3549 Unicode property match, there are two extra bytes that define the
3550 required property. In UTF-8 mode, long characters have their length in
3551 c, with the 0x80 bit as a flag. */
3552
3553 if (repeat_max < 0)
3554 {
3555 #ifdef SUPPORT_UTF8
3556 if (utf8 && c >= 128)
3557 {
3558 memcpy(code, utf8_char, c & 7);
3559 code += c & 7;
3560 }
3561 else
3562 #endif
3563 {
3564 *code++ = c;
3565 if (prop_type >= 0)
3566 {
3567 *code++ = prop_type;
3568 *code++ = prop_value;
3569 }
3570 }
3571 *code++ = OP_STAR + repeat_type;
3572 }
3573
3574 /* Else insert an UPTO if the max is greater than the min, again
3575 preceded by the character, for the previously inserted code. If the
3576 UPTO is just for 1 instance, we can use QUERY instead. */
3577
3578 else if (repeat_max != repeat_min)
3579 {
3580 #ifdef SUPPORT_UTF8
3581 if (utf8 && c >= 128)
3582 {
3583 memcpy(code, utf8_char, c & 7);
3584 code += c & 7;
3585 }
3586 else
3587 #endif
3588 *code++ = c;
3589 if (prop_type >= 0)
3590 {
3591 *code++ = prop_type;
3592 *code++ = prop_value;
3593 }
3594 repeat_max -= repeat_min;
3595
3596 if (repeat_max == 1)
3597 {
3598 *code++ = OP_QUERY + repeat_type;
3599 }
3600 else
3601 {
3602 *code++ = OP_UPTO + repeat_type;
3603 PUT2INC(code, 0, repeat_max);
3604 }
3605 }
3606 }
3607
3608 /* The character or character type itself comes last in all cases. */
3609
3610 #ifdef SUPPORT_UTF8
3611 if (utf8 && c >= 128)
3612 {
3613 memcpy(code, utf8_char, c & 7);
3614 code += c & 7;
3615 }
3616 else
3617 #endif
3618 *code++ = c;
3619
3620 /* For a repeated Unicode property match, there are two extra bytes that
3621 define the required property. */
3622
3623 #ifdef SUPPORT_UCP
3624 if (prop_type >= 0)
3625 {
3626 *code++ = prop_type;
3627 *code++ = prop_value;
3628 }
3629 #endif
3630 }
3631
3632 /* If previous was a character class or a back reference, we put the repeat
3633 stuff after it, but just skip the item if the repeat was {0,0}. */
3634
3635 else if (*previous == OP_CLASS ||
3636 *previous == OP_NCLASS ||
3637 #ifdef SUPPORT_UTF8
3638 *previous == OP_XCLASS ||
3639 #endif
3640 *previous == OP_REF)
3641 {
3642 if (repeat_max == 0)
3643 {
3644 code = previous;
3645 goto END_REPEAT;
3646 }
3647
3648 /* All real repeats make it impossible to handle partial matching (maybe
3649 one day we will be able to remove this restriction). */
3650
3651 if (repeat_max != 1) cd->nopartial = TRUE;
3652
3653 if (repeat_min == 0 && repeat_max == -1)
3654 *code++ = OP_CRSTAR + repeat_type;
3655 else if (repeat_min == 1 && repeat_max == -1)
3656 *code++ = OP_CRPLUS + repeat_type;
3657 else if (repeat_min == 0 && repeat_max == 1)
3658 *code++ = OP_CRQUERY + repeat_type;
3659 else
3660 {
3661 *code++ = OP_CRRANGE + repeat_type;
3662 PUT2INC(code, 0, repeat_min);
3663 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
3664 PUT2INC(code, 0, repeat_max);
3665 }
3666 }
3667
3668 /* If previous was a bracket group, we may have to replicate it in certain
3669 cases. */
3670
3671 else if (*previous == OP_BRA || *previous == OP_CBRA ||
3672 *previous == OP_ONCE || *previous == OP_COND)
3673 {
3674 register int i;
3675 int ketoffset = 0;
3676 int len = code - previous;
3677 uschar *bralink = NULL;
3678
3679 /* Repeating a DEFINE group is pointless */
3680
3681 if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
3682 {
3683 *errorcodeptr = ERR55;
3684 goto FAILED;
3685 }
3686
3687 /* If the maximum repeat count is unlimited, find the end of the bracket
3688 by scanning through from the start, and compute the offset back to it
3689 from the current code pointer. There may be an OP_OPT setting following
3690 the final KET, so we can't find the end just by going back from the code
3691 pointer. */
3692
3693 if (repeat_max == -1)
3694 {
3695 register uschar *ket = previous;
3696 do ket += GET(ket, 1); while (*ket != OP_KET);
3697 ketoffset = code - ket;
3698 }
3699
3700 /* The case of a zero minimum is special because of the need to stick
3701 OP_BRAZERO in front of it, and because the group appears once in the
3702 data, whereas in other cases it appears the minimum number of times. For
3703 this reason, it is simplest to treat this case separately, as otherwise
3704 the code gets far too messy. There are several special subcases when the
3705 minimum is zero. */
3706
3707 if (repeat_min == 0)
3708 {
3709 /* If the maximum is also zero, we just omit the group from the output
3710 altogether. */
3711
3712 if (repeat_max == 0)
3713 {
3714 code = previous;
3715 goto END_REPEAT;
3716 }
3717
3718 /* If the maximum is 1 or unlimited, we just have to stick in the
3719 BRAZERO and do no more at this point. However, we do need to adjust
3720 any OP_RECURSE calls inside the group that refer to the group itself or
3721 any internal or forward referenced group, because the offset is from
3722 the start of the whole regex. Temporarily terminate the pattern while
3723 doing this. */
3724
3725 if (repeat_max <= 1)
3726 {
3727 *code = OP_END;
3728 adjust_recurse(previous, 1, utf8, cd, save_hwm);
3729 memmove(previous+1, previous, len);
3730 code++;
3731 *previous++ = OP_BRAZERO + repeat_type;
3732 }
3733
3734 /* If the maximum is greater than 1 and limited, we have to replicate
3735 in a nested fashion, sticking OP_BRAZERO before each set of brackets.
3736 The first one has to be handled carefully because it's the original
3737 copy, which has to be moved up. The remainder can be handled by code
3738 that is common with the non-zero minimum case below. We have to
3739 adjust the value or repeat_max, since one less copy is required. Once
3740 again, we may have to adjust any OP_RECURSE calls inside the group. */
3741
3742 else
3743 {
3744 int offset;
3745 *code = OP_END;
3746 adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
3747 memmove(previous + 2 + LINK_SIZE, previous, len);
3748 code += 2 + LINK_SIZE;
3749 *previous++ = OP_BRAZERO + repeat_type;
3750 *previous++ = OP_BRA;
3751
3752 /* We chain together the bracket offset fields that have to be
3753 filled in later when the ends of the brackets are reached. */
3754
3755 offset = (bralink == NULL)? 0 : previous - bralink;
3756 bralink = previous;
3757 PUTINC(previous, 0, offset);
3758 }
3759
3760 repeat_max--;
3761 }
3762
3763 /* If the minimum is greater than zero, replicate the group as many
3764 times as necessary, and adjust the maximum to the number of subsequent
3765 copies that we need. If we set a first char from the group, and didn't
3766 set a required char, copy the latter from the former. If there are any
3767 forward reference subroutine calls in the group, there will be entries on
3768 the workspace list; replicate these with an appropriate increment. */
3769
3770 else
3771 {
3772 if (repeat_min > 1)
3773 {
3774 /* In the pre-compile phase, we don't actually do the replication. We
3775 just adjust the length as if we had. Do some paranoid checks for
3776 potential integer overflow. */
3777
3778 if (lengthptr != NULL)
3779 {
3780 int delta = (repeat_min - 1)*length_prevgroup;
3781 if ((double)(repeat_min - 1)*(double)length_prevgroup >
3782 (double)INT_MAX ||
3783 OFLOW_MAX - *lengthptr < delta)
3784 {
3785 *errorcodeptr = ERR20;
3786 goto FAILED;
3787 }
3788 *lengthptr += delta;
3789 }
3790
3791 /* This is compiling for real */
3792
3793 else
3794 {
3795 if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
3796 for (i = 1; i < repeat_min; i++)
3797 {
3798 uschar *hc;
3799 uschar *this_hwm = cd->hwm;
3800 memcpy(code, previous, len);
3801 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3802 {
3803 PUT(cd->hwm, 0, GET(hc, 0) + len);
3804 cd->hwm += LINK_SIZE;
3805 }
3806 save_hwm = this_hwm;
3807 code += len;
3808 }
3809 }
3810 }
3811
3812 if (repeat_max > 0) repeat_max -= repeat_min;
3813 }
3814
3815 /* This code is common to both the zero and non-zero minimum cases. If
3816 the maximum is limited, it replicates the group in a nested fashion,
3817 remembering the bracket starts on a stack. In the case of a zero minimum,
3818 the first one was set up above. In all cases the repeat_max now specifies
3819 the number of additional copies needed. Again, we must remember to
3820 replicate entries on the forward reference list. */
3821
3822 if (repeat_max >= 0)
3823 {
3824 /* In the pre-compile phase, we don't actually do the replication. We
3825 just adjust the length as if we had. For each repetition we must add 1
3826 to the length for BRAZERO and for all but the last repetition we must
3827 add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
3828 paranoid checks to avoid integer overflow. */
3829
3830 if (lengthptr != NULL && repeat_max > 0)
3831 {
3832 int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
3833 2 - 2*LINK_SIZE; /* Last one doesn't nest */
3834 if ((double)repeat_max *
3835 (double)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
3836 > (double)INT_MAX ||
3837 OFLOW_MAX - *lengthptr < delta)
3838 {
3839 *errorcodeptr = ERR20;
3840 goto FAILED;
3841 }
3842 *lengthptr += delta;
3843 }
3844
3845 /* This is compiling for real */
3846
3847 else for (i = repeat_max - 1; i >= 0; i--)
3848 {
3849 uschar *hc;
3850 uschar *this_hwm = cd->hwm;
3851
3852 *code++ = OP_BRAZERO + repeat_type;
3853
3854 /* All but the final copy start a new nesting, maintaining the
3855 chain of brackets outstanding. */
3856
3857 if (i != 0)
3858 {
3859 int offset;
3860 *code++ = OP_BRA;
3861 offset = (bralink == NULL)? 0 : code - bralink;
3862 bralink = code;
3863 PUTINC(code, 0, offset);
3864 }
3865
3866 memcpy(code, previous, len);
3867 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3868 {
3869 PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
3870 cd->hwm += LINK_SIZE;
3871 }
3872 save_hwm = this_hwm;
3873 code += len;
3874 }
3875
3876 /* Now chain through the pending brackets, and fill in their length
3877 fields (which are holding the chain links pro tem). */
3878
3879 while (bralink != NULL)
3880 {
3881 int oldlinkoffset;
3882 int offset = code - bralink + 1;
3883 uschar *bra = code - offset;
3884 oldlinkoffset = GET(bra, 1);
3885 bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
3886 *code++ = OP_KET;
3887 PUTINC(code, 0, offset);
3888 PUT(bra, 1, offset);
3889 }
3890 }
3891
3892 /* If the maximum is unlimited, set a repeater in the final copy. We
3893 can't just offset backwards from the current code point, because we
3894 don't know if there's been an options resetting after the ket. The
3895 correct offset was computed above.
3896
3897 Then, when we are doing the actual compile phase, check to see whether
3898 this group is a non-atomic one that could match an empty string. If so,
3899 convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
3900 that runtime checking can be done. [This check is also applied to
3901 atomic groups at runtime, but in a different way.] */
3902
3903 else
3904 {
3905 uschar *ketcode = code - ketoffset;
3906 uschar *bracode = ketcode - GET(ketcode, 1);
3907 *ketcode = OP_KETRMAX + repeat_type;
3908 if (lengthptr == NULL && *bracode != OP_ONCE)
3909 {
3910 uschar *scode = bracode;
3911 do
3912 {
3913 if (could_be_empty_branch(scode, ketcode, utf8))
3914 {
3915 *bracode += OP_SBRA - OP_BRA;
3916 break;
3917 }
3918 scode += GET(scode, 1);
3919 }
3920 while (*scode == OP_ALT);
3921 }
3922 }
3923 }
3924
3925 /* Else there's some kind of shambles */
3926
3927 else
3928 {
3929 *errorcodeptr = ERR11;
3930 goto FAILED;
3931 }
3932
3933 /* If the character following a repeat is '+', or if certain optimization
3934 tests above succeeded, possessive_quantifier is TRUE. For some of the
3935 simpler opcodes, there is an special alternative opcode for this. For
3936 anything else, we wrap the entire repeated item inside OP_ONCE brackets.
3937 The '+' notation is just syntactic sugar, taken from Sun's Java package,
3938 but the special opcodes can optimize it a bit. The repeated item starts at
3939 tempcode, not at previous, which might be the first part of a string whose
3940 (former) last char we repeated.
3941
3942 Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
3943 an 'upto' may follow. We skip over an 'exact' item, and then test the
3944 length of what remains before proceeding. */
3945
3946 if (possessive_quantifier)
3947 {
3948 int len;
3949 if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||
3950 *tempcode == OP_NOTEXACT)
3951 tempcode += _pcre_OP_lengths[*tempcode];
3952 len = code - tempcode;
3953 if (len > 0) switch (*tempcode)
3954 {
3955 case OP_STAR: *tempcode = OP_POSSTAR; break;
3956 case OP_PLUS: *tempcode = OP_POSPLUS; break;
3957 case OP_QUERY: *tempcode = OP_POSQUERY; break;
3958 case OP_UPTO: *tempcode = OP_POSUPTO; break;
3959
3960 case OP_TYPESTAR: *tempcode = OP_TYPEPOSSTAR; break;
3961 case OP_TYPEPLUS: *tempcode = OP_TYPEPOSPLUS; break;
3962 case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
3963 case OP_TYPEUPTO: *tempcode = OP_TYPEPOSUPTO; break;
3964
3965 case OP_NOTSTAR: *tempcode = OP_NOTPOSSTAR; break;
3966 case OP_NOTPLUS: *tempcode = OP_NOTPOSPLUS; break;
3967 case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
3968 case OP_NOTUPTO: *tempcode = OP_NOTPOSUPTO; break;
3969
3970 default:
3971 memmove(tempcode + 1+LINK_SIZE, tempcode, len);
3972 code += 1 + LINK_SIZE;
3973 len += 1 + LINK_SIZE;
3974 tempcode[0] = OP_ONCE;
3975 *code++ = OP_KET;
3976 PUTINC(code, 0, len);
3977 PUT(tempcode, 1, len);
3978 break;
3979 }
3980 }
3981
3982 /* In all case we no longer have a previous item. We also set the
3983 "follows varying string" flag for subsequently encountered reqbytes if
3984 it isn't already set and we have just passed a varying length item. */
3985
3986 END_REPEAT:
3987 previous = NULL;
3988 cd->req_varyopt |= reqvary;
3989 break;
3990
3991
3992 /* ===================================================================*/
3993 /* Start of nested parenthesized sub-expression, or comment or lookahead or
3994 lookbehind or option setting or condition or all the other extended
3995 parenthesis forms. */
3996
3997 case '(':
3998 newoptions = options;
3999 skipbytes = 0;
4000 bravalue = OP_CBRA;
4001 save_hwm = cd->hwm;
4002 reset_bracount = FALSE;
4003
4004 /* First deal with various "verbs" that can be introduced by '*'. */
4005
4006 if (*(++ptr) == '*' && (cd->ctypes[ptr[1]] & ctype_letter) != 0)
4007 {
4008 int i, namelen;
4009 const uschar *name = ++ptr;
4010 previous = NULL;
4011 while ((cd->ctypes[*++ptr] & ctype_letter) != 0);
4012 if (*ptr == ':')
4013 {
4014 *errorcodeptr = ERR59; /* Not supported */
4015 goto FAILED;
4016 }
4017 if (*ptr != ')')
4018 {
4019 *errorcodeptr = ERR60;
4020 goto FAILED;
4021 }
4022 namelen = ptr - name;
4023 for (i = 0; i < verbcount; i++)
4024 {
4025 if (namelen == verbs[i].len &&
4026 strncmp((char *)name, verbs[i].name, namelen) == 0)
4027 {
4028 *code = verbs[i].op;
4029 if (*code++ == OP_ACCEPT) cd->had_accept = TRUE;
4030 break;
4031 }
4032 }
4033 if (i < verbcount) continue;
4034 *errorcodeptr = ERR60;
4035 goto FAILED;
4036 }
4037
4038 /* Deal with the extended parentheses; all are introduced by '?', and the
4039 appearance of any of them means that this is not a capturing group. */
4040
4041 else if (*ptr == '?')
4042 {
4043 int i, set, unset, namelen;
4044 int *optset;
4045 const uschar *name;
4046 uschar *slot;
4047
4048 switch (*(++ptr))
4049 {
4050 case '#': /* Comment; skip to ket */
4051 ptr++;
4052 while (*ptr != 0 && *ptr != ')') ptr++;
4053 if (*ptr == 0)
4054 {
4055 *errorcodeptr = ERR18;
4056 goto FAILED;
4057 }
4058 continue;
4059
4060
4061 /* ------------------------------------------------------------ */
4062 case '|': /* Reset capture count for each branch */
4063 reset_bracount = TRUE;
4064 /* Fall through */
4065
4066 /* ------------------------------------------------------------ */
4067 case ':': /* Non-capturing bracket */
4068 bravalue = OP_BRA;
4069 ptr++;
4070 break;
4071
4072
4073 /* ------------------------------------------------------------ */
4074 case '(':
4075 bravalue = OP_COND; /* Conditional group */
4076
4077 /* A condition can be an assertion, a number (referring to a numbered
4078 group), a name (referring to a named group), or 'R', referring to
4079 recursion. R<digits> and R&name are also permitted for recursion tests.
4080
4081 There are several syntaxes for testing a named group: (?(name)) is used
4082 by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
4083
4084 There are two unfortunate ambiguities, caused by history. (a) 'R' can
4085 be the recursive thing or the name 'R' (and similarly for 'R' followed
4086 by digits), and (b) a number could be a name that consists of digits.
4087 In both cases, we look for a name first; if not found, we try the other
4088 cases. */
4089
4090 /* For conditions that are assertions, check the syntax, and then exit
4091 the switch. This will take control down to where bracketed groups,
4092 including assertions, are processed. */
4093
4094 if (ptr[1] == '?' && (ptr[2] == '=' || ptr[2] == '!' || ptr[2] == '<'))
4095 break;
4096
4097 /* Most other conditions use OP_CREF (a couple change to OP_RREF
4098 below), and all need to skip 3 bytes at the start of the group. */
4099
4100 code[1+LINK_SIZE] = OP_CREF;
4101 skipbytes = 3;
4102 refsign = -1;
4103
4104 /* Check for a test for recursion in a named group. */
4105
4106 if (ptr[1] == 'R' && ptr[2] == '&')
4107 {
4108 terminator = -1;
4109 ptr += 2;
4110 code[1+LINK_SIZE] = OP_RREF; /* Change the type of test */
4111 }
4112
4113 /* Check for a test for a named group's having been set, using the Perl
4114 syntax (?(<name>) or (?('name') */
4115
4116 else if (ptr[1] == '<')
4117 {
4118 terminator = '>';
4119 ptr++;
4120 }
4121 else if (ptr[1] == '\'')
4122 {
4123 terminator = '\'';
4124 ptr++;
4125 }
4126 else
4127 {
4128 terminator = 0;
4129 if (ptr[1] == '-' || ptr[1] == '+') refsign = *(++ptr);
4130 }
4131
4132 /* We now expect to read a name; any thing else is an error */
4133
4134 if ((cd->ctypes[ptr[1]] & ctype_word) == 0)
4135 {
4136 ptr += 1; /* To get the right offset */
4137 *errorcodeptr = ERR28;
4138 goto FAILED;
4139 }
4140
4141 /* Read the name, but also get it as a number if it's all digits */
4142
4143 recno = 0;
4144 name = ++ptr;
4145 while ((cd->ctypes[*ptr] & ctype_word) != 0)
4146 {
4147 if (recno >= 0)
4148 recno = ((digitab[*ptr] & ctype_digit) != 0)?
4149 recno * 10 + *ptr - '0' : -1;
4150 ptr++;
4151 }
4152 namelen = ptr - name;
4153
4154 if ((terminator > 0 && *ptr++ != terminator) || *ptr++ != ')')
4155 {
4156 ptr--; /* Error offset */
4157 *errorcodeptr = ERR26;
4158 goto FAILED;
4159 }
4160
4161 /* Do no further checking in the pre-compile phase. */
4162
4163 if (lengthptr != NULL) break;
4164
4165 /* In the real compile we do the work of looking for the actual
4166 reference. If the string started with "+" or "-" we require the rest to
4167 be digits, in which case recno will be set. */
4168
4169 if (refsign > 0)
4170 {
4171 if (recno <= 0)
4172 {
4173 *errorcodeptr = ERR58;
4174 goto FAILED;
4175 }
4176 if (refsign == '-')
4177 {
4178 recno = cd->bracount - recno + 1;
4179 if (recno <= 0)
4180 {
4181 *errorcodeptr = ERR15;
4182 goto FAILED;
4183 }
4184 }
4185 else recno += cd->bracount;
4186 PUT2(code, 2+LINK_SIZE, recno);
4187 break;
4188 }
4189
4190 /* Otherwise (did not start with "+" or "-"), start by looking for the
4191 name. */
4192
4193 slot = cd->name_table;
4194 for (i = 0; i < cd->names_found; i++)
4195 {
4196 if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
4197 slot += cd->name_entry_size;
4198 }
4199
4200 /* Found a previous named subpattern */
4201
4202 if (i < cd->names_found)
4203 {
4204 recno = GET2(slot, 0);
4205 PUT2(code, 2+LINK_SIZE, recno);
4206 }
4207
4208 /* Search the pattern for a forward reference */
4209
4210 else if ((i = find_parens(ptr, cd->bracount, name, namelen,
4211 (options & PCRE_EXTENDED) != 0)) > 0)
4212 {
4213 PUT2(code, 2+LINK_SIZE, i);
4214 }
4215
4216 /* If terminator == 0 it means that the name followed directly after
4217 the opening parenthesis [e.g. (?(abc)...] and in this case there are
4218 some further alternatives to try. For the cases where terminator != 0
4219 [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
4220 now checked all the possibilities, so give an error. */
4221
4222 else if (terminator != 0)
4223 {
4224 *errorcodeptr = ERR15;
4225 goto FAILED;
4226 }
4227
4228 /* Check for (?(R) for recursion. Allow digits after R to specify a
4229 specific group number. */
4230
4231 else if (*name == 'R')
4232 {
4233 recno = 0;
4234 for (i = 1; i < namelen; i++)
4235 {
4236 if ((digitab[name[i]] & ctype_digit) == 0)
4237 {
4238 *errorcodeptr = ERR15;
4239 goto FAILED;
4240 }
4241 recno = recno * 10 + name[i] - '0';
4242 }
4243 if (recno == 0) recno = RREF_ANY;
4244 code[1+LINK_SIZE] = OP_RREF; /* Change test type */
4245 PUT2(code, 2+LINK_SIZE, recno);
4246 }
4247
4248 /* Similarly, check for the (?(DEFINE) "condition", which is always
4249 false. */
4250
4251 else if (namelen == 6 && strncmp((char *)name, "DEFINE", 6) == 0)
4252 {
4253 code[1+LINK_SIZE] = OP_DEF;
4254 skipbytes = 1;
4255 }
4256
4257 /* Check for the "name" actually being a subpattern number. */
4258
4259 else if (recno > 0)
4260 {
4261 PUT2(code, 2+LINK_SIZE, recno);
4262 }
4263
4264 /* Either an unidentified subpattern, or a reference to (?(0) */
4265
4266 else
4267 {
4268 *errorcodeptr = (recno == 0)? ERR35: ERR15;
4269 goto FAILED;
4270 }
4271 break;
4272
4273
4274 /* ------------------------------------------------------------ */
4275 case '=': /* Positive lookahead */
4276 bravalue = OP_ASSERT;
4277 ptr++;
4278 break;
4279
4280
4281 /* ------------------------------------------------------------ */
4282 case '!': /* Negative lookahead */
4283 ptr++;
4284 if (*ptr == ')') /* Optimize (?!) */
4285 {
4286 *code++ = OP_FAIL;
4287 previous = NULL;
4288 continue;
4289 }
4290 bravalue = OP_ASSERT_NOT;
4291 break;
4292
4293
4294 /* ------------------------------------------------------------ */
4295 case '<': /* Lookbehind or named define */
4296 switch (ptr[1])
4297 {
4298 case '=': /* Positive lookbehind */
4299 bravalue = OP_ASSERTBACK;
4300 ptr += 2;
4301 break;
4302
4303 case '!': /* Negative lookbehind */
4304 bravalue = OP_ASSERTBACK_NOT;
4305 ptr += 2;
4306 break;
4307
4308 default: /* Could be name define, else bad */
4309 if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME;
4310 ptr++; /* Correct offset for error */
4311 *errorcodeptr = ERR24;
4312 goto FAILED;
4313 }
4314 break;
4315
4316
4317 /* ------------------------------------------------------------ */
4318 case '>': /* One-time brackets */
4319 bravalue = OP_ONCE;
4320 ptr++;
4321 break;
4322
4323
4324 /* ------------------------------------------------------------ */
4325 case 'C': /* Callout - may be followed by digits; */
4326 previous_callout = code; /* Save for later completion */
4327 after_manual_callout = 1; /* Skip one item before completing */
4328 *code++ = OP_CALLOUT;
4329 {
4330 int n = 0;
4331 while ((digitab[*(++ptr)] & ctype_digit) != 0)
4332 n = n * 10 + *ptr - '0';
4333 if (*ptr != ')')
4334 {
4335 *errorcodeptr = ERR39;
4336 goto FAILED;
4337 }
4338 if (n > 255)
4339 {
4340 *errorcodeptr = ERR38;
4341 goto FAILED;
4342 }
4343 *code++ = n;
4344 PUT(code, 0, ptr - cd->start_pattern + 1); /* Pattern offset */
4345 PUT(code, LINK_SIZE, 0); /* Default length */
4346 code += 2 * LINK_SIZE;
4347 }
4348 previous = NULL;
4349 continue;
4350
4351
4352 /* ------------------------------------------------------------ */
4353 case 'P': /* Python-style named subpattern handling */
4354 if (*(++ptr) == '=' || *ptr == '>') /* Reference or recursion */
4355 {
4356 is_recurse = *ptr == '>';
4357 terminator = ')';
4358 goto NAMED_REF_OR_RECURSE;
4359 }
4360 else if (*ptr != '<') /* Test for Python-style definition */
4361 {
4362 *errorcodeptr = ERR41;
4363 goto FAILED;
4364 }
4365 /* Fall through to handle (?P< as (?< is handled */
4366
4367
4368 /* ------------------------------------------------------------ */
4369 DEFINE_NAME: /* Come here from (?< handling */
4370 case '\'':
4371 {
4372 terminator = (*ptr == '<')? '>' : '\'';
4373 name = ++ptr;
4374
4375 while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4376 namelen = ptr - name;
4377
4378 /* In the pre-compile phase, just do a syntax check. */
4379
4380 if (lengthptr != NULL)
4381 {
4382 if (*ptr != terminator)
4383 {
4384 *errorcodeptr = ERR42;
4385 goto FAILED;
4386 }
4387 if (cd->names_found >= MAX_NAME_COUNT)
4388 {
4389 *errorcodeptr = ERR49;
4390 goto FAILED;
4391 }
4392 if (namelen + 3 > cd->name_entry_size)
4393 {
4394 cd->name_entry_size = namelen + 3;
4395 if (namelen > MAX_NAME_SIZE)
4396 {
4397 *errorcodeptr = ERR48;
4398 goto FAILED;
4399 }
4400 }
4401 }
4402
4403 /* In the real compile, create the entry in the table */
4404
4405 else
4406 {
4407 slot = cd->name_table;
4408 for (i = 0; i < cd->names_found; i++)
4409 {
4410 int crc = memcmp(name, slot+2, namelen);
4411 if (crc == 0)
4412 {
4413 if (slot[2+namelen] == 0)
4414 {
4415 if ((options & PCRE_DUPNAMES) == 0)
4416 {
4417 *errorcodeptr = ERR43;
4418 goto FAILED;
4419 }
4420 }
4421 else crc = -1; /* Current name is substring */
4422 }
4423 if (crc < 0)
4424 {
4425 memmove(slot + cd->name_entry_size, slot,
4426 (cd->names_found - i) * cd->name_entry_size);
4427 break;
4428 }
4429 slot += cd->name_entry_size;
4430 }
4431
4432 PUT2(slot, 0, cd->bracount + 1);
4433 memcpy(slot + 2, name, namelen);
4434 slot[2+namelen] = 0;
4435 }
4436 }
4437
4438 /* In both cases, count the number of names we've encountered. */
4439
4440 ptr++; /* Move past > or ' */
4441 cd->names_found++;
4442 goto NUMBERED_GROUP;
4443
4444
4445 /* ------------------------------------------------------------ */
4446 case '&': /* Perl recursion/subroutine syntax */
4447 terminator = ')';
4448 is_recurse = TRUE;
4449 /* Fall through */
4450
4451 /* We come here from the Python syntax above that handles both
4452 references (?P=name) and recursion (?P>name), as well as falling
4453 through from the Perl recursion syntax (?&name). */
4454
4455 NAMED_REF_OR_RECURSE:
4456 name = ++ptr;
4457 while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4458 namelen = ptr - name;
4459
4460 /* In the pre-compile phase, do a syntax check and set a dummy
4461 reference number. */
4462
4463 if (lengthptr != NULL)
4464 {
4465 if (*ptr != terminator)
4466 {
4467 *errorcodeptr = ERR42;
4468 goto FAILED;
4469 }
4470 if (namelen > MAX_NAME_SIZE)
4471 {
4472 *errorcodeptr = ERR48;
4473 goto FAILED;
4474 }
4475 recno = 0;
4476 }
4477
4478 /* In the real compile, seek the name in the table */
4479
4480 else
4481 {
4482 slot = cd->name_table;
4483 for (i = 0; i < cd->names_found; i++)
4484 {
4485 if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
4486 slot += cd->name_entry_size;
4487 }
4488
4489 if (i < cd->names_found) /* Back reference */
4490 {
4491 recno = GET2(slot, 0);
4492 }
4493 else if ((recno = /* Forward back reference */
4494 find_parens(ptr, cd->bracount, name, namelen,
4495 (options & PCRE_EXTENDED) != 0)) <= 0)
4496 {
4497 *errorcodeptr = ERR15;
4498 goto FAILED;
4499 }
4500 }
4501
4502 /* In both phases, we can now go to the code than handles numerical
4503 recursion or backreferences. */
4504
4505 if (is_recurse) goto HANDLE_RECURSION;
4506 else goto HANDLE_REFERENCE;
4507
4508
4509 /* ------------------------------------------------------------ */
4510 case 'R': /* Recursion */
4511 ptr++; /* Same as (?0) */
4512 /* Fall through */
4513
4514
4515 /* ------------------------------------------------------------ */
4516 case '-': case '+':
4517 case '0': case '1': case '2': case '3': case '4': /* Recursion or */
4518 case '5': case '6': case '7': case '8': case '9': /* subroutine */
4519 {
4520 const uschar *called;
4521
4522 if ((refsign = *ptr) == '+') ptr++;
4523 else if (refsign == '-')
4524 {
4525 if ((digitab[ptr[1]] & ctype_digit) == 0)
4526 goto OTHER_CHAR_AFTER_QUERY;
4527 ptr++;
4528 }
4529
4530 recno = 0;
4531 while((digitab[*ptr] & ctype_digit) != 0)
4532 recno = recno * 10 + *ptr++ - '0';
4533
4534 if (*ptr != ')')
4535 {
4536 *errorcodeptr = ERR29;
4537 goto FAILED;
4538 }
4539
4540 if (refsign == '-')
4541 {
4542 if (recno == 0)
4543 {
4544 *errorcodeptr = ERR58;
4545 goto FAILED;
4546 }
4547 recno = cd->bracount - recno + 1;
4548 if (recno <= 0)
4549 {
4550 *errorcodeptr = ERR15;
4551 goto FAILED;
4552 }
4553 }
4554 else if (refsign == '+')
4555 {
4556 if (recno == 0)
4557 {
4558 *errorcodeptr = ERR58;
4559 goto FAILED;
4560 }
4561 recno += cd->bracount;
4562 }
4563
4564 /* Come here from code above that handles a named recursion */
4565
4566 HANDLE_RECURSION:
4567
4568 previous = code;
4569 called = cd->start_code;
4570
4571 /* When we are actually compiling, find the bracket that is being
4572 referenced. Temporarily end the regex in case it doesn't exist before
4573 this point. If we end up with a forward reference, first check that
4574 the bracket does occur later so we can give the error (and position)
4575 now. Then remember this forward reference in the workspace so it can
4576 be filled in at the end. */
4577
4578 if (lengthptr == NULL)
4579 {
4580 *code = OP_END;
4581 if (recno != 0) called = find_bracket(cd->start_code, utf8, recno);
4582
4583 /* Forward reference */
4584
4585 if (called == NULL)
4586 {
4587 if (find_parens(ptr, cd->bracount, NULL, recno,
4588 (options & PCRE_EXTENDED) != 0) < 0)
4589 {
4590 *errorcodeptr = ERR15;
4591 goto FAILED;
4592 }
4593 called = cd->start_code + recno;
4594 PUTINC(cd->hwm, 0, code + 2 + LINK_SIZE - cd->start_code);
4595 }
4596
4597 /* If not a forward reference, and the subpattern is still open,
4598 this is a recursive call. We check to see if this is a left
4599 recursion that could loop for ever, and diagnose that case. */
4600
4601 else if (GET(called, 1) == 0 &&
4602 could_be_empty(called, code, bcptr, utf8))
4603 {
4604 *errorcodeptr = ERR40;
4605 goto FAILED;
4606 }
4607 }
4608
4609 /* Insert the recursion/subroutine item, automatically wrapped inside
4610 "once" brackets. Set up a "previous group" length so that a
4611 subsequent quantifier will work. */
4612
4613 *code = OP_ONCE;
4614 PUT(code, 1, 2 + 2*LINK_SIZE);
4615 code += 1 + LINK_SIZE;
4616
4617 *code = OP_RECURSE;
4618 PUT(code, 1, called - cd->start_code);
4619 code += 1 + LINK_SIZE;
4620
4621 *code = OP_KET;
4622 PUT(code, 1, 2 + 2*LINK_SIZE);
4623 code += 1 + LINK_SIZE;
4624
4625 length_prevgroup = 3 + 3*LINK_SIZE;
4626 }
4627
4628 /* Can't determine a first byte now */
4629
4630 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
4631 continue;
4632
4633
4634 /* ------------------------------------------------------------ */
4635 default: /* Other characters: check option setting */
4636 OTHER_CHAR_AFTER_QUERY:
4637 set = unset = 0;
4638 optset = &set;
4639
4640 while (*ptr != ')' && *ptr != ':')
4641 {
4642 switch (*ptr++)
4643 {
4644 case '-': optset = &unset; break;
4645
4646 case 'J': /* Record that it changed in the external options */
4647 *optset |= PCRE_DUPNAMES;
4648 cd->external_options |= PCRE_JCHANGED;
4649 break;
4650
4651 case 'i': *optset |= PCRE_CASELESS; break;
4652 case 'm': *optset |= PCRE_MULTILINE; break;
4653 case 's': *optset |= PCRE_DOTALL; break;
4654 case 'x': *optset |= PCRE_EXTENDED; break;
4655 case 'U': *optset |= PCRE_UNGREEDY; break;
4656 case 'X': *optset |= PCRE_EXTRA; break;
4657
4658 default: *errorcodeptr = ERR12;
4659 ptr--; /* Correct the offset */
4660 goto FAILED;
4661 }
4662 }
4663
4664 /* Set up the changed option bits, but don't change anything yet. */
4665
4666 newoptions = (options | set) & (~unset);
4667
4668 /* If the options ended with ')' this is not the start of a nested
4669 group with option changes, so the options change at this level. If this
4670 item is right at the start of the pattern, the options can be
4671 abstracted and made external in the pre-compile phase, and ignored in
4672 the compile phase. This can be helpful when matching -- for instance in
4673 caseless checking of required bytes.
4674
4675 If the code pointer is not (cd->start_code + 1 + LINK_SIZE), we are
4676 definitely *not* at the start of the pattern because something has been
4677 compiled. In the pre-compile phase, however, the code pointer can have
4678 that value after the start, because it gets reset as code is discarded
4679 during the pre-compile. However, this can happen only at top level - if
4680 we are within parentheses, the starting BRA will still be present. At
4681 any parenthesis level, the length value can be used to test if anything
4682 has been compiled at that level. Thus, a test for both these conditions
4683 is necessary to ensure we correctly detect the start of the pattern in
4684 both phases.
4685
4686 If we are not at the pattern start, compile code to change the ims
4687 options if this setting actually changes any of them. We also pass the
4688 new setting back so that it can be put at the start of any following
4689 branches, and when this group ends (if we are in a group), a resetting
4690 item can be compiled. */
4691
4692 if (*ptr == ')')
4693 {
4694 if (code == cd->start_code + 1 + LINK_SIZE &&
4695 (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE))
4696 {
4697 cd->external_options = newoptions;
4698 options = newoptions;
4699 }
4700 else
4701 {
4702 if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
4703 {
4704 *code++ = OP_OPT;
4705 *code++ = newoptions & PCRE_IMS;
4706 }
4707
4708 /* Change options at this level, and pass them back for use
4709 in subsequent branches. Reset the greedy defaults and the case
4710 value for firstbyte and reqbyte. */
4711
4712 *optionsptr = options = newoptions;
4713 greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
4714 greedy_non_default = greedy_default ^ 1;
4715 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
4716 }
4717
4718 previous = NULL; /* This item can't be repeated */
4719 continue; /* It is complete */
4720 }
4721
4722 /* If the options ended with ':' we are heading into a nested group
4723 with possible change of options. Such groups are non-capturing and are
4724 not assertions of any kind. All we need to do is skip over the ':';
4725 the newoptions value is handled below. */
4726
4727 bravalue = OP_BRA;
4728 ptr++;
4729 } /* End of switch for character following (? */
4730 } /* End of (? handling */
4731
4732 /* Opening parenthesis not followed by '?'. If PCRE_NO_AUTO_CAPTURE is set,
4733 all unadorned brackets become non-capturing and behave like (?:...)
4734 brackets. */
4735
4736 else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
4737 {
4738 bravalue = OP_BRA;
4739 }
4740
4741 /* Else we have a capturing group. */
4742
4743 else
4744 {
4745 NUMBERED_GROUP:
4746 cd->bracount += 1;
4747 PUT2(code, 1+LINK_SIZE, cd->bracount);
4748 skipbytes = 2;
4749 }
4750
4751 /* Process nested bracketed regex. Assertions may not be repeated, but
4752 other kinds can be. All their opcodes are >= OP_ONCE. We copy code into a
4753 non-register variable in order to be able to pass its address because some
4754 compilers complain otherwise. Pass in a new setting for the ims options if
4755 they have changed. */
4756
4757 previous = (bravalue >= OP_ONCE)? code : NULL;
4758 *code = bravalue;
4759 tempcode = code;
4760 tempreqvary = cd->req_varyopt; /* Save value before bracket */
4761 length_prevgroup = 0; /* Initialize for pre-compile phase */
4762
4763 if (!compile_regex(
4764 newoptions, /* The complete new option state */
4765 options & PCRE_IMS, /* The previous ims option state */
4766 &tempcode, /* Where to put code (updated) */
4767 &ptr, /* Input pointer (updated) */
4768 errorcodeptr, /* Where to put an error message */
4769 (bravalue == OP_ASSERTBACK ||
4770 bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
4771 reset_bracount, /* True if (?| group */
4772 skipbytes, /* Skip over bracket number */
4773 &subfirstbyte, /* For possible first char */
4774 &subreqbyte, /* For possible last char */
4775 bcptr, /* Current branch chain */
4776 cd, /* Tables block */
4777 (lengthptr == NULL)? NULL : /* Actual compile phase */
4778 &length_prevgroup /* Pre-compile phase */
4779 ))
4780 goto FAILED;
4781
4782 /* At the end of compiling, code is still pointing to the start of the
4783 group, while tempcode has been updated to point past the end of the group
4784 and any option resetting that may follow it. The pattern pointer (ptr)
4785 is on the bracket. */
4786
4787 /* If this is a conditional bracket, check that there are no more than
4788 two branches in the group, or just one if it's a DEFINE group. We do this
4789 in the real compile phase, not in the pre-pass, where the whole group may
4790 not be available. */
4791
4792 if (bravalue == OP_COND && lengthptr == NULL)
4793 {
4794 uschar *tc = code;
4795 int condcount = 0;
4796
4797 do {
4798 condcount++;
4799 tc += GET(tc,1);
4800 }
4801 while (*tc != OP_KET);
4802
4803 /* A DEFINE group is never obeyed inline (the "condition" is always
4804 false). It must have only one branch. */
4805
4806 if (code[LINK_SIZE+1] == OP_DEF)
4807 {
4808 if (condcount > 1)
4809 {
4810 *errorcodeptr = ERR54;
4811 goto FAILED;
4812 }
4813 bravalue = OP_DEF; /* Just a flag to suppress char handling below */
4814 }
4815
4816 /* A "normal" conditional group. If there is just one branch, we must not
4817 make use of its firstbyte or reqbyte, because this is equivalent to an
4818 empty second branch. */
4819
4820 else
4821 {
4822 if (condcount > 2)
4823 {
4824 *errorcodeptr = ERR27;
4825 goto FAILED;
4826 }
4827 if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
4828 }
4829 }
4830
4831 /* Error if hit end of pattern */
4832
4833 if (*ptr != ')')
4834 {
4835 *errorcodeptr = ERR14;
4836 goto FAILED;
4837 }
4838
4839 /* In the pre-compile phase, update the length by the length of the group,
4840 less the brackets at either end. Then reduce the compiled code to just a
4841 set of non-capturing brackets so that it doesn't use much memory if it is
4842 duplicated by a quantifier.*/
4843
4844 if (lengthptr != NULL)
4845 {
4846 if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE)
4847 {
4848 *errorcodeptr = ERR20;
4849 goto FAILED;
4850 }
4851 *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
4852 *code++ = OP_BRA;
4853 PUTINC(code, 0, 1 + LINK_SIZE);
4854 *code++ = OP_KET;
4855 PUTINC(code, 0, 1 + LINK_SIZE);
4856 break; /* No need to waste time with special character handling */
4857 }
4858
4859 /* Otherwise update the main code pointer to the end of the group. */
4860
4861 code = tempcode;
4862
4863 /* For a DEFINE group, required and first character settings are not
4864 relevant. */
4865
4866 if (bravalue == OP_DEF) break;
4867
4868 /* Handle updating of the required and first characters for other types of
4869 group. Update for normal brackets of all kinds, and conditions with two
4870 branches (see code above). If the bracket is followed by a quantifier with
4871 zero repeat, we have to back off. Hence the definition of zeroreqbyte and
4872 zerofirstbyte outside the main loop so that they can be accessed for the
4873 back off. */
4874
4875 zeroreqbyte = reqbyte;
4876 zerofirstbyte = firstbyte;
4877 groupsetfirstbyte = FALSE;
4878
4879 if (bravalue >= OP_ONCE)
4880 {
4881 /* If we have not yet set a firstbyte in this branch, take it from the
4882 subpattern, remembering that it was set here so that a repeat of more
4883 than one can replicate it as reqbyte if necessary. If the subpattern has
4884 no firstbyte, set "none" for the whole branch. In both cases, a zero
4885 repeat forces firstbyte to "none". */
4886
4887 if (firstbyte == REQ_UNSET)
4888 {
4889 if (subfirstbyte >= 0)
4890 {
4891 firstbyte = subfirstbyte;
4892 groupsetfirstbyte = TRUE;
4893 }
4894 else firstbyte = REQ_NONE;
4895 zerofirstbyte = REQ_NONE;
4896 }
4897
4898 /* If firstbyte was previously set, convert the subpattern's firstbyte
4899 into reqbyte if there wasn't one, using the vary flag that was in
4900 existence beforehand. */
4901
4902 else if (subfirstbyte >= 0 && subreqbyte < 0)
4903 subreqbyte = subfirstbyte | tempreqvary;
4904
4905 /* If the subpattern set a required byte (or set a first byte that isn't
4906 really the first byte - see above), set it. */
4907
4908 if (subreqbyte >= 0) reqbyte = subreqbyte;
4909 }
4910
4911 /* For a forward assertion, we take the reqbyte, if set. This can be
4912 helpful if the pattern that follows the assertion doesn't set a different
4913 char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
4914 for an assertion, however because it leads to incorrect effect for patterns
4915 such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
4916 of a firstbyte. This is overcome by a scan at the end if there's no
4917 firstbyte, looking for an asserted first char. */
4918
4919 else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
4920 break; /* End of processing '(' */
4921
4922
4923 /* ===================================================================*/
4924 /* Handle metasequences introduced by \. For ones like \d, the ESC_ values
4925 are arranged to be the negation of the corresponding OP_values. For the
4926 back references, the values are ESC_REF plus the reference number. Only
4927 back references and those types that consume a character may be repeated.
4928 We can test for values between ESC_b and ESC_Z for the latter; this may
4929 have to change if any new ones are ever created. */
4930
4931 case '\\':
4932 tempptr = ptr;
4933 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, FALSE);
4934 if (*errorcodeptr != 0) goto FAILED;
4935
4936 if (c < 0)
4937 {
4938 if (-c == ESC_Q) /* Handle start of quoted string */
4939 {
4940 if (ptr[1] == '\\' && ptr[2] == 'E') ptr += 2; /* avoid empty string */
4941 else inescq = TRUE;
4942 continue;
4943 }
4944
4945 if (-c == ESC_E) continue; /* Perl ignores an orphan \E */
4946
4947 /* For metasequences that actually match a character, we disable the
4948 setting of a first character if it hasn't already been set. */
4949
4950 if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
4951 firstbyte = REQ_NONE;
4952
4953 /* Set values to reset to if this is followed by a zero repeat. */
4954
4955 zerofirstbyte = firstbyte;
4956 zeroreqbyte = reqbyte;
4957
4958 /* \k<name> or \k'name' is a back reference by name (Perl syntax).
4959 We also support \k{name} (.NET syntax) */
4960
4961 if (-c == ESC_k && (ptr[1] == '<' || ptr[1] == '\'' || ptr[1] == '{'))
4962 {
4963 is_recurse = FALSE;
4964 terminator = (*(++ptr) == '<')? '>' : (*ptr == '\'')? '\'' : '}';
4965 goto NAMED_REF_OR_RECURSE;
4966 }
4967
4968 /* Back references are handled specially; must disable firstbyte if
4969 not set to cope with cases like (?=(\w+))\1: which would otherwise set
4970 ':' later. */
4971
4972 if (-c >= ESC_REF)
4973 {
4974 recno = -c - ESC_REF;
4975
4976 HANDLE_REFERENCE: /* Come here from named backref handling */
4977 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
4978 previous = code;
4979 *code++ = OP_REF;
4980 PUT2INC(code, 0, recno);
4981 cd->backref_map |= (recno < 32)? (1 << recno) : 1;
4982 if (recno > cd->top_backref) cd->top_backref = recno;
4983 }
4984
4985 /* So are Unicode property matches, if supported. */
4986
4987 #ifdef SUPPORT_UCP
4988 else if (-c == ESC_P || -c == ESC_p)
4989 {
4990 BOOL negated;
4991 int pdata;
4992 int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
4993 if (ptype < 0) goto FAILED;
4994 previous = code;
4995 *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
4996 *code++ = ptype;
4997 *code++ = pdata;
4998 }
4999 #else
5000
5001 /* If Unicode properties are not supported, \X, \P, and \p are not
5002 allowed. */
5003
5004 else if (-c == ESC_X || -c == ESC_P || -c == ESC_p)
5005 {
5006 *errorcodeptr = ERR45;
5007 goto FAILED;
5008 }
5009 #endif
5010
5011 /* For the rest (including \X when Unicode properties are supported), we
5012 can obtain the OP value by negating the escape value. */
5013
5014 else
5015 {
5016 previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
5017 *code++ = -c;
5018 }
5019 continue;
5020 }
5021
5022 /* We have a data character whose value is in c. In UTF-8 mode it may have
5023 a value > 127. We set its representation in the length/buffer, and then
5024 handle it as a data character. */
5025
5026 #ifdef SUPPORT_UTF8
5027 if (utf8 && c > 127)
5028 mclength = _pcre_ord2utf8(c, mcbuffer);
5029 else
5030 #endif
5031
5032 {
5033 mcbuffer[0] = c;
5034 mclength = 1;
5035 }
5036 goto ONE_CHAR;
5037
5038
5039 /* ===================================================================*/
5040 /* Handle a literal character. It is guaranteed not to be whitespace or #
5041 when the extended flag is set. If we are in UTF-8 mode, it may be a
5042 multi-byte literal character. */
5043
5044 default:
5045 NORMAL_CHAR:
5046 mclength = 1;
5047 mcbuffer[0] = c;
5048
5049 #ifdef SUPPORT_UTF8
5050 if (utf8 && c >= 0xc0)
5051 {
5052 while ((ptr[1] & 0xc0) == 0x80)
5053 mcbuffer[mclength++] = *(++ptr);
5054 }
5055 #endif
5056
5057 /* At this point we have the character's bytes in mcbuffer, and the length
5058 in mclength. When not in UTF-8 mode, the length is always 1. */
5059
5060 ONE_CHAR:
5061 previous = code;
5062 *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARNC : OP_CHAR;
5063 for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
5064
5065 /* Remember if \r or \n were seen */
5066
5067 if (mcbuffer[0] == '\r' || mcbuffer[0] == '\n')
5068 cd->external_options |= PCRE_HASCRORLF;
5069
5070 /* Set the first and required bytes appropriately. If no previous first
5071 byte, set it from this character, but revert to none on a zero repeat.
5072 Otherwise, leave the firstbyte value alone, and don't change it on a zero
5073 repeat. */
5074
5075 if (firstbyte == REQ_UNSET)
5076 {
5077 zerofirstbyte = REQ_NONE;
5078 zeroreqbyte = reqbyte;
5079
5080 /* If the character is more than one byte long, we can set firstbyte
5081 only if it is not to be matched caselessly. */
5082
5083 if (mclength == 1 || req_caseopt == 0)
5084 {
5085 firstbyte = mcbuffer[0] | req_caseopt;
5086 if (mclength != 1) reqbyte = code[-1] | cd->req_varyopt;
5087 }
5088 else firstbyte = reqbyte = REQ_NONE;
5089 }
5090
5091 /* firstbyte was previously set; we can set reqbyte only the length is
5092 1 or the matching is caseful. */
5093
5094 else
5095 {
5096 zerofirstbyte = firstbyte;
5097 zeroreqbyte = reqbyte;
5098 if (mclength == 1 || req_caseopt == 0)
5099 reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
5100 }
5101
5102 break; /* End of literal character handling */
5103 }
5104 } /* end of big loop */
5105
5106
5107 /* Control never reaches here by falling through, only by a goto for all the
5108 error states. Pass back the position in the pattern so that it can be displayed
5109 to the user for diagnosing the error. */
5110
5111 FAILED:
5112 *ptrptr = ptr;
5113 return FALSE;
5114 }
5115
5116
5117
5118
5119 /*************************************************
5120 * Compile sequence of alternatives *
5121 *************************************************/
5122
5123 /* On entry, ptr is pointing past the bracket character, but on return it
5124 points to the closing bracket, or vertical bar, or end of string. The code
5125 variable is pointing at the byte into which the BRA operator has been stored.
5126 If the ims options are changed at the start (for a (?ims: group) or during any
5127 branch, we need to insert an OP_OPT item at the start of every following branch
5128 to ensure they get set correctly at run time, and also pass the new options
5129 into every subsequent branch compile.
5130
5131 This function is used during the pre-compile phase when we are trying to find
5132 out the amount of memory needed, as well as during the real compile phase. The
5133 value of lengthptr distinguishes the two phases.
5134
5135 Arguments:
5136 options option bits, including any changes for this subpattern
5137 oldims previous settings of ims option bits
5138 codeptr -> the address of the current code pointer
5139 ptrptr -> the address of the current pattern pointer
5140 errorcodeptr -> pointer to error code variable
5141 lookbehind TRUE if this is a lookbehind assertion
5142 reset_bracount TRUE to reset the count for each branch
5143 skipbytes skip this many bytes at start (for brackets and OP_COND)
5144 firstbyteptr place to put the first required character, or a negative number
5145 reqbyteptr place to put the last required character, or a negative number
5146 bcptr pointer to the chain of currently open branches
5147 cd points to the data block with tables pointers etc.
5148 lengthptr NULL during the real compile phase
5149 points to length accumulator during pre-compile phase
5150
5151 Returns: TRUE on success
5152 */
5153
5154 static BOOL
5155 compile_regex(int options, int oldims, uschar **codeptr, const uschar **ptrptr,
5156 int *errorcodeptr, BOOL lookbehind, BOOL reset_bracount, int skipbytes,
5157 int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd,
5158 int *lengthptr)
5159 {
5160 const uschar *ptr = *ptrptr;
5161 uschar *code = *codeptr;
5162 uschar *last_branch = code;
5163 uschar *start_bracket = code;
5164 uschar *reverse_count = NULL;
5165 int firstbyte, reqbyte;
5166 int branchfirstbyte, branchreqbyte;
5167 int length;
5168 int orig_bracount;
5169 int max_bracount;
5170 branch_chain bc;
5171
5172 bc.outer = bcptr;
5173 bc.current = code;
5174
5175 firstbyte = reqbyte = REQ_UNSET;
5176
5177 /* Accumulate the length for use in the pre-compile phase. Start with the
5178 length of the BRA and KET and any extra bytes that are required at the
5179 beginning. We accumulate in a local variable to save frequent testing of
5180 lenthptr for NULL. We cannot do this by looking at the value of code at the
5181 start and end of each alternative, because compiled items are discarded during
5182 the pre-compile phase so that the work space is not exceeded. */
5183
5184 length = 2 + 2*LINK_SIZE + skipbytes;
5185
5186 /* WARNING: If the above line is changed for any reason, you must also change
5187 the code that abstracts option settings at the start of the pattern and makes
5188 them global. It tests the value of length for (2 + 2*LINK_SIZE) in the
5189 pre-compile phase to find out whether anything has yet been compiled or not. */
5190
5191 /* Offset is set zero to mark that this bracket is still open */
5192
5193 PUT(code, 1, 0);
5194 code += 1 + LINK_SIZE + skipbytes;
5195
5196 /* Loop for each alternative branch */
5197
5198 orig_bracount = max_bracount = cd->bracount;
5199 for (;;)
5200 {
5201 /* For a (?| group, reset the capturing bracket count so that each branch
5202 uses the same numbers. */
5203
5204 if (reset_bracount) cd->bracount = orig_bracount;
5205
5206 /* Handle a change of ims options at the start of the branch */
5207
5208 if ((options & PCRE_IMS) != oldims)
5209 {
5210 *code++ = OP_OPT;
5211 *code++ = options & PCRE_IMS;
5212 length += 2;
5213 }
5214
5215 /* Set up dummy OP_REVERSE if lookbehind assertion */
5216
5217 if (lookbehind)
5218 {
5219 *code++ = OP_REVERSE;
5220 reverse_count = code;
5221 PUTINC(code, 0, 0);
5222 length += 1 + LINK_SIZE;
5223 }
5224
5225 /* Now compile the branch; in the pre-compile phase its length gets added
5226 into the length. */
5227
5228 if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstbyte,
5229 &branchreqbyte, &bc, cd, (lengthptr == NULL)? NULL : &length))
5230 {
5231 *ptrptr = ptr;
5232 return FALSE;
5233 }
5234
5235 /* Keep the highest bracket count in case (?| was used and some branch
5236 has fewer than the rest. */
5237
5238 if (cd->bracount > max_bracount) max_bracount = cd->bracount;
5239
5240 /* In the real compile phase, there is some post-processing to be done. */
5241
5242 if (lengthptr == NULL)
5243 {
5244 /* If this is the first branch, the firstbyte and reqbyte values for the
5245 branch become the values for the regex. */
5246
5247 if (*last_branch != OP_ALT)
5248 {
5249 firstbyte = branchfirstbyte;
5250 reqbyte = branchreqbyte;
5251 }
5252
5253 /* If this is not the first branch, the first char and reqbyte have to
5254 match the values from all the previous branches, except that if the
5255 previous value for reqbyte didn't have REQ_VARY set, it can still match,
5256 and we set REQ_VARY for the regex. */
5257
5258 else
5259 {
5260 /* If we previously had a firstbyte, but it doesn't match the new branch,
5261 we have to abandon the firstbyte for the regex, but if there was
5262 previously no reqbyte, it takes on the value of the old firstbyte. */
5263
5264 if (firstbyte >= 0 && firstbyte != branchfirstbyte)
5265 {
5266 if (reqbyte < 0) reqbyte = firstbyte;
5267 firstbyte = REQ_NONE;
5268 }
5269
5270 /* If we (now or from before) have no firstbyte, a firstbyte from the
5271 branch becomes a reqbyte if there isn't a branch reqbyte. */
5272
5273 if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
5274 branchreqbyte = branchfirstbyte;
5275
5276 /* Now ensure that the reqbytes match */
5277
5278 if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
5279 reqbyte = REQ_NONE;
5280 else reqbyte |= branchreqbyte; /* To "or" REQ_VARY */
5281 }
5282
5283 /* If lookbehind, check that this branch matches a fixed-length string, and
5284 put the length into the OP_REVERSE item. Temporarily mark the end of the
5285 branch with OP_END. */
5286
5287 if (lookbehind)
5288 {
5289 int fixed_length;
5290 *code = OP_END;
5291 fixed_length = find_fixedlength(last_branch, options);
5292 DPRINTF(("fixed length = %d\n", fixed_length));
5293 if (fixed_length < 0)
5294 {
5295 *errorcodeptr = (fixed_length == -2)? ERR36 : ERR25;
5296 *ptrptr = ptr;
5297 return FALSE;
5298 }
5299 PUT(reverse_count, 0, fixed_length);
5300 }
5301 }
5302
5303 /* Reached end of expression, either ')' or end of pattern. In the real
5304 compile phase, go back through the alternative branches and reverse the chain
5305 of offsets, with the field in the BRA item now becoming an offset to the
5306 first alternative. If there are no alternatives, it points to the end of the
5307 group. The length in the terminating ket is always the length of the whole
5308 bracketed item. If any of the ims options were changed inside the group,
5309 compile a resetting op-code following, except at the very end of the pattern.
5310 Return leaving the pointer at the terminating char. */
5311
5312 if (*ptr != '|')
5313 {
5314 if (lengthptr == NULL)
5315 {
5316 int branch_length = code - last_branch;
5317 do
5318 {
5319 int prev_length = GET(last_branch, 1);
5320 PUT(last_branch, 1, branch_length);
5321 branch_length = prev_length;
5322 last_branch -= branch_length;
5323 }
5324 while (branch_length > 0);
5325 }
5326
5327 /* Fill in the ket */
5328
5329 *code = OP_KET;
5330 PUT(code, 1, code - start_bracket);
5331 code += 1 + LINK_SIZE;
5332
5333 /* Resetting option if needed */
5334
5335 if ((options & PCRE_IMS) != oldims && *ptr == ')')
5336 {
5337 *code++ = OP_OPT;
5338 *code++ = oldims;
5339 length += 2;
5340 }
5341
5342 /* Retain the highest bracket number, in case resetting was used. */
5343
5344 cd->bracount = max_bracount;
5345
5346 /* Set values to pass back */
5347
5348 *codeptr = code;
5349 *ptrptr = ptr;
5350 *firstbyteptr = firstbyte;
5351 *reqbyteptr = reqbyte;
5352 if (lengthptr != NULL)
5353 {
5354 if (OFLOW_MAX - *lengthptr < length)
5355 {
5356 *errorcodeptr = ERR20;
5357 return FALSE;
5358 }
5359 *lengthptr += length;
5360 }
5361 return TRUE;
5362 }
5363
5364 /* Another branch follows. In the pre-compile phase, we can move the code
5365 pointer back to where it was for the start of the first branch. (That is,
5366 pretend that each branch is the only one.)
5367
5368 In the real compile phase, insert an ALT node. Its length field points back
5369 to the previous branch while the bracket remains open. At the end the chain
5370 is reversed. It's done like this so that the start of the bracket has a
5371 zero offset until it is closed, making it possible to detect recursion. */
5372
5373 if (lengthptr != NULL)
5374 {
5375 code = *codeptr + 1 + LINK_SIZE + skipbytes;
5376 length += 1 + LINK_SIZE;
5377 }
5378 else
5379 {
5380 *code = OP_ALT;
5381 PUT(code, 1, code - last_branch);
5382 bc.current = last_branch = code;
5383 code += 1 + LINK_SIZE;
5384 }
5385
5386 ptr++;
5387 }
5388 /* Control never reaches here */
5389 }
5390
5391
5392
5393
5394 /*************************************************
5395 * Check for anchored expression *
5396 *************************************************/
5397
5398 /* Try to find out if this is an anchored regular expression. Consider each
5399 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
5400 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
5401 it's anchored. However, if this is a multiline pattern, then only OP_SOD
5402 counts, since OP_CIRC can match in the middle.
5403
5404 We can also consider a regex to be anchored if OP_SOM starts all its branches.
5405 This is the code for \G, which means "match at start of match position, taking
5406 into account the match offset".
5407
5408 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
5409 because that will try the rest of the pattern at all possible matching points,
5410 so there is no point trying again.... er ....
5411
5412 .... except when the .* appears inside capturing parentheses, and there is a
5413 subsequent back reference to those parentheses. We haven't enough information
5414 to catch that case precisely.
5415
5416 At first, the best we could do was to detect when .* was in capturing brackets
5417 and the highest back reference was greater than or equal to that level.
5418 However, by keeping a bitmap of the first 31 back references, we can catch some
5419 of the more common cases more precisely.
5420
5421 Arguments:
5422 code points to start of expression (the bracket)
5423 options points to the options setting
5424 bracket_map a bitmap of which brackets we are inside while testing; this
5425 handles up to substring 31; after that we just have to take
5426 the less precise approach
5427 backref_map the back reference bitmap
5428
5429 Returns: TRUE or FALSE
5430 */
5431
5432 static BOOL
5433 is_anchored(register const uschar *code, int *options, unsigned int bracket_map,
5434 unsigned int backref_map)
5435 {
5436 do {
5437 const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
5438 options, PCRE_MULTILINE, FALSE);
5439 register int op = *scode;
5440
5441 /* Non-capturing brackets */
5442
5443 if (op == OP_BRA)
5444 {
5445 if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
5446 }
5447
5448 /* Capturing brackets */
5449
5450 else if (op == OP_CBRA)
5451 {
5452 int n = GET2(scode, 1+LINK_SIZE);
5453 int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
5454 if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;
5455 }
5456
5457 /* Other brackets */
5458
5459 else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
5460 {
5461 if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
5462 }
5463
5464 /* .* is not anchored unless DOTALL is set and it isn't in brackets that
5465 are or may be referenced. */
5466
5467 else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
5468 op == OP_TYPEPOSSTAR) &&
5469 (*options & PCRE_DOTALL) != 0)
5470 {
5471 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
5472 }
5473
5474 /* Check for explicit anchoring */
5475
5476 else if (op != OP_SOD && op != OP_SOM &&
5477 ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
5478 return FALSE;
5479 code += GET(code, 1);
5480 }
5481 while (*code == OP_ALT); /* Loop for each alternative */
5482 return TRUE;
5483 }
5484
5485
5486
5487 /*************************************************
5488 * Check for starting with ^ or .* *
5489 *************************************************/
5490
5491 /* This is called to find out if every branch starts with ^ or .* so that
5492 "first char" processing can be done to speed things up in multiline
5493 matching and for non-DOTALL patterns that start with .* (which must start at
5494 the beginning or after \n). As in the case of is_anchored() (see above), we
5495 have to take account of back references to capturing brackets that contain .*
5496 because in that case we can't make the assumption.
5497
5498 Arguments:
5499 code points to start of expression (the bracket)
5500 bracket_map a bitmap of which brackets we are inside while testing; this
5501 handles up to substring 31; after that we just have to take
5502 the less precise approach
5503 backref_map the back reference bitmap
5504
5505 Returns: TRUE or FALSE
5506 */
5507
5508 static BOOL
5509 is_startline(const uschar *code, unsigned int bracket_map,
5510 unsigned int backref_map)
5511 {
5512 do {
5513 const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
5514 NULL, 0, FALSE);
5515 register int op = *scode;
5516
5517 /* Non-capturing brackets */
5518
5519 if (op == OP_BRA)
5520 {
5521 if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
5522 }
5523
5524 /* Capturing brackets */
5525
5526 else if (op == OP_CBRA)
5527 {
5528 int n = GET2(scode, 1+LINK_SIZE);
5529 int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
5530 if (!is_startline(scode, new_map, backref_map)) return FALSE;
5531 }
5532
5533 /* Other brackets */
5534
5535 else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
5536 { if (!is_startline(scode, bracket_map, backref_map)) return FALSE; }
5537
5538 /* .* means "start at start or after \n" if it isn't in brackets that
5539 may be referenced. */
5540
5541 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
5542 {
5543 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
5544 }
5545
5546 /* Check for explicit circumflex */
5547
5548 else if (op != OP_CIRC) return FALSE;
5549
5550 /* Move on to the next alternative */
5551
5552 code += GET(code, 1);
5553 }
5554 while (*code == OP_ALT); /* Loop for each alternative */
5555 return TRUE;
5556 }
5557
5558
5559
5560 /*************************************************
5561 * Check for asserted fixed first char *
5562 *************************************************/
5563
5564 /* During compilation, the "first char" settings from forward assertions are
5565 discarded, because they can cause conflicts with actual literals that follow.
5566 However, if we end up without a first char setting for an unanchored pattern,
5567 it is worth scanning the regex to see if there is an initial asserted first
5568 char. If all branches start with the same asserted char, or with a bracket all
5569 of whose alternatives start with the same asserted char (recurse ad lib), then
5570 we return that char, otherwise -1.
5571
5572 Arguments:
5573 code points to start of expression (the bracket)
5574 options pointer to the options (used to check casing changes)
5575 inassert TRUE if in an assertion
5576
5577 Returns: -1 or the fixed first char
5578 */
5579
5580 static int
5581 find_firstassertedchar(const uschar *code, int *options, BOOL inassert)
5582 {
5583 register int c = -1;
5584 do {
5585 int d;
5586 const uschar *scode =
5587 first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE);
5588 register int op = *scode;
5589
5590 switch(op)
5591 {
5592 default:
5593 return -1;
5594
5595 case OP_BRA:
5596 case OP_CBRA:
5597 case OP_ASSERT:
5598 case OP_ONCE:
5599 case OP_COND:
5600 if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0)
5601 return -1;
5602 if (c < 0) c = d; else if (c != d) return -1;
5603 break;
5604
5605 case OP_EXACT: /* Fall through */
5606 scode += 2;
5607
5608 case OP_CHAR:
5609 case OP_CHARNC:
5610 case OP_PLUS:
5611 case OP_MINPLUS:
5612 case OP_POSPLUS:
5613 if (!inassert) return -1;
5614 if (c < 0)
5615 {
5616 c = scode[1];
5617 if ((*options & PCRE_CASELESS) != 0) c |= REQ_CASELESS;
5618 }
5619 else if (c != scode[1]) return -1;
5620 break;
5621 }
5622
5623 code += GET(code, 1);
5624 }
5625 while (*code == OP_ALT);
5626 return c;
5627 }
5628
5629
5630
5631 /*************************************************
5632 * Compile a Regular Expression *
5633 *************************************************/
5634
5635 /* This function takes a string and returns a pointer to a block of store
5636 holding a compiled version of the expression. The original API for this
5637 function had no error code return variable; it is retained for backwards
5638 compatibility. The new function is given a new name.
5639
5640 Arguments:
5641 pattern the regular expression
5642 options various option bits
5643 errorcodeptr pointer to error code variable (pcre_compile2() only)
5644 can be NULL if you don't want a code value
5645 errorptr pointer to pointer to error text
5646 erroroffset ptr offset in pattern where error was detected
5647 tables pointer to character tables or NULL
5648
5649 Returns: pointer to compiled data block, or NULL on error,
5650 with errorptr and erroroffset set
5651 */
5652
5653 PCRE_EXP_DEFN pcre *
5654 pcre_compile(const char *pattern, int options, const char **errorptr,
5655 int *erroroffset, const unsigned char *tables)
5656 {
5657 return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
5658 }
5659
5660
5661 PCRE_EXP_DEFN pcre *
5662 pcre_compile2(const char *pattern, int options, int *errorcodeptr,
5663 const char **errorptr, int *erroroffset, const unsigned char *tables)
5664 {
5665 real_pcre *re;
5666 int length = 1; /* For final END opcode */
5667 int firstbyte, reqbyte, newline;
5668 int errorcode = 0;
5669 int skipatstart = 0;
5670 #ifdef SUPPORT_UTF8
5671 BOOL utf8;
5672 #endif
5673 size_t size;
5674 uschar *code;
5675 const uschar *codestart;
5676 const uschar *ptr;
5677 compile_data compile_block;
5678 compile_data *cd = &compile_block;
5679
5680 /* This space is used for "compiling" into during the first phase, when we are
5681 computing the amount of memory that is needed. Compiled items are thrown away
5682 as soon as possible, so that a fairly large buffer should be sufficient for
5683 this purpose. The same space is used in the second phase for remembering where
5684 to fill in forward references to subpatterns. */
5685
5686 uschar cworkspace[COMPILE_WORK_SIZE];
5687
5688
5689 /* Set this early so that early errors get offset 0. */
5690
5691 ptr = (const uschar *)pattern;
5692
5693 /* We can't pass back an error message if errorptr is NULL; I guess the best we
5694 can do is just return NULL, but we can set a code value if there is a code
5695 pointer. */
5696
5697 if (errorptr == NULL)
5698 {
5699 if (errorcodeptr != NULL) *errorcodeptr = 99;
5700 return NULL;
5701 }
5702
5703 *errorptr = NULL;
5704 if (errorcodeptr != NULL) *errorcodeptr = ERR0;
5705
5706 /* However, we can give a message for this error */
5707
5708 if (erroroffset == NULL)
5709 {
5710 errorcode = ERR16;
5711 goto PCRE_EARLY_ERROR_RETURN2;
5712 }
5713
5714 *erroroffset = 0;
5715
5716 /* Can't support UTF8 unless PCRE has been compiled to include the code. */
5717
5718 #ifdef SUPPORT_UTF8
5719 utf8 = (options & PCRE_UTF8) != 0;
5720 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
5721 (*erroroffset = _pcre_valid_utf8((uschar *)pattern, -1)) >= 0)
5722 {
5723 errorcode = ERR44;
5724 goto PCRE_EARLY_ERROR_RETURN2;
5725 }
5726 #else
5727 if ((options & PCRE_UTF8) != 0)
5728 {
5729 errorcode = ERR32;
5730 goto PCRE_EARLY_ERROR_RETURN;
5731 }
5732 #endif
5733
5734 if ((options & ~PUBLIC_OPTIONS) != 0)
5735 {
5736 errorcode = ERR17;
5737 goto PCRE_EARLY_ERROR_RETURN;
5738 }
5739
5740 /* Set up pointers to the individual character tables */
5741
5742 if (tables == NULL) tables = _pcre_default_tables;
5743 cd->lcc = tables + lcc_offset;
5744 cd->fcc = tables + fcc_offset;
5745 cd->cbits = tables + cbits_offset;
5746 cd->ctypes = tables + ctypes_offset;
5747
5748 /* Check for newline settings at the start of the pattern, and remember the
5749 offset for later. */
5750
5751 if (ptr[0] == '(' && ptr[1] == '*')
5752 {
5753 int newnl = 0;
5754 if (strncmp((char *)(ptr+2), "CR)", 3) == 0)
5755 { skipatstart = 5; newnl = PCRE_NEWLINE_CR; }
5756 else if (strncmp((char *)(ptr+2), "LF)", 3) == 0)
5757 { skipatstart = 5; newnl = PCRE_NEWLINE_LF; }
5758 else if (strncmp((char *)(ptr+2), "CRLF)", 5) == 0)
5759 { skipatstart = 7; newnl = PCRE_NEWLINE_CR + PCRE_NEWLINE_LF; }
5760 else if (strncmp((char *)(ptr+2), "ANY)", 4) == 0)
5761 { skipatstart = 6; newnl = PCRE_NEWLINE_ANY; }
5762 else if (strncmp((char *)(ptr+2), "ANYCRLF)", 8) == 0)
5763 { skipatstart = 10; newnl = PCRE_NEWLINE_ANYCRLF; }
5764 if (skipatstart > 0)
5765 options = (options & ~PCRE_NEWLINE_BITS) | newnl;
5766 }
5767
5768 /* Handle different types of newline. The three bits give seven cases. The
5769 current code allows for fixed one- or two-byte sequences, plus "any" and
5770 "anycrlf". */
5771
5772 switch (options & PCRE_NEWLINE_BITS)
5773 {
5774 case 0: newline = NEWLINE; break; /* Build-time default */
5775 case PCRE_NEWLINE_CR: newline = '\r'; break;
5776 case PCRE_NEWLINE_LF: newline = '\n'; break;
5777 case PCRE_NEWLINE_CR+
5778 PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
5779 case PCRE_NEWLINE_ANY: newline = -1; break;
5780 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
5781 default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
5782 }
5783
5784 if (newline == -2)
5785 {
5786 cd->nltype = NLTYPE_ANYCRLF;
5787 }
5788 else if (newline < 0)
5789 {
5790 cd->nltype = NLTYPE_ANY;
5791 }
5792 else
5793 {
5794 cd->nltype = NLTYPE_FIXED;
5795 if (newline > 255)
5796 {
5797 cd->nllen = 2;
5798 cd->nl[0] = (newline >> 8) & 255;
5799 cd->nl[1] = newline & 255;
5800 }
5801 else
5802 {
5803 cd->nllen = 1;
5804 cd->nl[0] = newline;
5805 }
5806 }
5807
5808 /* Maximum back reference and backref bitmap. The bitmap records up to 31 back
5809 references to help in deciding whether (.*) can be treated as anchored or not.
5810 */
5811
5812 cd->top_backref = 0;
5813 cd->backref_map = 0;
5814
5815 /* Reflect pattern for debugging output */
5816
5817 DPRINTF(("------------------------------------------------------------------\n"));
5818 DPRINTF(("%s\n", pattern));
5819
5820 /* Pretend to compile the pattern while actually just accumulating the length
5821 of memory required. This behaviour is triggered by passing a non-NULL final
5822 argument to compile_regex(). We pass a block of workspace (cworkspace) for it
5823 to compile parts of the pattern into; the compiled code is discarded when it is
5824 no longer needed, so hopefully this workspace will never overflow, though there
5825 is a test for its doing so. */
5826
5827 cd->bracount = 0;
5828 cd->names_found = 0;
5829 cd->name_entry_size = 0;
5830 cd->name_table = NULL;
5831 cd->start_workspace = cworkspace;
5832 cd->start_code = cworkspace;
5833 cd->hwm = cworkspace;
5834 cd->start_pattern = (const uschar *)pattern;
5835 cd->end_pattern = (const uschar *)(pattern + strlen(pattern));
5836 cd->req_varyopt = 0;
5837 cd->nopartial = FALSE;
5838 cd->external_options = options;
5839
5840 /* Now do the pre-compile. On error, errorcode will be set non-zero, so we
5841 don't need to look at the result of the function here. The initial options have
5842 been put into the cd block so that they can be changed if an option setting is
5843 found within the regex right at the beginning. Bringing initial option settings
5844 outside can help speed up starting point checks. */
5845
5846 ptr += skipatstart;
5847 code = cworkspace;
5848 *code = OP_BRA;
5849 (void)compile_regex(cd->external_options, cd->external_options & PCRE_IMS,
5850 &code, &ptr, &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd,
5851 &length);
5852 if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN;
5853
5854 DPRINTF(("end pre-compile: length=%d workspace=%d\n", length,
5855 cd->hwm - cworkspace));
5856
5857 if (length > MAX_PATTERN_SIZE)
5858 {
5859 errorcode = ERR20;
5860 goto PCRE_EARLY_ERROR_RETURN;
5861 }
5862
5863 /* Compute the size of data block needed and get it, either from malloc or
5864 externally provided function. Integer overflow should no longer be possible
5865 because nowadays we limit the maximum value of cd->names_found and
5866 cd->name_entry_size. */
5867
5868 size = length + sizeof(real_pcre) + cd->names_found * (cd->name_entry_size + 3);
5869 re = (real_pcre *)(pcre_malloc)(size);
5870
5871 if (re == NULL)
5872 {
5873 errorcode = ERR21;
5874 goto PCRE_EARLY_ERROR_RETURN;
5875 }
5876
5877 /* Put in the magic number, and save the sizes, initial options, and character
5878 table pointer. NULL is used for the default character tables. The nullpad field
5879 is at the end; it's there to help in the case when a regex compiled on a system
5880 with 4-byte pointers is run on another with 8-byte pointers. */
5881
5882 re->magic_number = MAGIC_NUMBER;
5883 re->size = size;
5884 re->options = cd->external_options;
5885 re->dummy1 = 0;
5886 re->first_byte = 0;
5887 re->req_byte = 0;
5888 re->name_table_offset = sizeof(real_pcre);
5889 re->name_entry_size = cd->name_entry_size;
5890 re->name_count = cd->names_found;
5891 re->ref_count = 0;
5892 re->tables = (tables == _pcre_default_tables)? NULL : tables;
5893 re->nullpad = NULL;
5894
5895 /* The starting points of the name/number translation table and of the code are
5896 passed around in the compile data block. The start/end pattern and initial
5897 options are already set from the pre-compile phase, as is the name_entry_size
5898 field. Reset the bracket count and the names_found field. Also reset the hwm
5899 field; this time it's used for remembering forward references to subpatterns.
5900 */
5901
5902 cd->bracount = 0;
5903 cd->names_found = 0;
5904 cd->name_table = (uschar *)re + re->name_table_offset;
5905 codestart = cd->name_table + re->name_entry_size * re->name_count;
5906 cd->start_code = codestart;
5907 cd->hwm = cworkspace;
5908 cd->req_varyopt = 0;
5909 cd->nopartial = FALSE;
5910 cd->had_accept = FALSE;
5911
5912 /* Set up a starting, non-extracting bracket, then compile the expression. On
5913 error, errorcode will be set non-zero, so we don't need to look at the result
5914 of the function here. */
5915
5916 ptr = (const uschar *)pattern + skipatstart;
5917 code = (uschar *)codestart;
5918 *code = OP_BRA;
5919 (void)compile_regex(re->options, re->options & PCRE_IMS, &code, &ptr,
5920 &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd, NULL);
5921 re->top_bracket = cd->bracount;
5922 re->top_backref = cd->top_backref;
5923
5924 if (cd->nopartial) re->options |= PCRE_NOPARTIAL;
5925 if (cd->had_accept) reqbyte = -1; /* Must disable after (*ACCEPT) */
5926
5927 /* If not reached end of pattern on success, there's an excess bracket. */
5928
5929 if (errorcode == 0 && *ptr != 0) errorcode = ERR22;
5930
5931 /* Fill in the terminating state and check for disastrous overflow, but
5932 if debugging, leave the test till after things are printed out. */
5933
5934 *code++ = OP_END;
5935
5936 #ifndef DEBUG
5937 if (code - codestart > length) errorcode = ERR23;
5938 #endif
5939
5940 /* Fill in any forward references that are required. */
5941
5942 while (errorcode == 0 && cd->hwm > cworkspace)
5943 {
5944 int offset, recno;
5945 const uschar *groupptr;
5946 cd->hwm -= LINK_SIZE;
5947 offset = GET(cd->hwm, 0);
5948 recno = GET(codestart, offset);
5949 groupptr = find_bracket(codestart, (re->options & PCRE_UTF8) != 0, recno);
5950 if (groupptr == NULL) errorcode = ERR53;
5951 else PUT(((uschar *)codestart), offset, groupptr - codestart);
5952 }
5953
5954 /* Give an error if there's back reference to a non-existent capturing
5955 subpattern. */
5956
5957 if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15;
5958
5959 /* Failed to compile, or error while post-processing */
5960
5961 if (errorcode != 0)
5962 {
5963 (pcre_free)(re);
5964 PCRE_EARLY_ERROR_RETURN:
5965 *erroroffset = ptr - (const uschar *)pattern;
5966 PCRE_EARLY_ERROR_RETURN2:
5967 *errorptr = error_texts[errorcode];
5968 if (errorcodeptr != NULL) *errorcodeptr = errorcode;
5969 return NULL;
5970 }
5971
5972 /* If the anchored option was not passed, set the flag if we can determine that
5973 the pattern is anchored by virtue of ^ characters or \A or anything else (such
5974 as starting with .* when DOTALL is set).
5975
5976 Otherwise, if we know what the first byte has to be, save it, because that
5977 speeds up unanchored matches no end. If not, see if we can set the
5978 PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
5979 start with ^. and also when all branches start with .* for non-DOTALL matches.
5980 */
5981
5982 if ((re->options & PCRE_ANCHORED) == 0)
5983 {
5984 int temp_options = re->options; /* May get changed during these scans */
5985 if (is_anchored(codestart, &temp_options, 0, cd->backref_map))
5986 re->options |= PCRE_ANCHORED;
5987 else
5988 {
5989 if (firstbyte < 0)
5990 firstbyte = find_firstassertedchar(codestart, &temp_options, FALSE);
5991 if (firstbyte >= 0) /* Remove caseless flag for non-caseable chars */
5992 {
5993 int ch = firstbyte & 255;
5994 re->first_byte = ((firstbyte & REQ_CASELESS) != 0 &&
5995 cd->fcc[ch] == ch)? ch : firstbyte;
5996 re->options |= PCRE_FIRSTSET;
5997 }
5998 else if (is_startline(codestart, 0, cd->backref_map))
5999 re->options |= PCRE_STARTLINE;
6000 }
6001 }
6002
6003 /* For an anchored pattern, we use the "required byte" only if it follows a
6004 variable length item in the regex. Remove the caseless flag for non-caseable
6005 bytes. */
6006
6007 if (reqbyte >= 0 &&
6008 ((re->options & PCRE_ANCHORED) == 0 || (reqbyte & REQ_VARY) != 0))
6009 {
6010 int ch = reqbyte & 255;
6011 re->req_byte = ((reqbyte & REQ_CASELESS) != 0 &&
6012 cd->fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte;
6013 re->options |= PCRE_REQCHSET;
6014 }
6015
6016 /* Print out the compiled data if debugging is enabled. This is never the
6017 case when building a production library. */
6018
6019 #ifdef DEBUG
6020
6021 printf("Length = %d top_bracket = %d top_backref = %d\n",
6022 length, re->top_bracket, re->top_backref);
6023
6024 printf("Options=%08x\n", re->options);
6025
6026 if ((re->options & PCRE_FIRSTSET) != 0)
6027 {
6028 int ch = re->first_byte & 255;
6029 const char *caseless = ((re->first_byte & REQ_CASELESS) == 0)?
6030 "" : " (caseless)";
6031 if (isprint(ch)) printf("First char = %c%s\n", ch, caseless);
6032 else printf("First char = \\x%02x%s\n", ch, caseless);
6033 }
6034
6035 if ((re->options & PCRE_REQCHSET) != 0)
6036 {
6037 int ch = re->req_byte & 255;
6038 const char *caseless = ((re->req_byte & REQ_CASELESS) == 0)?
6039 "" : " (caseless)";
6040 if (isprint(ch)) printf("Req char = %c%s\n", ch, caseless);
6041 else printf("Req char = \\x%02x%s\n", ch, caseless);
6042 }
6043
6044 pcre_printint(re, stdout, TRUE);
6045
6046 /* This check is done here in the debugging case so that the code that
6047 was compiled can be seen. */
6048
6049 if (code - codestart > length)
6050 {
6051 (pcre_free)(re);
6052 *errorptr = error_texts[ERR23];
6053 *erroroffset = ptr - (uschar *)pattern;
6054 if (errorcodeptr != NULL) *errorcodeptr = ERR23;
6055 return NULL;
6056 }
6057 #endif /* DEBUG */
6058
6059 return (pcre *)re;
6060 }
6061
6062 /* End of pcre_compile.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12