/[pcre]/code/trunk/pcre.c
ViewVC logotype

Contents of /code/trunk/pcre.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 73 - (show annotations) (download)
Sat Feb 24 21:40:30 2007 UTC (7 years, 6 months ago) by nigel
File MIME type: text/plain
File size: 255554 byte(s)
Load pcre-4.5 into code/trunk.

1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /*
6 This is a library of functions to support regular expressions whose syntax
7 and semantics are as close as possible to those of the Perl 5 language. See
8 the file Tech.Notes for some information on the internals.
9
10 Written by: Philip Hazel <ph10@cam.ac.uk>
11
12 Copyright (c) 1997-2003 University of Cambridge
13
14 -----------------------------------------------------------------------------
15 Permission is granted to anyone to use this software for any purpose on any
16 computer system, and to redistribute it freely, subject to the following
17 restrictions:
18
19 1. This software is distributed in the hope that it will be useful,
20 but WITHOUT ANY WARRANTY; without even the implied warranty of
21 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
22
23 2. The origin of this software must not be misrepresented, either by
24 explicit claim or by omission.
25
26 3. Altered versions must be plainly marked as such, and must not be
27 misrepresented as being the original software.
28
29 4. If PCRE is embedded in any software that is released under the GNU
30 General Purpose Licence (GPL), then the terms of that licence shall
31 supersede any condition above with which it is incompatible.
32 -----------------------------------------------------------------------------
33 */
34
35
36 /* Define DEBUG to get debugging output on stdout. */
37 /* #define DEBUG */
38
39 /* Use a macro for debugging printing, 'cause that eliminates the use of #ifdef
40 inline, and there are *still* stupid compilers about that don't like indented
41 pre-processor statements. I suppose it's only been 10 years... */
42
43 #ifdef DEBUG
44 #define DPRINTF(p) printf p
45 #else
46 #define DPRINTF(p) /*nothing*/
47 #endif
48
49 /* Include the internals header, which itself includes "config.h", the Standard
50 C headers, and the external pcre header. */
51
52 #include "internal.h"
53
54
55 /* Allow compilation as C++ source code, should anybody want to do that. */
56
57 #ifdef __cplusplus
58 #define class pcre_class
59 #endif
60
61
62 /* Maximum number of items on the nested bracket stacks at compile time. This
63 applies to the nesting of all kinds of parentheses. It does not limit
64 un-nested, non-capturing parentheses. This number can be made bigger if
65 necessary - it is used to dimension one int and one unsigned char vector at
66 compile time. */
67
68 #define BRASTACK_SIZE 200
69
70
71 /* Maximum number of ints of offset to save on the stack for recursive calls.
72 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
73 because the offset vector is always a multiple of 3 long. */
74
75 #define REC_STACK_SAVE_MAX 30
76
77
78 /* The number of bytes in a literal character string above which we can't add
79 any more is set at 250 in order to allow for UTF-8 characters. (In theory it
80 could be 255 when UTF-8 support is excluded, but that means that some of the
81 test output would be different, which just complicates things.) */
82
83 #define MAXLIT 250
84
85
86 /* The maximum remaining length of subject we are prepared to search for a
87 req_byte match. */
88
89 #define REQ_BYTE_MAX 1000
90
91
92 /* Table of sizes for the fixed-length opcodes. It's defined in a macro so that
93 the definition is next to the definition of the opcodes in internal.h. */
94
95 static const uschar OP_lengths[] = { OP_LENGTHS };
96
97 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
98
99 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
100 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
101
102 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
103 are simple data values; negative values are for special things like \d and so
104 on. Zero means further processing is needed (for things like \x), or the escape
105 is invalid. */
106
107 #if !EBCDIC /* This is the "normal" table for ASCII systems */
108 static const short int escapes[] = {
109 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
110 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
111 '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */
112 0, 0, 0, 0, 0, 0, 0, 0, /* H - O */
113 0, -ESC_Q, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
114 0, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
115 '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */
116 0, 0, 0, 0, 0, 0, ESC_n, 0, /* h - o */
117 0, 0, ESC_r, -ESC_s, ESC_tee, 0, 0, -ESC_w, /* p - w */
118 0, 0, -ESC_z /* x - z */
119 };
120
121 #else /* This is the "abnormal" table for EBCDIC systems */
122 static const short int escapes[] = {
123 /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
124 /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
125 /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
126 /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
127 /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
128 /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
129 /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
130 /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
131 /* 88 */ 0, 0, 0, '{', 0, 0, 0, 0,
132 /* 90 */ 0, 0, 0, 'l', 0, ESC_n, 0, 0,
133 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
134 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0, 0, -ESC_w, 0,
135 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
136 /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
137 /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
138 /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
139 /* C8 */ 0, 0, 0, 0, 0, 0, 0, 0,
140 /* D0 */ '}', 0, 0, 0, 0, 0, 0, 0,
141 /* D8 */-ESC_Q, 0, 0, 0, 0, 0, 0, 0,
142 /* E0 */ '\\', 0, -ESC_S, 0, 0, 0, -ESC_W, 0,
143 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
144 /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
145 /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
146 };
147 #endif
148
149
150 /* Tables of names of POSIX character classes and their lengths. The list is
151 terminated by a zero length entry. The first three must be alpha, upper, lower,
152 as this is assumed for handling case independence. */
153
154 static const char *const posix_names[] = {
155 "alpha", "lower", "upper",
156 "alnum", "ascii", "blank", "cntrl", "digit", "graph",
157 "print", "punct", "space", "word", "xdigit" };
158
159 static const uschar posix_name_lengths[] = {
160 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
161
162 /* Table of class bit maps for each POSIX class; up to three may be combined
163 to form the class. The table for [:blank:] is dynamically modified to remove
164 the vertical space characters. */
165
166 static const int posix_class_maps[] = {
167 cbit_lower, cbit_upper, -1, /* alpha */
168 cbit_lower, -1, -1, /* lower */
169 cbit_upper, -1, -1, /* upper */
170 cbit_digit, cbit_lower, cbit_upper, /* alnum */
171 cbit_print, cbit_cntrl, -1, /* ascii */
172 cbit_space, -1, -1, /* blank - a GNU extension */
173 cbit_cntrl, -1, -1, /* cntrl */
174 cbit_digit, -1, -1, /* digit */
175 cbit_graph, -1, -1, /* graph */
176 cbit_print, -1, -1, /* print */
177 cbit_punct, -1, -1, /* punct */
178 cbit_space, -1, -1, /* space */
179 cbit_word, -1, -1, /* word - a Perl extension */
180 cbit_xdigit,-1, -1 /* xdigit */
181 };
182
183 /* Table to identify digits and hex digits. This is used when compiling
184 patterns. Note that the tables in chartables are dependent on the locale, and
185 may mark arbitrary characters as digits - but the PCRE compiling code expects
186 to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
187 a private table here. It costs 256 bytes, but it is a lot faster than doing
188 character value tests (at least in some simple cases I timed), and in some
189 applications one wants PCRE to compile efficiently as well as match
190 efficiently.
191
192 For convenience, we use the same bit definitions as in chartables:
193
194 0x04 decimal digit
195 0x08 hexadecimal digit
196
197 Then we can use ctype_digit and ctype_xdigit in the code. */
198
199 #if !EBCDIC /* This is the "normal" case, for ASCII systems */
200 static const unsigned char digitab[] =
201 {
202 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
203 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
204 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
205 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
206 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
207 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
208 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
209 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
210 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
211 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
212 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
213 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
214 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
215 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
216 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
217 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
218 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
219 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
220 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
221 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
222 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
223 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
224 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
225 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
226 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
227 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
228 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
229 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
230 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
231 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
232 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
233 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
234
235 #else /* This is the "abnormal" case, for EBCDIC systems */
236 static const unsigned char digitab[] =
237 {
238 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
239 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
240 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
241 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
242 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
243 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
244 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
245 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
246 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
247 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
248 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
249 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- */
250 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
251 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
252 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
253 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
254 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
255 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
256 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
257 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
258 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
259 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
260 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
261 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
262 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
263 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
264 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
265 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
266 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
267 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
268 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
269 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
270
271 static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
272 0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
273 0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
274 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
275 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
276 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
277 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
278 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
279 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
280 0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
281 0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
282 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
283 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- */
284 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
285 0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
286 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
287 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
288 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
289 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
290 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
291 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
292 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
293 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
294 0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
295 0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
296 0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
297 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
298 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
299 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
300 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
301 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
302 0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
303 0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
304 #endif
305
306
307 /* Definition to allow mutual recursion */
308
309 static BOOL
310 compile_regex(int, int, int *, uschar **, const uschar **, const char **,
311 BOOL, int, int *, int *, branch_chain *, compile_data *);
312
313 /* Structure for building a chain of data that actually lives on the
314 stack, for holding the values of the subject pointer at the start of each
315 subpattern, so as to detect when an empty string has been matched by a
316 subpattern - to break infinite loops. When NO_RECURSE is set, these blocks
317 are on the heap, not on the stack. */
318
319 typedef struct eptrblock {
320 struct eptrblock *epb_prev;
321 const uschar *epb_saved_eptr;
322 } eptrblock;
323
324 /* Flag bits for the match() function */
325
326 #define match_condassert 0x01 /* Called to check a condition assertion */
327 #define match_isgroup 0x02 /* Set if start of bracketed group */
328
329 /* Non-error returns from the match() function. Error returns are externally
330 defined PCRE_ERROR_xxx codes, which are all negative. */
331
332 #define MATCH_MATCH 1
333 #define MATCH_NOMATCH 0
334
335
336
337 /*************************************************
338 * Global variables *
339 *************************************************/
340
341 /* PCRE is thread-clean and doesn't use any global variables in the normal
342 sense. However, it calls memory allocation and free functions via the four
343 indirections below, and it can optionally do callouts. These values can be
344 changed by the caller, but are shared between all threads. However, when
345 compiling for Virtual Pascal, things are done differently (see pcre.in). */
346
347 #ifndef VPCOMPAT
348 #ifdef __cplusplus
349 extern "C" void *(*pcre_malloc)(size_t) = malloc;
350 extern "C" void (*pcre_free)(void *) = free;
351 extern "C" void *(*pcre_stack_malloc)(size_t) = malloc;
352 extern "C" void (*pcre_stack_free)(void *) = free;
353 extern "C" int (*pcre_callout)(pcre_callout_block *) = NULL;
354 #else
355 void *(*pcre_malloc)(size_t) = malloc;
356 void (*pcre_free)(void *) = free;
357 void *(*pcre_stack_malloc)(size_t) = malloc;
358 void (*pcre_stack_free)(void *) = free;
359 int (*pcre_callout)(pcre_callout_block *) = NULL;
360 #endif
361 #endif
362
363
364 /*************************************************
365 * Macros and tables for character handling *
366 *************************************************/
367
368 /* When UTF-8 encoding is being used, a character is no longer just a single
369 byte. The macros for character handling generate simple sequences when used in
370 byte-mode, and more complicated ones for UTF-8 characters. */
371
372 #ifndef SUPPORT_UTF8
373 #define GETCHAR(c, eptr) c = *eptr;
374 #define GETCHARINC(c, eptr) c = *eptr++;
375 #define GETCHARINCTEST(c, eptr) c = *eptr++;
376 #define GETCHARLEN(c, eptr, len) c = *eptr;
377 #define BACKCHAR(eptr)
378
379 #else /* SUPPORT_UTF8 */
380
381 /* Get the next UTF-8 character, not advancing the pointer. This is called when
382 we know we are in UTF-8 mode. */
383
384 #define GETCHAR(c, eptr) \
385 c = *eptr; \
386 if ((c & 0xc0) == 0xc0) \
387 { \
388 int gcii; \
389 int gcaa = utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
390 int gcss = 6*gcaa; \
391 c = (c & utf8_table3[gcaa]) << gcss; \
392 for (gcii = 1; gcii <= gcaa; gcii++) \
393 { \
394 gcss -= 6; \
395 c |= (eptr[gcii] & 0x3f) << gcss; \
396 } \
397 }
398
399 /* Get the next UTF-8 character, advancing the pointer. This is called when we
400 know we are in UTF-8 mode. */
401
402 #define GETCHARINC(c, eptr) \
403 c = *eptr++; \
404 if ((c & 0xc0) == 0xc0) \
405 { \
406 int gcaa = utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
407 int gcss = 6*gcaa; \
408 c = (c & utf8_table3[gcaa]) << gcss; \
409 while (gcaa-- > 0) \
410 { \
411 gcss -= 6; \
412 c |= (*eptr++ & 0x3f) << gcss; \
413 } \
414 }
415
416 /* Get the next character, testing for UTF-8 mode, and advancing the pointer */
417
418 #define GETCHARINCTEST(c, eptr) \
419 c = *eptr++; \
420 if (md->utf8 && (c & 0xc0) == 0xc0) \
421 { \
422 int gcaa = utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
423 int gcss = 6*gcaa; \
424 c = (c & utf8_table3[gcaa]) << gcss; \
425 while (gcaa-- > 0) \
426 { \
427 gcss -= 6; \
428 c |= (*eptr++ & 0x3f) << gcss; \
429 } \
430 }
431
432 /* Get the next UTF-8 character, not advancing the pointer, incrementing length
433 if there are extra bytes. This is called when we know we are in UTF-8 mode. */
434
435 #define GETCHARLEN(c, eptr, len) \
436 c = *eptr; \
437 if ((c & 0xc0) == 0xc0) \
438 { \
439 int gcii; \
440 int gcaa = utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
441 int gcss = 6*gcaa; \
442 c = (c & utf8_table3[gcaa]) << gcss; \
443 for (gcii = 1; gcii <= gcaa; gcii++) \
444 { \
445 gcss -= 6; \
446 c |= (eptr[gcii] & 0x3f) << gcss; \
447 } \
448 len += gcaa; \
449 }
450
451 /* If the pointer is not at the start of a character, move it back until
452 it is. Called only in UTF-8 mode. */
453
454 #define BACKCHAR(eptr) while((*eptr & 0xc0) == 0x80) eptr--;
455
456 #endif
457
458
459
460 /*************************************************
461 * Default character tables *
462 *************************************************/
463
464 /* A default set of character tables is included in the PCRE binary. Its source
465 is built by the maketables auxiliary program, which uses the default C ctypes
466 functions, and put in the file chartables.c. These tables are used by PCRE
467 whenever the caller of pcre_compile() does not provide an alternate set of
468 tables. */
469
470 #include "chartables.c"
471
472
473
474 #ifdef SUPPORT_UTF8
475 /*************************************************
476 * Tables for UTF-8 support *
477 *************************************************/
478
479 /* These are the breakpoints for different numbers of bytes in a UTF-8
480 character. */
481
482 static const int utf8_table1[] =
483 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff};
484
485 /* These are the indicator bits and the mask for the data bits to set in the
486 first byte of a character, indexed by the number of additional bytes. */
487
488 static const int utf8_table2[] = { 0, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
489 static const int utf8_table3[] = { 0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01};
490
491 /* Table of the number of extra characters, indexed by the first character
492 masked with 0x3f. The highest number for a valid UTF-8 character is in fact
493 0x3d. */
494
495 static const uschar utf8_table4[] = {
496 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
497 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
498 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
499 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 };
500
501
502 /*************************************************
503 * Convert character value to UTF-8 *
504 *************************************************/
505
506 /* This function takes an integer value in the range 0 - 0x7fffffff
507 and encodes it as a UTF-8 character in 0 to 6 bytes.
508
509 Arguments:
510 cvalue the character value
511 buffer pointer to buffer for result - at least 6 bytes long
512
513 Returns: number of characters placed in the buffer
514 */
515
516 static int
517 ord2utf8(int cvalue, uschar *buffer)
518 {
519 register int i, j;
520 for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
521 if (cvalue <= utf8_table1[i]) break;
522 buffer += i;
523 for (j = i; j > 0; j--)
524 {
525 *buffer-- = 0x80 | (cvalue & 0x3f);
526 cvalue >>= 6;
527 }
528 *buffer = utf8_table2[i] | cvalue;
529 return i + 1;
530 }
531 #endif
532
533
534
535 /*************************************************
536 * Print compiled regex *
537 *************************************************/
538
539 /* The code for doing this is held in a separate file that is also included in
540 pcretest.c. It defines a function called print_internals(). */
541
542 #ifdef DEBUG
543 #include "printint.c"
544 #endif
545
546
547
548 /*************************************************
549 * Return version string *
550 *************************************************/
551
552 #define STRING(a) # a
553 #define XSTRING(s) STRING(s)
554
555 EXPORT const char *
556 pcre_version(void)
557 {
558 return XSTRING(PCRE_MAJOR) "." XSTRING(PCRE_MINOR) " " XSTRING(PCRE_DATE);
559 }
560
561
562
563
564 /*************************************************
565 * (Obsolete) Return info about compiled pattern *
566 *************************************************/
567
568 /* This is the original "info" function. It picks potentially useful data out
569 of the private structure, but its interface was too rigid. It remains for
570 backwards compatibility. The public options are passed back in an int - though
571 the re->options field has been expanded to a long int, all the public options
572 at the low end of it, and so even on 16-bit systems this will still be OK.
573 Therefore, I haven't changed the API for pcre_info().
574
575 Arguments:
576 external_re points to compiled code
577 optptr where to pass back the options
578 first_byte where to pass back the first character,
579 or -1 if multiline and all branches start ^,
580 or -2 otherwise
581
582 Returns: number of capturing subpatterns
583 or negative values on error
584 */
585
586 EXPORT int
587 pcre_info(const pcre *external_re, int *optptr, int *first_byte)
588 {
589 const real_pcre *re = (const real_pcre *)external_re;
590 if (re == NULL) return PCRE_ERROR_NULL;
591 if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
592 if (optptr != NULL) *optptr = (int)(re->options & PUBLIC_OPTIONS);
593 if (first_byte != NULL)
594 *first_byte = ((re->options & PCRE_FIRSTSET) != 0)? re->first_byte :
595 ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
596 return re->top_bracket;
597 }
598
599
600
601 /*************************************************
602 * Return info about compiled pattern *
603 *************************************************/
604
605 /* This is a newer "info" function which has an extensible interface so
606 that additional items can be added compatibly.
607
608 Arguments:
609 external_re points to compiled code
610 extra_data points extra data, or NULL
611 what what information is required
612 where where to put the information
613
614 Returns: 0 if data returned, negative on error
615 */
616
617 EXPORT int
618 pcre_fullinfo(const pcre *external_re, const pcre_extra *extra_data, int what,
619 void *where)
620 {
621 const real_pcre *re = (const real_pcre *)external_re;
622 const pcre_study_data *study = NULL;
623
624 if (re == NULL || where == NULL) return PCRE_ERROR_NULL;
625 if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
626
627 if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_STUDY_DATA) != 0)
628 study = (const pcre_study_data *)extra_data->study_data;
629
630 switch (what)
631 {
632 case PCRE_INFO_OPTIONS:
633 *((unsigned long int *)where) = re->options & PUBLIC_OPTIONS;
634 break;
635
636 case PCRE_INFO_SIZE:
637 *((size_t *)where) = re->size;
638 break;
639
640 case PCRE_INFO_STUDYSIZE:
641 *((size_t *)where) = (study == NULL)? 0 : study->size;
642 break;
643
644 case PCRE_INFO_CAPTURECOUNT:
645 *((int *)where) = re->top_bracket;
646 break;
647
648 case PCRE_INFO_BACKREFMAX:
649 *((int *)where) = re->top_backref;
650 break;
651
652 case PCRE_INFO_FIRSTBYTE:
653 *((int *)where) =
654 ((re->options & PCRE_FIRSTSET) != 0)? re->first_byte :
655 ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
656 break;
657
658 case PCRE_INFO_FIRSTTABLE:
659 *((const uschar **)where) =
660 (study != NULL && (study->options & PCRE_STUDY_MAPPED) != 0)?
661 study->start_bits : NULL;
662 break;
663
664 case PCRE_INFO_LASTLITERAL:
665 *((int *)where) =
666 ((re->options & PCRE_REQCHSET) != 0)? re->req_byte : -1;
667 break;
668
669 case PCRE_INFO_NAMEENTRYSIZE:
670 *((int *)where) = re->name_entry_size;
671 break;
672
673 case PCRE_INFO_NAMECOUNT:
674 *((int *)where) = re->name_count;
675 break;
676
677 case PCRE_INFO_NAMETABLE:
678 *((const uschar **)where) = (const uschar *)re + sizeof(real_pcre);
679 break;
680
681 default: return PCRE_ERROR_BADOPTION;
682 }
683
684 return 0;
685 }
686
687
688
689 /*************************************************
690 * Return info about what features are configured *
691 *************************************************/
692
693 /* This is function which has an extensible interface so that additional items
694 can be added compatibly.
695
696 Arguments:
697 what what information is required
698 where where to put the information
699
700 Returns: 0 if data returned, negative on error
701 */
702
703 EXPORT int
704 pcre_config(int what, void *where)
705 {
706 switch (what)
707 {
708 case PCRE_CONFIG_UTF8:
709 #ifdef SUPPORT_UTF8
710 *((int *)where) = 1;
711 #else
712 *((int *)where) = 0;
713 #endif
714 break;
715
716 case PCRE_CONFIG_NEWLINE:
717 *((int *)where) = NEWLINE;
718 break;
719
720 case PCRE_CONFIG_LINK_SIZE:
721 *((int *)where) = LINK_SIZE;
722 break;
723
724 case PCRE_CONFIG_POSIX_MALLOC_THRESHOLD:
725 *((int *)where) = POSIX_MALLOC_THRESHOLD;
726 break;
727
728 case PCRE_CONFIG_MATCH_LIMIT:
729 *((unsigned int *)where) = MATCH_LIMIT;
730 break;
731
732 case PCRE_CONFIG_STACKRECURSE:
733 #ifdef NO_RECURSE
734 *((int *)where) = 0;
735 #else
736 *((int *)where) = 1;
737 #endif
738 break;
739
740 default: return PCRE_ERROR_BADOPTION;
741 }
742
743 return 0;
744 }
745
746
747
748 #ifdef DEBUG
749 /*************************************************
750 * Debugging function to print chars *
751 *************************************************/
752
753 /* Print a sequence of chars in printable format, stopping at the end of the
754 subject if the requested.
755
756 Arguments:
757 p points to characters
758 length number to print
759 is_subject TRUE if printing from within md->start_subject
760 md pointer to matching data block, if is_subject is TRUE
761
762 Returns: nothing
763 */
764
765 static void
766 pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
767 {
768 int c;
769 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
770 while (length-- > 0)
771 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
772 }
773 #endif
774
775
776
777
778 /*************************************************
779 * Handle escapes *
780 *************************************************/
781
782 /* This function is called when a \ has been encountered. It either returns a
783 positive value for a simple escape such as \n, or a negative value which
784 encodes one of the more complicated things such as \d. When UTF-8 is enabled,
785 a positive value greater than 255 may be returned. On entry, ptr is pointing at
786 the \. On exit, it is on the final character of the escape sequence.
787
788 Arguments:
789 ptrptr points to the pattern position pointer
790 errorptr points to the pointer to the error message
791 bracount number of previous extracting brackets
792 options the options bits
793 isclass TRUE if inside a character class
794
795 Returns: zero or positive => a data character
796 negative => a special escape sequence
797 on error, errorptr is set
798 */
799
800 static int
801 check_escape(const uschar **ptrptr, const char **errorptr, int bracount,
802 int options, BOOL isclass)
803 {
804 const uschar *ptr = *ptrptr;
805 int c, i;
806
807 /* If backslash is at the end of the pattern, it's an error. */
808
809 c = *(++ptr);
810 if (c == 0) *errorptr = ERR1;
811
812 /* Non-alphamerics are literals. For digits or letters, do an initial lookup in
813 a table. A non-zero result is something that can be returned immediately.
814 Otherwise further processing may be required. */
815
816 #if !EBCDIC /* ASCII coding */
817 else if (c < '0' || c > 'z') {} /* Not alphameric */
818 else if ((i = escapes[c - '0']) != 0) c = i;
819
820 #else /* EBCDIC coding */
821 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphameric */
822 else if ((i = escapes[c - 0x48]) != 0) c = i;
823 #endif
824
825 /* Escapes that need further processing, or are illegal. */
826
827 else
828 {
829 const uschar *oldptr;
830 switch (c)
831 {
832 /* A number of Perl escapes are not handled by PCRE. We give an explicit
833 error. */
834
835 case 'l':
836 case 'L':
837 case 'N':
838 case 'p':
839 case 'P':
840 case 'u':
841 case 'U':
842 case 'X':
843 *errorptr = ERR37;
844 break;
845
846 /* The handling of escape sequences consisting of a string of digits
847 starting with one that is not zero is not straightforward. By experiment,
848 the way Perl works seems to be as follows:
849
850 Outside a character class, the digits are read as a decimal number. If the
851 number is less than 10, or if there are that many previous extracting
852 left brackets, then it is a back reference. Otherwise, up to three octal
853 digits are read to form an escaped byte. Thus \123 is likely to be octal
854 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
855 value is greater than 377, the least significant 8 bits are taken. Inside a
856 character class, \ followed by a digit is always an octal number. */
857
858 case '1': case '2': case '3': case '4': case '5':
859 case '6': case '7': case '8': case '9':
860
861 if (!isclass)
862 {
863 oldptr = ptr;
864 c -= '0';
865 while ((digitab[ptr[1]] & ctype_digit) != 0)
866 c = c * 10 + *(++ptr) - '0';
867 if (c < 10 || c <= bracount)
868 {
869 c = -(ESC_REF + c);
870 break;
871 }
872 ptr = oldptr; /* Put the pointer back and fall through */
873 }
874
875 /* Handle an octal number following \. If the first digit is 8 or 9, Perl
876 generates a binary zero byte and treats the digit as a following literal.
877 Thus we have to pull back the pointer by one. */
878
879 if ((c = *ptr) >= '8')
880 {
881 ptr--;
882 c = 0;
883 break;
884 }
885
886 /* \0 always starts an octal number, but we may drop through to here with a
887 larger first octal digit. */
888
889 case '0':
890 c -= '0';
891 while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
892 c = c * 8 + *(++ptr) - '0';
893 c &= 255; /* Take least significant 8 bits */
894 break;
895
896 /* \x is complicated when UTF-8 is enabled. \x{ddd} is a character number
897 which can be greater than 0xff, but only if the ddd are hex digits. */
898
899 case 'x':
900 #ifdef SUPPORT_UTF8
901 if (ptr[1] == '{' && (options & PCRE_UTF8) != 0)
902 {
903 const uschar *pt = ptr + 2;
904 register int count = 0;
905 c = 0;
906 while ((digitab[*pt] & ctype_xdigit) != 0)
907 {
908 int cc = *pt++;
909 count++;
910 #if !EBCDIC /* ASCII coding */
911 if (cc >= 'a') cc -= 32; /* Convert to upper case */
912 c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
913 #else /* EBCDIC coding */
914 if (cc >= 'a' && cc <= 'z') cc += 64; /* Convert to upper case */
915 c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
916 #endif
917 }
918 if (*pt == '}')
919 {
920 if (c < 0 || count > 8) *errorptr = ERR34;
921 ptr = pt;
922 break;
923 }
924 /* If the sequence of hex digits does not end with '}', then we don't
925 recognize this construct; fall through to the normal \x handling. */
926 }
927 #endif
928
929 /* Read just a single hex char */
930
931 c = 0;
932 while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
933 {
934 int cc; /* Some compilers don't like ++ */
935 cc = *(++ptr); /* in initializers */
936 #if !EBCDIC /* ASCII coding */
937 if (cc >= 'a') cc -= 32; /* Convert to upper case */
938 c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
939 #else /* EBCDIC coding */
940 if (cc <= 'z') cc += 64; /* Convert to upper case */
941 c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
942 #endif
943 }
944 break;
945
946 /* Other special escapes not starting with a digit are straightforward */
947
948 case 'c':
949 c = *(++ptr);
950 if (c == 0)
951 {
952 *errorptr = ERR2;
953 return 0;
954 }
955
956 /* A letter is upper-cased; then the 0x40 bit is flipped. This coding
957 is ASCII-specific, but then the whole concept of \cx is ASCII-specific.
958 (However, an EBCDIC equivalent has now been added.) */
959
960 #if !EBCDIC /* ASCII coding */
961 if (c >= 'a' && c <= 'z') c -= 32;
962 c ^= 0x40;
963 #else /* EBCDIC coding */
964 if (c >= 'a' && c <= 'z') c += 64;
965 c ^= 0xC0;
966 #endif
967 break;
968
969 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
970 other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
971 for Perl compatibility, it is a literal. This code looks a bit odd, but
972 there used to be some cases other than the default, and there may be again
973 in future, so I haven't "optimized" it. */
974
975 default:
976 if ((options & PCRE_EXTRA) != 0) switch(c)
977 {
978 default:
979 *errorptr = ERR3;
980 break;
981 }
982 break;
983 }
984 }
985
986 *ptrptr = ptr;
987 return c;
988 }
989
990
991
992 /*************************************************
993 * Check for counted repeat *
994 *************************************************/
995
996 /* This function is called when a '{' is encountered in a place where it might
997 start a quantifier. It looks ahead to see if it really is a quantifier or not.
998 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
999 where the ddds are digits.
1000
1001 Arguments:
1002 p pointer to the first char after '{'
1003
1004 Returns: TRUE or FALSE
1005 */
1006
1007 static BOOL
1008 is_counted_repeat(const uschar *p)
1009 {
1010 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
1011 while ((digitab[*p] & ctype_digit) != 0) p++;
1012 if (*p == '}') return TRUE;
1013
1014 if (*p++ != ',') return FALSE;
1015 if (*p == '}') return TRUE;
1016
1017 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
1018 while ((digitab[*p] & ctype_digit) != 0) p++;
1019
1020 return (*p == '}');
1021 }
1022
1023
1024
1025 /*************************************************
1026 * Read repeat counts *
1027 *************************************************/
1028
1029 /* Read an item of the form {n,m} and return the values. This is called only
1030 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
1031 so the syntax is guaranteed to be correct, but we need to check the values.
1032
1033 Arguments:
1034 p pointer to first char after '{'
1035 minp pointer to int for min
1036 maxp pointer to int for max
1037 returned as -1 if no max
1038 errorptr points to pointer to error message
1039
1040 Returns: pointer to '}' on success;
1041 current ptr on error, with errorptr set
1042 */
1043
1044 static const uschar *
1045 read_repeat_counts(const uschar *p, int *minp, int *maxp, const char **errorptr)
1046 {
1047 int min = 0;
1048 int max = -1;
1049
1050 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
1051
1052 if (*p == '}') max = min; else
1053 {
1054 if (*(++p) != '}')
1055 {
1056 max = 0;
1057 while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
1058 if (max < min)
1059 {
1060 *errorptr = ERR4;
1061 return p;
1062 }
1063 }
1064 }
1065
1066 /* Do paranoid checks, then fill in the required variables, and pass back the
1067 pointer to the terminating '}'. */
1068
1069 if (min > 65535 || max > 65535)
1070 *errorptr = ERR5;
1071 else
1072 {
1073 *minp = min;
1074 *maxp = max;
1075 }
1076 return p;
1077 }
1078
1079
1080
1081 /*************************************************
1082 * Find first significant op code *
1083 *************************************************/
1084
1085 /* This is called by several functions that scan a compiled expression looking
1086 for a fixed first character, or an anchoring op code etc. It skips over things
1087 that do not influence this. For some calls, a change of option is important.
1088
1089 Arguments:
1090 code pointer to the start of the group
1091 options pointer to external options
1092 optbit the option bit whose changing is significant, or
1093 zero if none are
1094
1095 Returns: pointer to the first significant opcode
1096 */
1097
1098 static const uschar*
1099 first_significant_code(const uschar *code, int *options, int optbit)
1100 {
1101 for (;;)
1102 {
1103 switch ((int)*code)
1104 {
1105 case OP_OPT:
1106 if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
1107 *options = (int)code[1];
1108 code += 2;
1109 break;
1110
1111 case OP_ASSERT_NOT:
1112 case OP_ASSERTBACK:
1113 case OP_ASSERTBACK_NOT:
1114 do code += GET(code, 1); while (*code == OP_ALT);
1115 /* Fall through */
1116
1117 case OP_CALLOUT:
1118 case OP_CREF:
1119 case OP_BRANUMBER:
1120 case OP_WORD_BOUNDARY:
1121 case OP_NOT_WORD_BOUNDARY:
1122 code += OP_lengths[*code];
1123 break;
1124
1125 default:
1126 return code;
1127 }
1128 }
1129 /* Control never reaches here */
1130 }
1131
1132
1133
1134
1135 /*************************************************
1136 * Find the fixed length of a pattern *
1137 *************************************************/
1138
1139 /* Scan a pattern and compute the fixed length of subject that will match it,
1140 if the length is fixed. This is needed for dealing with backward assertions.
1141 In UTF8 mode, the result is in characters rather than bytes.
1142
1143 Arguments:
1144 code points to the start of the pattern (the bracket)
1145 options the compiling options
1146
1147 Returns: the fixed length, or -1 if there is no fixed length,
1148 or -2 if \C was encountered
1149 */
1150
1151 static int
1152 find_fixedlength(uschar *code, int options)
1153 {
1154 int length = -1;
1155
1156 register int branchlength = 0;
1157 register uschar *cc = code + 1 + LINK_SIZE;
1158
1159 /* Scan along the opcodes for this branch. If we get to the end of the
1160 branch, check the length against that of the other branches. */
1161
1162 for (;;)
1163 {
1164 int d;
1165 register int op = *cc;
1166 if (op >= OP_BRA) op = OP_BRA;
1167
1168 switch (op)
1169 {
1170 case OP_BRA:
1171 case OP_ONCE:
1172 case OP_COND:
1173 d = find_fixedlength(cc, options);
1174 if (d < 0) return d;
1175 branchlength += d;
1176 do cc += GET(cc, 1); while (*cc == OP_ALT);
1177 cc += 1 + LINK_SIZE;
1178 break;
1179
1180 /* Reached end of a branch; if it's a ket it is the end of a nested
1181 call. If it's ALT it is an alternation in a nested call. If it is
1182 END it's the end of the outer call. All can be handled by the same code. */
1183
1184 case OP_ALT:
1185 case OP_KET:
1186 case OP_KETRMAX:
1187 case OP_KETRMIN:
1188 case OP_END:
1189 if (length < 0) length = branchlength;
1190 else if (length != branchlength) return -1;
1191 if (*cc != OP_ALT) return length;
1192 cc += 1 + LINK_SIZE;
1193 branchlength = 0;
1194 break;
1195
1196 /* Skip over assertive subpatterns */
1197
1198 case OP_ASSERT:
1199 case OP_ASSERT_NOT:
1200 case OP_ASSERTBACK:
1201 case OP_ASSERTBACK_NOT:
1202 do cc += GET(cc, 1); while (*cc == OP_ALT);
1203 /* Fall through */
1204
1205 /* Skip over things that don't match chars */
1206
1207 case OP_REVERSE:
1208 case OP_BRANUMBER:
1209 case OP_CREF:
1210 case OP_OPT:
1211 case OP_CALLOUT:
1212 case OP_SOD:
1213 case OP_SOM:
1214 case OP_EOD:
1215 case OP_EODN:
1216 case OP_CIRC:
1217 case OP_DOLL:
1218 case OP_NOT_WORD_BOUNDARY:
1219 case OP_WORD_BOUNDARY:
1220 cc += OP_lengths[*cc];
1221 break;
1222
1223 /* Handle char strings. In UTF-8 mode we must count characters, not bytes.
1224 This requires a scan of the string, unfortunately. We assume valid UTF-8
1225 strings, so all we do is reduce the length by one for every byte whose bits
1226 are 10xxxxxx. */
1227
1228 case OP_CHARS:
1229 branchlength += *(++cc);
1230 #ifdef SUPPORT_UTF8
1231 if ((options & PCRE_UTF8) != 0)
1232 for (d = 1; d <= *cc; d++)
1233 if ((cc[d] & 0xc0) == 0x80) branchlength--;
1234 #endif
1235 cc += *cc + 1;
1236 break;
1237
1238 /* Handle exact repetitions. The count is already in characters, but we
1239 need to skip over a multibyte character in UTF8 mode. */
1240
1241 case OP_EXACT:
1242 branchlength += GET2(cc,1);
1243 cc += 4;
1244 #ifdef SUPPORT_UTF8
1245 if ((options & PCRE_UTF8) != 0)
1246 {
1247 while((*cc & 0x80) == 0x80) cc++;
1248 }
1249 #endif
1250 break;
1251
1252 case OP_TYPEEXACT:
1253 branchlength += GET2(cc,1);
1254 cc += 4;
1255 break;
1256
1257 /* Handle single-char matchers */
1258
1259 case OP_NOT_DIGIT:
1260 case OP_DIGIT:
1261 case OP_NOT_WHITESPACE:
1262 case OP_WHITESPACE:
1263 case OP_NOT_WORDCHAR:
1264 case OP_WORDCHAR:
1265 case OP_ANY:
1266 branchlength++;
1267 cc++;
1268 break;
1269
1270 /* The single-byte matcher isn't allowed */
1271
1272 case OP_ANYBYTE:
1273 return -2;
1274
1275 /* Check a class for variable quantification */
1276
1277 #ifdef SUPPORT_UTF8
1278 case OP_XCLASS:
1279 cc += GET(cc, 1) - 33;
1280 /* Fall through */
1281 #endif
1282
1283 case OP_CLASS:
1284 case OP_NCLASS:
1285 cc += 33;
1286
1287 switch (*cc)
1288 {
1289 case OP_CRSTAR:
1290 case OP_CRMINSTAR:
1291 case OP_CRQUERY:
1292 case OP_CRMINQUERY:
1293 return -1;
1294
1295 case OP_CRRANGE:
1296 case OP_CRMINRANGE:
1297 if (GET2(cc,1) != GET2(cc,3)) return -1;
1298 branchlength += GET2(cc,1);
1299 cc += 5;
1300 break;
1301
1302 default:
1303 branchlength++;
1304 }
1305 break;
1306
1307 /* Anything else is variable length */
1308
1309 default:
1310 return -1;
1311 }
1312 }
1313 /* Control never gets here */
1314 }
1315
1316
1317
1318
1319 /*************************************************
1320 * Scan compiled regex for numbered bracket *
1321 *************************************************/
1322
1323 /* This little function scans through a compiled pattern until it finds a
1324 capturing bracket with the given number.
1325
1326 Arguments:
1327 code points to start of expression
1328 utf8 TRUE in UTF-8 mode
1329 number the required bracket number
1330
1331 Returns: pointer to the opcode for the bracket, or NULL if not found
1332 */
1333
1334 static const uschar *
1335 find_bracket(const uschar *code, BOOL utf8, int number)
1336 {
1337 #ifndef SUPPORT_UTF8
1338 utf8 = utf8; /* Stop pedantic compilers complaining */
1339 #endif
1340
1341 for (;;)
1342 {
1343 register int c = *code;
1344 if (c == OP_END) return NULL;
1345 else if (c == OP_CHARS) code += code[1] + OP_lengths[c];
1346 else if (c > OP_BRA)
1347 {
1348 int n = c - OP_BRA;
1349 if (n > EXTRACT_BASIC_MAX) n = GET2(code, 2+LINK_SIZE);
1350 if (n == number) return (uschar *)code;
1351 code += OP_lengths[OP_BRA];
1352 }
1353 else
1354 {
1355 code += OP_lengths[c];
1356
1357 #ifdef SUPPORT_UTF8
1358
1359 /* In UTF-8 mode, opcodes that are followed by a character may be followed
1360 by a multi-byte character. The length in the table is a minimum, so we have
1361 to scan along to skip the extra characters. All opcodes are less than 128,
1362 so we can use relatively efficient code. */
1363
1364 if (utf8) switch(c)
1365 {
1366 case OP_EXACT:
1367 case OP_UPTO:
1368 case OP_MINUPTO:
1369 case OP_STAR:
1370 case OP_MINSTAR:
1371 case OP_PLUS:
1372 case OP_MINPLUS:
1373 case OP_QUERY:
1374 case OP_MINQUERY:
1375 while ((*code & 0xc0) == 0x80) code++;
1376 break;
1377
1378 /* XCLASS is used for classes that cannot be represented just by a bit
1379 map. This includes negated single high-valued characters. The length in
1380 the table is zero; the actual length is stored in the compled code. */
1381
1382 case OP_XCLASS:
1383 code += GET(code, 1) + 1;
1384 break;
1385 }
1386 #endif
1387 }
1388 }
1389 }
1390
1391
1392
1393 /*************************************************
1394 * Scan compiled regex for recursion reference *
1395 *************************************************/
1396
1397 /* This little function scans through a compiled pattern until it finds an
1398 instance of OP_RECURSE.
1399
1400 Arguments:
1401 code points to start of expression
1402 utf8 TRUE in UTF-8 mode
1403
1404 Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1405 */
1406
1407 static const uschar *
1408 find_recurse(const uschar *code, BOOL utf8)
1409 {
1410 #ifndef SUPPORT_UTF8
1411 utf8 = utf8; /* Stop pedantic compilers complaining */
1412 #endif
1413
1414 for (;;)
1415 {
1416 register int c = *code;
1417 if (c == OP_END) return NULL;
1418 else if (c == OP_RECURSE) return code;
1419 else if (c == OP_CHARS) code += code[1] + OP_lengths[c];
1420 else if (c > OP_BRA)
1421 {
1422 code += OP_lengths[OP_BRA];
1423 }
1424 else
1425 {
1426 code += OP_lengths[c];
1427
1428 #ifdef SUPPORT_UTF8
1429
1430 /* In UTF-8 mode, opcodes that are followed by a character may be followed
1431 by a multi-byte character. The length in the table is a minimum, so we have
1432 to scan along to skip the extra characters. All opcodes are less than 128,
1433 so we can use relatively efficient code. */
1434
1435 if (utf8) switch(c)
1436 {
1437 case OP_EXACT:
1438 case OP_UPTO:
1439 case OP_MINUPTO:
1440 case OP_STAR:
1441 case OP_MINSTAR:
1442 case OP_PLUS:
1443 case OP_MINPLUS:
1444 case OP_QUERY:
1445 case OP_MINQUERY:
1446 while ((*code & 0xc0) == 0x80) code++;
1447 break;
1448
1449 /* XCLASS is used for classes that cannot be represented just by a bit
1450 map. This includes negated single high-valued characters. The length in
1451 the table is zero; the actual length is stored in the compled code. */
1452
1453 case OP_XCLASS:
1454 code += GET(code, 1) + 1;
1455 break;
1456 }
1457 #endif
1458 }
1459 }
1460 }
1461
1462
1463
1464 /*************************************************
1465 * Scan compiled branch for non-emptiness *
1466 *************************************************/
1467
1468 /* This function scans through a branch of a compiled pattern to see whether it
1469 can match the empty string or not. It is called only from could_be_empty()
1470 below. Note that first_significant_code() skips over assertions. If we hit an
1471 unclosed bracket, we return "empty" - this means we've struck an inner bracket
1472 whose current branch will already have been scanned.
1473
1474 Arguments:
1475 code points to start of search
1476 endcode points to where to stop
1477 utf8 TRUE if in UTF8 mode
1478
1479 Returns: TRUE if what is matched could be empty
1480 */
1481
1482 static BOOL
1483 could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1484 {
1485 register int c;
1486 for (code = first_significant_code(code + 1 + LINK_SIZE, NULL, 0);
1487 code < endcode;
1488 code = first_significant_code(code + OP_lengths[c], NULL, 0))
1489 {
1490 const uschar *ccode;
1491
1492 c = *code;
1493
1494 if (c >= OP_BRA)
1495 {
1496 BOOL empty_branch;
1497 if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1498
1499 /* Scan a closed bracket */
1500
1501 empty_branch = FALSE;
1502 do
1503 {
1504 if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1505 empty_branch = TRUE;
1506 code += GET(code, 1);
1507 }
1508 while (*code == OP_ALT);
1509 if (!empty_branch) return FALSE; /* All branches are non-empty */
1510 code += 1 + LINK_SIZE;
1511 c = *code;
1512 }
1513
1514 else switch (c)
1515 {
1516 /* Check for quantifiers after a class */
1517
1518 #ifdef SUPPORT_UTF8
1519 case OP_XCLASS:
1520 ccode = code + GET(code, 1);
1521 goto CHECK_CLASS_REPEAT;
1522 #endif
1523
1524 case OP_CLASS:
1525 case OP_NCLASS:
1526 ccode = code + 33;
1527
1528 #ifdef SUPPORT_UTF8
1529 CHECK_CLASS_REPEAT:
1530 #endif
1531
1532 switch (*ccode)
1533 {
1534 case OP_CRSTAR: /* These could be empty; continue */
1535 case OP_CRMINSTAR:
1536 case OP_CRQUERY:
1537 case OP_CRMINQUERY:
1538 break;
1539
1540 default: /* Non-repeat => class must match */
1541 case OP_CRPLUS: /* These repeats aren't empty */
1542 case OP_CRMINPLUS:
1543 return FALSE;
1544
1545 case OP_CRRANGE:
1546 case OP_CRMINRANGE:
1547 if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
1548 break;
1549 }
1550 break;
1551
1552 /* Opcodes that must match a character */
1553
1554 case OP_NOT_DIGIT:
1555 case OP_DIGIT:
1556 case OP_NOT_WHITESPACE:
1557 case OP_WHITESPACE:
1558 case OP_NOT_WORDCHAR:
1559 case OP_WORDCHAR:
1560 case OP_ANY:
1561 case OP_ANYBYTE:
1562 case OP_CHARS:
1563 case OP_NOT:
1564 case OP_PLUS:
1565 case OP_MINPLUS:
1566 case OP_EXACT:
1567 case OP_NOTPLUS:
1568 case OP_NOTMINPLUS:
1569 case OP_NOTEXACT:
1570 case OP_TYPEPLUS:
1571 case OP_TYPEMINPLUS:
1572 case OP_TYPEEXACT:
1573 return FALSE;
1574
1575 /* End of branch */
1576
1577 case OP_KET:
1578 case OP_KETRMAX:
1579 case OP_KETRMIN:
1580 case OP_ALT:
1581 return TRUE;
1582
1583 /* In UTF-8 mode, STAR, MINSTAR, QUERY, MINQUERY, UPTO, and MINUPTO may be
1584 followed by a multibyte character */
1585
1586 #ifdef SUPPORT_UTF8
1587 case OP_STAR:
1588 case OP_MINSTAR:
1589 case OP_QUERY:
1590 case OP_MINQUERY:
1591 case OP_UPTO:
1592 case OP_MINUPTO:
1593 if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1594 break;
1595 #endif
1596 }
1597 }
1598
1599 return TRUE;
1600 }
1601
1602
1603
1604 /*************************************************
1605 * Scan compiled regex for non-emptiness *
1606 *************************************************/
1607
1608 /* This function is called to check for left recursive calls. We want to check
1609 the current branch of the current pattern to see if it could match the empty
1610 string. If it could, we must look outwards for branches at other levels,
1611 stopping when we pass beyond the bracket which is the subject of the recursion.
1612
1613 Arguments:
1614 code points to start of the recursion
1615 endcode points to where to stop (current RECURSE item)
1616 bcptr points to the chain of current (unclosed) branch starts
1617 utf8 TRUE if in UTF-8 mode
1618
1619 Returns: TRUE if what is matched could be empty
1620 */
1621
1622 static BOOL
1623 could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1624 BOOL utf8)
1625 {
1626 while (bcptr != NULL && bcptr->current >= code)
1627 {
1628 if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1629 bcptr = bcptr->outer;
1630 }
1631 return TRUE;
1632 }
1633
1634
1635
1636 /*************************************************
1637 * Check for POSIX class syntax *
1638 *************************************************/
1639
1640 /* This function is called when the sequence "[:" or "[." or "[=" is
1641 encountered in a character class. It checks whether this is followed by an
1642 optional ^ and then a sequence of letters, terminated by a matching ":]" or
1643 ".]" or "=]".
1644
1645 Argument:
1646 ptr pointer to the initial [
1647 endptr where to return the end pointer
1648 cd pointer to compile data
1649
1650 Returns: TRUE or FALSE
1651 */
1652
1653 static BOOL
1654 check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
1655 {
1656 int terminator; /* Don't combine these lines; the Solaris cc */
1657 terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
1658 if (*(++ptr) == '^') ptr++;
1659 while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
1660 if (*ptr == terminator && ptr[1] == ']')
1661 {
1662 *endptr = ptr;
1663 return TRUE;
1664 }
1665 return FALSE;
1666 }
1667
1668
1669
1670
1671 /*************************************************
1672 * Check POSIX class name *
1673 *************************************************/
1674
1675 /* This function is called to check the name given in a POSIX-style class entry
1676 such as [:alnum:].
1677
1678 Arguments:
1679 ptr points to the first letter
1680 len the length of the name
1681
1682 Returns: a value representing the name, or -1 if unknown
1683 */
1684
1685 static int
1686 check_posix_name(const uschar *ptr, int len)
1687 {
1688 register int yield = 0;
1689 while (posix_name_lengths[yield] != 0)
1690 {
1691 if (len == posix_name_lengths[yield] &&
1692 strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
1693 yield++;
1694 }
1695 return -1;
1696 }
1697
1698
1699 /*************************************************
1700 * Adjust OP_RECURSE items in repeated group *
1701 *************************************************/
1702
1703 /* OP_RECURSE items contain an offset from the start of the regex to the group
1704 that is referenced. This means that groups can be replicated for fixed
1705 repetition simply by copying (because the recursion is allowed to refer to
1706 earlier groups that are outside the current group). However, when a group is
1707 optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
1708 it, after it has been compiled. This means that any OP_RECURSE items within it
1709 that refer to the group itself or any contained groups have to have their
1710 offsets adjusted. That is the job of this function. Before it is called, the
1711 partially compiled regex must be temporarily terminated with OP_END.
1712
1713 Arguments:
1714 group points to the start of the group
1715 adjust the amount by which the group is to be moved
1716 utf8 TRUE in UTF-8 mode
1717 cd contains pointers to tables etc.
1718
1719 Returns: nothing
1720 */
1721
1722 static void
1723 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd)
1724 {
1725 uschar *ptr = group;
1726 while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1727 {
1728 int offset = GET(ptr, 1);
1729 if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1730 ptr += 1 + LINK_SIZE;
1731 }
1732 }
1733
1734
1735
1736 /*************************************************
1737 * Compile one branch *
1738 *************************************************/
1739
1740 /* Scan the pattern, compiling it into the code vector. If the options are
1741 changed during the branch, the pointer is used to change the external options
1742 bits.
1743
1744 Arguments:
1745 optionsptr pointer to the option bits
1746 brackets points to number of extracting brackets used
1747 code points to the pointer to the current code point
1748 ptrptr points to the current pattern pointer
1749 errorptr points to pointer to error message
1750 firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
1751 reqbyteptr set to the last literal character required, else < 0
1752 bcptr points to current branch chain
1753 cd contains pointers to tables etc.
1754
1755 Returns: TRUE on success
1756 FALSE, with *errorptr set on error
1757 */
1758
1759 static BOOL
1760 compile_branch(int *optionsptr, int *brackets, uschar **codeptr,
1761 const uschar **ptrptr, const char **errorptr, int *firstbyteptr,
1762 int *reqbyteptr, branch_chain *bcptr, compile_data *cd)
1763 {
1764 int repeat_type, op_type;
1765 int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
1766 int bravalue = 0;
1767 int length;
1768 int greedy_default, greedy_non_default;
1769 int firstbyte, reqbyte;
1770 int zeroreqbyte, zerofirstbyte;
1771 int req_caseopt, reqvary, tempreqvary;
1772 int condcount = 0;
1773 int options = *optionsptr;
1774 register int c;
1775 register uschar *code = *codeptr;
1776 uschar *tempcode;
1777 BOOL inescq = FALSE;
1778 BOOL groupsetfirstbyte = FALSE;
1779 const uschar *ptr = *ptrptr;
1780 const uschar *tempptr;
1781 uschar *previous = NULL;
1782 uschar class[32];
1783
1784 #ifdef SUPPORT_UTF8
1785 BOOL class_utf8;
1786 BOOL utf8 = (options & PCRE_UTF8) != 0;
1787 uschar *class_utf8data;
1788 uschar utf8_char[6];
1789 #else
1790 BOOL utf8 = FALSE;
1791 #endif
1792
1793 /* Set up the default and non-default settings for greediness */
1794
1795 greedy_default = ((options & PCRE_UNGREEDY) != 0);
1796 greedy_non_default = greedy_default ^ 1;
1797
1798 /* Initialize no first char, no required char. REQ_UNSET means "no char
1799 matching encountered yet". It gets changed to REQ_NONE if we hit something that
1800 matches a non-fixed char first char; reqbyte just remains unset if we never
1801 find one.
1802
1803 When we hit a repeat whose minimum is zero, we may have to adjust these values
1804 to take the zero repeat into account. This is implemented by setting them to
1805 zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
1806 item types that can be repeated set these backoff variables appropriately. */
1807
1808 firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
1809
1810 /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
1811 according to the current setting of the caseless flag. REQ_CASELESS is a bit
1812 value > 255. It is added into the firstbyte or reqbyte variables to record the
1813 case status of the value. */
1814
1815 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
1816
1817 /* Switch on next character until the end of the branch */
1818
1819 for (;; ptr++)
1820 {
1821 BOOL negate_class;
1822 BOOL possessive_quantifier;
1823 int class_charcount;
1824 int class_lastchar;
1825 int newoptions;
1826 int recno;
1827 int skipbytes;
1828 int subreqbyte;
1829 int subfirstbyte;
1830
1831 c = *ptr;
1832 if (inescq && c != 0) goto NORMAL_CHAR;
1833
1834 if ((options & PCRE_EXTENDED) != 0)
1835 {
1836 if ((cd->ctypes[c] & ctype_space) != 0) continue;
1837 if (c == '#')
1838 {
1839 /* The space before the ; is to avoid a warning on a silly compiler
1840 on the Macintosh. */
1841 while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
1842 if (c != 0) continue; /* Else fall through to handle end of string */
1843 }
1844 }
1845
1846 switch(c)
1847 {
1848 /* The branch terminates at end of string, |, or ). */
1849
1850 case 0:
1851 case '|':
1852 case ')':
1853 *firstbyteptr = firstbyte;
1854 *reqbyteptr = reqbyte;
1855 *codeptr = code;
1856 *ptrptr = ptr;
1857 return TRUE;
1858
1859 /* Handle single-character metacharacters. In multiline mode, ^ disables
1860 the setting of any following char as a first character. */
1861
1862 case '^':
1863 if ((options & PCRE_MULTILINE) != 0)
1864 {
1865 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
1866 }
1867 previous = NULL;
1868 *code++ = OP_CIRC;
1869 break;
1870
1871 case '$':
1872 previous = NULL;
1873 *code++ = OP_DOLL;
1874 break;
1875
1876 /* There can never be a first char if '.' is first, whatever happens about
1877 repeats. The value of reqbyte doesn't change either. */
1878
1879 case '.':
1880 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
1881 zerofirstbyte = firstbyte;
1882 zeroreqbyte = reqbyte;
1883 previous = code;
1884 *code++ = OP_ANY;
1885 break;
1886
1887 /* Character classes. If the included characters are all < 255 in value, we
1888 build a 32-byte bitmap of the permitted characters, except in the special
1889 case where there is only one such character. For negated classes, we build
1890 the map as usual, then invert it at the end. However, we use a different
1891 opcode so that data characters > 255 can be handled correctly.
1892
1893 If the class contains characters outside the 0-255 range, a different
1894 opcode is compiled. It may optionally have a bit map for characters < 256,
1895 but those above are are explicitly listed afterwards. A flag byte tells
1896 whether the bitmap is present, and whether this is a negated class or not.
1897 */
1898
1899 case '[':
1900 previous = code;
1901
1902 /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
1903 they are encountered at the top level, so we'll do that too. */
1904
1905 if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
1906 check_posix_syntax(ptr, &tempptr, cd))
1907 {
1908 *errorptr = (ptr[1] == ':')? ERR13 : ERR31;
1909 goto FAILED;
1910 }
1911
1912 /* If the first character is '^', set the negation flag and skip it. */
1913
1914 if ((c = *(++ptr)) == '^')
1915 {
1916 negate_class = TRUE;
1917 c = *(++ptr);
1918 }
1919 else
1920 {
1921 negate_class = FALSE;
1922 }
1923
1924 /* Keep a count of chars with values < 256 so that we can optimize the case
1925 of just a single character (as long as it's < 256). For higher valued UTF-8
1926 characters, we don't yet do any optimization. */
1927
1928 class_charcount = 0;
1929 class_lastchar = -1;
1930
1931 #ifdef SUPPORT_UTF8
1932 class_utf8 = FALSE; /* No chars >= 256 */
1933 class_utf8data = code + LINK_SIZE + 34; /* For UTF-8 items */
1934 #endif
1935
1936 /* Initialize the 32-char bit map to all zeros. We have to build the
1937 map in a temporary bit of store, in case the class contains only 1
1938 character (< 256), because in that case the compiled code doesn't use the
1939 bit map. */
1940
1941 memset(class, 0, 32 * sizeof(uschar));
1942
1943 /* Process characters until ] is reached. By writing this as a "do" it
1944 means that an initial ] is taken as a data character. The first pass
1945 through the regex checked the overall syntax, so we don't need to be very
1946 strict here. At the start of the loop, c contains the first byte of the
1947 character. */
1948
1949 do
1950 {
1951 #ifdef SUPPORT_UTF8
1952 if (utf8 && c > 127)
1953 { /* Braces are required because the */
1954 GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
1955 }
1956 #endif
1957
1958 /* Inside \Q...\E everything is literal except \E */
1959
1960 if (inescq)
1961 {
1962 if (c == '\\' && ptr[1] == 'E')
1963 {
1964 inescq = FALSE;
1965 ptr++;
1966 continue;
1967 }
1968 else goto LONE_SINGLE_CHARACTER;
1969 }
1970
1971 /* Handle POSIX class names. Perl allows a negation extension of the
1972 form [:^name:]. A square bracket that doesn't match the syntax is
1973 treated as a literal. We also recognize the POSIX constructions
1974 [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
1975 5.6 and 5.8 do. */
1976
1977 if (c == '[' &&
1978 (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
1979 check_posix_syntax(ptr, &tempptr, cd))
1980 {
1981 BOOL local_negate = FALSE;
1982 int posix_class, i;
1983 register const uschar *cbits = cd->cbits;
1984
1985 if (ptr[1] != ':')
1986 {
1987 *errorptr = ERR31;
1988 goto FAILED;
1989 }
1990
1991 ptr += 2;
1992 if (*ptr == '^')
1993 {
1994 local_negate = TRUE;
1995 ptr++;
1996 }
1997
1998 posix_class = check_posix_name(ptr, tempptr - ptr);
1999 if (posix_class < 0)
2000 {
2001 *errorptr = ERR30;
2002 goto FAILED;
2003 }
2004
2005 /* If matching is caseless, upper and lower are converted to
2006 alpha. This relies on the fact that the class table starts with
2007 alpha, lower, upper as the first 3 entries. */
2008
2009 if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
2010 posix_class = 0;
2011
2012 /* Or into the map we are building up to 3 of the static class
2013 tables, or their negations. The [:blank:] class sets up the same
2014 chars as the [:space:] class (all white space). We remove the vertical
2015 white space chars afterwards. */
2016
2017 posix_class *= 3;
2018 for (i = 0; i < 3; i++)
2019 {
2020 BOOL blankclass = strncmp((char *)ptr, "blank", 5) == 0;
2021 int taboffset = posix_class_maps[posix_class + i];
2022 if (taboffset < 0) break;
2023 if (local_negate)
2024 {
2025 for (c = 0; c < 32; c++) class[c] |= ~cbits[c+taboffset];
2026 if (blankclass) class[1] |= 0x3c;
2027 }
2028 else
2029 {
2030 for (c = 0; c < 32; c++) class[c] |= cbits[c+taboffset];
2031 if (blankclass) class[1] &= ~0x3c;
2032 }
2033 }
2034
2035 ptr = tempptr + 1;
2036 class_charcount = 10; /* Set > 1; assumes more than 1 per class */
2037 continue; /* End of POSIX syntax handling */
2038 }
2039
2040 /* Backslash may introduce a single character, or it may introduce one
2041 of the specials, which just set a flag. Escaped items are checked for
2042 validity in the pre-compiling pass. The sequence \b is a special case.
2043 Inside a class (and only there) it is treated as backspace. Elsewhere
2044 it marks a word boundary. Other escapes have preset maps ready to
2045 or into the one we are building. We assume they have more than one
2046 character in them, so set class_charcount bigger than one. */
2047
2048 if (c == '\\')
2049 {
2050 c = check_escape(&ptr, errorptr, *brackets, options, TRUE);
2051 if (-c == ESC_b) c = '\b'; /* \b is backslash in a class */
2052
2053 if (-c == ESC_Q) /* Handle start of quoted string */
2054 {
2055 if (ptr[1] == '\\' && ptr[2] == 'E')
2056 {
2057 ptr += 2; /* avoid empty string */
2058 }
2059 else inescq = TRUE;
2060 continue;
2061 }
2062
2063 else if (c < 0)
2064 {
2065 register const uschar *cbits = cd->cbits;
2066 class_charcount = 10; /* Greater than 1 is what matters */
2067 switch (-c)
2068 {
2069 case ESC_d:
2070 for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_digit];
2071 continue;
2072
2073 case ESC_D:
2074 for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_digit];
2075 continue;
2076
2077 case ESC_w:
2078 for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_word];
2079 continue;
2080
2081 case ESC_W:
2082 for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_word];
2083 continue;
2084
2085 case ESC_s:
2086 for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_space];
2087 class[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
2088 continue;
2089
2090 case ESC_S:
2091 for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_space];
2092 class[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
2093 continue;
2094
2095 /* Unrecognized escapes are faulted if PCRE is running in its
2096 strict mode. By default, for compatibility with Perl, they are
2097 treated as literals. */
2098
2099 default:
2100 if ((options & PCRE_EXTRA) != 0)
2101 {
2102 *errorptr = ERR7;
2103 goto FAILED;
2104 }
2105 c = *ptr; /* The final character */
2106 }
2107 }
2108
2109 /* Fall through if we have a single character (c >= 0). This may be
2110 > 256 in UTF-8 mode. */
2111
2112 } /* End of backslash handling */
2113
2114 /* A single character may be followed by '-' to form a range. However,
2115 Perl does not permit ']' to be the end of the range. A '-' character
2116 here is treated as a literal. */
2117
2118 if (ptr[1] == '-' && ptr[2] != ']')
2119 {
2120 int d;
2121 ptr += 2;
2122
2123 #ifdef SUPPORT_UTF8
2124 if (utf8)
2125 { /* Braces are required because the */
2126 GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
2127 }
2128 else
2129 #endif
2130 d = *ptr;
2131
2132 /* The second part of a range can be a single-character escape, but
2133 not any of the other escapes. Perl 5.6 treats a hyphen as a literal
2134 in such circumstances. */
2135
2136 if (d == '\\')
2137 {
2138 const uschar *oldptr = ptr;
2139 d = check_escape(&ptr, errorptr, *brackets, options, TRUE);
2140
2141 /* \b is backslash; any other special means the '-' was literal */
2142
2143 if (d < 0)
2144 {
2145 if (d == -ESC_b) d = '\b'; else
2146 {
2147 ptr = oldptr - 2;
2148 goto LONE_SINGLE_CHARACTER; /* A few lines below */
2149 }
2150 }
2151 }
2152
2153 /* Check that the two values are in the correct order */
2154
2155 if (d < c)
2156 {
2157 *errorptr = ERR8;
2158 goto FAILED;
2159 }
2160
2161 /* If d is greater than 255, we can't just use the bit map, so set up
2162 for the UTF-8 supporting class type. If we are not caseless, we can
2163 just set up a single range. If we are caseless, the characters < 256
2164 are handled with a bitmap, in order to get the case-insensitive
2165 handling. */
2166
2167 #ifdef SUPPORT_UTF8
2168 if (d > 255)
2169 {
2170 class_utf8 = TRUE;
2171 *class_utf8data++ = XCL_RANGE;
2172 if ((options & PCRE_CASELESS) == 0)
2173 {
2174 class_utf8data += ord2utf8(c, class_utf8data);
2175 class_utf8data += ord2utf8(d, class_utf8data);
2176 continue; /* Go get the next char in the class */
2177 }
2178 class_utf8data += ord2utf8(256, class_utf8data);
2179 class_utf8data += ord2utf8(d, class_utf8data);
2180 d = 255;
2181 /* Fall through */
2182 }
2183 #endif
2184 /* We use the bit map if the range is entirely < 255, or if part of it
2185 is < 255 and matching is caseless. */
2186
2187 for (; c <= d; c++)
2188 {
2189 class[c/8] |= (1 << (c&7));
2190 if ((options & PCRE_CASELESS) != 0)
2191 {
2192 int uc = cd->fcc[c]; /* flip case */
2193 class[uc/8] |= (1 << (uc&7));
2194 }
2195 class_charcount++; /* in case a one-char range */
2196 class_lastchar = c;
2197 }
2198
2199 continue; /* Go get the next char in the class */
2200 }
2201
2202 /* Handle a lone single character - we can get here for a normal
2203 non-escape char, or after \ that introduces a single character. */
2204
2205 LONE_SINGLE_CHARACTER:
2206
2207 /* Handle a multibyte character */
2208
2209 #ifdef SUPPORT_UTF8
2210 if (utf8 && c > 255)
2211 {
2212 class_utf8 = TRUE;
2213 *class_utf8data++ = XCL_SINGLE;
2214 class_utf8data += ord2utf8(c, class_utf8data);
2215 }
2216 else
2217 #endif
2218 /* Handle a single-byte character */
2219 {
2220 class [c/8] |= (1 << (c&7));
2221 if ((options & PCRE_CASELESS) != 0)
2222 {
2223 c = cd->fcc[c]; /* flip case */
2224 class[c/8] |= (1 << (c&7));
2225 }
2226 class_charcount++;
2227 class_lastchar = c;
2228 }
2229 }
2230
2231 /* Loop until ']' reached; the check for end of string happens inside the
2232 loop. This "while" is the end of the "do" above. */
2233
2234 while ((c = *(++ptr)) != ']' || inescq);
2235
2236 /* If class_charcount is 1, we saw precisely one character with a value <
2237 256. In UTF-8 mode, we can optimize if there were no characters >= 256 and
2238 the one character is < 128. In non-UTF-8 mode we can always optimize.
2239
2240 The optimization throws away the bit map. We turn the item into a
2241 1-character OP_CHARS if it's positive, or OP_NOT if it's negative. Note
2242 that OP_NOT does not support multibyte characters. In the positive case, it
2243 can cause firstbyte to be set. Otherwise, there can be no first char if
2244 this item is first, whatever repeat count may follow. In the case of
2245 reqbyte, save the previous value for reinstating. */
2246
2247 #ifdef SUPPORT_UTF8
2248 if (class_charcount == 1 &&
2249 (!utf8 ||
2250 (!class_utf8 && class_lastchar < 128)))
2251 #else
2252 if (class_charcount == 1)
2253 #endif
2254 {
2255 zeroreqbyte = reqbyte;
2256 if (negate_class)
2257 {
2258 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2259 zerofirstbyte = firstbyte;
2260 *code++ = OP_NOT;
2261 }
2262 else
2263 {
2264 if (firstbyte == REQ_UNSET)
2265 {
2266 zerofirstbyte = REQ_NONE;
2267 firstbyte = class_lastchar | req_caseopt;
2268 }
2269 else
2270 {
2271 zerofirstbyte = firstbyte;
2272 reqbyte = class_lastchar | req_caseopt | cd->req_varyopt;
2273 }
2274 *code++ = OP_CHARS;
2275 *code++ = 1;
2276 }
2277 *code++ = class_lastchar;
2278 break; /* End of class handling */
2279 } /* End of 1-byte optimization */
2280
2281 /* Otherwise, if this is the first thing in the branch, there can be no
2282 first char setting, whatever the repeat count. Any reqbyte setting must
2283 remain unchanged after any kind of repeat. */
2284
2285 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2286 zerofirstbyte = firstbyte;
2287 zeroreqbyte = reqbyte;
2288
2289 /* If there are characters with values > 255, we have to compile an
2290 extended class, with its own opcode. If there are no characters < 256,
2291 we can omit the bitmap. */
2292
2293 #ifdef SUPPORT_UTF8
2294 if (class_utf8)
2295 {
2296 *class_utf8data++ = XCL_END; /* Marks the end of extra data */
2297 *code++ = OP_XCLASS;
2298 code += LINK_SIZE;
2299 *code = negate_class? XCL_NOT : 0;
2300
2301 /* If the map is required, install it, and move on to the end of
2302 the extra data */
2303
2304 if (class_charcount > 0)
2305 {
2306 *code++ |= XCL_MAP;
2307 memcpy(code, class, 32);
2308 code = class_utf8data;
2309 }
2310
2311 /* If the map is not required, slide down the extra data. */
2312
2313 else
2314 {
2315 int len = class_utf8data - (code + 33);
2316 memmove(code + 1, code + 33, len);
2317 code += len + 1;
2318 }
2319
2320 /* Now fill in the complete length of the item */
2321
2322 PUT(previous, 1, code - previous);
2323 break; /* End of class handling */
2324 }
2325 #endif
2326
2327 /* If there are no characters > 255, negate the 32-byte map if necessary,
2328 and copy it into the code vector. If this is the first thing in the branch,
2329 there can be no first char setting, whatever the repeat count. Any reqbyte
2330 setting must remain unchanged after any kind of repeat. */
2331
2332 if (negate_class)
2333 {
2334 *code++ = OP_NCLASS;
2335 for (c = 0; c < 32; c++) code[c] = ~class[c];
2336 }
2337 else
2338 {
2339 *code++ = OP_CLASS;
2340 memcpy(code, class, 32);
2341 }
2342 code += 32;
2343 break;
2344
2345 /* Various kinds of repeat */
2346
2347 case '{':
2348 if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR;
2349 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorptr);
2350 if (*errorptr != NULL) goto FAILED;
2351 goto REPEAT;
2352
2353 case '*':
2354 repeat_min = 0;
2355 repeat_max = -1;
2356 goto REPEAT;
2357
2358 case '+':
2359 repeat_min = 1;
2360 repeat_max = -1;
2361 goto REPEAT;
2362
2363 case '?':
2364 repeat_min = 0;
2365 repeat_max = 1;
2366
2367 REPEAT:
2368 if (previous == NULL)
2369 {
2370 *errorptr = ERR9;
2371 goto FAILED;
2372 }
2373
2374 if (repeat_min == 0)
2375 {
2376 firstbyte = zerofirstbyte; /* Adjust for zero repeat */
2377 reqbyte = zeroreqbyte; /* Ditto */
2378 }
2379
2380 /* Remember whether this is a variable length repeat */
2381
2382 reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
2383
2384 op_type = 0; /* Default single-char op codes */
2385 possessive_quantifier = FALSE; /* Default not possessive quantifier */
2386
2387 /* Save start of previous item, in case we have to move it up to make space
2388 for an inserted OP_ONCE for the additional '+' extension. */
2389
2390 tempcode = previous;
2391
2392 /* If the next character is '+', we have a possessive quantifier. This
2393 implies greediness, whatever the setting of the PCRE_UNGREEDY option.
2394 If the next character is '?' this is a minimizing repeat, by default,
2395 but if PCRE_UNGREEDY is set, it works the other way round. We change the
2396 repeat type to the non-default. */
2397
2398 if (ptr[1] == '+')
2399 {
2400 repeat_type = 0; /* Force greedy */
2401 possessive_quantifier = TRUE;
2402 ptr++;
2403 }
2404 else if (ptr[1] == '?')
2405 {
2406 repeat_type = greedy_non_default;
2407 ptr++;
2408 }
2409 else repeat_type = greedy_default;
2410
2411 /* If previous was a recursion, we need to wrap it inside brackets so that
2412 it can be replicated if necessary. */
2413
2414 if (*previous == OP_RECURSE)
2415 {
2416 memmove(previous + 1 + LINK_SIZE, previous, 1 + LINK_SIZE);
2417 code += 1 + LINK_SIZE;
2418 *previous = OP_BRA;
2419 PUT(previous, 1, code - previous);
2420 *code = OP_KET;
2421 PUT(code, 1, code - previous);
2422 code += 1 + LINK_SIZE;
2423 }
2424
2425 /* If previous was a string of characters, chop off the last one and use it
2426 as the subject of the repeat. If there was only one character, we can
2427 abolish the previous item altogether. If a one-char item has a minumum of
2428 more than one, ensure that it is set in reqbyte - it might not be if a
2429 sequence such as x{3} is the first thing in a branch because the x will
2430 have gone into firstbyte instead. */
2431
2432 if (*previous == OP_CHARS)
2433 {
2434 /* Deal with UTF-8 characters that take up more than one byte. It's
2435 easier to write this out separately than try to macrify it. Use c to
2436 hold the length of the character in bytes, plus 0x80 to flag that it's a
2437 length rather than a small character. */
2438
2439 #ifdef SUPPORT_UTF8
2440 if (utf8 && (code[-1] & 0x80) != 0)
2441 {
2442 uschar *lastchar = code - 1;
2443 while((*lastchar & 0xc0) == 0x80) lastchar--;
2444 c = code - lastchar; /* Length of UTF-8 character */
2445 memcpy(utf8_char, lastchar, c); /* Save the char */
2446 if (lastchar == previous + 2) /* There was only one character */
2447 {
2448 code = previous; /* Abolish the previous item */
2449 }
2450 else
2451 {
2452 previous[1] -= c; /* Adjust length of previous */
2453 code = lastchar; /* Lost char off the end */
2454 tempcode = code; /* Adjust position to be moved for '+' */
2455 }
2456 c |= 0x80; /* Flag c as a length */
2457 }
2458 else
2459 #endif
2460
2461 /* Handle the case of a single byte - either with no UTF8 support, or
2462 with UTF-8 disabled, or for a UTF-8 character < 128. */
2463
2464 {
2465 c = *(--code);
2466 if (code == previous + 2) /* There was only one character */
2467 {
2468 code = previous; /* Abolish the previous item */
2469 if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
2470 }
2471 else
2472 {
2473 previous[1]--; /* adjust length */
2474 tempcode = code; /* Adjust position to be moved for '+' */
2475 }
2476 }
2477
2478 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
2479 }
2480
2481 /* If previous was a single negated character ([^a] or similar), we use
2482 one of the special opcodes, replacing it. The code is shared with single-
2483 character repeats by setting opt_type to add a suitable offset into
2484 repeat_type. OP_NOT is currently used only for single-byte chars. */
2485
2486 else if (*previous == OP_NOT)
2487 {
2488 op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
2489 c = previous[1];
2490 code = previous;
2491 goto OUTPUT_SINGLE_REPEAT;
2492 }
2493
2494 /* If previous was a character type match (\d or similar), abolish it and
2495 create a suitable repeat item. The code is shared with single-character
2496 repeats by setting op_type to add a suitable offset into repeat_type. */
2497
2498 else if (*previous < OP_EODN)
2499 {
2500 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
2501 c = *previous;
2502 code = previous;
2503
2504 OUTPUT_SINGLE_REPEAT:
2505
2506 /* If the maximum is zero then the minimum must also be zero; Perl allows
2507 this case, so we do too - by simply omitting the item altogether. */
2508
2509 if (repeat_max == 0) goto END_REPEAT;
2510
2511 /* Combine the op_type with the repeat_type */
2512
2513 repeat_type += op_type;
2514
2515 /* A minimum of zero is handled either as the special case * or ?, or as
2516 an UPTO, with the maximum given. */
2517
2518 if (repeat_min == 0)
2519 {
2520 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
2521 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
2522 else
2523 {
2524 *code++ = OP_UPTO + repeat_type;
2525 PUT2INC(code, 0, repeat_max);
2526 }
2527 }
2528
2529 /* The case {1,} is handled as the special case + */
2530
2531 else if (repeat_min == 1 && repeat_max == -1)
2532 *code++ = OP_PLUS + repeat_type;
2533
2534 /* The case {n,n} is just an EXACT, while the general case {n,m} is
2535 handled as an EXACT followed by an UPTO. An EXACT of 1 is optimized. */
2536
2537 else
2538 {
2539 if (repeat_min != 1)
2540 {
2541 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
2542 PUT2INC(code, 0, repeat_min);
2543 }
2544
2545 /* If the mininum is 1 and the previous item was a character string,
2546 we either have to put back the item that got cancelled if the string
2547 length was 1, or add the character back onto the end of a longer
2548 string. For a character type nothing need be done; it will just get
2549 put back naturally. Note that the final character is always going to
2550 get added below, so we leave code ready for its insertion. */
2551
2552 else if (*previous == OP_CHARS)
2553 {
2554 if (code == previous) code += 2; else
2555
2556 /* In UTF-8 mode, a multibyte char has its length in c, with the 0x80
2557 bit set as a flag. The length will always be between 2 and 6. */
2558
2559 #ifdef SUPPORT_UTF8
2560 if (utf8 && c >= 128) previous[1] += c & 7; else
2561 #endif
2562 previous[1]++;
2563 }
2564
2565 /* For a single negated character we also have to put back the
2566 item that got cancelled. At present this applies only to single byte
2567 characters in any mode. */
2568
2569 else if (*previous == OP_NOT) code++;
2570
2571 /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
2572 we have to insert the character for the previous code. In UTF-8 mode,
2573 long characters have their length in c, with the 0x80 bit as a flag. */
2574
2575 if (repeat_max < 0)
2576 {
2577 #ifdef SUPPORT_UTF8
2578 if (utf8 && c >= 128)
2579 {
2580 memcpy(code, utf8_char, c & 7);
2581 code += c & 7;
2582 }
2583 else
2584 #endif
2585 *code++ = c;
2586 *code++ = OP_STAR + repeat_type;
2587 }
2588
2589 /* Else insert an UPTO if the max is greater than the min, again
2590 preceded by the character, for the previously inserted code. */
2591
2592 else if (repeat_max != repeat_min)
2593 {
2594 #ifdef SUPPORT_UTF8
2595 if (utf8 && c >= 128)
2596 {
2597 memcpy(code, utf8_char, c & 7);
2598 code += c & 7;
2599 }
2600 else
2601 #endif
2602 *code++ = c;
2603 repeat_max -= repeat_min;
2604 *code++ = OP_UPTO + repeat_type;
2605 PUT2INC(code, 0, repeat_max);
2606 }
2607 }
2608
2609 /* The character or character type itself comes last in all cases. */
2610
2611 #ifdef SUPPORT_UTF8
2612 if (utf8 && c >= 128)
2613 {
2614 memcpy(code, utf8_char, c & 7);
2615 code += c & 7;
2616 }
2617 else
2618 #endif
2619
2620 *code++ = c;
2621 }
2622
2623 /* If previous was a character class or a back reference, we put the repeat
2624 stuff after it, but just skip the item if the repeat was {0,0}. */
2625
2626 else if (*previous == OP_CLASS ||
2627 *previous == OP_NCLASS ||
2628 #ifdef SUPPORT_UTF8
2629 *previous == OP_XCLASS ||
2630 #endif
2631 *previous == OP_REF)
2632 {
2633 if (repeat_max == 0)
2634 {
2635 code = previous;
2636 goto END_REPEAT;
2637 }
2638 if (repeat_min == 0 && repeat_max == -1)
2639 *code++ = OP_CRSTAR + repeat_type;
2640 else if (repeat_min == 1 && repeat_max == -1)
2641 *code++ = OP_CRPLUS + repeat_type;
2642 else if (repeat_min == 0 && repeat_max == 1)
2643 *code++ = OP_CRQUERY + repeat_type;
2644 else
2645 {
2646 *code++ = OP_CRRANGE + repeat_type;
2647 PUT2INC(code, 0, repeat_min);
2648 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
2649 PUT2INC(code, 0, repeat_max);
2650 }
2651 }
2652
2653 /* If previous was a bracket group, we may have to replicate it in certain
2654 cases. */
2655
2656 else if (*previous >= OP_BRA || *previous == OP_ONCE ||
2657 *previous == OP_COND)
2658 {
2659 register int i;
2660 int ketoffset = 0;
2661 int len = code - previous;
2662 uschar *bralink = NULL;
2663
2664 /* If the maximum repeat count is unlimited, find the end of the bracket
2665 by scanning through from the start, and compute the offset back to it
2666 from the current code pointer. There may be an OP_OPT setting following
2667 the final KET, so we can't find the end just by going back from the code
2668 pointer. */
2669
2670 if (repeat_max == -1)
2671 {
2672 register uschar *ket = previous;
2673 do ket += GET(ket, 1); while (*ket != OP_KET);
2674 ketoffset = code - ket;
2675 }
2676
2677 /* The case of a zero minimum is special because of the need to stick
2678 OP_BRAZERO in front of it, and because the group appears once in the
2679 data, whereas in other cases it appears the minimum number of times. For
2680 this reason, it is simplest to treat this case separately, as otherwise
2681 the code gets far too messy. There are several special subcases when the
2682 minimum is zero. */
2683
2684 if (repeat_min == 0)
2685 {
2686 /* If the maximum is also zero, we just omit the group from the output
2687 altogether. */
2688
2689 if (repeat_max == 0)
2690 {
2691 code = previous;
2692 goto END_REPEAT;
2693 }
2694
2695 /* If the maximum is 1 or unlimited, we just have to stick in the
2696 BRAZERO and do no more at this point. However, we do need to adjust
2697 any OP_RECURSE calls inside the group that refer to the group itself or
2698 any internal group, because the offset is from the start of the whole
2699 regex. Temporarily terminate the pattern while doing this. */
2700
2701 if (repeat_max <= 1)
2702 {
2703 *code = OP_END;
2704 adjust_recurse(previous, 1, utf8, cd);
2705 memmove(previous+1, previous, len);
2706 code++;
2707 *previous++ = OP_BRAZERO + repeat_type;
2708 }
2709
2710 /* If the maximum is greater than 1 and limited, we have to replicate
2711 in a nested fashion, sticking OP_BRAZERO before each set of brackets.
2712 The first one has to be handled carefully because it's the original
2713 copy, which has to be moved up. The remainder can be handled by code
2714 that is common with the non-zero minimum case below. We have to
2715 adjust the value or repeat_max, since one less copy is required. Once
2716 again, we may have to adjust any OP_RECURSE calls inside the group. */
2717
2718 else
2719 {
2720 int offset;
2721 *code = OP_END;
2722 adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd);
2723 memmove(previous + 2 + LINK_SIZE, previous, len);
2724 code += 2 + LINK_SIZE;
2725 *previous++ = OP_BRAZERO + repeat_type;
2726 *previous++ = OP_BRA;
2727
2728 /* We chain together the bracket offset fields that have to be
2729 filled in later when the ends of the brackets are reached. */
2730
2731 offset = (bralink == NULL)? 0 : previous - bralink;
2732 bralink = previous;
2733 PUTINC(previous, 0, offset);
2734 }
2735
2736 repeat_max--;
2737 }
2738
2739 /* If the minimum is greater than zero, replicate the group as many
2740 times as necessary, and adjust the maximum to the number of subsequent
2741 copies that we need. If we set a first char from the group, and didn't
2742 set a required char, copy the latter from the former. */
2743
2744 else
2745 {
2746 if (repeat_min > 1)
2747 {
2748 if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
2749 for (i = 1; i < repeat_min; i++)
2750 {
2751 memcpy(code, previous, len);
2752 code += len;
2753 }
2754 }
2755 if (repeat_max > 0) repeat_max -= repeat_min;
2756 }
2757
2758 /* This code is common to both the zero and non-zero minimum cases. If
2759 the maximum is limited, it replicates the group in a nested fashion,
2760 remembering the bracket starts on a stack. In the case of a zero minimum,
2761 the first one was set up above. In all cases the repeat_max now specifies
2762 the number of additional copies needed. */
2763
2764 if (repeat_max >= 0)
2765 {
2766 for (i = repeat_max - 1; i >= 0; i--)
2767 {
2768 *code++ = OP_BRAZERO + repeat_type;
2769
2770 /* All but the final copy start a new nesting, maintaining the
2771 chain of brackets outstanding. */
2772
2773 if (i != 0)
2774 {
2775 int offset;
2776 *code++ = OP_BRA;
2777 offset = (bralink == NULL)? 0 : code - bralink;
2778 bralink = code;
2779 PUTINC(code, 0, offset);
2780 }
2781
2782 memcpy(code, previous, len);
2783 code += len;
2784 }
2785
2786 /* Now chain through the pending brackets, and fill in their length
2787 fields (which are holding the chain links pro tem). */
2788
2789 while (bralink != NULL)
2790 {
2791 int oldlinkoffset;
2792 int offset = code - bralink + 1;
2793 uschar *bra = code - offset;
2794 oldlinkoffset = GET(bra, 1);
2795 bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
2796 *code++ = OP_KET;
2797 PUTINC(code, 0, offset);
2798 PUT(bra, 1, offset);
2799 }
2800 }
2801
2802 /* If the maximum is unlimited, set a repeater in the final copy. We
2803 can't just offset backwards from the current code point, because we
2804 don't know if there's been an options resetting after the ket. The
2805 correct offset was computed above. */
2806
2807 else code[-ketoffset] = OP_KETRMAX + repeat_type;
2808 }
2809
2810 /* Else there's some kind of shambles */
2811
2812 else
2813 {
2814 *errorptr = ERR11;
2815 goto FAILED;
2816 }
2817
2818 /* If the character following a repeat is '+', we wrap the entire repeated
2819 item inside OP_ONCE brackets. This is just syntactic sugar, taken from
2820 Sun's Java package. The repeated item starts at tempcode, not at previous,
2821 which might be the first part of a string whose (former) last char we
2822 repeated. However, we don't support '+' after a greediness '?'. */
2823
2824 if (possessive_quantifier)
2825 {
2826 int len = code - tempcode;
2827 memmove(tempcode + 1+LINK_SIZE, tempcode, len);
2828 code += 1 + LINK_SIZE;
2829 len += 1 + LINK_SIZE;
2830 tempcode[0] = OP_ONCE;
2831 *code++ = OP_KET;
2832 PUTINC(code, 0, len);
2833 PUT(tempcode, 1, len);
2834 }
2835
2836 /* In all case we no longer have a previous item. We also set the
2837 "follows varying string" flag for subsequently encountered reqbytes if
2838 it isn't already set and we have just passed a varying length item. */
2839
2840 END_REPEAT:
2841 previous = NULL;
2842 cd->req_varyopt |= reqvary;
2843 break;
2844
2845
2846 /* Start of nested bracket sub-expression, or comment or lookahead or
2847 lookbehind or option setting or condition. First deal with special things
2848 that can come after a bracket; all are introduced by ?, and the appearance
2849 of any of them means that this is not a referencing group. They were
2850 checked for validity in the first pass over the string, so we don't have to
2851 check for syntax errors here. */
2852
2853 case '(':
2854 newoptions = options;
2855 skipbytes = 0;
2856
2857 if (*(++ptr) == '?')
2858 {
2859 int set, unset;
2860 int *optset;
2861
2862 switch (*(++ptr))
2863 {
2864 case '#': /* Comment; skip to ket */
2865 ptr++;
2866 while (*ptr != ')') ptr++;
2867 continue;
2868
2869 case ':': /* Non-extracting bracket */
2870 bravalue = OP_BRA;
2871 ptr++;
2872 break;
2873
2874 case '(':
2875 bravalue = OP_COND; /* Conditional group */
2876
2877 /* Condition to test for recursion */
2878
2879 if (ptr[1] == 'R')
2880 {
2881 code[1+LINK_SIZE] = OP_CREF;
2882 PUT2(code, 2+LINK_SIZE, CREF_RECURSE);
2883 skipbytes = 3;
2884 ptr += 3;
2885 }
2886
2887 /* Condition to test for a numbered subpattern match. We know that
2888 if a digit follows ( then there will just be digits until ) because
2889 the syntax was checked in the first pass. */
2890
2891 else if ((digitab[ptr[1]] && ctype_digit) != 0)
2892 {
2893 int condref; /* Don't amalgamate; some compilers */
2894 condref = *(++ptr) - '0'; /* grumble at autoincrement in declaration */
2895 while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';
2896 if (condref == 0)
2897 {
2898 *errorptr = ERR35;
2899 goto FAILED;
2900 }
2901 ptr++;
2902 code[1+LINK_SIZE] = OP_CREF;
2903 PUT2(code, 2+LINK_SIZE, condref);
2904 skipbytes = 3;
2905 }
2906 /* For conditions that are assertions, we just fall through, having
2907 set bravalue above. */
2908 break;
2909
2910 case '=': /* Positive lookahead */
2911 bravalue = OP_ASSERT;
2912 ptr++;
2913 break;
2914
2915 case '!': /* Negative lookahead */
2916 bravalue = OP_ASSERT_NOT;
2917 ptr++;
2918 break;
2919
2920 case '<': /* Lookbehinds */
2921 switch (*(++ptr))
2922 {
2923 case '=': /* Positive lookbehind */
2924 bravalue = OP_ASSERTBACK;
2925 ptr++;
2926 break;
2927
2928 case '!': /* Negative lookbehind */
2929 bravalue = OP_ASSERTBACK_NOT;
2930 ptr++;
2931 break;
2932 }
2933 break;
2934
2935 case '>': /* One-time brackets */
2936 bravalue = OP_ONCE;
2937 ptr++;
2938 break;
2939
2940 case 'C': /* Callout - may be followed by digits */
2941 *code++ = OP_CALLOUT;
2942 {
2943 int n = 0;
2944 while ((digitab[*(++ptr)] & ctype_digit) != 0)
2945 n = n * 10 + *ptr - '0';
2946 if (n > 255)
2947 {
2948 *errorptr = ERR38;
2949 goto FAILED;
2950 }
2951 *code++ = n;
2952 }
2953 previous = NULL;
2954 continue;
2955
2956 case 'P': /* Named subpattern handling */
2957 if (*(++ptr) == '<') /* Definition */
2958 {
2959 int i, namelen;
2960 uschar *slot = cd->name_table;
2961 const uschar *name; /* Don't amalgamate; some compilers */
2962 name = ++ptr; /* grumble at autoincrement in declaration */
2963
2964 while (*ptr++ != '>');
2965 namelen = ptr - name - 1;
2966
2967 for (i = 0; i < cd->names_found; i++)
2968 {
2969 int crc = memcmp(name, slot+2, namelen);
2970 if (crc == 0)
2971 {
2972 if (slot[2+namelen] == 0)
2973 {
2974 *errorptr = ERR43;
2975 goto FAILED;
2976 }
2977 crc = -1; /* Current name is substring */
2978 }
2979 if (crc < 0)
2980 {
2981 memmove(slot + cd->name_entry_size, slot,
2982 (cd->names_found - i) * cd->name_entry_size);
2983 break;
2984 }
2985 slot += cd->name_entry_size;
2986 }
2987
2988 PUT2(slot, 0, *brackets + 1);
2989 memcpy(slot + 2, name, namelen);
2990 slot[2+namelen] = 0;
2991 cd->names_found++;
2992 goto NUMBERED_GROUP;
2993 }
2994
2995 if (*ptr == '=' || *ptr == '>') /* Reference or recursion */
2996 {
2997 int i, namelen;
2998 int type = *ptr++;
2999 const uschar *name = ptr;
3000 uschar *slot = cd->name_table;
3001
3002 while (*ptr != ')') ptr++;
3003 namelen = ptr - name;
3004
3005 for (i = 0; i < cd->names_found; i++)
3006 {
3007 if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
3008 slot += cd->name_entry_size;
3009 }
3010 if (i >= cd->names_found)
3011 {
3012 *errorptr = ERR15;
3013 goto FAILED;
3014 }
3015
3016 recno = GET2(slot, 0);
3017
3018 if (type == '>') goto HANDLE_RECURSION; /* A few lines below */
3019
3020 /* Back reference */
3021
3022 previous = code;
3023 *code++ = OP_REF;
3024 PUT2INC(code, 0, recno);
3025 cd->backref_map |= (recno < 32)? (1 << recno) : 1;
3026 if (recno > cd->top_backref) cd->top_backref = recno;
3027 continue;
3028 }
3029
3030 /* Should never happen */
3031 break;
3032
3033 case 'R': /* Pattern recursion */
3034 ptr++; /* Same as (?0) */
3035 /* Fall through */
3036
3037 /* Recursion or "subroutine" call */
3038
3039 case '0': case '1': case '2': case '3': case '4':
3040 case '5': case '6': case '7': case '8': case '9':
3041 {
3042 const uschar *called;
3043 recno = 0;
3044 while((digitab[*ptr] & ctype_digit) != 0)
3045 recno = recno * 10 + *ptr++ - '0';
3046
3047 /* Come here from code above that handles a named recursion */
3048
3049 HANDLE_RECURSION:
3050
3051 previous = code;
3052
3053 /* Find the bracket that is being referenced. Temporarily end the
3054 regex in case it doesn't exist. */
3055
3056 *code = OP_END;
3057 called = (recno == 0)?
3058 cd->start_code : find_bracket(cd->start_code, utf8, recno);
3059
3060 if (called == NULL)
3061 {
3062 *errorptr = ERR15;
3063 goto FAILED;
3064 }
3065
3066 /* If the subpattern is still open, this is a recursive call. We
3067 check to see if this is a left recursion that could loop for ever,
3068 and diagnose that case. */
3069
3070 if (GET(called, 1) == 0 && could_be_empty(called, code, bcptr, utf8))
3071 {
3072 *errorptr = ERR40;
3073 goto FAILED;
3074 }
3075
3076 /* Insert the recursion/subroutine item */
3077
3078 *code = OP_RECURSE;
3079 PUT(code, 1, called - cd->start_code);
3080 code += 1 + LINK_SIZE;
3081 }
3082 continue;
3083
3084 /* Character after (? not specially recognized */
3085
3086 default: /* Option setting */
3087 set = unset = 0;
3088 optset = &set;
3089
3090 while (*ptr != ')' && *ptr != ':')
3091 {
3092 switch (*ptr++)
3093 {
3094 case '-': optset = &unset; break;
3095
3096 case 'i': *optset |= PCRE_CASELESS; break;
3097 case 'm': *optset |= PCRE_MULTILINE; break;
3098 case 's': *optset |= PCRE_DOTALL; break;
3099 case 'x': *optset |= PCRE_EXTENDED; break;
3100 case 'U': *optset |= PCRE_UNGREEDY; break;
3101 case 'X': *optset |= PCRE_EXTRA; break;
3102 }
3103 }
3104
3105 /* Set up the changed option bits, but don't change anything yet. */
3106
3107 newoptions = (options | set) & (~unset);
3108
3109 /* If the options ended with ')' this is not the start of a nested
3110 group with option changes, so the options change at this level. Compile
3111 code to change the ims options if this setting actually changes any of
3112 them. We also pass the new setting back so that it can be put at the
3113 start of any following branches, and when this group ends (if we are in
3114 a group), a resetting item can be compiled.
3115
3116 Note that if this item is right at the start of the pattern, the
3117 options will have been abstracted and made global, so there will be no
3118 change to compile. */
3119
3120 if (*ptr == ')')
3121 {
3122 if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
3123 {
3124 *code++ = OP_OPT;
3125 *code++ = newoptions & PCRE_IMS;
3126 }
3127
3128 /* Change options at this level, and pass them back for use
3129 in subsequent branches. Reset the greedy defaults and the case
3130 value for firstbyte and reqbyte. */
3131
3132 *optionsptr = options = newoptions;
3133 greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
3134 greedy_non_default = greedy_default ^ 1;
3135 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
3136
3137 previous = NULL; /* This item can't be repeated */
3138 continue; /* It is complete */
3139 }
3140
3141 /* If the options ended with ':' we are heading into a nested group
3142 with possible change of options. Such groups are non-capturing and are
3143 not assertions of any kind. All we need to do is skip over the ':';
3144 the newoptions value is handled below. */
3145
3146 bravalue = OP_BRA;
3147 ptr++;
3148 }
3149 }
3150
3151 /* If PCRE_NO_AUTO_CAPTURE is set, all unadorned brackets become
3152 non-capturing and behave like (?:...) brackets */
3153
3154 else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
3155 {
3156 bravalue = OP_BRA;
3157 }
3158
3159 /* Else we have a referencing group; adjust the opcode. If the bracket
3160 number is greater than EXTRACT_BASIC_MAX, we set the opcode one higher, and
3161 arrange for the true number to follow later, in an OP_BRANUMBER item. */
3162
3163 else
3164 {
3165 NUMBERED_GROUP:
3166 if (++(*brackets) > EXTRACT_BASIC_MAX)
3167 {
3168 bravalue = OP_BRA + EXTRACT_BASIC_MAX + 1;
3169 code[1+LINK_SIZE] = OP_BRANUMBER;
3170 PUT2(code, 2+LINK_SIZE, *brackets);
3171 skipbytes = 3;
3172 }
3173 else bravalue = OP_BRA + *brackets;
3174 }
3175
3176 /* Process nested bracketed re. Assertions may not be repeated, but other
3177 kinds can be. We copy code into a non-register variable in order to be able
3178 to pass its address because some compilers complain otherwise. Pass in a
3179 new setting for the ims options if they have changed. */
3180
3181 previous = (bravalue >= OP_ONCE)? code : NULL;
3182 *code = bravalue;
3183 tempcode = code;
3184 tempreqvary = cd->req_varyopt; /* Save value before bracket */
3185
3186 if (!compile_regex(
3187 newoptions, /* The complete new option state */
3188 options & PCRE_IMS, /* The previous ims option state */
3189 brackets, /* Extracting bracket count */
3190 &tempcode, /* Where to put code (updated) */
3191 &ptr, /* Input pointer (updated) */
3192 errorptr, /* Where to put an error message */
3193 (bravalue == OP_ASSERTBACK ||
3194 bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
3195 skipbytes, /* Skip over OP_COND/OP_BRANUMBER */
3196 &subfirstbyte, /* For possible first char */
3197 &subreqbyte, /* For possible last char */
3198 bcptr, /* Current branch chain */
3199 cd)) /* Tables block */
3200 goto FAILED;
3201
3202 /* At the end of compiling, code is still pointing to the start of the
3203 group, while tempcode has been updated to point past the end of the group
3204 and any option resetting that may follow it. The pattern pointer (ptr)
3205 is on the bracket. */
3206
3207 /* If this is a conditional bracket, check that there are no more than
3208 two branches in the group. */
3209
3210 else if (bravalue == OP_COND)
3211 {
3212 uschar *tc = code;
3213 condcount = 0;
3214
3215 do {
3216 condcount++;
3217 tc += GET(tc,1);
3218 }
3219 while (*tc != OP_KET);
3220
3221 if (condcount > 2)
3222 {
3223 *errorptr = ERR27;
3224 goto FAILED;
3225 }
3226
3227 /* If there is just one branch, we must not make use of its firstbyte or
3228 reqbyte, because this is equivalent to an empty second branch. */
3229
3230 if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
3231 }
3232
3233 /* Handle updating of the required and first characters. Update for normal
3234 brackets of all kinds, and conditions with two branches (see code above).
3235 If the bracket is followed by a quantifier with zero repeat, we have to
3236 back off. Hence the definition of zeroreqbyte and zerofirstbyte outside the
3237 main loop so that they can be accessed for the back off. */
3238
3239 zeroreqbyte = reqbyte;
3240 zerofirstbyte = firstbyte;
3241 groupsetfirstbyte = FALSE;
3242
3243 if (bravalue >= OP_BRA || bravalue == OP_ONCE || bravalue == OP_COND)
3244 {
3245 /* If we have not yet set a firstbyte in this branch, take it from the
3246 subpattern, remembering that it was set here so that a repeat of more
3247 than one can replicate it as reqbyte if necessary. If the subpattern has
3248 no firstbyte, set "none" for the whole branch. In both cases, a zero
3249 repeat forces firstbyte to "none". */
3250
3251 if (firstbyte == REQ_UNSET)
3252 {
3253 if (subfirstbyte >= 0)
3254 {
3255 firstbyte = subfirstbyte;
3256 groupsetfirstbyte = TRUE;
3257 }
3258 else firstbyte = REQ_NONE;
3259 zerofirstbyte = REQ_NONE;
3260 }
3261
3262 /* If firstbyte was previously set, convert the subpattern's firstbyte
3263 into reqbyte if there wasn't one, using the vary flag that was in
3264 existence beforehand. */
3265
3266 else if (subfirstbyte >= 0 && subreqbyte < 0)
3267 subreqbyte = subfirstbyte | tempreqvary;
3268
3269 /* If the subpattern set a required byte (or set a first byte that isn't
3270 really the first byte - see above), set it. */
3271
3272 if (subreqbyte >= 0) reqbyte = subreqbyte;
3273 }
3274
3275 /* For a forward assertion, we take the reqbyte, if set. This can be
3276 helpful if the pattern that follows the assertion doesn't set a different
3277 char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
3278 for an assertion, however because it leads to incorrect effect for patterns
3279 such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
3280 of a firstbyte. This is overcome by a scan at the end if there's no
3281 firstbyte, looking for an asserted first char. */
3282
3283 else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
3284
3285 /* Now update the main code pointer to the end of the group. */
3286
3287 code = tempcode;
3288
3289 /* Error if hit end of pattern */
3290
3291 if (*ptr != ')')
3292 {
3293 *errorptr = ERR14;
3294 goto FAILED;
3295 }
3296 break;
3297
3298 /* Check \ for being a real metacharacter; if not, fall through and handle
3299 it as a data character at the start of a string. Escape items are checked
3300 for validity in the pre-compiling pass. */
3301
3302 case '\\':
3303 tempptr = ptr;
3304 c = check_escape(&ptr, errorptr, *brackets, options, FALSE);
3305
3306 /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
3307 are arranged to be the negation of the corresponding OP_values. For the
3308 back references, the values are ESC_REF plus the reference number. Only
3309 back references and those types that consume a character may be repeated.
3310 We can test for values between ESC_b and ESC_Z for the latter; this may
3311 have to change if any new ones are ever created. */
3312
3313 if (c < 0)
3314 {
3315 if (-c == ESC_Q) /* Handle start of quoted string */
3316 {
3317 if (ptr[1] == '\\' && ptr[2] == 'E') ptr += 2; /* avoid empty string */
3318 else inescq = TRUE;
3319 continue;
3320 }
3321
3322 /* For metasequences that actually match a character, we disable the
3323 setting of a first character if it hasn't already been set. */
3324
3325 if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
3326 firstbyte = REQ_NONE;
3327
3328 /* Set values to reset to if this is followed by a zero repeat. */
3329
3330 zerofirstbyte = firstbyte;
3331 zeroreqbyte = reqbyte;
3332
3333 /* Back references are handled specially */
3334
3335 if (-c >= ESC_REF)
3336 {
3337 int number = -c - ESC_REF;
3338 previous = code;
3339 *code++ = OP_REF;
3340 PUT2INC(code, 0, number);
3341 }
3342 else
3343 {
3344 previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
3345 *code++ = -c;
3346 }
3347 continue;
3348 }
3349
3350 /* Data character: reset and fall through */
3351
3352 ptr = tempptr;
3353 c = '\\';
3354
3355 /* Handle a run of data characters until a metacharacter is encountered.
3356 The first character is guaranteed not to be whitespace or # when the
3357 extended flag is set. */
3358
3359 NORMAL_CHAR:
3360 default:
3361 previous = code;
3362 *code = OP_CHARS;
3363 code += 2;
3364 length = 0;
3365
3366 do
3367 {
3368 /* If in \Q...\E, check for the end; if not, we always have a literal */
3369
3370 if (inescq)
3371 {
3372 if (c == '\\' && ptr[1] == 'E')
3373 {
3374 inescq = FALSE;
3375 ptr++;
3376 }
3377 else
3378 {
3379 *code++ = c;
3380 length++;
3381 }
3382 continue;
3383 }
3384
3385 /* Skip white space and comments for /x patterns */
3386
3387 if ((options & PCRE_EXTENDED) != 0)
3388 {
3389 if ((cd->ctypes[c] & ctype_space) != 0) continue;
3390 if (c == '#')
3391 {
3392 /* The space before the ; is to avoid a warning on a silly compiler
3393 on the Macintosh. */
3394 while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
3395 if (c == 0) break;
3396 continue;
3397 }
3398 }
3399
3400 /* Backslash may introduce a data char or a metacharacter. Escaped items
3401 are checked for validity in the pre-compiling pass. Stop the string
3402 before a metaitem. */
3403
3404 if (c == '\\')
3405 {
3406 tempptr = ptr;
3407 c = check_escape(&ptr, errorptr, *brackets, options, FALSE);
3408 if (c < 0) { ptr = tempptr; break; }
3409
3410 /* If a character is > 127 in UTF-8 mode, we have to turn it into
3411 two or more bytes in the UTF-8 encoding. */
3412
3413 #ifdef SUPPORT_UTF8
3414 if (utf8 && c > 127)
3415 {
3416 uschar buffer[8];
3417 int len = ord2utf8(c, buffer);
3418 for (c = 0; c < len; c++) *code++ = buffer[c];
3419 length += len;
3420 continue;
3421 }
3422 #endif
3423 }
3424
3425 /* Ordinary character or single-char escape */
3426
3427 *code++ = c;
3428 length++;
3429 }
3430
3431 /* This "while" is the end of the "do" above. */
3432
3433 while (length < MAXLIT && (cd->ctypes[c = *(++ptr)] & ctype_meta) == 0);
3434
3435 /* Update the first and last requirements. These are always bytes, even in
3436 UTF-8 mode. However, there is a special case to be considered when there
3437 are only one or two characters. Because this gets messy in UTF-8 mode, the
3438 code is kept separate. When we get here "length" contains the number of
3439 bytes. */
3440
3441 #ifdef SUPPORT_UTF8
3442 if (utf8 && length > 1)
3443 {
3444 uschar *t = previous + 3; /* After this code, t */
3445 while (t < code && (*t & 0xc0) == 0x80) t++; /* follows the 1st char */
3446
3447 /* Handle the case when there is only one multibyte character. It must
3448 have at least two bytes because of the "length > 1" test above. */
3449
3450 if (t == code)
3451 {
3452 /* If no previous first byte, set it from this character, but revert to
3453 none on a zero repeat. */
3454
3455 if (firstbyte == REQ_UNSET)
3456 {
3457 zerofirstbyte = REQ_NONE;
3458 firstbyte = previous[2];
3459 }
3460
3461 /* Otherwise, leave the first byte value alone, and don't change it on
3462 a zero repeat */
3463
3464 else zerofirstbyte = firstbyte;
3465
3466 /* In both cases, a zero repeat resets the previous required byte */
3467
3468 zeroreqbyte = reqbyte;
3469 }
3470
3471 /* Handle the case when there is more than one character. These may be
3472 single-byte or multibyte characters */
3473
3474 else
3475 {
3476 t = code - 1; /* After this code, t is at the */
3477 while ((*t & 0xc0) == 0x80) t--; /* start of the last character */
3478
3479 /* If no previous first byte, set it from the first character, and
3480 retain it on a zero repeat (of the last character). The required byte
3481 is reset on a zero repeat, either to the byte before the last
3482 character, unless this is the first byte of the string. In that case,
3483 it reverts to its previous value. */
3484
3485 if (firstbyte == REQ_UNSET)
3486 {
3487 zerofirstbyte = firstbyte = previous[2] | req_caseopt;
3488 zeroreqbyte = (t - 1 == previous + 2)?
3489 reqbyte : t[-1] | req_caseopt | cd->req_varyopt;
3490 }
3491
3492 /* If there was a previous first byte, leave it alone, and don't change
3493 it on a zero repeat. The required byte is reset on a zero repeat to the
3494 byte before the last character. */
3495
3496 else
3497 {
3498 zerofirstbyte = firstbyte;
3499 zeroreqbyte = t[-1] | req_caseopt | cd->req_varyopt;
3500 }
3501 }
3502
3503 /* In all cases (we know length > 1), the new required byte is the last
3504 byte of the string. */
3505
3506 reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
3507 }
3508
3509 else /* End of UTF-8 coding */
3510 #endif
3511
3512 /* This is the code for non-UTF-8 operation, either without UTF-8 support,
3513 or when UTF-8 is not enabled. */
3514
3515 {
3516 /* firstbyte was not previously set; take it from this string */
3517
3518 if (firstbyte == REQ_UNSET)
3519 {
3520 if (length == 1)
3521 {
3522 zerofirstbyte = REQ_NONE;
3523 firstbyte = previous[2] | req_caseopt;
3524 zeroreqbyte = reqbyte;
3525 }
3526 else
3527 {
3528 zerofirstbyte = firstbyte = previous[2] | req_caseopt;
3529 zeroreqbyte = (length > 2)?
3530 (code[-2] | req_caseopt | cd->req_varyopt) : reqbyte;
3531 reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
3532 }
3533 }
3534
3535 /* firstbyte was previously set */
3536
3537 else
3538 {
3539 zerofirstbyte = firstbyte;
3540 zeroreqbyte = (length == 1)? reqbyte :
3541 code[-2] | req_caseopt | cd->req_varyopt;
3542 reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
3543 }
3544 }
3545
3546 /* Set the length in the data vector, and advance to the next state. */
3547
3548 previous[1] = length;
3549 if (length < MAXLIT) ptr--;
3550 break;
3551 }
3552 } /* end of big loop */
3553
3554 /* Control never reaches here by falling through, only by a goto for all the
3555 error states. Pass back the position in the pattern so that it can be displayed
3556 to the user for diagnosing the error. */
3557
3558 FAILED:
3559 *ptrptr = ptr;
3560 return FALSE;
3561 }
3562
3563
3564
3565
3566 /*************************************************
3567 * Compile sequence of alternatives *
3568 *************************************************/
3569
3570 /* On entry, ptr is pointing past the bracket character, but on return
3571 it points to the closing bracket, or vertical bar, or end of string.
3572 The code variable is pointing at the byte into which the BRA operator has been
3573 stored. If the ims options are changed at the start (for a (?ims: group) or
3574 during any branch, we need to insert an OP_OPT item at the start of every
3575 following branch to ensure they get set correctly at run time, and also pass
3576 the new options into every subsequent branch compile.
3577
3578 Argument:
3579 options option bits, including any changes for this subpattern
3580 oldims previous settings of ims option bits
3581 brackets -> int containing the number of extracting brackets used
3582 codeptr -> the address of the current code pointer
3583 ptrptr -> the address of the current pattern pointer
3584 errorptr -> pointer to error message
3585 lookbehind TRUE if this is a lookbehind assertion
3586 skipbytes skip this many bytes at start (for OP_COND, OP_BRANUMBER)
3587 firstbyteptr place to put the first required character, or a negative number
3588 reqbyteptr place to put the last required character, or a negative number
3589 bcptr pointer to the chain of currently open branches
3590 cd points to the data block with tables pointers etc.
3591
3592 Returns: TRUE on success
3593 */
3594
3595 static BOOL
3596 compile_regex(int options, int oldims, int *brackets, uschar **codeptr,
3597 const uschar **ptrptr, const char **errorptr, BOOL lookbehind, int skipbytes,
3598 int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd)
3599 {
3600 const uschar *ptr = *ptrptr;
3601 uschar *code = *codeptr;
3602 uschar *last_branch = code;
3603 uschar *start_bracket = code;
3604 uschar *reverse_count = NULL;
3605 int firstbyte, reqbyte;
3606 int branchfirstbyte, branchreqbyte;
3607 branch_chain bc;
3608
3609 bc.outer = bcptr;
3610 bc.current = code;
3611
3612 firstbyte = reqbyte = REQ_UNSET;
3613
3614 /* Offset is set zero to mark that this bracket is still open */
3615
3616 PUT(code, 1, 0);
3617 code += 1 + LINK_SIZE + skipbytes;
3618
3619 /* Loop for each alternative branch */
3620
3621 for (;;)
3622 {
3623 /* Handle a change of ims options at the start of the branch */
3624
3625 if ((options & PCRE_IMS) != oldims)
3626 {
3627 *code++ = OP_OPT;
3628 *code++ = options & PCRE_IMS;
3629 }
3630
3631 /* Set up dummy OP_REVERSE if lookbehind assertion */
3632
3633 if (lookbehind)
3634 {
3635 *code++ = OP_REVERSE;
3636 reverse_count = code;
3637 PUTINC(code, 0, 0);
3638 }
3639
3640 /* Now compile the branch */
3641
3642 if (!compile_branch(&options, brackets, &code, &ptr, errorptr,
3643 &branchfirstbyte, &branchreqbyte, &bc, cd))
3644 {
3645 *ptrptr = ptr;
3646 return FALSE;
3647 }
3648
3649 /* If this is the first branch, the firstbyte and reqbyte values for the
3650 branch become the values for the regex. */
3651
3652 if (*last_branch != OP_ALT)
3653 {
3654 firstbyte = branchfirstbyte;
3655 reqbyte = branchreqbyte;
3656 }
3657
3658 /* If this is not the first branch, the first char and reqbyte have to
3659 match the values from all the previous branches, except that if the previous
3660 value for reqbyte didn't have REQ_VARY set, it can still match, and we set
3661 REQ_VARY for the regex. */
3662
3663 else
3664 {
3665 /* If we previously had a firstbyte, but it doesn't match the new branch,
3666 we have to abandon the firstbyte for the regex, but if there was previously
3667 no reqbyte, it takes on the value of the old firstbyte. */
3668
3669 if (firstbyte >= 0 && firstbyte != branchfirstbyte)
3670 {
3671 if (reqbyte < 0) reqbyte = firstbyte;
3672 firstbyte = REQ_NONE;
3673 }
3674
3675 /* If we (now or from before) have no firstbyte, a firstbyte from the
3676 branch becomes a reqbyte if there isn't a branch reqbyte. */
3677
3678 if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
3679 branchreqbyte = branchfirstbyte;
3680
3681 /* Now ensure that the reqbytes match */
3682
3683 if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
3684 reqbyte = REQ_NONE;
3685 else reqbyte |= branchreqbyte; /* To "or" REQ_VARY */
3686 }
3687
3688 /* If lookbehind, check that this branch matches a fixed-length string,
3689 and put the length into the OP_REVERSE item. Temporarily mark the end of
3690 the branch with OP_END. */
3691
3692 if (lookbehind)
3693 {
3694 int length;
3695 *code = OP_END;
3696 length = find_fixedlength(last_branch, options);
3697 DPRINTF(("fixed length = %d\n", length));
3698 if (length < 0)
3699 {
3700 *errorptr = (length == -2)? ERR36 : ERR25;
3701 *ptrptr = ptr;
3702 return FALSE;
3703 }
3704 PUT(reverse_count, 0, length);
3705 }
3706
3707 /* Reached end of expression, either ')' or end of pattern. Go back through
3708 the alternative branches and reverse the chain of offsets, with the field in
3709 the BRA item now becoming an offset to the first alternative. If there are
3710 no alternatives, it points to the end of the group. The length in the
3711 terminating ket is always the length of the whole bracketed item. If any of
3712 the ims options were changed inside the group, compile a resetting op-code
3713 following, except at the very end of the pattern. Return leaving the pointer
3714 at the terminating char. */
3715
3716 if (*ptr != '|')
3717 {
3718 int length = code - last_branch;
3719 do
3720 {
3721 int prev_length = GET(last_branch, 1);
3722 PUT(last_branch, 1, length);
3723 length = prev_length;
3724 last_branch -= length;
3725 }
3726 while (length > 0);
3727
3728 /* Fill in the ket */
3729
3730 *code = OP_KET;
3731 PUT(code, 1, code - start_bracket);
3732 code += 1 + LINK_SIZE;
3733
3734 /* Resetting option if needed */
3735
3736 if ((options & PCRE_IMS) != oldims && *ptr == ')')
3737 {
3738 *code++ = OP_OPT;
3739 *code++ = oldims;
3740 }
3741
3742 /* Set values to pass back */
3743
3744 *codeptr = code;
3745 *ptrptr = ptr;
3746 *firstbyteptr = firstbyte;
3747 *reqbyteptr = reqbyte;
3748 return TRUE;
3749 }
3750
3751 /* Another branch follows; insert an "or" node. Its length field points back
3752 to the previous branch while the bracket remains open. At the end the chain
3753 is reversed. It's done like this so that the start of the bracket has a
3754 zero offset until it is closed, making it possible to detect recursion. */
3755
3756 *code = OP_ALT;
3757 PUT(code, 1, code - last_branch);
3758 bc.current = last_branch = code;
3759 code += 1 + LINK_SIZE;
3760 ptr++;
3761 }
3762 /* Control never reaches here */
3763 }
3764
3765
3766
3767
3768 /*************************************************
3769 * Check for anchored expression *
3770 *************************************************/
3771
3772 /* Try to find out if this is an anchored regular expression. Consider each
3773 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
3774 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
3775 it's anchored. However, if this is a multiline pattern, then only OP_SOD
3776 counts, since OP_CIRC can match in the middle.
3777
3778 We can also consider a regex to be anchored if OP_SOM starts all its branches.
3779 This is the code for \G, which means "match at start of match position, taking
3780 into account the match offset".
3781
3782 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
3783 because that will try the rest of the pattern at all possible matching points,
3784 so there is no point trying again.... er ....
3785
3786 .... except when the .* appears inside capturing parentheses, and there is a
3787 subsequent back reference to those parentheses. We haven't enough information
3788 to catch that case precisely.
3789
3790 At first, the best we could do was to detect when .* was in capturing brackets
3791 and the highest back reference was greater than or equal to that level.
3792 However, by keeping a bitmap of the first 31 back references, we can catch some
3793 of the more common cases more precisely.
3794
3795 Arguments:
3796 code points to start of expression (the bracket)
3797 options points to the options setting
3798 bracket_map a bitmap of which brackets we are inside while testing; this
3799 handles up to substring 31; after that we just have to take
3800 the less precise approach
3801 backref_map the back reference bitmap
3802
3803 Returns: TRUE or FALSE
3804 */
3805
3806 static BOOL
3807 is_anchored(register const uschar *code, int *options, unsigned int bracket_map,
3808 unsigned int backref_map)
3809 {
3810 do {
3811 const uschar *scode =
3812 first_significant_code(code + 1+LINK_SIZE, options, PCRE_MULTILINE);
3813 register int op = *scode;
3814
3815 /* Capturing brackets */
3816
3817 if (op > OP_BRA)
3818 {
3819 int new_map;
3820 op -= OP_BRA;
3821 if (op > EXTRACT_BASIC_MAX) op = GET2(scode, 2+LINK_SIZE);
3822 new_map = bracket_map | ((op < 32)? (1 << op) : 1);
3823 if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;
3824 }
3825
3826 /* Other brackets */
3827
3828 else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
3829 {
3830 if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
3831 }
3832
3833 /* .* is not anchored unless DOTALL is set and it isn't in brackets that
3834 are or may be referenced. */
3835
3836 else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR) &&
3837 (*options & PCRE_DOTALL) != 0)
3838 {
3839 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
3840 }
3841
3842 /* Check for explicit anchoring */
3843
3844 else if (op != OP_SOD && op != OP_SOM &&
3845 ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
3846 return FALSE;
3847 code += GET(code, 1);
3848 }
3849 while (*code == OP_ALT); /* Loop for each alternative */
3850 return TRUE;
3851 }
3852
3853
3854
3855 /*************************************************
3856 * Check for starting with ^ or .* *
3857 *************************************************/
3858
3859 /* This is called to find out if every branch starts with ^ or .* so that
3860 "first char" processing can be done to speed things up in multiline
3861 matching and for non-DOTALL patterns that start with .* (which must start at
3862 the beginning or after \n). As in the case of is_anchored() (see above), we
3863 have to take account of back references to capturing brackets that contain .*
3864 because in that case we can't make the assumption.
3865
3866 Arguments:
3867 code points to start of expression (the bracket)
3868 bracket_map a bitmap of which brackets we are inside while testing; this
3869 handles up to substring 31; after that we just have to take
3870 the less precise approach
3871 backref_map the back reference bitmap
3872
3873 Returns: TRUE or FALSE
3874 */
3875
3876 static BOOL
3877 is_startline(const uschar *code, unsigned int bracket_map,
3878 unsigned int backref_map)
3879 {
3880 do {
3881 const uschar *scode = first_significant_code(code + 1+LINK_SIZE, NULL, 0);
3882 register int op = *scode;
3883
3884 /* Capturing brackets */
3885
3886 if (op > OP_BRA)
3887 {
3888 int new_map;
3889 op -= OP_BRA;
3890 if (op > EXTRACT_BASIC_MAX) op = GET2(scode, 2+LINK_SIZE);
3891 new_map = bracket_map | ((op < 32)? (1 << op) : 1);
3892 if (!is_startline(scode, new_map, backref_map)) return FALSE;
3893 }
3894
3895 /* Other brackets */
3896
3897 else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
3898 { if (!is_startline(scode, bracket_map, backref_map)) return FALSE; }
3899
3900 /* .* is not anchored unless DOTALL is set and it isn't in brackets that
3901 may be referenced. */
3902
3903 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
3904 {
3905 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
3906 }
3907
3908 /* Check for explicit circumflex */
3909
3910 else if (op != OP_CIRC) return FALSE;
3911 code += GET(code, 1);
3912 }
3913 while (*code == OP_ALT); /* Loop for each alternative */
3914 return TRUE;
3915 }
3916
3917
3918
3919 /*************************************************
3920 * Check for asserted fixed first char *
3921 *************************************************/
3922
3923 /* During compilation, the "first char" settings from forward assertions are
3924 discarded, because they can cause conflicts with actual literals that follow.
3925 However, if we end up without a first char setting for an unanchored pattern,
3926 it is worth scanning the regex to see if there is an initial asserted first
3927 char. If all branches start with the same asserted char, or with a bracket all
3928 of whose alternatives start with the same asserted char (recurse ad lib), then
3929 we return that char, otherwise -1.
3930
3931 Arguments:
3932 code points to start of expression (the bracket)
3933 options pointer to the options (used to check casing changes)
3934 inassert TRUE if in an assertion
3935
3936 Returns: -1 or the fixed first char
3937 */
3938
3939 static int
3940 find_firstassertedchar(const uschar *code, int *options, BOOL inassert)
3941 {
3942 register int c = -1;
3943 do {
3944 int d;
3945 const uschar *scode =
3946 first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS);
3947 register int op = *scode;
3948
3949 if (op >= OP_BRA) op = OP_BRA;
3950
3951 switch(op)
3952 {
3953 default:
3954 return -1;
3955
3956 case OP_BRA:
3957 case OP_ASSERT:
3958 case OP_ONCE:
3959 case OP_COND:
3960 if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0)
3961 return -1;
3962 if (c < 0) c = d; else if (c != d) return -1;
3963 break;
3964
3965 case OP_EXACT: /* Fall through */
3966 scode++;
3967
3968 case OP_CHARS: /* Fall through */
3969 scode++;
3970
3971 case OP_PLUS:
3972 case OP_MINPLUS:
3973 if (!inassert) return -1;
3974 if (c < 0)
3975 {
3976 c = scode[1];
3977 if ((*options & PCRE_CASELESS) != 0) c |= REQ_CASELESS;
3978 }
3979 else if (c != scode[1]) return -1;
3980 break;
3981 }
3982
3983 code += GET(code, 1);
3984 }
3985 while (*code == OP_ALT);
3986 return c;
3987 }
3988
3989
3990
3991
3992 #ifdef SUPPORT_UTF8
3993 /*************************************************
3994 * Validate a UTF-8 string *
3995 *************************************************/
3996
3997 /* This function is called (optionally) at the start of compile or match, to
3998 validate that a supposed UTF-8 string is actually valid. The early check means
3999 that subsequent code can assume it is dealing with a valid string. The check
4000 can be turned off for maximum performance, but then consequences of supplying
4001 an invalid string are then undefined.
4002
4003 Arguments:
4004 string points to the string
4005 length length of string, or -1 if the string is zero-terminated
4006
4007 Returns: < 0 if the string is a valid UTF-8 string
4008 >= 0 otherwise; the value is the offset of the bad byte
4009 */
4010
4011 static int
4012 valid_utf8(const uschar *string, int length)
4013 {
4014 register const uschar *p;
4015
4016 if (length < 0)
4017 {
4018 for (p = string; *p != 0; p++);
4019 length = p - string;
4020 }
4021
4022 for (p = string; length-- > 0; p++)
4023 {
4024 register int ab;
4025 register int c = *p;
4026 if (c < 128) continue;
4027 if ((c & 0xc0) != 0xc0) return p - string;
4028 ab = utf8_table4[c & 0x3f]; /* Number of additional bytes */
4029 if (length < ab) return p - string;
4030 length -= ab;
4031
4032 /* Check top bits in the second byte */
4033 if ((*(++p) & 0xc0) != 0x80) return p - string;
4034
4035 /* Check for overlong sequences for each different length */
4036 switch (ab)
4037 {
4038 /* Check for xx00 000x */
4039 case 1:
4040 if ((c & 0x3e) == 0) return p - string;
4041 continue; /* We know there aren't any more bytes to check */
4042
4043 /* Check for 1110 0000, xx0x xxxx */
4044 case 2:
4045 if (c == 0xe0 && (*p & 0x20) == 0) return p - string;
4046 break;
4047
4048 /* Check for 1111 0000, xx00 xxxx */
4049 case 3:
4050 if (c == 0xf0 && (*p & 0x30) == 0) return p - string;
4051 break;
4052
4053 /* Check for 1111 1000, xx00 0xxx */
4054 case 4:
4055 if (c == 0xf8 && (*p & 0x38) == 0) return p - string;
4056 break;
4057
4058 /* Check for leading 0xfe or 0xff, and then for 1111 1100, xx00 00xx */
4059 case 5:
4060 if (c == 0xfe || c == 0xff ||
4061 (c == 0xfc && (*p & 0x3c) == 0)) return p - string;
4062 break;
4063 }
4064
4065 /* Check for valid bytes after the 2nd, if any; all must start 10 */
4066 while (--ab > 0)
4067 {
4068 if ((*(++p) & 0xc0) != 0x80) return p - string;
4069 }
4070 }
4071
4072 return -1;
4073 }
4074 #endif
4075
4076
4077
4078 /*************************************************
4079 * Compile a Regular Expression *
4080 *************************************************/
4081
4082 /* This function takes a string and returns a pointer to a block of store
4083 holding a compiled version of the expression.
4084
4085 Arguments:
4086 pattern the regular expression
4087 options various option bits
4088 errorptr pointer to pointer to error text
4089 erroroffset ptr offset in pattern where error was detected
4090 tables pointer to character tables or NULL
4091
4092 Returns: pointer to compiled data block, or NULL on error,
4093 with errorptr and erroroffset set
4094 */
4095
4096 EXPORT pcre *
4097 pcre_compile(const char *pattern, int options, const char **errorptr,
4098 int *erroroffset, const unsigned char *tables)
4099 {
4100 real_pcre *re;
4101 int length = 1 + LINK_SIZE; /* For initial BRA plus length */
4102 int runlength;
4103 int c, firstbyte, reqbyte;
4104 int bracount = 0;
4105 int branch_extra = 0;
4106 int branch_newextra;
4107 int item_count = -1;
4108 int name_count = 0;
4109 int max_name_size = 0;
4110 #ifdef SUPPORT_UTF8
4111 int lastcharlength = 0;
4112 BOOL utf8;
4113 BOOL class_utf8;
4114 #endif
4115 BOOL inescq = FALSE;
4116 unsigned int brastackptr = 0;
4117 size_t size;
4118 uschar *code;
4119 const uschar *codestart;
4120 const uschar *ptr;
4121 compile_data compile_block;
4122 int brastack[BRASTACK_SIZE];
4123 uschar bralenstack[BRASTACK_SIZE];
4124
4125 /* We can't pass back an error message if errorptr is NULL; I guess the best we
4126 can do is just return NULL. */
4127
4128 if (errorptr == NULL) return NULL;
4129 *errorptr = NULL;
4130
4131 /* However, we can give a message for this error */
4132
4133 if (erroroffset == NULL)
4134 {
4135 *errorptr = ERR16;
4136 return NULL;
4137 }
4138 *erroroffset = 0;
4139
4140 /* Can't support UTF8 unless PCRE has been compiled to include the code. */
4141
4142 #ifdef SUPPORT_UTF8
4143 utf8 = (options & PCRE_UTF8) != 0;
4144 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
4145 (*erroroffset = valid_utf8((uschar *)pattern, -1)) >= 0)
4146 {
4147 *errorptr = ERR44;
4148 return NULL;
4149 }
4150 #else
4151 if ((options & PCRE_UTF8) != 0)
4152 {
4153 *errorptr = ERR32;
4154 return NULL;
4155 }
4156 #endif
4157
4158 if ((options & ~PUBLIC_OPTIONS) != 0)
4159 {
4160 *errorptr = ERR17;
4161 return NULL;
4162 }
4163
4164 /* Set up pointers to the individual character tables */
4165
4166 if (tables == NULL) tables = pcre_default_tables;
4167 compile_block.lcc = tables + lcc_offset;
4168 compile_block.fcc = tables + fcc_offset;
4169 compile_block.cbits = tables + cbits_offset;
4170 compile_block.ctypes = tables + ctypes_offset;
4171
4172 /* Maximum back reference and backref bitmap. This is updated for numeric
4173 references during the first pass, but for named references during the actual
4174 compile pass. The bitmap records up to 31 back references to help in deciding
4175 whether (.*) can be treated as anchored or not. */
4176
4177 compile_block.top_backref = 0;
4178 compile_block.backref_map = 0;
4179
4180 /* Reflect pattern for debugging output */
4181
4182 DPRINTF(("------------------------------------------------------------------\n"));
4183 DPRINTF(("%s\n", pattern));
4184
4185 /* The first thing to do is to make a pass over the pattern to compute the
4186 amount of store required to hold the compiled code. This does not have to be
4187 perfect as long as errors are overestimates. At the same time we can detect any
4188 flag settings right at the start, and extract them. Make an attempt to correct
4189 for any counted white space if an "extended" flag setting appears late in the
4190 pattern. We can't be so clever for #-comments. */
4191
4192 ptr = (const uschar *)(pattern - 1);
4193 while ((c = *(++ptr)) != 0)
4194 {
4195 int min, max;
4196 int class_optcount;
4197 int bracket_length;
4198 int duplength;
4199
4200 /* If we are inside a \Q...\E sequence, all chars are literal */
4201
4202 if (inescq) goto NORMAL_CHAR;
4203
4204 /* Otherwise, first check for ignored whitespace and comments */
4205
4206 if ((options & PCRE_EXTENDED) != 0)
4207 {
4208 if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
4209 if (c == '#')
4210 {
4211 /* The space before the ; is to avoid a warning on a silly compiler
4212 on the Macintosh. */
4213 while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
4214 if (c == 0) break;
4215 continue;
4216 }
4217 }
4218
4219 item_count++; /* Is zero for the first non-comment item */
4220
4221 switch(c)
4222 {
4223 /* A backslashed item may be an escaped "normal" character or a
4224 character type. For a "normal" character, put the pointers and
4225 character back so that tests for whitespace etc. in the input
4226 are done correctly. */
4227
4228 case '\\':
4229 {
4230 const uschar *save_ptr = ptr;
4231 c = check_escape(&ptr, errorptr, bracount, options, FALSE);
4232 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
4233 if (c >= 0)
4234 {
4235 ptr = save_ptr;
4236 c = '\\';
4237 goto NORMAL_CHAR;
4238 }
4239 }
4240
4241 /* If \Q, enter "literal" mode */
4242
4243 if (-c == ESC_Q)
4244 {
4245 inescq = TRUE;
4246 continue;
4247 }
4248
4249 /* Other escapes need one byte, and are of length one for repeats */
4250
4251 length++;
4252 #ifdef SUPPORT_UTF8
4253 lastcharlength = 1;
4254 #endif
4255
4256 /* A back reference needs an additional 2 bytes, plus either one or 5
4257 bytes for a repeat. We also need to keep the value of the highest
4258 back reference. */
4259
4260 if (c <= -ESC_REF)
4261 {
4262 int refnum = -c - ESC_REF;
4263 compile_block.backref_map |= (refnum < 32)? (1 << refnum) : 1;
4264 if (refnum > compile_block.top_backref)
4265 compile_block.top_backref = refnum;
4266 length += 2; /* For single back reference */
4267 if (ptr[1] == '{' && is_counted_repeat(ptr+2))
4268 {
4269 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
4270 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
4271 if ((min == 0 && (max == 1 || max == -1)) ||
4272 (min == 1 && max == -1))
4273 length++;
4274 else length += 5;
4275 if (ptr[1] == '?') ptr++;
4276 }
4277 }
4278 continue;
4279
4280 case '^': /* Single-byte metacharacters */
4281 case '.':
4282 case '$':
4283 length++;
4284 #ifdef SUPPORT_UTF8
4285 lastcharlength = 1;
4286 #endif
4287 continue;
4288
4289 case '*': /* These repeats won't be after brackets; */
4290 case '+': /* those are handled separately */
4291 case '?':
4292 length++;
4293 goto POSESSIVE; /* A few lines below */
4294
4295 /* This covers the cases of braced repeats after a single char, metachar,
4296 class, or back reference. */
4297
4298 case '{':
4299 if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR;
4300 ptr = read_repeat_counts(ptr+1, &min, &max, errorptr);
4301 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
4302
4303 /* These special cases just insert one extra opcode */
4304
4305 if ((min == 0 && (max == 1 || max == -1)) ||
4306 (min == 1 && max == -1))
4307 length++;
4308
4309 /* These cases might insert additional copies of a preceding character. */
4310
4311 else
4312 {
4313 #ifdef SUPPORT_UTF8
4314 /* In UTF-8 mode, we should find the length in lastcharlength */
4315 if (utf8)
4316 {
4317 if (min != 1)
4318 {
4319 length -= lastcharlength; /* Uncount the original char or metachar */
4320 if (min > 0) length += 3 + lastcharlength;
4321 }
4322 length += lastcharlength + ((max > 0)? 3 : 1);
4323 }
4324 else
4325 #endif
4326
4327 /* Not UTF-8 mode: all characters are one byte */
4328 {
4329 if (min != 1)
4330 {
4331 length--; /* Uncount the original char or metachar */
4332 if (min > 0) length += 4;
4333 }
4334
4335 length += (max > 0)? 4 : 2;
4336 }
4337 }
4338
4339 if (ptr[1] == '?') ptr++; /* Needs no extra length */
4340
4341 POSESSIVE: /* Test for possessive quantifier */
4342 if (ptr[1] == '+')
4343 {
4344 ptr++;
4345 length += 2 + 2*LINK_SIZE; /* Allow for atomic brackets */
4346 }
4347 continue;
4348
4349 /* An alternation contains an offset to the next branch or ket. If any ims
4350 options changed in the previous branch(es), and/or if we are in a
4351 lookbehind assertion, extra space will be needed at the start of the
4352 branch. This is handled by branch_extra. */
4353
4354 case '|':
4355 length += 1 + LINK_SIZE + branch_extra;
4356 continue;
4357
4358 /* A character class uses 33 characters provided that all the character
4359 values are less than 256. Otherwise, it uses a bit map for low valued
4360 characters, and individual items for others. Don't worry about character
4361 types that aren't allowed in classes - they'll get picked up during the
4362 compile. A character class that contains only one single-byte character
4363 uses 2 or 3 bytes, depending on whether it is negated or not. Notice this
4364 where we can. (In UTF-8 mode we can do this only for chars < 128.) */
4365
4366 case '[':
4367 class_optcount = 0;
4368
4369 #ifdef SUPPORT_UTF8
4370 class_utf8 = FALSE;
4371 #endif
4372
4373 if (*(++ptr) == '^') ptr++;
4374
4375 /* Written as a "do" so that an initial ']' is taken as data */
4376
4377 if (*ptr != 0) do
4378 {
4379 /* Inside \Q...\E everything is literal except \E */
4380
4381 if (inescq)
4382 {
4383 if (*ptr != '\\' || ptr[1] != 'E') goto NON_SPECIAL_CHARACTER;
4384 inescq = FALSE;
4385 ptr += 1;
4386 continue;
4387 }
4388
4389 /* Outside \Q...\E, check for escapes */
4390
4391 if (*ptr == '\\')
4392 {
4393 #ifdef SUPPORT_UTF8
4394 int prevchar = ptr[-1];
4395 #endif
4396 int ch = check_escape(&ptr, errorptr, bracount, options, TRUE);
4397 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
4398
4399 /* \b is backspace inside a class */
4400
4401 if (-ch == ESC_b) ch = '\b';
4402
4403 /* \Q enters quoting mode */
4404
4405 if (-ch == ESC_Q)
4406 {
4407 inescq = TRUE;
4408 continue;
4409 }
4410
4411 /* Handle escapes that turn into characters */
4412
4413 if (ch >= 0)
4414 {
4415 #ifdef SUPPORT_UTF8
4416 if (utf8)
4417 {
4418 if (ch > 127) class_optcount = 10; /* Ensure > 1 */
4419 if (ch > 255)
4420 {
4421 uschar buffer[6];
4422 if (!class_utf8)
4423 {
4424 class_utf8 = TRUE;
4425 length += LINK_SIZE + 1 + 1;
4426 }
4427 length += 1 + ord2utf8(ch, buffer);
4428
4429 /* If this wide character is preceded by '-', add an extra 2 to
4430 the length in case the previous character was < 128, because in
4431 this case the whole range will be put into the list. */
4432
4433 if (prevchar == '-') length += 2;
4434 }
4435 }
4436 #endif
4437 class_optcount++; /* for possible optimization */
4438 }
4439 else class_optcount = 10; /* \d, \s etc; make sure > 1 */
4440 }
4441
4442 /* Check the syntax for POSIX stuff. The bits we actually handle are
4443 checked during the real compile phase. */
4444
4445 else if (*ptr == '[' && check_posix_syntax(ptr, &ptr, &compile_block))
4446 {
4447 ptr++;
4448 class_optcount = 10; /* Make sure > 1 */
4449 }
4450
4451 /* Anything else just increments the possible optimization count. If
4452 there are wide characters, we are going to have to use an XCLASS. */
4453
4454 else
4455 {
4456 NON_SPECIAL_CHARACTER:
4457 class_optcount++;
4458
4459 #ifdef SUPPORT_UTF8
4460 if (utf8)
4461 {
4462 int ch;
4463 int extra = 0;
4464 GETCHARLEN(ch, ptr, extra);
4465 if (ch > 127) class_optcount = 10; /* No optimization possible */
4466 if (ch > 255)
4467 {
4468 if (!class_utf8)
4469 {
4470 class_utf8 = TRUE;
4471 length += LINK_SIZE + 1 + 1;
4472 }
4473 length += 2 + extra;
4474
4475 /* If this wide character is preceded by '-', add an extra 2 to
4476 the length in case the previous character was < 128, because in
4477 this case the whole range will be put into the list. */
4478
4479 if (ptr[-1] == '-') length += 2;
4480
4481 /* Advance to the end of this character */
4482
4483 ptr += extra;
4484 }
4485 }
4486 #endif
4487 }
4488 }
4489 while (*(++ptr) != 0 && (inescq || *ptr != ']')); /* Concludes "do" above */
4490
4491 if (*ptr == 0) /* Missing terminating ']' */
4492 {
4493 *errorptr = ERR6;
4494 goto PCRE_ERROR_RETURN;
4495 }
4496
4497 /* We can optimize when there was only one optimizable character. Repeats
4498 for positive and negated single one-byte chars are handled by the general
4499 code. Here, we handle repeats for the class opcodes. */
4500
4501 if (class_optcount == 1) length += 3; else
4502 {
4503 length += 33;
4504
4505 /* A repeat needs either 1 or 5 bytes. If it is a possessive quantifier,
4506 we also need extra for wrapping the whole thing in a sub-pattern. */
4507
4508 if (*ptr != 0 && ptr[1] == '{' && is_counted_repeat(ptr+2))
4509 {
4510 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
4511 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
4512 if ((min == 0 && (max == 1 || max == -1)) ||
4513 (min == 1 && max == -1))
4514 length++;
4515 else length += 5;
4516 if (ptr[1] == '+')
4517 {
4518 ptr++;
4519 length += 2 + 2*LINK_SIZE;
4520 }
4521 else if (ptr[1] == '?') ptr++;
4522 }
4523 }
4524 continue;
4525
4526 /* Brackets may be genuine groups or special things */
4527
4528 case '(':
4529 branch_newextra = 0;
4530 bracket_length = 1 + LINK_SIZE;
4531
4532 /* Handle special forms of bracket, which all start (? */
4533
4534 if (ptr[1] == '?')
4535 {
4536 int set, unset;
4537 int *optset;
4538
4539 switch (c = ptr[2])
4540 {
4541 /* Skip over comments entirely */
4542 case '#':
4543 ptr += 3;
4544 while (*ptr != 0 && *ptr != ')') ptr++;
4545 if (*ptr == 0)
4546 {
4547 *errorptr = ERR18;
4548 goto PCRE_ERROR_RETURN;
4549 }
4550 continue;
4551
4552 /* Non-referencing groups and lookaheads just move the pointer on, and
4553 then behave like a non-special bracket, except that they don't increment
4554 the count of extracting brackets. Ditto for the "once only" bracket,
4555 which is in Perl from version 5.005. */
4556
4557 case ':':
4558 case '=':
4559 case '!':
4560 case '>':
4561 ptr += 2;
4562 break;
4563
4564 /* (?R) specifies a recursive call to the regex, which is an extension
4565 to provide the facility which can be obtained by (?p{perl-code}) in
4566 Perl 5.6. In Perl 5.8 this has become (??{perl-code}).
4567
4568 From PCRE 4.00, items such as (?3) specify subroutine-like "calls" to
4569 the appropriate numbered brackets. This includes both recursive and
4570 non-recursive calls. (?R) is now synonymous with (?0). */
4571
4572 case 'R':
4573 ptr++;
4574
4575 case '0': case '1': case '2': case '3': case '4':
4576 case '5': case '6': case '7': case '8': case '9':
4577 ptr += 2;
4578 if (c != 'R')
4579 while ((digitab[*(++ptr)] & ctype_digit) != 0);
4580 if (*ptr != ')')
4581 {
4582 *errorptr = ERR29;
4583 goto PCRE_ERROR_RETURN;
4584 }
4585 length += 1 + LINK_SIZE;
4586
4587 /* If this item is quantified, it will get wrapped inside brackets so
4588 as to use the code for quantified brackets. We jump down and use the
4589 code that handles this for real brackets. */
4590
4591 if (ptr[1] == '+' || ptr[1] == '*' || ptr[1] == '?' || ptr[1] == '{')
4592 {
4593 length += 2 + 2 * LINK_SIZE; /* to make bracketed */
4594 duplength = 5 + 3 * LINK_SIZE;
4595 goto HANDLE_QUANTIFIED_BRACKETS;
4596 }
4597 continue;
4598
4599 /* (?C) is an extension which provides "callout" - to provide a bit of
4600 the functionality of the Perl (?{...}) feature. An optional number may
4601 follow (default is zero). */
4602
4603 case 'C':
4604 ptr += 2;
4605 while ((digitab[*(++ptr)] & ctype_digit) != 0);
4606 if (*ptr != ')')
4607 {
4608 *errorptr = ERR39;
4609 goto PCRE_ERROR_RETURN;
4610 }
4611 length += 2;
4612 continue;
4613
4614 /* Named subpatterns are an extension copied from Python */
4615
4616 case 'P':
4617 ptr += 3;
4618 if (*ptr == '<')
4619 {
4620 const uschar *p; /* Don't amalgamate; some compilers */
4621 p = ++ptr; /* grumble at autoincrement in declaration */
4622 while ((compile_block.ctypes[*ptr] & ctype_word) != 0) ptr++;
4623 if (*ptr != '>')
4624 {
4625 *errorptr = ERR42;
4626 goto PCRE_ERROR_RETURN;
4627 }
4628 name_count++;
4629 if (ptr - p > max_name_size) max_name_size = (ptr - p);
4630 break;
4631 }
4632
4633 if (*ptr == '=' || *ptr == '>')
4634 {
4635 while ((compile_block.ctypes[*(++ptr)] & ctype_word) != 0);
4636 if (*ptr != ')')
4637 {
4638 *errorptr = ERR42;
4639 goto PCRE_ERROR_RETURN;
4640 }
4641 break;
4642 }
4643
4644 /* Unknown character after (?P */
4645
4646 *errorptr = ERR41;
4647 goto PCRE_ERROR_RETURN;
4648
4649 /* Lookbehinds are in Perl from version 5.005 */
4650
4651 case '<':
4652 ptr += 3;
4653 if (*ptr == '=' || *ptr == '!')
4654 {
4655 branch_newextra = 1 + LINK_SIZE;
4656 length += 1 + LINK_SIZE; /* For the first branch */
4657 break;
4658 }
4659 *errorptr = ERR24;
4660 goto PCRE_ERROR_RETURN;
4661
4662 /* Conditionals are in Perl from version 5.005. The bracket must either
4663 be followed by a number (for bracket reference) or by an assertion
4664 group, or (a PCRE extension) by 'R' for a recursion test. */
4665
4666 case '(':
4667 if (ptr[3] == 'R' && ptr[4] == ')')
4668 {
4669 ptr += 4;
4670 length += 3;
4671 }
4672 else if ((digitab[ptr[3]] & ctype_digit) != 0)
4673 {
4674 ptr += 4;
4675 length += 3;
4676 while ((digitab[*ptr] & ctype_digit) != 0) ptr++;
4677 if (*ptr != ')')
4678 {
4679 *errorptr = ERR26;
4680 goto PCRE_ERROR_RETURN;
4681 }
4682 }
4683 else /* An assertion must follow */
4684 {
4685 ptr++; /* Can treat like ':' as far as spacing is concerned */
4686 if (ptr[2] != '?' ||
4687 (ptr[3] != '=' && ptr[3] != '!' && ptr[3] != '<') )
4688 {
4689 ptr += 2; /* To get right offset in message */
4690 *errorptr = ERR28;
4691 goto PCRE_ERROR_RETURN;
4692 }
4693 }
4694 break;
4695
4696 /* Else loop checking valid options until ) is met. Anything else is an
4697 error. If we are without any brackets, i.e. at top level, the settings
4698 act as if specified in the options, so massage the options immediately.
4699 This is for backward compatibility with Perl 5.004. */
4700
4701 default:
4702 set = unset = 0;
4703 optset = &set;
4704 ptr += 2;
4705
4706 for (;; ptr++)
4707 {
4708 c = *ptr;
4709 switch (c)
4710 {
4711 case 'i':
4712 *optset |= PCRE_CASELESS;
4713 continue;
4714
4715 case 'm':
4716 *optset |= PCRE_MULTILINE;
4717 continue;
4718
4719 case 's':
4720 *optset |= PCRE_DOTALL;
4721 continue;
4722
4723 case 'x':
4724 *optset |= PCRE_EXTENDED;
4725 continue;
4726
4727 case 'X':
4728 *optset |= PCRE_EXTRA;
4729 continue;
4730
4731 case 'U':
4732 *optset |= PCRE_UNGREEDY;
4733 continue;
4734
4735 case '-':
4736 optset = &unset;
4737 continue;
4738
4739 /* A termination by ')' indicates an options-setting-only item; if
4740 this is at the very start of the pattern (indicated by item_count
4741 being zero), we use it to set the global options. This is helpful
4742 when analyzing the pattern for first characters, etc. Otherwise
4743 nothing is done here and it is handled during the compiling
4744 process.
4745
4746 [Historical note: Up to Perl 5.8, options settings at top level
4747 were always global settings, wherever they appeared in the pattern.
4748 That is, they were equivalent to an external setting. From 5.8
4749 onwards, they apply only to what follows (which is what you might
4750 expect).] */
4751
4752 case ')':
4753 if (item_count == 0)
4754 {
4755 options = (options | set) & (~unset);
4756 set = unset = 0; /* To save length */
4757 item_count--; /* To allow for several */
4758 }
4759
4760 /* Fall through */
4761
4762 /* A termination by ':' indicates the start of a nested group with
4763 the given options set. This is again handled at compile time, but
4764 we must allow for compiled space if any of the ims options are
4765 set. We also have to allow for resetting space at the end of
4766 the group, which is why 4 is added to the length and not just 2.
4767 If there are several changes of options within the same group, this
4768 will lead to an over-estimate on the length, but this shouldn't
4769 matter very much. We also have to allow for resetting options at
4770 the start of any alternations, which we do by setting
4771 branch_newextra to 2. Finally, we record whether the case-dependent
4772 flag ever changes within the regex. This is used by the "required
4773 character" code. */
4774
4775 case ':':
4776 if (((set|unset) & PCRE_IMS) != 0)
4777 {
4778 length += 4;
4779 branch_newextra = 2;
4780 if (((set|unset) & PCRE_CASELESS) != 0) options |= PCRE_ICHANGED;
4781 }
4782 goto END_OPTIONS;
4783
4784 /* Unrecognized option character */
4785
4786 default:
4787 *errorptr = ERR12;
4788 goto PCRE_ERROR_RETURN;
4789 }
4790 }
4791
4792 /* If we hit a closing bracket, that's it - this is a freestanding
4793 option-setting. We need to ensure that branch_extra is updated if
4794 necessary. The only values branch_newextra can have here are 0 or 2.
4795 If the value is 2, then branch_extra must either be 2 or 5, depending
4796 on whether this is a lookbehind group or not. */
4797
4798 END_OPTIONS:
4799 if (c == ')')
4800 {
4801 if (branch_newextra == 2 &&
4802 (branch_extra == 0 || branch_extra == 1+LINK_SIZE))
4803 branch_extra += branch_newextra;
4804 continue;
4805 }
4806
4807 /* If options were terminated by ':' control comes here. Fall through
4808 to handle the group below. */
4809 }
4810 }
4811
4812 /* Extracting brackets must be counted so we can process escapes in a
4813 Perlish way. If the number exceeds EXTRACT_BASIC_MAX we are going to
4814 need an additional 3 bytes of store per extracting bracket. However, if
4815 PCRE_NO_AUTO)CAPTURE is set, unadorned brackets become non-capturing, so we
4816 must leave the count alone (it will aways be zero). */
4817
4818 else if ((options & PCRE_NO_AUTO_CAPTURE) == 0)
4819 {
4820 bracount++;
4821 if (bracount > EXTRACT_BASIC_MAX) bracket_length += 3;
4822 }
4823
4824 /* Save length for computing whole length at end if there's a repeat that
4825 requires duplication of the group. Also save the current value of
4826 branch_extra, and start the new group with the new value. If non-zero, this
4827 will either be 2 for a (?imsx: group, or 3 for a lookbehind assertion. */
4828
4829 if (brastackptr >= sizeof(brastack)/sizeof(int))
4830 {
4831 *errorptr = ERR19;
4832 goto PCRE_ERROR_RETURN;
4833 }
4834
4835 bralenstack[brastackptr] = branch_extra;
4836 branch_extra = branch_newextra;
4837
4838 brastack[brastackptr++] = length;
4839 length += bracket_length;
4840 continue;
4841
4842 /* Handle ket. Look for subsequent max/min; for certain sets of values we
4843 have to replicate this bracket up to that many times. If brastackptr is
4844 0 this is an unmatched bracket which will generate an error, but take care
4845 not to try to access brastack[-1] when computing the length and restoring
4846 the branch_extra value. */
4847
4848 case ')':
4849 length += 1 + LINK_SIZE;
4850 if (brastackptr > 0)
4851 {
4852 duplength = length - brastack[--brastackptr];
4853 branch_extra = bralenstack[brastackptr];
4854 }
4855 else duplength = 0;
4856
4857 /* The following code is also used when a recursion such as (?3) is
4858 followed by a quantifier, because in that case, it has to be wrapped inside
4859 brackets so that the quantifier works. The value of duplength must be
4860 set before arrival. */
4861
4862 HANDLE_QUANTIFIED_BRACKETS:
4863
4864 /* Leave ptr at the final char; for read_repeat_counts this happens
4865 automatically; for the others we need an increment. */
4866
4867 if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2))
4868 {
4869 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
4870 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
4871 }
4872 else if (c == '*') { min = 0; max = -1; ptr++; }
4873 else if (c == '+') { min = 1; max = -1; ptr++; }
4874 else if (c == '?') { min = 0; max = 1; ptr++; }
4875 else { min = 1; max = 1; }
4876
4877 /* If the minimum is zero, we have to allow for an OP_BRAZERO before the
4878 group, and if the maximum is greater than zero, we have to replicate
4879 maxval-1 times; each replication acquires an OP_BRAZERO plus a nesting
4880 bracket set. */
4881
4882 if (min == 0)
4883 {
4884 length++;
4885 if (max > 0) length += (max - 1) * (duplength + 3 + 2*LINK_SIZE);
4886 }
4887
4888 /* When the minimum is greater than zero, we have to replicate up to
4889 minval-1 times, with no additions required in the copies. Then, if there
4890 is a limited maximum we have to replicate up to maxval-1 times allowing
4891 for a BRAZERO item before each optional copy and nesting brackets for all
4892 but one of the optional copies. */
4893
4894 else
4895 {
4896 length += (min - 1) * duplength;
4897 if (max > min) /* Need this test as max=-1 means no limit */
4898 length += (max - min) * (duplength + 3 + 2*LINK_SIZE)
4899 - (2 + 2*LINK_SIZE);
4900 }
4901
4902 /* Allow space for once brackets for "possessive quantifier" */
4903
4904 if (ptr[1] == '+')
4905 {
4906 ptr++;
4907 length += 2 + 2*LINK_SIZE;
4908 }
4909 continue;
4910
4911 /* Non-special character. For a run of such characters the length required
4912 is the number of characters + 2, except that the maximum run length is
4913 MAXLIT. We won't get a skipped space or a non-data escape or the start of a
4914 # comment as the first character, so the length can't be zero. */
4915
4916 NORMAL_CHAR:
4917 default:
4918 length += 2;
4919 runlength = 0;
4920 do
4921 {
4922 #ifdef SUPPORT_UTF8
4923 lastcharlength = 1; /* Need length of last char for UTF-8 repeats */
4924 #endif
4925
4926 /* If in a \Q...\E sequence, check for end; otherwise it's a literal */
4927 if (inescq)
4928 {
4929 if (c == '\\' && ptr[1] == 'E')
4930 {
4931 inescq = FALSE;
4932 ptr++;
4933 }
4934 else runlength++;
4935 continue;
4936 }
4937
4938 /* Skip whitespace and comments for /x */
4939
4940 if ((options & PCRE_EXTENDED) != 0)
4941 {
4942 if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
4943 if (c == '#')
4944 {
4945 /* The space before the ; is to avoid a warning on a silly compiler
4946 on the Macintosh. */
4947 while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
4948 continue;
4949 }
4950 }
4951
4952 /* Backslash may introduce a data char or a metacharacter; stop the
4953 string before the latter. */
4954
4955 if (c == '\\')
4956 {
4957 const uschar *saveptr = ptr;
4958 c = check_escape(&ptr, errorptr, bracount, options, FALSE);
4959 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
4960 if (c < 0) { ptr = saveptr; break; }
4961
4962 /* In UTF-8 mode, add on the number of additional bytes needed to
4963 encode this character, and save the total length in case this is a
4964 final char that is repeated. */
4965
4966 #ifdef SUPPORT_UTF8
4967 if (utf8 && c > 127)
4968 {
4969 int i;
4970 for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
4971 if (c <= utf8_table1[i]) break;
4972 runlength += i;
4973 lastcharlength += i;
4974 }
4975 #endif
4976 }
4977
4978 /* Ordinary character or single-char escape */
4979
4980 runlength++;
4981 }
4982
4983 /* This "while" is the end of the "do" above. */
4984
4985 while (runlength < MAXLIT &&
4986 (compile_block.ctypes[c = *(++ptr)] & ctype_meta) == 0);
4987
4988 /* If we hit a meta-character, back off to point to it */
4989
4990 if (runlength < MAXLIT) ptr--;
4991
4992 /* If the last char in the string is a UTF-8 multibyte character, we must
4993 set lastcharlength correctly. If it was specified as an escape, this will
4994 already have been done above. However, we also have to support in-line
4995 UTF-8 characters, so check backwards from where we are. */
4996
4997 #ifdef SUPPORT_UTF8
4998 if (utf8)
4999 {
5000 const uschar *lastptr = ptr - 1;
5001 if ((*lastptr & 0x80) != 0)
5002 {
5003 while((*lastptr & 0xc0) == 0x80) lastptr--;
5004 lastcharlength = ptr - lastptr;
5005 }
5006 }
5007 #endif
5008
5009 length += runlength;
5010 continue;
5011 }
5012 }
5013
5014 length += 2 + LINK_SIZE; /* For final KET and END */
5015
5016 if (length > MAX_PATTERN_SIZE)
5017 {
5018 *errorptr = ERR20;
5019 return NULL;
5020 }
5021
5022 /* Compute the size of data block needed and get it, either from malloc or
5023 externally provided function. */
5024
5025 size = length + sizeof(real_pcre) + name_count * (max_name_size + 3);
5026 re = (real_pcre *)(pcre_malloc)(size);
5027
5028 if (re == NULL)
5029 {
5030 *errorptr = ERR21;
5031 return NULL;
5032 }
5033
5034 /* Put in the magic number, and save the size, options, and table pointer */
5035
5036 re->magic_number = MAGIC_NUMBER;
5037 re->size = size;
5038 re->options = options;
5039 re->tables = tables;
5040 re->name_entry_size = max_name_size + 3;
5041 re->name_count = name_count;
5042
5043 /* The starting points of the name/number translation table and of the code are
5044 passed around in the compile data block. */
5045
5046 compile_block.names_found = 0;
5047 compile_block.name_entry_size = max_name_size + 3;
5048 compile_block.name_table = (uschar *)re + sizeof(real_pcre);
5049 codestart = compile_block.name_table + re->name_entry_size * re->name_count;
5050 compile_block.start_code = codestart;
5051 compile_block.req_varyopt = 0;
5052
5053 /* Set up a starting, non-extracting bracket, then compile the expression. On
5054 error, *errorptr will be set non-NULL, so we don't need to look at the result
5055 of the function here. */
5056
5057 ptr = (const uschar *)pattern;
5058 code = (uschar *)codestart;
5059 *code = OP_BRA;
5060 bracount = 0;
5061 (void)compile_regex(options, options & PCRE_IMS, &bracount, &code, &ptr,
5062 errorptr, FALSE, 0, &firstbyte, &reqbyte, NULL, &compile_block);
5063 re->top_bracket = bracount;
5064 re->top_backref = compile_block.top_backref;
5065
5066 /* If not reached end of pattern on success, there's an excess bracket. */
5067
5068 if (*errorptr == NULL && *ptr != 0) *errorptr = ERR22;
5069
5070 /* Fill in the terminating state and check for disastrous overflow, but
5071 if debugging, leave the test till after things are printed out. */
5072
5073 *code++ = OP_END;
5074
5075 #ifndef DEBUG
5076 if (code - codestart > length) *errorptr = ERR23;
5077 #endif
5078
5079 /* Give an error if there's back reference to a non-existent capturing
5080 subpattern. */
5081
5082 if (re->top_backref > re->top_bracket) *errorptr = ERR15;
5083
5084 /* Failed to compile, or error while post-processing */
5085
5086 if (*errorptr != NULL)
5087 {
5088 (pcre_free)(re);
5089 PCRE_ERROR_RETURN:
5090 *erroroffset = ptr - (const uschar *)pattern;
5091 return NULL;
5092 }
5093
5094 /* If the anchored option was not passed, set the flag if we can determine that
5095 the pattern is anchored by virtue of ^ characters or \A or anything else (such
5096 as starting with .* when DOTALL is set).
5097
5098 Otherwise, if we know what the first character has to be, save it, because that
5099 speeds up unanchored matches no end. If not, see if we can set the
5100 PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
5101 start with ^. and also when all branches start with .* for non-DOTALL matches.
5102 */
5103
5104 if ((options & PCRE_ANCHORED) == 0)
5105 {
5106 int temp_options = options;
5107 if (is_anchored(codestart, &temp_options, 0, compile_block.backref_map))
5108 re->options |= PCRE_ANCHORED;
5109 else
5110 {
5111 if (firstbyte < 0)
5112 firstbyte = find_firstassertedchar(codestart, &temp_options, FALSE);
5113 if (firstbyte >= 0) /* Remove caseless flag for non-caseable chars */
5114 {
5115 int ch = firstbyte & 255;
5116 re->first_byte = ((firstbyte & REQ_CASELESS) != 0 &&
5117 compile_block.fcc[ch] == ch)? ch : firstbyte;
5118 re->options |= PCRE_FIRSTSET;
5119 }
5120 else if (is_startline(codestart, 0, compile_block.backref_map))
5121 re->options |= PCRE_STARTLINE;
5122 }
5123 }
5124
5125 /* For an anchored pattern, we use the "required byte" only if it follows a
5126 variable length item in the regex. Remove the caseless flag for non-caseable
5127 chars. */
5128
5129 if (reqbyte >= 0 &&
5130 ((re->options & PCRE_ANCHORED) == 0 || (reqbyte & REQ_VARY) != 0))
5131 {
5132 int ch = reqbyte & 255;
5133 re->req_byte = ((reqbyte & REQ_CASELESS) != 0 &&
5134 compile_block.fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte;
5135 re->options |= PCRE_REQCHSET;
5136 }
5137
5138 /* Print out the compiled data for debugging */
5139
5140 #ifdef DEBUG
5141
5142 printf("Length = %d top_bracket = %d top_backref = %d\n",
5143 length, re->top_bracket, re->top_backref);
5144
5145 if (re->options != 0)
5146 {
5147 printf("%s%s%s%s%s%s%s%s%s\n",
5148 ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
5149 ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
5150 ((re->options & PCRE_ICHANGED) != 0)? "case state changed " : "",
5151 ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
5152 ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
5153 ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
5154 ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
5155 ((re->options & PCRE_EXTRA) != 0)? "extra " : "",
5156 ((re->options & PCRE_UNGREEDY) != 0)? "ungreedy " : "");
5157 }
5158
5159 if ((re->options & PCRE_FIRSTSET) != 0)
5160 {
5161 int ch = re->first_byte & 255;
5162 const char *caseless = ((re->first_byte & REQ_CASELESS) == 0)? "" : " (caseless)";
5163 if (isprint(ch)) printf("First char = %c%s\n", ch, caseless);
5164 else printf("First char = \\x%02x%s\n", ch, caseless);
5165 }
5166
5167 if ((re->options & PCRE_REQCHSET) != 0)
5168 {
5169 int ch = re->req_byte & 255;
5170 const char *caseless = ((re->req_byte & REQ_CASELESS) == 0)? "" : " (caseless)";
5171 if (isprint(ch)) printf("Req char = %c%s\n", ch, caseless);
5172 else printf("Req char = \\x%02x%s\n", ch, caseless);
5173 }
5174
5175 print_internals(re, stdout);
5176
5177 /* This check is done here in the debugging case so that the code that
5178 was compiled can be seen. */
5179
5180 if (code - codestart > length)
5181 {
5182 *errorptr = ERR23;
5183 (pcre_free)(re);
5184 *erroroffset = ptr - (uschar *)pattern;
5185 return NULL;
5186 }
5187 #endif
5188
5189 return (pcre *)re;
5190 }
5191
5192
5193
5194 /*************************************************
5195 * Match a back-reference *
5196 *************************************************/
5197
5198 /* If a back reference hasn't been set, the length that is passed is greater
5199 than the number of characters left in the string, so the match fails.
5200
5201 Arguments:
5202 offset index into the offset vector
5203 eptr points into the subject
5204 length length to be matched
5205 md points to match data block
5206 ims the ims flags
5207
5208 Returns: TRUE if matched
5209 */
5210
5211 static BOOL
5212 match_ref(int offset, register const uschar *eptr, int length, match_data *md,
5213 unsigned long int ims)
5214 {
5215 const uschar *p = md->start_subject + md->offset_vector[offset];
5216
5217 #ifdef DEBUG
5218 if (eptr >= md->end_subject)
5219 printf("matching subject <null>");
5220 else
5221 {
5222 printf("matching subject ");
5223 pchars(eptr, length, TRUE, md);
5224 }
5225 printf(" against backref ");
5226 pchars(p, length, FALSE, md);
5227 printf("\n");
5228 #endif
5229
5230 /* Always fail if not enough characters left */
5231
5232 if (length > md->end_subject - eptr) return FALSE;
5233
5234 /* Separate the caselesss case for speed */
5235
5236 if ((ims & PCRE_CASELESS) != 0)
5237 {
5238 while (length-- > 0)
5239 if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE;
5240 }
5241 else
5242 { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
5243
5244 return TRUE;
5245 }
5246
5247
5248 #ifdef SUPPORT_UTF8
5249 /*************************************************
5250 * Match character against an XCLASS *
5251 *************************************************/
5252
5253 /* This function is called from within the XCLASS code below, to match a
5254 character against an extended class which might match values > 255.
5255
5256 Arguments:
5257 c the character
5258 data points to the flag byte of the XCLASS data
5259
5260 Returns: TRUE if character matches, else FALSE
5261 */
5262
5263 static BOOL
5264 match_xclass(int c, const uschar *data)
5265 {
5266 int t;
5267 BOOL negated = (*data & XCL_NOT) != 0;
5268
5269 /* Character values < 256 are matched against a bitmap, if one is present. If
5270 not, we still carry on, because there may be ranges that start below 256 in the
5271 additional data. */
5272
5273 if (c < 256)
5274 {
5275 if ((*data & XCL_MAP) != 0 && (data[1 + c/8] & (1 << (c&7))) != 0)
5276 return !negated; /* char found */
5277 }
5278
5279 /* Now match against the list of large chars or ranges that end with a large
5280 char. First skip the bit map if present. */
5281
5282 if ((*data++ & XCL_MAP) != 0) data += 32;
5283
5284 while ((t = *data++) != XCL_END)
5285 {
5286 int x, y;
5287 GETCHARINC(x, data);
5288 if (t == XCL_SINGLE)
5289 {
5290 if (c == x) return !negated;
5291 }
5292 else
5293 {
5294 GETCHARINC(y, data);
5295 if (c >= x && c <= y) return !negated;
5296 }
5297 }
5298
5299 return negated; /* char was not found */
5300 }
5301 #endif
5302
5303
5304 /***************************************************************************
5305 ****************************************************************************
5306 RECURSION IN THE match() FUNCTION
5307
5308 The match() function is highly recursive. Some regular expressions can cause
5309 it to recurse thousands of times. I was writing for Unix, so I just let it
5310 call itself recursively. This uses the stack for saving everything that has
5311 to be saved for a recursive call. On Unix, the stack can be large, and this
5312 works fine.
5313
5314 It turns out that on non-Unix systems there are problems with programs that
5315 use a lot of stack. (This despite the fact that every last chip has oodles
5316 of memory these days, and techniques for extending the stack have been known
5317 for decades.) So....
5318
5319 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
5320 calls by keeping local variables that need to be preserved in blocks of memory
5321 obtained from malloc instead instead of on the stack. Macros are used to
5322 achieve this so that the actual code doesn't look very different to what it
5323 always used to.
5324 ****************************************************************************
5325 ***************************************************************************/
5326
5327
5328 /* These versions of the macros use the stack, as normal */
5329
5330 #ifndef NO_RECURSE
5331 #define REGISTER register
5332 #define RMATCH(rx,ra,rb,rc,rd,re,rf,rg) rx = match(ra,rb,rc,rd,re,rf,rg)
5333 #define RRETURN(ra) return ra
5334 #else
5335
5336
5337 /* These versions of the macros manage a private stack on the heap. Note
5338 that the rd argument of RMATCH isn't actually used. It's the md argument of
5339 match(), which never actually changes. */
5340
5341 #define REGISTER
5342
5343 #define RMATCH(rx,ra,rb,rc,rd,re,rf,rg)\
5344 {\
5345 heapframe *newframe = (pcre_stack_malloc)(sizeof(heapframe));\
5346 if (setjmp(frame->Xwhere) == 0)\
5347 {\
5348 newframe->Xeptr = ra;\
5349 newframe->Xecode = rb;\
5350 newframe->Xoffset_top = rc;\
5351 newframe->Xims = re;\
5352 newframe->Xeptrb = rf;\
5353 newframe->Xflags = rg;\
5354 newframe->Xprevframe = frame;\
5355 frame = newframe;\
5356 DPRINTF(("restarting from line %d\n", __LINE__));\
5357 goto HEAP_RECURSE;\
5358 }\
5359 else\
5360 {\
5361 DPRINTF(("longjumped back to line %d\n", __LINE__));\
5362 frame = md->thisframe;\
5363 rx = frame->Xresult;\
5364 }\
5365 }
5366
5367 #define RRETURN(ra)\
5368 {\
5369 heapframe *newframe = frame;\
5370 frame = newframe->Xprevframe;\
5371 (pcre_stack_free)(newframe);\
5372 if (frame != NULL)\
5373 {\
5374 frame->Xresult = ra;\
5375 md->thisframe = frame;\
5376 longjmp(frame->Xwhere, 1);\
5377 }\
5378 return ra;\
5379 }
5380
5381
5382 /* Structure for remembering the local variables in a private frame */
5383
5384 typedef struct heapframe {
5385 struct heapframe *Xprevframe;
5386
5387 /* Function arguments that may change */
5388
5389 const uschar *Xeptr;
5390 const uschar *Xecode;
5391 int Xoffset_top;
5392 long int Xims;
5393 eptrblock *Xeptrb;
5394 int Xflags;
5395
5396 /* Function local variables */
5397
5398 const uschar *Xcallpat;
5399 const uschar *Xcharptr;
5400 const uschar *Xdata;
5401 const uschar *Xlastptr;
5402 const uschar *Xnext;
5403 const uschar *Xpp;
5404 const uschar *Xprev;
5405 const uschar *Xsaved_eptr;
5406
5407 recursion_info Xnew_recursive;
5408
5409 BOOL Xcur_is_word;
5410 BOOL Xcondition;
5411 BOOL Xminimize;
5412 BOOL Xprev_is_word;
5413
5414 unsigned long int Xoriginal_ims;
5415
5416 int Xctype;
5417 int Xfc;
5418 int Xfi;
5419 int Xlength;
5420 int Xmax;
5421 int Xmin;
5422 int Xnumber;
5423 int Xoffset;
5424 int Xop;
5425 int Xsave_capture_last;
5426 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
5427 int Xstacksave[REC_STACK_SAVE_MAX];
5428
5429 eptrblock Xnewptrb;
5430
5431 /* Place to pass back result, and where to jump back to */
5432
5433 int Xresult;
5434 jmp_buf Xwhere;
5435
5436 } heapframe;
5437
5438 #endif
5439
5440
5441 /***************************************************************************
5442 ***************************************************************************/
5443
5444
5445
5446 /*************************************************
5447 * Match from current position *
5448 *************************************************/
5449
5450 /* On entry ecode points to the first opcode, and eptr to the first character
5451 in the subject string, while eptrb holds the value of eptr at the start of the
5452 last bracketed group - used for breaking infinite loops matching zero-length
5453 strings. This function is called recursively in many circumstances. Whenever it
5454 returns a negative (error) response, the outer incarnation must also return the
5455 same response.
5456
5457 Performance note: It might be tempting to extract commonly used fields from the
5458 md structure (e.g. utf8, end_subject) into individual variables to improve
5459 performance. Tests using gcc on a SPARC disproved this; in the first case, it
5460 made performance worse.
5461
5462 Arguments:
5463 eptr pointer in subject
5464 ecode position in code
5465 offset_top current top pointer
5466 md pointer to "static" info for the match
5467 ims current /i, /m, and /s options
5468 eptrb pointer to chain of blocks containing eptr at start of
5469 brackets - for testing for empty matches
5470 flags can contain
5471 match_condassert - this is an assertion condition
5472 match_isgroup - this is the start of a bracketed group
5473
5474 Returns: MATCH_MATCH if matched ) these values are >= 0
5475 MATCH_NOMATCH if failed to match )
5476 a negative PCRE_ERROR_xxx value if aborted by an error condition
5477 (e.g. stopped by recursion limit)
5478 */
5479
5480 static int
5481 match(REGISTER const uschar *eptr, REGISTER const uschar *ecode,
5482 int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
5483 int flags)
5484 {
5485 /* These variables do not need to be preserved over recursion in this function,
5486 so they can be ordinary variables in all cases. Mark them with "register"
5487 because they are used a lot in loops. */
5488
5489 register int rrc; /* Returns from recursive calls */
5490 register int i; /* Used for loops not involving calls to RMATCH() */
5491 register int c; /* Character values not kept over RMATCH() calls */
5492
5493 /* When recursion is not being used, all "local" variables that have to be
5494 preserved over calls to RMATCH() are part of a "frame" which is obtained from
5495 heap storage. Set up the top-level frame here; others are obtained from the
5496 heap whenever RMATCH() does a "recursion". See the macro definitions above. */
5497
5498 #ifdef NO_RECURSE
5499 heapframe *frame = (pcre_stack_malloc)(sizeof(heapframe));
5500 frame->Xprevframe = NULL; /* Marks the top level */
5501
5502 /* Copy in the original argument variables */
5503
5504 frame->Xeptr = eptr;
5505 frame->Xecode = ecode;
5506 frame->Xoffset_top = offset_top;
5507 frame->Xims = ims;
5508 frame->Xeptrb = eptrb;
5509 frame->Xflags = flags;
5510
5511 /* This is where control jumps back to to effect "recursion" */
5512
5513 HEAP_RECURSE:
5514
5515 /* Macros make the argument variables come from the current frame */
5516
5517 #define eptr frame->Xeptr
5518 #define ecode frame->Xecode
5519 #define offset_top frame->Xoffset_top
5520 #define ims frame->Xims
5521 #define eptrb frame->Xeptrb
5522 #define flags frame->Xflags
5523
5524 /* Ditto for the local variables */
5525
5526 #define callpat frame->Xcallpat
5527 #define charptr frame->Xcharptr
5528 #define data frame->Xdata
5529 #define lastptr frame->Xlastptr
5530 #define next frame->Xnext
5531 #define pp frame->Xpp
5532 #define prev frame->Xprev
5533 #define saved_eptr frame->Xsaved_eptr
5534
5535 #define new_recursive frame->Xnew_recursive
5536
5537 #define cur_is_word frame->Xcur_is_word
5538 #define condition frame->Xcondition
5539 #define minimize frame->Xminimize
5540 #define prev_is_word frame->Xprev_is_word
5541
5542 #define original_ims frame->Xoriginal_ims
5543
5544 #define ctype frame->Xctype
5545 #define fc frame->Xfc
5546 #define fi frame->Xfi
5547 #define length frame->Xlength
5548 #define max frame->Xmax
5549 #define min frame->Xmin
5550 #define number frame->Xnumber
5551 #define offset frame->Xoffset
5552 #define op frame->Xop
5553 #define save_capture_last frame->Xsave_capture_last
5554 #define save_offset1 frame->Xsave_offset1
5555 #define save_offset2 frame->Xsave_offset2
5556 #define save_offset3 frame->Xsave_offset3
5557 #define stacksave frame->Xstacksave
5558
5559 #define newptrb frame->Xnewptrb
5560
5561 /* When recursion is being used, local variables are allocated on the stack and
5562 get preserved during recursion in the normal way. In this environment, fi and
5563 i, and fc and c, can be the same variables. */
5564
5565 #else
5566 #define fi i
5567 #define fc c
5568
5569 const uschar *callpat; /* Many of these variables are used ony */
5570 const uschar *charptr; /* small blocks of the code. My normal */
5571 const uschar *data; /* style of coding would have declared */
5572 const uschar *lastptr; /* them within each of those blocks. */
5573 const uschar *next; /* However, in order to accommodate the */
5574 const uschar *pp; /* version of this code that uses an */
5575 const uschar *prev; /* external "stack" implemented on the */
5576 const uschar *saved_eptr; /* heap, it is easier to declare them */
5577 /* all here, so the declarations can */
5578 recursion_info new_recursive; /* be cut out in a block. The only */
5579 /* declarations within blocks below are */
5580 BOOL cur_is_word; /* for variables that do not have to */
5581 BOOL condition; /* be preserved over a recursive call */
5582 BOOL minimize; /* to RMATCH(). */
5583 BOOL prev_is_word;
5584
5585 unsigned long int original_ims;
5586
5587 int ctype;
5588 int length;
5589 int max;
5590 int min;
5591 int number;
5592 int offset;
5593 int op;
5594 int save_capture_last;
5595 int save_offset1, save_offset2, save_offset3;
5596 int stacksave[REC_STACK_SAVE_MAX];
5597
5598 eptrblock newptrb;
5599 #endif
5600
5601
5602 /* OK, now we can get on with the real code of the function. Recursion is
5603 specified by the macros RMATCH and RRETURN. When NO_RECURSE is *not* defined,
5604 these just turn into a recursive call to match() and a "return", respectively.
5605 However, RMATCH isn't like a function call because it's quite a complicated
5606 macro. It has to be used in one particular way. This shouldn't, however, impact
5607 performance when true recursion is being used. */
5608
5609 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
5610
5611 original_ims = ims; /* Save for resetting on ')' */
5612
5613 /* At the start of a bracketed group, add the current subject pointer to the
5614 stack of such pointers, to be re-instated at the end of the group when we hit
5615 the closing ket. When match() is called in other circumstances, we don't add to
5616 this stack. */
5617
5618 if ((flags & match_isgroup) != 0)
5619 {
5620 newptrb.epb_prev = eptrb;
5621 newptrb.epb_saved_eptr = eptr;
5622 eptrb = &newptrb;
5623 }
5624
5625 /* Now start processing the operations. */
5626
5627 for (;;)
5628 {
5629 op = *ecode;
5630 minimize = FALSE;
5631
5632 /* Opening capturing bracket. If there is space in the offset vector, save
5633 the current subject position in the working slot at the top of the vector. We
5634 mustn't change the current values of the data slot, because they may be set
5635 from a previous iteration of this group, and be referred to by a reference
5636 inside the group.
5637
5638 If the bracket fails to match, we need to restore this value and also the
5639 values of the final offsets, in case they were set by a previous iteration of
5640 the same bracket.
5641
5642 If there isn't enough space in the offset vector, treat this as if it were a
5643 non-capturing bracket. Don't worry about setting the flag for the error case
5644 here; that is handled in the code for KET. */
5645
5646 if (op > OP_BRA)
5647 {
5648 number = op - OP_BRA;
5649
5650 /* For extended extraction brackets (large number), we have to fish out the
5651 number from a dummy opcode at the start. */
5652
5653 if (number > EXTRACT_BASIC_MAX)
5654 number = GET2(ecode, 2+LINK_SIZE);
5655 offset = number << 1;
5656
5657 #ifdef DEBUG
5658 printf("start bracket %d subject=", number);
5659 pchars(eptr, 16, TRUE, md);
5660 printf("\n");
5661 #endif
5662
5663 if (offset < md->offset_max)
5664 {
5665 save_offset1 = md->offset_vector[offset];
5666 save_offset2 = md->offset_vector[offset+1];
5667 save_offset3 = md->offset_vector[md->offset_end - number];
5668 save_capture_last = md->capture_last;
5669
5670 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
5671 md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
5672
5673 do
5674 {
5675 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb,
5676 match_isgroup);
5677 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5678 md->capture_last = save_capture_last;
5679 ecode += GET(ecode, 1);
5680 }
5681 while (*ecode == OP_ALT);
5682
5683 DPRINTF(("bracket %d failed\n", number));
5684
5685 md->offset_vector[offset] = save_offset1;
5686 md->offset_vector[offset+1] = save_offset2;
5687 md->offset_vector[md->offset_end - number] = save_offset3;
5688
5689 RRETURN(MATCH_NOMATCH);
5690 }
5691
5692 /* Insufficient room for saving captured contents */
5693
5694 else op = OP_BRA;
5695 }
5696
5697 /* Other types of node can be handled by a switch */
5698
5699 switch(op)
5700 {
5701 case OP_BRA: /* Non-capturing bracket: optimized */
5702 DPRINTF(("start bracket 0\n"));
5703 do
5704 {
5705 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb,
5706 match_isgroup);
5707 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5708 ecode += GET(ecode, 1);
5709 }
5710 while (*ecode == OP_ALT);
5711 DPRINTF(("bracket 0 failed\n"));
5712 RRETURN(MATCH_NOMATCH);
5713
5714 /* Conditional group: compilation checked that there are no more than
5715 two branches. If the condition is false, skipping the first branch takes us
5716 past the end if there is only one branch, but that's OK because that is
5717 exactly what going to the ket would do. */
5718
5719 case OP_COND:
5720 if (ecode[LINK_SIZE+1] == OP_CREF) /* Condition extract or recurse test */
5721 {
5722 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
5723 condition = (offset == CREF_RECURSE * 2)?
5724 (md->recursive != NULL) :
5725 (offset < offset_top && md->offset_vector[offset] >= 0);
5726 RMATCH(rrc, eptr, ecode + (condition?
5727 (LINK_SIZE + 4) : (LINK_SIZE + 1 + GET(ecode, 1))),
5728 offset_top, md, ims, eptrb, match_isgroup);
5729 RRETURN(rrc);
5730 }
5731
5732 /* The condition is an assertion. Call match() to evaluate it - setting
5733 the final argument TRUE causes it to stop at the end of an assertion. */
5734
5735 else
5736 {
5737 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
5738 match_condassert | match_isgroup);
5739 if (rrc == MATCH_MATCH)
5740 {
5741 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE+2);
5742 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
5743 }
5744 else if (rrc != MATCH_NOMATCH)
5745 {
5746 RRETURN(rrc); /* Need braces because of following else */
5747 }
5748 else ecode += GET(ecode, 1);
5749 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb,
5750 match_isgroup);
5751 RRETURN(rrc);
5752 }
5753 /* Control never reaches here */
5754
5755 /* Skip over conditional reference or large extraction number data if
5756 encountered. */
5757
5758 case OP_CREF:
5759 case OP_BRANUMBER:
5760 ecode += 3;
5761 break;
5762
5763 /* End of the pattern. If we are in a recursion, we should restore the
5764 offsets appropriately and continue from after the call. */
5765
5766 case OP_END:
5767 if (md->recursive != NULL && md->recursive->group_num == 0)
5768 {
5769 recursion_info *rec = md->recursive;
5770 DPRINTF(("Hit the end in a (?0) recursion\n"));
5771 md->recursive = rec->prevrec;
5772 memmove(md->offset_vector, rec->offset_save,
5773 rec->saved_max * sizeof(int));
5774 md->start_match = rec->save_start;
5775 ims = original_ims;
5776 ecode = rec->after_call;
5777 break;
5778 }
5779
5780 /* Otherwise, if PCRE_NOTEMPTY is set, fail if we have matched an empty
5781 string - backtracking will then try other alternatives, if any. */
5782
5783 if (md->notempty && eptr == md->start_match) RRETURN(MATCH_NOMATCH);
5784 md->end_match_ptr = eptr; /* Record where we ended */
5785 md->end_offset_top = offset_top; /* and how many extracts were taken */
5786 RRETURN(MATCH_MATCH);
5787
5788 /* Change option settings */
5789
5790 case OP_OPT:
5791 ims = ecode[1];
5792 ecode += 2;
5793 DPRINTF(("ims set to %02lx\n", ims));
5794 break;
5795
5796 /* Assertion brackets. Check the alternative branches in turn - the
5797 matching won't pass the KET for an assertion. If any one branch matches,
5798 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
5799 start of each branch to move the current point backwards, so the code at
5800 this level is identical to the lookahead case. */
5801
5802 case OP_ASSERT:
5803 case OP_ASSERTBACK:
5804 do
5805 {
5806 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
5807 match_isgroup);
5808 if (rrc == MATCH_MATCH) break;
5809 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5810 ecode += GET(ecode, 1);
5811 }
5812 while (*ecode == OP_ALT);
5813 if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
5814
5815 /* If checking an assertion for a condition, return MATCH_MATCH. */
5816
5817 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
5818
5819 /* Continue from after the assertion, updating the offsets high water
5820 mark, since extracts may have been taken during the assertion. */
5821
5822 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
5823 ecode += 1 + LINK_SIZE;
5824 offset_top = md->end_offset_top;
5825 continue;
5826
5827 /* Negative assertion: all branches must fail to match */
5828
5829 case OP_ASSERT_NOT:
5830 case OP_ASSERTBACK_NOT:
5831 do
5832 {
5833 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
5834 match_isgroup);
5835 if (rrc == MATCH_MATCH) RRETURN(MATCH_NOMATCH);
5836 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5837 ecode += GET(ecode,1);
5838 }
5839 while (*ecode == OP_ALT);
5840
5841 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
5842
5843 ecode += 1 + LINK_SIZE;
5844 continue;
5845
5846 /* Move the subject pointer back. This occurs only at the start of
5847 each branch of a lookbehind assertion. If we are too close to the start to
5848 move back, this match function fails. When working with UTF-8 we move
5849 back a number of characters, not bytes. */
5850
5851 case OP_REVERSE:
5852 #ifdef SUPPORT_UTF8
5853 if (md->utf8)
5854 {
5855 c = GET(ecode,1);
5856 for (i = 0; i < c; i++)
5857 {
5858 eptr--;
5859 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
5860 BACKCHAR(eptr)
5861 }
5862 }
5863 else
5864 #endif
5865
5866 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
5867
5868 {
5869 eptr -= GET(ecode,1);
5870 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
5871 }
5872
5873 /* Skip to next op code */
5874
5875 ecode += 1 + LINK_SIZE;
5876 break;
5877
5878 /* The callout item calls an external function, if one is provided, passing
5879 details of the match so far. This is mainly for debugging, though the
5880 function is able to force a failure. */
5881
5882 case OP_CALLOUT:
5883 if (pcre_callout != NULL)
5884 {
5885 pcre_callout_block cb;
5886 cb.version = 0; /* Version 0 of the callout block */
5887 cb.callout_number = ecode[1];
5888 cb.offset_vector = md->offset_vector;
5889 cb.subject = (const char *)md->start_subject;
5890 cb.subject_length = md->end_subject - md->start_subject;
5891 cb.start_match = md->start_match - md->start_subject;
5892 cb.current_position = eptr - md->start_subject;
5893 cb.capture_top = offset_top/2;
5894 cb.capture_last = md->capture_last;
5895 cb.callout_data = md->callout_data;
5896 if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
5897 if (rrc < 0) RRETURN(rrc);
5898 }
5899 ecode += 2;
5900 break;
5901
5902 /* Recursion either matches the current regex, or some subexpression. The
5903 offset data is the offset to the starting bracket from the start of the
5904 whole pattern. (This is so that it works from duplicated subpatterns.)
5905
5906 If there are any capturing brackets started but not finished, we have to
5907 save their starting points and reinstate them after the recursion. However,
5908 we don't know how many such there are (offset_top records the completed
5909 total) so we just have to save all the potential data. There may be up to
5910 65535 such values, which is too large to put on the stack, but using malloc
5911 for small numbers seems expensive. As a compromise, the stack is used when
5912 there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
5913 is used. A problem is what to do if the malloc fails ... there is no way of
5914 returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
5915 values on the stack, and accept that the rest may be wrong.
5916
5917 There are also other values that have to be saved. We use a chained
5918 sequence of blocks that actually live on the stack. Thanks to Robin Houston
5919 for the original version of this logic. */
5920
5921 case OP_RECURSE:
5922 {
5923 callpat = md->start_code + GET(ecode, 1);
5924 new_recursive.group_num = *callpat - OP_BRA;
5925
5926 /* For extended extraction brackets (large number), we have to fish out
5927 the number from a dummy opcode at the start. */
5928
5929 if (new_recursive.group_num > EXTRACT_BASIC_MAX)
5930 new_recursive.group_num = GET2(callpat, 2+LINK_SIZE);
5931
5932 /* Add to "recursing stack" */
5933
5934 new_recursive.prevrec = md->recursive;
5935 md->recursive = &new_recursive;
5936
5937 /* Find where to continue from afterwards */
5938
5939 ecode += 1 + LINK_SIZE;
5940 new_recursive.after_call = ecode;
5941
5942 /* Now save the offset data. */
5943
5944 new_recursive.saved_max = md->offset_end;
5945 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
5946 new_recursive.offset_save = stacksave;
5947 else
5948 {
5949 new_recursive.offset_save =
5950 (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
5951 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
5952 }
5953
5954 memcpy(new_recursive.offset_save, md->offset_vector,
5955 new_recursive.saved_max * sizeof(int));
5956 new_recursive.save_start = md->start_match;
5957 md->start_match = eptr;
5958
5959 /* OK, now we can do the recursion. For each top-level alternative we
5960 restore the offset and recursion data. */
5961
5962 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
5963 do
5964 {
5965 RMATCH(rrc, eptr, callpat + 1 + LINK_SIZE, offset_top, md, ims,
5966 eptrb, match_isgroup);
5967 if (rrc == MATCH_MATCH)
5968 {
5969 md->recursive = new_recursive.prevrec;
5970 if (new_recursive.offset_save != stacksave)
5971 (pcre_free)(new_recursive.offset_save);
5972 RRETURN(MATCH_MATCH);
5973 }
5974 else if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5975
5976 md->recursive = &new_recursive;
5977 memcpy(md->offset_vector, new_recursive.offset_save,
5978 new_recursive.saved_max * sizeof(int));
5979 callpat += GET(callpat, 1);
5980 }
5981 while (*callpat == OP_ALT);
5982
5983 DPRINTF(("Recursion didn't match\n"));
5984 md->recursive = new_recursive.prevrec;
5985 if (new_recursive.offset_save != stacksave)
5986 (pcre_free)(new_recursive.offset_save);
5987 RRETURN(MATCH_NOMATCH);
5988 }
5989 /* Control never reaches here */
5990
5991 /* "Once" brackets are like assertion brackets except that after a match,
5992 the point in the subject string is not moved back. Thus there can never be
5993 a move back into the brackets. Friedl calls these "atomic" subpatterns.
5994 Check the alternative branches in turn - the matching won't pass the KET
5995 for this kind of subpattern. If any one branch matches, we carry on as at
5996 the end of a normal bracket, leaving the subject pointer. */
5997
5998 case OP_ONCE:
5999 {
6000 prev = ecode;
6001 saved_eptr = eptr;
6002
6003 do
6004 {
6005 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims,
6006 eptrb, match_isgroup);
6007 if (rrc == MATCH_MATCH) break;
6008 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6009 ecode += GET(ecode,1);
6010 }
6011 while (*ecode == OP_ALT);
6012
6013 /* If hit the end of the group (which could be repeated), fail */
6014
6015 if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
6016
6017 /* Continue as from after the assertion, updating the offsets high water
6018 mark, since extracts may have been taken. */
6019
6020 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
6021
6022 offset_top = md->end_offset_top;
6023 eptr = md->end_match_ptr;
6024
6025 /* For a non-repeating ket, just continue at this level. This also
6026 happens for a repeating ket if no characters were matched in the group.
6027 This is the forcible breaking of infinite loops as implemented in Perl
6028 5.005. If there is an options reset, it will get obeyed in the normal
6029 course of events. */
6030
6031 if (*ecode == OP_KET || eptr == saved_eptr)
6032 {
6033 ecode += 1+LINK_SIZE;
6034 break;
6035 }
6036
6037 /* The repeating kets try the rest of the pattern or restart from the
6038 preceding bracket, in the appropriate order. We need to reset any options
6039 that changed within the bracket before re-running it, so check the next
6040 opcode. */
6041
6042 if (ecode[1+LINK_SIZE] == OP_OPT)
6043 {
6044 ims = (ims & ~PCRE_IMS) | ecode[4];
6045 DPRINTF(("ims set to %02lx at group repeat\n", ims));
6046 }
6047
6048 if (*ecode == OP_KETRMIN)
6049 {
6050 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0);
6051 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6052 RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup);
6053 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6054 }
6055 else /* OP_KETRMAX */
6056 {
6057 RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup);
6058 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6059 RMATCH(rrc, eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0);
6060 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6061 }
6062 }
6063 RRETURN(MATCH_NOMATCH);
6064
6065 /* An alternation is the end of a branch; scan along to find the end of the
6066 bracketed group and go to there. */
6067
6068 case OP_ALT:
6069 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
6070 break;
6071
6072 /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
6073 that it may occur zero times. It may repeat infinitely, or not at all -
6074 i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
6075 repeat limits are compiled as a number of copies, with the optional ones
6076 preceded by BRAZERO or BRAMINZERO. */
6077
6078 case OP_BRAZERO:
6079 {
6080 next = ecode+1;
6081 RMATCH(rrc, eptr, next, offset_top, md, ims, eptrb, match_isgroup);
6082 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6083 do next += GET(next,1); while (*next == OP_ALT);
6084 ecode = next + 1+LINK_SIZE;
6085 }
6086 break;
6087
6088 case OP_BRAMINZERO:
6089 {
6090 next = ecode+1;
6091 do next += GET(next,1); while (*next == OP_ALT);
6092 RMATCH(rrc, eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb,
6093 match_isgroup);
6094 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6095 ecode++;
6096 }
6097 break;
6098
6099 /* End of a group, repeated or non-repeating. If we are at the end of
6100 an assertion "group", stop matching and return MATCH_MATCH, but record the
6101 current high water mark for use by positive assertions. Do this also
6102 for the "once" (not-backup up) groups. */
6103
6104 case OP_KET:
6105 case OP_KETRMIN:
6106 case OP_KETRMAX:
6107 {
6108 prev = ecode - GET(ecode, 1);
6109 saved_eptr = eptrb->epb_saved_eptr;
6110
6111 /* Back up the stack of bracket start pointers. */
6112
6113 eptrb = eptrb->epb_prev;
6114
6115 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
6116 *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
6117 *prev == OP_ONCE)
6118 {
6119 md->end_match_ptr = eptr; /* For ONCE */
6120 md->end_offset_top = offset_top;
6121 RRETURN(MATCH_MATCH);
6122 }
6123
6124 /* In all other cases except a conditional group we have to check the
6125 group number back at the start and if necessary complete handling an
6126 extraction by setting the offsets and bumping the high water mark. */
6127
6128 if (*prev != OP_COND)
6129 {
6130 number = *prev - OP_BRA;
6131
6132 /* For extended extraction brackets (large number), we have to fish out
6133 the number from a dummy opcode at the start. */
6134
6135 if (number > EXTRACT_BASIC_MAX) number = GET2(prev, 2+LINK_SIZE);
6136 offset = number << 1;
6137
6138 #ifdef DEBUG
6139 printf("end bracket %d", number);
6140 printf("\n");
6141 #endif
6142
6143 /* Test for a numbered group. This includes groups called as a result
6144 of recursion. Note that whole-pattern recursion is coded as a recurse
6145 into group 0, so it won't be picked up here. Instead, we catch it when
6146 the OP_END is reached. */
6147
6148 if (number > 0)
6149 {
6150 md->capture_last = number;
6151 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
6152 {
6153 md->offset_vector[offset] =
6154 md->offset_vector[md->offset_end - number];
6155 md->offset_vector[offset+1] = eptr - md->start_subject;
6156 if (offset_top <= offset) offset_top = offset + 2;
6157 }
6158
6159 /* Handle a recursively called group. Restore the offsets
6160 appropriately and continue from after the call. */
6161
6162 if (md->recursive != NULL && md->recursive->group_num == number)
6163 {
6164 recursion_info *rec = md->recursive;
6165 DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
6166 md->recursive = rec->prevrec;
6167 md->start_match = rec->save_start;
6168 memcpy(md->offset_vector, rec->offset_save,
6169 rec->saved_max * sizeof(int));
6170 ecode = rec->after_call;
6171 ims = original_ims;
6172 break;
6173 }
6174 }
6175 }
6176
6177 /* Reset the value of the ims flags, in case they got changed during
6178 the group. */
6179
6180 ims = original_ims;
6181 DPRINTF(("ims reset to %02lx\n", ims));
6182
6183 /* For a non-repeating ket, just continue at this level. This also
6184 happens for a repeating ket if no characters were matched in the group.
6185 This is the forcible breaking of infinite loops as implemented in Perl
6186 5.005. If there is an options reset, it will get obeyed in the normal
6187 course of events. */
6188
6189 if (*ecode == OP_KET || eptr == saved_eptr)
6190 {
6191 ecode += 1 + LINK_SIZE;
6192 break;
6193 }
6194
6195 /* The repeating kets try the rest of the pattern or restart from the
6196 preceding bracket, in the appropriate order. */
6197
6198 if (*ecode == OP_KETRMIN)
6199 {
6200 RMATCH(rrc, eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0);
6201 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6202 RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup);
6203 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6204 }
6205 else /* OP_KETRMAX */
6206 {
6207 RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup);
6208 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6209 RMATCH(rrc, eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0);
6210 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6211 }
6212 }
6213
6214 RRETURN(MATCH_NOMATCH);
6215
6216 /* Start of subject unless notbol, or after internal newline if multiline */
6217
6218 case OP_CIRC:
6219 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
6220 if ((ims & PCRE_MULTILINE) != 0)
6221 {
6222 if (eptr != md->start_subject && eptr[-1] != NEWLINE)
6223 RRETURN(MATCH_NOMATCH);
6224 ecode++;
6225 break;
6226 }
6227 /* ... else fall through */
6228
6229 /* Start of subject assertion */
6230
6231 case OP_SOD:
6232 if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
6233 ecode++;
6234 break;
6235
6236 /* Start of match assertion */
6237
6238 case OP_SOM:
6239 if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
6240 ecode++;
6241 break;
6242
6243 /* Assert before internal newline if multiline, or before a terminating
6244 newline unless endonly is set, else end of subject unless noteol is set. */
6245
6246 case OP_DOLL:
6247 if ((ims & PCRE_MULTILINE) != 0)
6248 {
6249 if (eptr < md->end_subject)
6250 { if (*eptr != NEWLINE) RRETURN(MATCH_NOMATCH); }
6251 else
6252 { if (md->noteol) RRETURN(MATCH_NOMATCH); }
6253 ecode++;
6254 break;
6255 }
6256 else
6257 {
6258 if (md->noteol) RRETURN(MATCH_NOMATCH);
6259 if (!md->endonly)
6260 {
6261 if (eptr < md->end_subject - 1 ||
6262 (eptr == md->end_subject - 1 && *eptr != NEWLINE))
6263 RRETURN(MATCH_NOMATCH);
6264 ecode++;
6265 break;
6266 }
6267 }
6268 /* ... else fall through */
6269
6270 /* End of subject assertion (\z) */
6271
6272 case OP_EOD:
6273 if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
6274 ecode++;
6275 break;
6276
6277 /* End of subject or ending \n assertion (\Z) */
6278
6279 case OP_EODN:
6280 if (eptr < md->end_subject - 1 ||
6281 (eptr == md->end_subject - 1 && *eptr != NEWLINE)) RRETURN(MATCH_NOMATCH);
6282 ecode++;
6283 break;
6284
6285 /* Word boundary assertions */
6286
6287 case OP_NOT_WORD_BOUNDARY:
6288 case OP_WORD_BOUNDARY:
6289 {
6290
6291 /* Find out if the previous and current characters are "word" characters.
6292 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
6293 be "non-word" characters. */
6294
6295 #ifdef SUPPORT_UTF8
6296 if (md->utf8)
6297 {
6298 if (eptr == md->start_subject) prev_is_word = FALSE; else
6299 {
6300 lastptr = eptr - 1;
6301 while((*lastptr & 0xc0) == 0x80) lastptr--;
6302 GETCHAR(c, lastptr);
6303 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
6304 }
6305 if (eptr >= md->end_subject) cur_is_word = FALSE; else
6306 {
6307 GETCHAR(c, eptr);
6308 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
6309 }
6310 }
6311 else
6312 #endif
6313
6314 /* More streamlined when not in UTF-8 mode */
6315
6316 {
6317 prev_is_word = (eptr != md->start_subject) &&
6318 ((md->ctypes[eptr[-1]] & ctype_word) != 0);
6319 cur_is_word = (eptr < md->end_subject) &&
6320 ((md->ctypes[*eptr] & ctype_word) != 0);
6321 }
6322
6323 /* Now see if the situation is what we want */
6324
6325 if ((*ecode++ == OP_WORD_BOUNDARY)?
6326 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
6327 RRETURN(MATCH_NOMATCH);
6328 }
6329 break;
6330
6331 /* Match a single character type; inline for speed */
6332
6333 case OP_ANY:
6334 if ((ims & PCRE_DOTALL) == 0 && eptr < md->end_subject && *eptr == NEWLINE)
6335 RRETURN(MATCH_NOMATCH);
6336 if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
6337 #ifdef SUPPORT_UTF8
6338 if (md->utf8)
6339 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
6340 #endif
6341 ecode++;
6342 break;
6343
6344 /* Match a single byte, even in UTF-8 mode. This opcode really does match
6345 any byte, even newline, independent of the setting of PCRE_DOTALL. */
6346
6347 case OP_ANYBYTE:
6348 if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
6349 ecode++;
6350 break;
6351
6352 case OP_NOT_DIGIT:
6353 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
6354 GETCHARINCTEST(c, eptr);
6355 if (
6356 #ifdef SUPPORT_UTF8
6357 c < 256 &&
6358 #endif
6359 (md->ctypes[c] & ctype_digit) != 0
6360 )
6361 RRETURN(MATCH_NOMATCH);
6362 ecode++;
6363 break;
6364
6365 case OP_DIGIT:
6366 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
6367 GETCHARINCTEST(c, eptr);
6368 if (
6369 #ifdef SUPPORT_UTF8
6370 c >= 256 ||
6371 #endif
6372 (md->ctypes[c] & ctype_digit) == 0
6373 )
6374 RRETURN(MATCH_NOMATCH);
6375 ecode++;
6376 break;
6377
6378 case OP_NOT_WHITESPACE:
6379 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
6380 GETCHARINCTEST(c, eptr);
6381 if (
6382 #ifdef SUPPORT_UTF8
6383 c < 256 &&
6384 #endif
6385 (md->ctypes[c] & ctype_space) != 0
6386 )
6387 RRETURN(MATCH_NOMATCH);
6388 ecode++;
6389 break;
6390
6391 case OP_WHITESPACE:
6392 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
6393 GETCHARINCTEST(c, eptr);
6394 if (
6395 #ifdef SUPPORT_UTF8
6396 c >= 256 ||
6397 #endif
6398 (md->ctypes[c] & ctype_space) == 0
6399 )
6400 RRETURN(MATCH_NOMATCH);
6401 ecode++;
6402 break;
6403
6404 case OP_NOT_WORDCHAR:
6405 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
6406 GETCHARINCTEST(c, eptr);
6407 if (
6408 #ifdef SUPPORT_UTF8
6409 c < 256 &&
6410 #endif
6411 (md->ctypes[c] & ctype_word) != 0
6412 )
6413 RRETURN(MATCH_NOMATCH);
6414 ecode++;
6415 break;
6416
6417 case OP_WORDCHAR:
6418 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
6419 GETCHARINCTEST(c, eptr);
6420 if (
6421 #ifdef SUPPORT_UTF8
6422 c >= 256 ||
6423 #endif
6424 (md->ctypes[c] & ctype_word) == 0
6425 )
6426 RRETURN(MATCH_NOMATCH);
6427 ecode++;
6428 break;
6429
6430 /* Match a back reference, possibly repeatedly. Look past the end of the
6431 item to see if there is repeat information following. The code is similar
6432 to that for character classes, but repeated for efficiency. Then obey
6433 similar code to character type repeats - written out again for speed.
6434 However, if the referenced string is the empty string, always treat
6435 it as matched, any number of times (otherwise there could be infinite
6436 loops). */
6437
6438 case OP_REF:
6439 {
6440 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
6441 ecode += 3; /* Advance past item */
6442
6443 /* If the reference is unset, set the length to be longer than the amount
6444 of subject left; this ensures that every attempt at a match fails. We
6445 can't just fail here, because of the possibility of quantifiers with zero
6446 minima. */
6447
6448 length = (offset >= offset_top || md->offset_vector[offset] < 0)?
6449 md->end_subject - eptr + 1 :
6450 md->offset_vector[offset+1] - md->offset_vector[offset];
6451
6452 /* Set up for repetition, or handle the non-repeated case */
6453
6454 switch (*ecode)
6455 {
6456 case OP_CRSTAR:
6457 case OP_CRMINSTAR:
6458 case OP_CRPLUS:
6459 case OP_CRMINPLUS:
6460 case OP_CRQUERY:
6461 case OP_CRMINQUERY:
6462 c = *ecode++ - OP_CRSTAR;
6463 minimize = (c & 1) != 0;
6464 min = rep_min[c]; /* Pick up values from tables; */
6465 max = rep_max[c]; /* zero for max => infinity */
6466 if (max == 0) max = INT_MAX;
6467 break;
6468
6469 case OP_CRRANGE:
6470 case OP_CRMINRANGE:
6471 minimize = (*ecode == OP_CRMINRANGE);
6472 min = GET2(ecode, 1);
6473 max = GET2(ecode, 3);
6474 if (max == 0) max = INT_MAX;
6475 ecode += 5;
6476 break;
6477
6478 default: /* No repeat follows */
6479 if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
6480 eptr += length;
6481 continue; /* With the main loop */
6482 }
6483
6484 /* If the length of the reference is zero, just continue with the
6485 main loop. */
6486
6487 if (length == 0) continue;
6488
6489 /* First, ensure the minimum number of matches are present. We get back
6490 the length of the reference string explicitly rather than passing the
6491 address of eptr, so that eptr can be a register variable. */
6492
6493 for (i = 1; i <= min; i++)
6494 {
6495 if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
6496 eptr += length;
6497 }
6498
6499 /* If min = max, continue at the same level without recursion.
6500 They are not both allowed to be zero. */
6501
6502 if (min == max) continue;
6503
6504 /* If minimizing, keep trying and advancing the pointer */
6505
6506 if (minimize)
6507 {
6508 for (fi = min;; fi++)
6509 {
6510 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
6511 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6512 if (fi >= max || !match_ref(offset, eptr, length, md, ims))
6513 RRETURN(MATCH_NOMATCH);
6514 eptr += length;
6515 }
6516 /* Control never gets here */
6517 }
6518
6519 /* If maximizing, find the longest string and work backwards */
6520
6521 else
6522 {
6523 pp = eptr;
6524 for (i = min; i < max; i++)
6525 {
6526 if (!match_ref(offset, eptr, length, md, ims)) break;
6527 eptr += length;
6528 }
6529 while (eptr >= pp)
6530 {
6531 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
6532 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6533 eptr -= length;
6534 }
6535 RRETURN(MATCH_NOMATCH);
6536 }
6537 }
6538 /* Control never gets here */
6539
6540
6541
6542 /* Match a bit-mapped character class, possibly repeatedly. This op code is
6543 used when all the characters in the class have values in the range 0-255.
6544 The only difference between OP_CLASS and OP_NCLASS occurs when a data
6545 character outside the range is encountered.
6546
6547 First, look past the end of the item to see if there is repeat information
6548 following. Then obey similar code to character type repeats - written out
6549 again for speed. */
6550
6551 case OP_NCLASS:
6552 case OP_CLASS:
6553 {
6554 data = ecode + 1; /* Save for matching */
6555 ecode += 33; /* Advance past the item */
6556
6557 switch (*ecode)
6558 {
6559 case OP_CRSTAR:
6560 case OP_CRMINSTAR:
6561 case OP_CRPLUS:
6562 case OP_CRMINPLUS:
6563 case OP_CRQUERY:
6564 case OP_CRMINQUERY:
6565 c = *ecode++ - OP_CRSTAR;
6566 minimize = (c & 1) != 0;
6567 min = rep_min[c]; /* Pick up values from tables; */
6568 max = rep_max[c]; /* zero for max => infinity */
6569 if (max == 0) max = INT_MAX;
6570 break;
6571
6572 case OP_CRRANGE:
6573 case OP_CRMINRANGE:
6574 minimize = (*ecode == OP_CRMINRANGE);
6575 min = GET2(ecode, 1);
6576 max = GET2(ecode, 3);
6577 if (max == 0) max = INT_MAX;
6578 ecode += 5;
6579 break;
6580
6581 default: /* No repeat follows */
6582 min = max = 1;
6583 break;
6584 }
6585
6586 /* First, ensure the minimum number of matches are present. */
6587
6588 #ifdef SUPPORT_UTF8
6589 /* UTF-8 mode */
6590 if (md->utf8)
6591 {
6592 for (i = 1; i <= min; i++)
6593 {
6594 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
6595 GETCHARINC(c, eptr);
6596 if (c > 255)
6597 {
6598 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
6599 }
6600 else
6601 {
6602 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
6603 }
6604 }
6605 }
6606 else
6607 #endif
6608 /* Not UTF-8 mode */
6609 {
6610 for (i = 1; i <= min; i++)
6611 {
6612 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
6613 c = *eptr++;
6614 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
6615 }
6616 }
6617
6618 /* If max == min we can continue with the main loop without the
6619 need to recurse. */
6620
6621 if (min == max) continue;
6622
6623 /* If minimizing, keep testing the rest of the expression and advancing
6624 the pointer while it matches the class. */
6625
6626 if (minimize)
6627 {
6628 #ifdef SUPPORT_UTF8
6629 /* UTF-8 mode */
6630 if (md->utf8)
6631 {
6632 for (fi = min;; fi++)
6633 {
6634 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
6635 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6636 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
6637 GETCHARINC(c, eptr);
6638 if (c > 255)
6639 {
6640 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
6641 }
6642 else
6643 {
6644 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
6645 }
6646 }
6647 }
6648 else
6649 #endif
6650 /* Not UTF-8 mode */
6651 {
6652 for (fi = min;; fi++)
6653 {
6654 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
6655 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6656 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
6657 c = *eptr++;
6658 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
6659 }
6660 }
6661 /* Control never gets here */
6662 }
6663
6664 /* If maximizing, find the longest possible run, then work backwards. */
6665
6666 else
6667 {
6668 pp = eptr;
6669
6670 #ifdef SUPPORT_UTF8
6671 /* UTF-8 mode */
6672 if (md->utf8)
6673 {
6674 for (i = min; i < max; i++)
6675 {
6676 int len = 1;
6677 if (eptr >= md->end_subject) break;
6678 GETCHARLEN(c, eptr, len);
6679 if (c > 255)
6680 {
6681 if (op == OP_CLASS) break;
6682 }
6683 else
6684 {
6685 if ((data[c/8] & (1 << (c&7))) == 0) break;
6686 }
6687 eptr += len;
6688 }
6689 for (;;)
6690 {
6691 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
6692 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6693 if (eptr-- == pp) break; /* Stop if tried at original pos */
6694 BACKCHAR(eptr);
6695 }
6696 }
6697 else
6698 #endif
6699 /* Not UTF-8 mode */
6700 {
6701 for (i = min; i < max; i++)