/[pcre]/code/tags/pcre-4.5/pcre.c
ViewVC logotype

Contents of /code/tags/pcre-4.5/pcre.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 71 - (show annotations) (download)
Sat Feb 24 21:40:24 2007 UTC (7 years, 5 months ago) by nigel
Original Path: code/trunk/pcre.c
File MIME type: text/plain
File size: 233449 byte(s)
Load pcre-4.4 into code/trunk.

1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /*
6 This is a library of functions to support regular expressions whose syntax
7 and semantics are as close as possible to those of the Perl 5 language. See
8 the file Tech.Notes for some information on the internals.
9
10 Written by: Philip Hazel <ph10@cam.ac.uk>
11
12 Copyright (c) 1997-2003 University of Cambridge
13
14 -----------------------------------------------------------------------------
15 Permission is granted to anyone to use this software for any purpose on any
16 computer system, and to redistribute it freely, subject to the following
17 restrictions:
18
19 1. This software is distributed in the hope that it will be useful,
20 but WITHOUT ANY WARRANTY; without even the implied warranty of
21 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
22
23 2. The origin of this software must not be misrepresented, either by
24 explicit claim or by omission.
25
26 3. Altered versions must be plainly marked as such, and must not be
27 misrepresented as being the original software.
28
29 4. If PCRE is embedded in any software that is released under the GNU
30 General Purpose Licence (GPL), then the terms of that licence shall
31 supersede any condition above with which it is incompatible.
32 -----------------------------------------------------------------------------
33 */
34
35 /* Define DEBUG to get debugging output on stdout. */
36
37 /* #define DEBUG */
38
39 /* Use a macro for debugging printing, 'cause that eliminates the use of #ifdef
40 inline, and there are *still* stupid compilers about that don't like indented
41 pre-processor statements. I suppose it's only been 10 years... */
42
43 #ifdef DEBUG
44 #define DPRINTF(p) printf p
45 #else
46 #define DPRINTF(p) /*nothing*/
47 #endif
48
49 /* Include the internals header, which itself includes Standard C headers plus
50 the external pcre header. */
51
52 #include "internal.h"
53
54
55 /* Allow compilation as C++ source code, should anybody want to do that. */
56
57 #ifdef __cplusplus
58 #define class pcre_class
59 #endif
60
61
62 /* Maximum number of items on the nested bracket stacks at compile time. This
63 applies to the nesting of all kinds of parentheses. It does not limit
64 un-nested, non-capturing parentheses. This number can be made bigger if
65 necessary - it is used to dimension one int and one unsigned char vector at
66 compile time. */
67
68 #define BRASTACK_SIZE 200
69
70
71 /* Maximum number of ints of offset to save on the stack for recursive calls.
72 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
73 because the offset vector is always a multiple of 3 long. */
74
75 #define REC_STACK_SAVE_MAX 30
76
77
78 /* The number of bytes in a literal character string above which we can't add
79 any more is set at 250 in order to allow for UTF-8 characters. (In theory it
80 could be 255 when UTF-8 support is excluded, but that means that some of the
81 test output would be different, which just complicates things.) */
82
83 #define MAXLIT 250
84
85
86 /* The maximum remaining length of subject we are prepared to search for a
87 req_byte match. */
88
89 #define REQ_BYTE_MAX 1000
90
91
92 /* Table of sizes for the fixed-length opcodes. It's defined in a macro so that
93 the definition is next to the definition of the opcodes in internal.h. */
94
95 static uschar OP_lengths[] = { OP_LENGTHS };
96
97 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
98
99 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
100 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
101
102 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
103 are simple data values; negative values are for special things like \d and so
104 on. Zero means further processing is needed (for things like \x), or the escape
105 is invalid. */
106
107 static const short int escapes[] = {
108 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
109 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
110 '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */
111 0, 0, 0, 0, 0, 0, 0, 0, /* H - O */
112 0, -ESC_Q, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
113 0, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
114 '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */
115 0, 0, 0, 0, 0, 0, ESC_n, 0, /* h - o */
116 0, 0, ESC_r, -ESC_s, ESC_tee, 0, 0, -ESC_w, /* p - w */
117 0, 0, -ESC_z /* x - z */
118 };
119
120 /* Tables of names of POSIX character classes and their lengths. The list is
121 terminated by a zero length entry. The first three must be alpha, upper, lower,
122 as this is assumed for handling case independence. */
123
124 static const char *posix_names[] = {
125 "alpha", "lower", "upper",
126 "alnum", "ascii", "blank", "cntrl", "digit", "graph",
127 "print", "punct", "space", "word", "xdigit" };
128
129 static const uschar posix_name_lengths[] = {
130 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
131
132 /* Table of class bit maps for each POSIX class; up to three may be combined
133 to form the class. The table for [:blank:] is dynamically modified to remove
134 the vertical space characters. */
135
136 static const int posix_class_maps[] = {
137 cbit_lower, cbit_upper, -1, /* alpha */
138 cbit_lower, -1, -1, /* lower */
139 cbit_upper, -1, -1, /* upper */
140 cbit_digit, cbit_lower, cbit_upper, /* alnum */
141 cbit_print, cbit_cntrl, -1, /* ascii */
142 cbit_space, -1, -1, /* blank - a GNU extension */
143 cbit_cntrl, -1, -1, /* cntrl */
144 cbit_digit, -1, -1, /* digit */
145 cbit_graph, -1, -1, /* graph */
146 cbit_print, -1, -1, /* print */
147 cbit_punct, -1, -1, /* punct */
148 cbit_space, -1, -1, /* space */
149 cbit_word, -1, -1, /* word - a Perl extension */
150 cbit_xdigit,-1, -1 /* xdigit */
151 };
152
153 /* Table to identify ASCII digits and hex digits. This is used when compiling
154 patterns. Note that the tables in chartables are dependent on the locale, and
155 may mark arbitrary characters as digits - but the PCRE compiling code expects
156 to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
157 a private table here. It costs 256 bytes, but it is a lot faster than doing
158 character value tests (at least in some simple cases I timed), and in some
159 applications one wants PCRE to compile efficiently as well as match
160 efficiently.
161
162 For convenience, we use the same bit definitions as in chartables:
163
164 0x04 decimal digit
165 0x08 hexadecimal digit
166
167 Then we can use ctype_digit and ctype_xdigit in the code. */
168
169 static const unsigned char digitab[] =
170 {
171 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
172 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
173 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
174 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
175 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
176 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
177 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
178 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
179 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
180 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
181 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
182 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
183 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
184 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
185 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
186 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
187 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
188 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
189 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
190 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
191 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
192 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
193 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
194 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
195 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
196 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
197 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
198 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
199 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
200 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
201 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
202 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
203
204 /* Definition to allow mutual recursion */
205
206 static BOOL
207 compile_regex(int, int, int *, uschar **, const uschar **, const char **,
208 BOOL, int, int *, int *, branch_chain *, compile_data *);
209
210 /* Structure for building a chain of data that actually lives on the
211 stack, for holding the values of the subject pointer at the start of each
212 subpattern, so as to detect when an empty string has been matched by a
213 subpattern - to break infinite loops. */
214
215 typedef struct eptrblock {
216 struct eptrblock *prev;
217 const uschar *saved_eptr;
218 } eptrblock;
219
220 /* Flag bits for the match() function */
221
222 #define match_condassert 0x01 /* Called to check a condition assertion */
223 #define match_isgroup 0x02 /* Set if start of bracketed group */
224
225 /* Non-error returns from the match() function. Error returns are externally
226 defined PCRE_ERROR_xxx codes, which are all negative. */
227
228 #define MATCH_MATCH 1
229 #define MATCH_NOMATCH 0
230
231
232
233 /*************************************************
234 * Global variables *
235 *************************************************/
236
237 /* PCRE is thread-clean and doesn't use any global variables in the normal
238 sense. However, it calls memory allocation and free functions via the two
239 indirections below, and it can optionally do callouts. These values can be
240 changed by the caller, but are shared between all threads. However, when
241 compiling for Virtual Pascal, things are done differently (see pcre.in). */
242
243 #ifndef VPCOMPAT
244 #ifdef __cplusplus
245 extern "C" void *(*pcre_malloc)(size_t) = malloc;
246 extern "C" void (*pcre_free)(void *) = free;
247 extern "C" int (*pcre_callout)(pcre_callout_block *) = NULL;
248 #else
249 void *(*pcre_malloc)(size_t) = malloc;
250 void (*pcre_free)(void *) = free;
251 int (*pcre_callout)(pcre_callout_block *) = NULL;
252 #endif
253 #endif
254
255
256 /*************************************************
257 * Macros and tables for character handling *
258 *************************************************/
259
260 /* When UTF-8 encoding is being used, a character is no longer just a single
261 byte. The macros for character handling generate simple sequences when used in
262 byte-mode, and more complicated ones for UTF-8 characters. */
263
264 #ifndef SUPPORT_UTF8
265 #define GETCHAR(c, eptr) c = *eptr;
266 #define GETCHARINC(c, eptr) c = *eptr++;
267 #define GETCHARINCTEST(c, eptr) c = *eptr++;
268 #define GETCHARLEN(c, eptr, len) c = *eptr;
269 #define BACKCHAR(eptr)
270
271 #else /* SUPPORT_UTF8 */
272
273 /* Get the next UTF-8 character, not advancing the pointer. This is called when
274 we know we are in UTF-8 mode. */
275
276 #define GETCHAR(c, eptr) \
277 c = *eptr; \
278 if ((c & 0xc0) == 0xc0) \
279 { \
280 int gcii; \
281 int gcaa = utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
282 int gcss = 6*gcaa; \
283 c = (c & utf8_table3[gcaa]) << gcss; \
284 for (gcii = 1; gcii <= gcaa; gcii++) \
285 { \
286 gcss -= 6; \
287 c |= (eptr[gcii] & 0x3f) << gcss; \
288 } \
289 }
290
291 /* Get the next UTF-8 character, advancing the pointer. This is called when we
292 know we are in UTF-8 mode. */
293
294 #define GETCHARINC(c, eptr) \
295 c = *eptr++; \
296 if ((c & 0xc0) == 0xc0) \
297 { \
298 int gcaa = utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
299 int gcss = 6*gcaa; \
300 c = (c & utf8_table3[gcaa]) << gcss; \
301 while (gcaa-- > 0) \
302 { \
303 gcss -= 6; \
304 c |= (*eptr++ & 0x3f) << gcss; \
305 } \
306 }
307
308 /* Get the next character, testing for UTF-8 mode, and advancing the pointer */
309
310 #define GETCHARINCTEST(c, eptr) \
311 c = *eptr++; \
312 if (md->utf8 && (c & 0xc0) == 0xc0) \
313 { \
314 int gcaa = utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
315 int gcss = 6*gcaa; \
316 c = (c & utf8_table3[gcaa]) << gcss; \
317 while (gcaa-- > 0) \
318 { \
319 gcss -= 6; \
320 c |= (*eptr++ & 0x3f) << gcss; \
321 } \
322 }
323
324 /* Get the next UTF-8 character, not advancing the pointer, incrementing length
325 if there are extra bytes. This is called when we know we are in UTF-8 mode. */
326
327 #define GETCHARLEN(c, eptr, len) \
328 c = *eptr; \
329 if ((c & 0xc0) == 0xc0) \
330 { \
331 int gcii; \
332 int gcaa = utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
333 int gcss = 6*gcaa; \
334 c = (c & utf8_table3[gcaa]) << gcss; \
335 for (gcii = 1; gcii <= gcaa; gcii++) \
336 { \
337 gcss -= 6; \
338 c |= (eptr[gcii] & 0x3f) << gcss; \
339 } \
340 len += gcaa; \
341 }
342
343 /* If the pointer is not at the start of a character, move it back until
344 it is. Called only in UTF-8 mode. */
345
346 #define BACKCHAR(eptr) while((*eptr & 0xc0) == 0x80) eptr--;
347
348 #endif
349
350
351
352 /*************************************************
353 * Default character tables *
354 *************************************************/
355
356 /* A default set of character tables is included in the PCRE binary. Its source
357 is built by the maketables auxiliary program, which uses the default C ctypes
358 functions, and put in the file chartables.c. These tables are used by PCRE
359 whenever the caller of pcre_compile() does not provide an alternate set of
360 tables. */
361
362 #include "chartables.c"
363
364
365
366 #ifdef SUPPORT_UTF8
367 /*************************************************
368 * Tables for UTF-8 support *
369 *************************************************/
370
371 /* These are the breakpoints for different numbers of bytes in a UTF-8
372 character. */
373
374 static const int utf8_table1[] =
375 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff};
376
377 /* These are the indicator bits and the mask for the data bits to set in the
378 first byte of a character, indexed by the number of additional bytes. */
379
380 static const int utf8_table2[] = { 0, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
381 static const int utf8_table3[] = { 0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01};
382
383 /* Table of the number of extra characters, indexed by the first character
384 masked with 0x3f. The highest number for a valid UTF-8 character is in fact
385 0x3d. */
386
387 static const uschar utf8_table4[] = {
388 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
389 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
390 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
391 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 };
392
393
394 /*************************************************
395 * Convert character value to UTF-8 *
396 *************************************************/
397
398 /* This function takes an integer value in the range 0 - 0x7fffffff
399 and encodes it as a UTF-8 character in 0 to 6 bytes.
400
401 Arguments:
402 cvalue the character value
403 buffer pointer to buffer for result - at least 6 bytes long
404
405 Returns: number of characters placed in the buffer
406 */
407
408 static int
409 ord2utf8(int cvalue, uschar *buffer)
410 {
411 register int i, j;
412 for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
413 if (cvalue <= utf8_table1[i]) break;
414 buffer += i;
415 for (j = i; j > 0; j--)
416 {
417 *buffer-- = 0x80 | (cvalue & 0x3f);
418 cvalue >>= 6;
419 }
420 *buffer = utf8_table2[i] | cvalue;
421 return i + 1;
422 }
423 #endif
424
425
426
427 /*************************************************
428 * Print compiled regex *
429 *************************************************/
430
431 /* The code for doing this is held in a separate file that is also included in
432 pcretest.c. It defines a function called print_internals(). */
433
434 #ifdef DEBUG
435 #include "printint.c"
436 #endif
437
438
439
440 /*************************************************
441 * Return version string *
442 *************************************************/
443
444 #define STRING(a) # a
445 #define XSTRING(s) STRING(s)
446
447 const char *
448 pcre_version(void)
449 {
450 return XSTRING(PCRE_MAJOR) "." XSTRING(PCRE_MINOR) " " XSTRING(PCRE_DATE);
451 }
452
453
454
455
456 /*************************************************
457 * (Obsolete) Return info about compiled pattern *
458 *************************************************/
459
460 /* This is the original "info" function. It picks potentially useful data out
461 of the private structure, but its interface was too rigid. It remains for
462 backwards compatibility. The public options are passed back in an int - though
463 the re->options field has been expanded to a long int, all the public options
464 at the low end of it, and so even on 16-bit systems this will still be OK.
465 Therefore, I haven't changed the API for pcre_info().
466
467 Arguments:
468 external_re points to compiled code
469 optptr where to pass back the options
470 first_byte where to pass back the first character,
471 or -1 if multiline and all branches start ^,
472 or -2 otherwise
473
474 Returns: number of capturing subpatterns
475 or negative values on error
476 */
477
478 int
479 pcre_info(const pcre *external_re, int *optptr, int *first_byte)
480 {
481 const real_pcre *re = (const real_pcre *)external_re;
482 if (re == NULL) return PCRE_ERROR_NULL;
483 if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
484 if (optptr != NULL) *optptr = (int)(re->options & PUBLIC_OPTIONS);
485 if (first_byte != NULL)
486 *first_byte = ((re->options & PCRE_FIRSTSET) != 0)? re->first_byte :
487 ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
488 return re->top_bracket;
489 }
490
491
492
493 /*************************************************
494 * Return info about compiled pattern *
495 *************************************************/
496
497 /* This is a newer "info" function which has an extensible interface so
498 that additional items can be added compatibly.
499
500 Arguments:
501 external_re points to compiled code
502 extra_data points extra data, or NULL
503 what what information is required
504 where where to put the information
505
506 Returns: 0 if data returned, negative on error
507 */
508
509 int
510 pcre_fullinfo(const pcre *external_re, const pcre_extra *extra_data, int what,
511 void *where)
512 {
513 const real_pcre *re = (const real_pcre *)external_re;
514 const pcre_study_data *study = NULL;
515
516 if (re == NULL || where == NULL) return PCRE_ERROR_NULL;
517 if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
518
519 if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_STUDY_DATA) != 0)
520 study = (const pcre_study_data *)extra_data->study_data;
521
522 switch (what)
523 {
524 case PCRE_INFO_OPTIONS:
525 *((unsigned long int *)where) = re->options & PUBLIC_OPTIONS;
526 break;
527
528 case PCRE_INFO_SIZE:
529 *((size_t *)where) = re->size;
530 break;
531
532 case PCRE_INFO_STUDYSIZE:
533 *((size_t *)where) = (study == NULL)? 0 : study->size;
534 break;
535
536 case PCRE_INFO_CAPTURECOUNT:
537 *((int *)where) = re->top_bracket;
538 break;
539
540 case PCRE_INFO_BACKREFMAX:
541 *((int *)where) = re->top_backref;
542 break;
543
544 case PCRE_INFO_FIRSTBYTE:
545 *((int *)where) =
546 ((re->options & PCRE_FIRSTSET) != 0)? re->first_byte :
547 ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
548 break;
549
550 case PCRE_INFO_FIRSTTABLE:
551 *((const uschar **)where) =
552 (study != NULL && (study->options & PCRE_STUDY_MAPPED) != 0)?
553 study->start_bits : NULL;
554 break;
555
556 case PCRE_INFO_LASTLITERAL:
557 *((int *)where) =
558 ((re->options & PCRE_REQCHSET) != 0)? re->req_byte : -1;
559 break;
560
561 case PCRE_INFO_NAMEENTRYSIZE:
562 *((int *)where) = re->name_entry_size;
563 break;
564
565 case PCRE_INFO_NAMECOUNT:
566 *((int *)where) = re->name_count;
567 break;
568
569 case PCRE_INFO_NAMETABLE:
570 *((const uschar **)where) = (const uschar *)re + sizeof(real_pcre);
571 break;
572
573 default: return PCRE_ERROR_BADOPTION;
574 }
575
576 return 0;
577 }
578
579
580
581 /*************************************************
582 * Return info about what features are configured *
583 *************************************************/
584
585 /* This is function which has an extensible interface so that additional items
586 can be added compatibly.
587
588 Arguments:
589 what what information is required
590 where where to put the information
591
592 Returns: 0 if data returned, negative on error
593 */
594
595 int
596 pcre_config(int what, void *where)
597 {
598 switch (what)
599 {
600 case PCRE_CONFIG_UTF8:
601 #ifdef SUPPORT_UTF8
602 *((int *)where) = 1;
603 #else
604 *((int *)where) = 0;
605 #endif
606 break;
607
608 case PCRE_CONFIG_NEWLINE:
609 *((int *)where) = NEWLINE;
610 break;
611
612 case PCRE_CONFIG_LINK_SIZE:
613 *((int *)where) = LINK_SIZE;
614 break;
615
616 case PCRE_CONFIG_POSIX_MALLOC_THRESHOLD:
617 *((int *)where) = POSIX_MALLOC_THRESHOLD;
618 break;
619
620 case PCRE_CONFIG_MATCH_LIMIT:
621 *((unsigned int *)where) = MATCH_LIMIT;
622 break;
623
624 default: return PCRE_ERROR_BADOPTION;
625 }
626
627 return 0;
628 }
629
630
631
632 #ifdef DEBUG
633 /*************************************************
634 * Debugging function to print chars *
635 *************************************************/
636
637 /* Print a sequence of chars in printable format, stopping at the end of the
638 subject if the requested.
639
640 Arguments:
641 p points to characters
642 length number to print
643 is_subject TRUE if printing from within md->start_subject
644 md pointer to matching data block, if is_subject is TRUE
645
646 Returns: nothing
647 */
648
649 static void
650 pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
651 {
652 int c;
653 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
654 while (length-- > 0)
655 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
656 }
657 #endif
658
659
660
661
662 /*************************************************
663 * Handle escapes *
664 *************************************************/
665
666 /* This function is called when a \ has been encountered. It either returns a
667 positive value for a simple escape such as \n, or a negative value which
668 encodes one of the more complicated things such as \d. When UTF-8 is enabled,
669 a positive value greater than 255 may be returned. On entry, ptr is pointing at
670 the \. On exit, it is on the final character of the escape sequence.
671
672 Arguments:
673 ptrptr points to the pattern position pointer
674 errorptr points to the pointer to the error message
675 bracount number of previous extracting brackets
676 options the options bits
677 isclass TRUE if inside a character class
678
679 Returns: zero or positive => a data character
680 negative => a special escape sequence
681 on error, errorptr is set
682 */
683
684 static int
685 check_escape(const uschar **ptrptr, const char **errorptr, int bracount,
686 int options, BOOL isclass)
687 {
688 const uschar *ptr = *ptrptr;
689 int c, i;
690
691 /* If backslash is at the end of the pattern, it's an error. */
692
693 c = *(++ptr);
694 if (c == 0) *errorptr = ERR1;
695
696 /* Digits or letters may have special meaning; all others are literals. */
697
698 else if (c < '0' || c > 'z') {}
699
700 /* Do an initial lookup in a table. A non-zero result is something that can be
701 returned immediately. Otherwise further processing may be required. */
702
703 else if ((i = escapes[c - '0']) != 0) c = i;
704
705 /* Escapes that need further processing, or are illegal. */
706
707 else
708 {
709 const uschar *oldptr;
710 switch (c)
711 {
712 /* A number of Perl escapes are not handled by PCRE. We give an explicit
713 error. */
714
715 case 'l':
716 case 'L':
717 case 'N':
718 case 'p':
719 case 'P':
720 case 'u':
721 case 'U':
722 case 'X':
723 *errorptr = ERR37;
724 break;
725
726 /* The handling of escape sequences consisting of a string of digits
727 starting with one that is not zero is not straightforward. By experiment,
728 the way Perl works seems to be as follows:
729
730 Outside a character class, the digits are read as a decimal number. If the
731 number is less than 10, or if there are that many previous extracting
732 left brackets, then it is a back reference. Otherwise, up to three octal
733 digits are read to form an escaped byte. Thus \123 is likely to be octal
734 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
735 value is greater than 377, the least significant 8 bits are taken. Inside a
736 character class, \ followed by a digit is always an octal number. */
737
738 case '1': case '2': case '3': case '4': case '5':
739 case '6': case '7': case '8': case '9':
740
741 if (!isclass)
742 {
743 oldptr = ptr;
744 c -= '0';
745 while ((digitab[ptr[1]] & ctype_digit) != 0)
746 c = c * 10 + *(++ptr) - '0';
747 if (c < 10 || c <= bracount)
748 {
749 c = -(ESC_REF + c);
750 break;
751 }
752 ptr = oldptr; /* Put the pointer back and fall through */
753 }
754
755 /* Handle an octal number following \. If the first digit is 8 or 9, Perl
756 generates a binary zero byte and treats the digit as a following literal.
757 Thus we have to pull back the pointer by one. */
758
759 if ((c = *ptr) >= '8')
760 {
761 ptr--;
762 c = 0;
763 break;
764 }
765
766 /* \0 always starts an octal number, but we may drop through to here with a
767 larger first octal digit. */
768
769 case '0':
770 c -= '0';
771 while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
772 c = c * 8 + *(++ptr) - '0';
773 c &= 255; /* Take least significant 8 bits */
774 break;
775
776 /* \x is complicated when UTF-8 is enabled. \x{ddd} is a character number
777 which can be greater than 0xff, but only if the ddd are hex digits. */
778
779 case 'x':
780 #ifdef SUPPORT_UTF8
781 if (ptr[1] == '{' && (options & PCRE_UTF8) != 0)
782 {
783 const uschar *pt = ptr + 2;
784 register int count = 0;
785 c = 0;
786 while ((digitab[*pt] & ctype_xdigit) != 0)
787 {
788 int cc = *pt++;
789 if (cc >= 'a') cc -= 32; /* Convert to upper case */
790 count++;
791 c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
792 }
793 if (*pt == '}')
794 {
795 if (c < 0 || count > 8) *errorptr = ERR34;
796 ptr = pt;
797 break;
798 }
799 /* If the sequence of hex digits does not end with '}', then we don't
800 recognize this construct; fall through to the normal \x handling. */
801 }
802 #endif
803
804 /* Read just a single hex char */
805
806 c = 0;
807 while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
808 {
809 int cc; /* Some compilers don't like ++ */
810 cc = *(++ptr); /* in initializers */
811 if (cc >= 'a') cc -= 32; /* Convert to upper case */
812 c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
813 }
814 break;
815
816 /* Other special escapes not starting with a digit are straightforward */
817
818 case 'c':
819 c = *(++ptr);
820 if (c == 0)
821 {
822 *errorptr = ERR2;
823 return 0;
824 }
825
826 /* A letter is upper-cased; then the 0x40 bit is flipped. This coding
827 is ASCII-specific, but then the whole concept of \cx is ASCII-specific. */
828
829 if (c >= 'a' && c <= 'z') c -= 32;
830 c ^= 0x40;
831 break;
832
833 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
834 other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
835 for Perl compatibility, it is a literal. This code looks a bit odd, but
836 there used to be some cases other than the default, and there may be again
837 in future, so I haven't "optimized" it. */
838
839 default:
840 if ((options & PCRE_EXTRA) != 0) switch(c)
841 {
842 default:
843 *errorptr = ERR3;
844 break;
845 }
846 break;
847 }
848 }
849
850 *ptrptr = ptr;
851 return c;
852 }
853
854
855
856 /*************************************************
857 * Check for counted repeat *
858 *************************************************/
859
860 /* This function is called when a '{' is encountered in a place where it might
861 start a quantifier. It looks ahead to see if it really is a quantifier or not.
862 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
863 where the ddds are digits.
864
865 Arguments:
866 p pointer to the first char after '{'
867
868 Returns: TRUE or FALSE
869 */
870
871 static BOOL
872 is_counted_repeat(const uschar *p)
873 {
874 if ((digitab[*p++] && ctype_digit) == 0) return FALSE;
875 while ((digitab[*p] & ctype_digit) != 0) p++;
876 if (*p == '}') return TRUE;
877
878 if (*p++ != ',') return FALSE;
879 if (*p == '}') return TRUE;
880
881 if ((digitab[*p++] && ctype_digit) == 0) return FALSE;
882 while ((digitab[*p] & ctype_digit) != 0) p++;
883
884 return (*p == '}');
885 }
886
887
888
889 /*************************************************
890 * Read repeat counts *
891 *************************************************/
892
893 /* Read an item of the form {n,m} and return the values. This is called only
894 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
895 so the syntax is guaranteed to be correct, but we need to check the values.
896
897 Arguments:
898 p pointer to first char after '{'
899 minp pointer to int for min
900 maxp pointer to int for max
901 returned as -1 if no max
902 errorptr points to pointer to error message
903
904 Returns: pointer to '}' on success;
905 current ptr on error, with errorptr set
906 */
907
908 static const uschar *
909 read_repeat_counts(const uschar *p, int *minp, int *maxp, const char **errorptr)
910 {
911 int min = 0;
912 int max = -1;
913
914 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
915
916 if (*p == '}') max = min; else
917 {
918 if (*(++p) != '}')
919 {
920 max = 0;
921 while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
922 if (max < min)
923 {
924 *errorptr = ERR4;
925 return p;
926 }
927 }
928 }
929
930 /* Do paranoid checks, then fill in the required variables, and pass back the
931 pointer to the terminating '}'. */
932
933 if (min > 65535 || max > 65535)
934 *errorptr = ERR5;
935 else
936 {
937 *minp = min;
938 *maxp = max;
939 }
940 return p;
941 }
942
943
944
945 /*************************************************
946 * Find first significant op code *
947 *************************************************/
948
949 /* This is called by several functions that scan a compiled expression looking
950 for a fixed first character, or an anchoring op code etc. It skips over things
951 that do not influence this. For some calls, a change of option is important.
952
953 Arguments:
954 code pointer to the start of the group
955 options pointer to external options
956 optbit the option bit whose changing is significant, or
957 zero if none are
958
959 Returns: pointer to the first significant opcode
960 */
961
962 static const uschar*
963 first_significant_code(const uschar *code, int *options, int optbit)
964 {
965 for (;;)
966 {
967 switch ((int)*code)
968 {
969 case OP_OPT:
970 if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
971 *options = (int)code[1];
972 code += 2;
973 break;
974
975 case OP_ASSERT_NOT:
976 case OP_ASSERTBACK:
977 case OP_ASSERTBACK_NOT:
978 do code += GET(code, 1); while (*code == OP_ALT);
979 /* Fall through */
980
981 case OP_CALLOUT:
982 case OP_CREF:
983 case OP_BRANUMBER:
984 case OP_WORD_BOUNDARY:
985 case OP_NOT_WORD_BOUNDARY:
986 code += OP_lengths[*code];
987 break;
988
989 default:
990 return code;
991 }
992 }
993 /* Control never reaches here */
994 }
995
996
997
998
999 /*************************************************
1000 * Find the fixed length of a pattern *
1001 *************************************************/
1002
1003 /* Scan a pattern and compute the fixed length of subject that will match it,
1004 if the length is fixed. This is needed for dealing with backward assertions.
1005 In UTF8 mode, the result is in characters rather than bytes.
1006
1007 Arguments:
1008 code points to the start of the pattern (the bracket)
1009 options the compiling options
1010
1011 Returns: the fixed length, or -1 if there is no fixed length,
1012 or -2 if \C was encountered
1013 */
1014
1015 static int
1016 find_fixedlength(uschar *code, int options)
1017 {
1018 int length = -1;
1019
1020 register int branchlength = 0;
1021 register uschar *cc = code + 1 + LINK_SIZE;
1022
1023 /* Scan along the opcodes for this branch. If we get to the end of the
1024 branch, check the length against that of the other branches. */
1025
1026 for (;;)
1027 {
1028 int d;
1029 register int op = *cc;
1030 if (op >= OP_BRA) op = OP_BRA;
1031
1032 switch (op)
1033 {
1034 case OP_BRA:
1035 case OP_ONCE:
1036 case OP_COND:
1037 d = find_fixedlength(cc, options);
1038 if (d < 0) return d;
1039 branchlength += d;
1040 do cc += GET(cc, 1); while (*cc == OP_ALT);
1041 cc += 1 + LINK_SIZE;
1042 break;
1043
1044 /* Reached end of a branch; if it's a ket it is the end of a nested
1045 call. If it's ALT it is an alternation in a nested call. If it is
1046 END it's the end of the outer call. All can be handled by the same code. */
1047
1048 case OP_ALT:
1049 case OP_KET:
1050 case OP_KETRMAX:
1051 case OP_KETRMIN:
1052 case OP_END:
1053 if (length < 0) length = branchlength;
1054 else if (length != branchlength) return -1;
1055 if (*cc != OP_ALT) return length;
1056 cc += 1 + LINK_SIZE;
1057 branchlength = 0;
1058 break;
1059
1060 /* Skip over assertive subpatterns */
1061
1062 case OP_ASSERT:
1063 case OP_ASSERT_NOT:
1064 case OP_ASSERTBACK:
1065 case OP_ASSERTBACK_NOT:
1066 do cc += GET(cc, 1); while (*cc == OP_ALT);
1067 /* Fall through */
1068
1069 /* Skip over things that don't match chars */
1070
1071 case OP_REVERSE:
1072 case OP_BRANUMBER:
1073 case OP_CREF:
1074 case OP_OPT:
1075 case OP_CALLOUT:
1076 case OP_SOD:
1077 case OP_SOM:
1078 case OP_EOD:
1079 case OP_EODN:
1080 case OP_CIRC:
1081 case OP_DOLL:
1082 case OP_NOT_WORD_BOUNDARY:
1083 case OP_WORD_BOUNDARY:
1084 cc += OP_lengths[*cc];
1085 break;
1086
1087 /* Handle char strings. In UTF-8 mode we must count characters, not bytes.
1088 This requires a scan of the string, unfortunately. We assume valid UTF-8
1089 strings, so all we do is reduce the length by one for every byte whose bits
1090 are 10xxxxxx. */
1091
1092 case OP_CHARS:
1093 branchlength += *(++cc);
1094 #ifdef SUPPORT_UTF8
1095 if ((options & PCRE_UTF8) != 0)
1096 for (d = 1; d <= *cc; d++)
1097 if ((cc[d] & 0xc0) == 0x80) branchlength--;
1098 #endif
1099 cc += *cc + 1;
1100 break;
1101
1102 /* Handle exact repetitions. The count is already in characters, but we
1103 need to skip over a multibyte character in UTF8 mode. */
1104
1105 case OP_EXACT:
1106 branchlength += GET2(cc,1);
1107 cc += 4;
1108 #ifdef SUPPORT_UTF8
1109 if ((options & PCRE_UTF8) != 0)
1110 {
1111 while((*cc & 0x80) == 0x80) cc++;
1112 }
1113 #endif
1114 break;
1115
1116 case OP_TYPEEXACT:
1117 branchlength += GET2(cc,1);
1118 cc += 4;
1119 break;
1120
1121 /* Handle single-char matchers */
1122
1123 case OP_NOT_DIGIT:
1124 case OP_DIGIT:
1125 case OP_NOT_WHITESPACE:
1126 case OP_WHITESPACE:
1127 case OP_NOT_WORDCHAR:
1128 case OP_WORDCHAR:
1129 case OP_ANY:
1130 branchlength++;
1131 cc++;
1132 break;
1133
1134 /* The single-byte matcher isn't allowed */
1135
1136 case OP_ANYBYTE:
1137 return -2;
1138
1139 /* Check a class for variable quantification */
1140
1141 #ifdef SUPPORT_UTF8
1142 case OP_XCLASS:
1143 cc += GET(cc, 1) - 33;
1144 /* Fall through */
1145 #endif
1146
1147 case OP_CLASS:
1148 case OP_NCLASS:
1149 cc += 33;
1150
1151 switch (*cc)
1152 {
1153 case OP_CRSTAR:
1154 case OP_CRMINSTAR:
1155 case OP_CRQUERY:
1156 case OP_CRMINQUERY:
1157 return -1;
1158
1159 case OP_CRRANGE:
1160 case OP_CRMINRANGE:
1161 if (GET2(cc,1) != GET2(cc,3)) return -1;
1162 branchlength += GET2(cc,1);
1163 cc += 5;
1164 break;
1165
1166 default:
1167 branchlength++;
1168 }
1169 break;
1170
1171 /* Anything else is variable length */
1172
1173 default:
1174 return -1;
1175 }
1176 }
1177 /* Control never gets here */
1178 }
1179
1180
1181
1182
1183 /*************************************************
1184 * Scan compiled regex for numbered bracket *
1185 *************************************************/
1186
1187 /* This little function scans through a compiled pattern until it finds a
1188 capturing bracket with the given number.
1189
1190 Arguments:
1191 code points to start of expression
1192 utf8 TRUE in UTF-8 mode
1193 number the required bracket number
1194
1195 Returns: pointer to the opcode for the bracket, or NULL if not found
1196 */
1197
1198 static const uschar *
1199 find_bracket(const uschar *code, BOOL utf8, int number)
1200 {
1201 #ifndef SUPPORT_UTF8
1202 utf8 = utf8; /* Stop pedantic compilers complaining */
1203 #endif
1204
1205 for (;;)
1206 {
1207 register int c = *code;
1208 if (c == OP_END) return NULL;
1209 else if (c == OP_CHARS) code += code[1] + OP_lengths[c];
1210 else if (c > OP_BRA)
1211 {
1212 int n = c - OP_BRA;
1213 if (n > EXTRACT_BASIC_MAX) n = GET2(code, 2+LINK_SIZE);
1214 if (n == number) return (uschar *)code;
1215 code += OP_lengths[OP_BRA];
1216 }
1217 else
1218 {
1219 code += OP_lengths[c];
1220
1221 /* In UTF-8 mode, opcodes that are followed by a character may be followed
1222 by a multi-byte character. The length in the table is a minimum, so we have
1223 to scan along to skip the extra characters. All opcodes are less than 128,
1224 so we can use relatively efficient code. */
1225
1226 #ifdef SUPPORT_UTF8
1227 if (utf8) switch(c)
1228 {
1229 case OP_EXACT:
1230 case OP_UPTO:
1231 case OP_MINUPTO:
1232 case OP_STAR:
1233 case OP_MINSTAR:
1234 case OP_PLUS:
1235 case OP_MINPLUS:
1236 case OP_QUERY:
1237 case OP_MINQUERY:
1238 while ((*code & 0xc0) == 0x80) code++;
1239 break;
1240 }
1241 #endif
1242 }
1243 }
1244 }
1245
1246
1247
1248 /*************************************************
1249 * Scan compiled branch for non-emptiness *
1250 *************************************************/
1251
1252 /* This function scans through a branch of a compiled pattern to see whether it
1253 can match the empty string or not. It is called only from could_be_empty()
1254 below. Note that first_significant_code() skips over assertions. If we hit an
1255 unclosed bracket, we return "empty" - this means we've struck an inner bracket
1256 whose current branch will already have been scanned.
1257
1258 Arguments:
1259 code points to start of search
1260 endcode points to where to stop
1261 utf8 TRUE if in UTF8 mode
1262
1263 Returns: TRUE if what is matched could be empty
1264 */
1265
1266 static BOOL
1267 could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1268 {
1269 register int c;
1270 for (code = first_significant_code(code + 1 + LINK_SIZE, NULL, 0);
1271 code < endcode;
1272 code = first_significant_code(code + OP_lengths[c], NULL, 0))
1273 {
1274 const uschar *ccode;
1275
1276 c = *code;
1277
1278 if (c >= OP_BRA)
1279 {
1280 BOOL empty_branch;
1281 if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1282
1283 /* Scan a closed bracket */
1284
1285 empty_branch = FALSE;
1286 do
1287 {
1288 if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1289 empty_branch = TRUE;
1290 code += GET(code, 1);
1291 }
1292 while (*code == OP_ALT);
1293 if (!empty_branch) return FALSE; /* All branches are non-empty */
1294 code += 1 + LINK_SIZE;
1295 c = *code;
1296 }
1297
1298 else switch (c)
1299 {
1300 /* Check for quantifiers after a class */
1301
1302 #ifdef SUPPORT_UTF8
1303 case OP_XCLASS:
1304 ccode = code + GET(code, 1);
1305 goto CHECK_CLASS_REPEAT;
1306 #endif
1307
1308 case OP_CLASS:
1309 case OP_NCLASS:
1310 ccode = code + 33;
1311
1312 #ifdef SUPPORT_UTF8
1313 CHECK_CLASS_REPEAT:
1314 #endif
1315
1316 switch (*ccode)
1317 {
1318 case OP_CRSTAR: /* These could be empty; continue */
1319 case OP_CRMINSTAR:
1320 case OP_CRQUERY:
1321 case OP_CRMINQUERY:
1322 break;
1323
1324 default: /* Non-repeat => class must match */
1325 case OP_CRPLUS: /* These repeats aren't empty */
1326 case OP_CRMINPLUS:
1327 return FALSE;
1328
1329 case OP_CRRANGE:
1330 case OP_CRMINRANGE:
1331 if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
1332 break;
1333 }
1334 break;
1335
1336 /* Opcodes that must match a character */
1337
1338 case OP_NOT_DIGIT:
1339 case OP_DIGIT:
1340 case OP_NOT_WHITESPACE:
1341 case OP_WHITESPACE:
1342 case OP_NOT_WORDCHAR:
1343 case OP_WORDCHAR:
1344 case OP_ANY:
1345 case OP_ANYBYTE:
1346 case OP_CHARS:
1347 case OP_NOT:
1348 case OP_PLUS:
1349 case OP_MINPLUS:
1350 case OP_EXACT:
1351 case OP_NOTPLUS:
1352 case OP_NOTMINPLUS:
1353 case OP_NOTEXACT:
1354 case OP_TYPEPLUS:
1355 case OP_TYPEMINPLUS:
1356 case OP_TYPEEXACT:
1357 return FALSE;
1358
1359 /* End of branch */
1360
1361 case OP_KET:
1362 case OP_KETRMAX:
1363 case OP_KETRMIN:
1364 case OP_ALT:
1365 return TRUE;
1366
1367 /* In UTF-8 mode, STAR, MINSTAR, QUERY, MINQUERY, UPTO, and MINUPTO may be
1368 followed by a multibyte character */
1369
1370 #ifdef SUPPORT_UTF8
1371 case OP_STAR:
1372 case OP_MINSTAR:
1373 case OP_QUERY:
1374 case OP_MINQUERY:
1375 case OP_UPTO:
1376 case OP_MINUPTO:
1377 if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1378 break;
1379 #endif
1380 }
1381 }
1382
1383 return TRUE;
1384 }
1385
1386
1387
1388 /*************************************************
1389 * Scan compiled regex for non-emptiness *
1390 *************************************************/
1391
1392 /* This function is called to check for left recursive calls. We want to check
1393 the current branch of the current pattern to see if it could match the empty
1394 string. If it could, we must look outwards for branches at other levels,
1395 stopping when we pass beyond the bracket which is the subject of the recursion.
1396
1397 Arguments:
1398 code points to start of the recursion
1399 endcode points to where to stop (current RECURSE item)
1400 bcptr points to the chain of current (unclosed) branch starts
1401 utf8 TRUE if in UTF-8 mode
1402
1403 Returns: TRUE if what is matched could be empty
1404 */
1405
1406 static BOOL
1407 could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1408 BOOL utf8)
1409 {
1410 while (bcptr != NULL && bcptr->current >= code)
1411 {
1412 if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1413 bcptr = bcptr->outer;
1414 }
1415 return TRUE;
1416 }
1417
1418
1419
1420 /*************************************************
1421 * Check for POSIX class syntax *
1422 *************************************************/
1423
1424 /* This function is called when the sequence "[:" or "[." or "[=" is
1425 encountered in a character class. It checks whether this is followed by an
1426 optional ^ and then a sequence of letters, terminated by a matching ":]" or
1427 ".]" or "=]".
1428
1429 Argument:
1430 ptr pointer to the initial [
1431 endptr where to return the end pointer
1432 cd pointer to compile data
1433
1434 Returns: TRUE or FALSE
1435 */
1436
1437 static BOOL
1438 check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
1439 {
1440 int terminator; /* Don't combine these lines; the Solaris cc */
1441 terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
1442 if (*(++ptr) == '^') ptr++;
1443 while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
1444 if (*ptr == terminator && ptr[1] == ']')
1445 {
1446 *endptr = ptr;
1447 return TRUE;
1448 }
1449 return FALSE;
1450 }
1451
1452
1453
1454
1455 /*************************************************
1456 * Check POSIX class name *
1457 *************************************************/
1458
1459 /* This function is called to check the name given in a POSIX-style class entry
1460 such as [:alnum:].
1461
1462 Arguments:
1463 ptr points to the first letter
1464 len the length of the name
1465
1466 Returns: a value representing the name, or -1 if unknown
1467 */
1468
1469 static int
1470 check_posix_name(const uschar *ptr, int len)
1471 {
1472 register int yield = 0;
1473 while (posix_name_lengths[yield] != 0)
1474 {
1475 if (len == posix_name_lengths[yield] &&
1476 strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
1477 yield++;
1478 }
1479 return -1;
1480 }
1481
1482
1483
1484
1485 /*************************************************
1486 * Compile one branch *
1487 *************************************************/
1488
1489 /* Scan the pattern, compiling it into the code vector. If the options are
1490 changed during the branch, the pointer is used to change the external options
1491 bits.
1492
1493 Arguments:
1494 optionsptr pointer to the option bits
1495 brackets points to number of extracting brackets used
1496 code points to the pointer to the current code point
1497 ptrptr points to the current pattern pointer
1498 errorptr points to pointer to error message
1499 firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
1500 reqbyteptr set to the last literal character required, else < 0
1501 bcptr points to current branch chain
1502 cd contains pointers to tables etc.
1503
1504 Returns: TRUE on success
1505 FALSE, with *errorptr set on error
1506 */
1507
1508 static BOOL
1509 compile_branch(int *optionsptr, int *brackets, uschar **codeptr,
1510 const uschar **ptrptr, const char **errorptr, int *firstbyteptr,
1511 int *reqbyteptr, branch_chain *bcptr, compile_data *cd)
1512 {
1513 int repeat_type, op_type;
1514 int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
1515 int bravalue = 0;
1516 int length;
1517 int greedy_default, greedy_non_default;
1518 int firstbyte, reqbyte;
1519 int zeroreqbyte, zerofirstbyte;
1520 int req_caseopt, reqvary, tempreqvary;
1521 int condcount = 0;
1522 int options = *optionsptr;
1523 register int c;
1524 register uschar *code = *codeptr;
1525 uschar *tempcode;
1526 BOOL inescq = FALSE;
1527 BOOL groupsetfirstbyte = FALSE;
1528 const uschar *ptr = *ptrptr;
1529 const uschar *tempptr;
1530 uschar *previous = NULL;
1531 uschar class[32];
1532
1533 #ifdef SUPPORT_UTF8
1534 BOOL class_utf8;
1535 BOOL utf8 = (options & PCRE_UTF8) != 0;
1536 uschar *class_utf8data;
1537 uschar utf8_char[6];
1538 #else
1539 BOOL utf8 = FALSE;
1540 #endif
1541
1542 /* Set up the default and non-default settings for greediness */
1543
1544 greedy_default = ((options & PCRE_UNGREEDY) != 0);
1545 greedy_non_default = greedy_default ^ 1;
1546
1547 /* Initialize no first char, no required char. REQ_UNSET means "no char
1548 matching encountered yet". It gets changed to REQ_NONE if we hit something that
1549 matches a non-fixed char first char; reqbyte just remains unset if we never
1550 find one.
1551
1552 When we hit a repeat whose minimum is zero, we may have to adjust these values
1553 to take the zero repeat into account. This is implemented by setting them to
1554 zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
1555 item types that can be repeated set these backoff variables appropriately. */
1556
1557 firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
1558
1559 /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
1560 according to the current setting of the caseless flag. REQ_CASELESS is a bit
1561 value > 255. It is added into the firstbyte or reqbyte variables to record the
1562 case status of the value. */
1563
1564 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
1565
1566 /* Switch on next character until the end of the branch */
1567
1568 for (;; ptr++)
1569 {
1570 BOOL negate_class;
1571 BOOL possessive_quantifier;
1572 int class_charcount;
1573 int class_lastchar;
1574 int newoptions;
1575 int recno;
1576 int skipbytes;
1577 int subreqbyte;
1578 int subfirstbyte;
1579
1580 c = *ptr;
1581 if (inescq && c != 0) goto NORMAL_CHAR;
1582
1583 if ((options & PCRE_EXTENDED) != 0)
1584 {
1585 if ((cd->ctypes[c] & ctype_space) != 0) continue;
1586 if (c == '#')
1587 {
1588 /* The space before the ; is to avoid a warning on a silly compiler
1589 on the Macintosh. */
1590 while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
1591 if (c != 0) continue; /* Else fall through to handle end of string */
1592 }
1593 }
1594
1595 switch(c)
1596 {
1597 /* The branch terminates at end of string, |, or ). */
1598
1599 case 0:
1600 case '|':
1601 case ')':
1602 *firstbyteptr = firstbyte;
1603 *reqbyteptr = reqbyte;
1604 *codeptr = code;
1605 *ptrptr = ptr;
1606 return TRUE;
1607
1608 /* Handle single-character metacharacters. In multiline mode, ^ disables
1609 the setting of any following char as a first character. */
1610
1611 case '^':
1612 if ((options & PCRE_MULTILINE) != 0)
1613 {
1614 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
1615 }
1616 previous = NULL;
1617 *code++ = OP_CIRC;
1618 break;
1619
1620 case '$':
1621 previous = NULL;
1622 *code++ = OP_DOLL;
1623 break;
1624
1625 /* There can never be a first char if '.' is first, whatever happens about
1626 repeats. The value of reqbyte doesn't change either. */
1627
1628 case '.':
1629 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
1630 zerofirstbyte = firstbyte;
1631 zeroreqbyte = reqbyte;
1632 previous = code;
1633 *code++ = OP_ANY;
1634 break;
1635
1636 /* Character classes. If the included characters are all < 255 in value, we
1637 build a 32-byte bitmap of the permitted characters, except in the special
1638 case where there is only one such character. For negated classes, we build
1639 the map as usual, then invert it at the end. However, we use a different
1640 opcode so that data characters > 255 can be handled correctly.
1641
1642 If the class contains characters outside the 0-255 range, a different
1643 opcode is compiled. It may optionally have a bit map for characters < 256,
1644 but those above are are explicitly listed afterwards. A flag byte tells
1645 whether the bitmap is present, and whether this is a negated class or not.
1646 */
1647
1648 case '[':
1649 previous = code;
1650
1651 /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
1652 they are encountered at the top level, so we'll do that too. */
1653
1654 if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
1655 check_posix_syntax(ptr, &tempptr, cd))
1656 {
1657 *errorptr = (ptr[1] == ':')? ERR13 : ERR31;
1658 goto FAILED;
1659 }
1660
1661 /* If the first character is '^', set the negation flag and skip it. */
1662
1663 if ((c = *(++ptr)) == '^')
1664 {
1665 negate_class = TRUE;
1666 c = *(++ptr);
1667 }
1668 else
1669 {
1670 negate_class = FALSE;
1671 }
1672
1673 /* Keep a count of chars with values < 256 so that we can optimize the case
1674 of just a single character (as long as it's < 256). For higher valued UTF-8
1675 characters, we don't yet do any optimization. */
1676
1677 class_charcount = 0;
1678 class_lastchar = -1;
1679
1680 #ifdef SUPPORT_UTF8
1681 class_utf8 = FALSE; /* No chars >= 256 */
1682 class_utf8data = code + LINK_SIZE + 34; /* For UTF-8 items */
1683 #endif
1684
1685 /* Initialize the 32-char bit map to all zeros. We have to build the
1686 map in a temporary bit of store, in case the class contains only 1
1687 character (< 256), because in that case the compiled code doesn't use the
1688 bit map. */
1689
1690 memset(class, 0, 32 * sizeof(uschar));
1691
1692 /* Process characters until ] is reached. By writing this as a "do" it
1693 means that an initial ] is taken as a data character. The first pass
1694 through the regex checked the overall syntax, so we don't need to be very
1695 strict here. At the start of the loop, c contains the first byte of the
1696 character. */
1697
1698 do
1699 {
1700 #ifdef SUPPORT_UTF8
1701 if (utf8 && c > 127)
1702 { /* Braces are required because the */
1703 GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
1704 }
1705 #endif
1706
1707 /* Inside \Q...\E everything is literal except \E */
1708
1709 if (inescq)
1710 {
1711 if (c == '\\' && ptr[1] == 'E')
1712 {
1713 inescq = FALSE;
1714 ptr++;
1715 continue;
1716 }
1717 else goto LONE_SINGLE_CHARACTER;
1718 }
1719
1720 /* Handle POSIX class names. Perl allows a negation extension of the
1721 form [:^name:]. A square bracket that doesn't match the syntax is
1722 treated as a literal. We also recognize the POSIX constructions
1723 [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
1724 5.6 and 5.8 do. */
1725
1726 if (c == '[' &&
1727 (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
1728 check_posix_syntax(ptr, &tempptr, cd))
1729 {
1730 BOOL local_negate = FALSE;
1731 int posix_class, i;
1732 register const uschar *cbits = cd->cbits;
1733
1734 if (ptr[1] != ':')
1735 {
1736 *errorptr = ERR31;
1737 goto FAILED;
1738 }
1739
1740 ptr += 2;
1741 if (*ptr == '^')
1742 {
1743 local_negate = TRUE;
1744 ptr++;
1745 }
1746
1747 posix_class = check_posix_name(ptr, tempptr - ptr);
1748 if (posix_class < 0)
1749 {
1750 *errorptr = ERR30;
1751 goto FAILED;
1752 }
1753
1754 /* If matching is caseless, upper and lower are converted to
1755 alpha. This relies on the fact that the class table starts with
1756 alpha, lower, upper as the first 3 entries. */
1757
1758 if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
1759 posix_class = 0;
1760
1761 /* Or into the map we are building up to 3 of the static class
1762 tables, or their negations. The [:blank:] class sets up the same
1763 chars as the [:space:] class (all white space). We remove the vertical
1764 white space chars afterwards. */
1765
1766 posix_class *= 3;
1767 for (i = 0; i < 3; i++)
1768 {
1769 BOOL isblank = strncmp((char *)ptr, "blank", 5) == 0;
1770 int taboffset = posix_class_maps[posix_class + i];
1771 if (taboffset < 0) break;
1772 if (local_negate)
1773 {
1774 for (c = 0; c < 32; c++) class[c] |= ~cbits[c+taboffset];
1775 if (isblank) class[1] |= 0x3c;
1776 }
1777 else
1778 {
1779 for (c = 0; c < 32; c++) class[c] |= cbits[c+taboffset];
1780 if (isblank) class[1] &= ~0x3c;
1781 }
1782 }
1783
1784 ptr = tempptr + 1;
1785 class_charcount = 10; /* Set > 1; assumes more than 1 per class */
1786 continue; /* End of POSIX syntax handling */
1787 }
1788
1789 /* Backslash may introduce a single character, or it may introduce one
1790 of the specials, which just set a flag. Escaped items are checked for
1791 validity in the pre-compiling pass. The sequence \b is a special case.
1792 Inside a class (and only there) it is treated as backspace. Elsewhere
1793 it marks a word boundary. Other escapes have preset maps ready to
1794 or into the one we are building. We assume they have more than one
1795 character in them, so set class_charcount bigger than one. */
1796
1797 if (c == '\\')
1798 {
1799 c = check_escape(&ptr, errorptr, *brackets, options, TRUE);
1800 if (-c == ESC_b) c = '\b'; /* \b is backslash in a class */
1801
1802 if (-c == ESC_Q) /* Handle start of quoted string */
1803 {
1804 if (ptr[1] == '\\' && ptr[2] == 'E')
1805 {
1806 ptr += 2; /* avoid empty string */
1807 }
1808 else inescq = TRUE;
1809 continue;
1810 }
1811
1812 else if (c < 0)
1813 {
1814 register const uschar *cbits = cd->cbits;
1815 class_charcount = 10; /* Greater than 1 is what matters */
1816 switch (-c)
1817 {
1818 case ESC_d:
1819 for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_digit];
1820 continue;
1821
1822 case ESC_D:
1823 for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_digit];
1824 continue;
1825
1826 case ESC_w:
1827 for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_word];
1828 continue;
1829
1830 case ESC_W:
1831 for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_word];
1832 continue;
1833
1834 case ESC_s:
1835 for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_space];
1836 class[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
1837 continue;
1838
1839 case ESC_S:
1840 for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_space];
1841 class[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
1842 continue;
1843
1844 /* Unrecognized escapes are faulted if PCRE is running in its
1845 strict mode. By default, for compatibility with Perl, they are
1846 treated as literals. */
1847
1848 default:
1849 if ((options & PCRE_EXTRA) != 0)
1850 {
1851 *errorptr = ERR7;
1852 goto FAILED;
1853 }
1854 c = *ptr; /* The final character */
1855 }
1856 }
1857
1858 /* Fall through if we have a single character (c >= 0). This may be
1859 > 256 in UTF-8 mode. */
1860
1861 } /* End of backslash handling */
1862
1863 /* A single character may be followed by '-' to form a range. However,
1864 Perl does not permit ']' to be the end of the range. A '-' character
1865 here is treated as a literal. */
1866
1867 if (ptr[1] == '-' && ptr[2] != ']')
1868 {
1869 int d;
1870 ptr += 2;
1871
1872 #ifdef SUPPORT_UTF8
1873 if (utf8)
1874 { /* Braces are required because the */
1875 GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
1876 }
1877 else
1878 #endif
1879 d = *ptr;
1880
1881 /* The second part of a range can be a single-character escape, but
1882 not any of the other escapes. Perl 5.6 treats a hyphen as a literal
1883 in such circumstances. */
1884
1885 if (d == '\\')
1886 {
1887 const uschar *oldptr = ptr;
1888 d = check_escape(&ptr, errorptr, *brackets, options, TRUE);
1889
1890 /* \b is backslash; any other special means the '-' was literal */
1891
1892 if (d < 0)
1893 {
1894 if (d == -ESC_b) d = '\b'; else
1895 {
1896 ptr = oldptr - 2;
1897 goto LONE_SINGLE_CHARACTER; /* A few lines below */
1898 }
1899 }
1900 }
1901
1902 /* Check that the two values are in the correct order */
1903
1904 if (d < c)
1905 {
1906 *errorptr = ERR8;
1907 goto FAILED;
1908 }
1909
1910 /* If d is greater than 255, we can't just use the bit map, so set up
1911 for the UTF-8 supporting class type. If we are not caseless, we can
1912 just set up a single range. If we are caseless, the characters < 256
1913 are handled with a bitmap, in order to get the case-insensitive
1914 handling. */
1915
1916 #ifdef SUPPORT_UTF8
1917 if (d > 255)
1918 {
1919 class_utf8 = TRUE;
1920 *class_utf8data++ = XCL_RANGE;
1921 if ((options & PCRE_CASELESS) == 0)
1922 {
1923 class_utf8data += ord2utf8(c, class_utf8data);
1924 class_utf8data += ord2utf8(d, class_utf8data);
1925 continue; /* Go get the next char in the class */
1926 }
1927 class_utf8data += ord2utf8(256, class_utf8data);
1928 class_utf8data += ord2utf8(d, class_utf8data);
1929 d = 255;
1930 /* Fall through */
1931 }
1932 #endif
1933 /* We use the bit map if the range is entirely < 255, or if part of it
1934 is < 255 and matching is caseless. */
1935
1936 for (; c <= d; c++)
1937 {
1938 class[c/8] |= (1 << (c&7));
1939 if ((options & PCRE_CASELESS) != 0)
1940 {
1941 int uc = cd->fcc[c]; /* flip case */
1942 class[uc/8] |= (1 << (uc&7));
1943 }
1944 class_charcount++; /* in case a one-char range */
1945 class_lastchar = c;
1946 }
1947
1948 continue; /* Go get the next char in the class */
1949 }
1950
1951 /* Handle a lone single character - we can get here for a normal
1952 non-escape char, or after \ that introduces a single character. */
1953
1954 LONE_SINGLE_CHARACTER:
1955
1956 /* Handle a multibyte character */
1957
1958 #ifdef SUPPORT_UTF8
1959 if (utf8 && c > 255)
1960 {
1961 class_utf8 = TRUE;
1962 *class_utf8data++ = XCL_SINGLE;
1963 class_utf8data += ord2utf8(c, class_utf8data);
1964 }
1965 else
1966 #endif
1967 /* Handle a single-byte character */
1968 {
1969 class [c/8] |= (1 << (c&7));
1970 if ((options & PCRE_CASELESS) != 0)
1971 {
1972 c = cd->fcc[c]; /* flip case */
1973 class[c/8] |= (1 << (c&7));
1974 }
1975 class_charcount++;
1976 class_lastchar = c;
1977 }
1978 }
1979
1980 /* Loop until ']' reached; the check for end of string happens inside the
1981 loop. This "while" is the end of the "do" above. */
1982
1983 while ((c = *(++ptr)) != ']' || inescq);
1984
1985 /* If class_charcount is 1, we saw precisely one character with a value <
1986 256. In UTF-8 mode, we can optimize if there were no characters >= 256 and
1987 the one character is < 128. In non-UTF-8 mode we can always optimize.
1988
1989 The optimization throws away the bit map. We turn the item into a
1990 1-character OP_CHARS if it's positive, or OP_NOT if it's negative. Note
1991 that OP_NOT does not support multibyte characters. In the positive case, it
1992 can cause firstbyte to be set. Otherwise, there can be no first char if
1993 this item is first, whatever repeat count may follow. In the case of
1994 reqbyte, save the previous value for reinstating. */
1995
1996 #ifdef SUPPORT_UTF8
1997 if (class_charcount == 1 &&
1998 (!utf8 ||
1999 (!class_utf8 && class_lastchar < 128)))
2000 #else
2001 if (class_charcount == 1)
2002 #endif
2003 {
2004 zeroreqbyte = reqbyte;
2005 if (negate_class)
2006 {
2007 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2008 zerofirstbyte = firstbyte;
2009 *code++ = OP_NOT;
2010 }
2011 else
2012 {
2013 if (firstbyte == REQ_UNSET)
2014 {
2015 zerofirstbyte = REQ_NONE;
2016 firstbyte = class_lastchar | req_caseopt;
2017 }
2018 else
2019 {
2020 zerofirstbyte = firstbyte;
2021 reqbyte = class_lastchar | req_caseopt | cd->req_varyopt;
2022 }
2023 *code++ = OP_CHARS;
2024 *code++ = 1;
2025 }
2026 *code++ = class_lastchar;
2027 break; /* End of class handling */
2028 } /* End of 1-byte optimization */
2029
2030 /* Otherwise, if this is the first thing in the branch, there can be no
2031 first char setting, whatever the repeat count. Any reqbyte setting must
2032 remain unchanged after any kind of repeat. */
2033
2034 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2035 zerofirstbyte = firstbyte;
2036 zeroreqbyte = reqbyte;
2037
2038 /* If there are characters with values > 255, we have to compile an
2039 extended class, with its own opcode. If there are no characters < 256,
2040 we can omit the bitmap. */
2041
2042 #ifdef SUPPORT_UTF8
2043 if (class_utf8)
2044 {
2045 *class_utf8data++ = XCL_END; /* Marks the end of extra data */
2046 *code++ = OP_XCLASS;
2047 code += LINK_SIZE;
2048 *code = negate_class? XCL_NOT : 0;
2049
2050 /* If the map is required, install it, and move on to the end of
2051 the extra data */
2052
2053 if (class_charcount > 0)
2054 {
2055 *code++ |= XCL_MAP;
2056 memcpy(code, class, 32);
2057 code = class_utf8data;
2058 }
2059
2060 /* If the map is not required, slide down the extra data. */
2061
2062 else
2063 {
2064 int len = class_utf8data - (code + 33);
2065 memmove(code + 1, code + 33, len);
2066 code += len + 1;
2067 }
2068
2069 /* Now fill in the complete length of the item */
2070
2071 PUT(previous, 1, code - previous);
2072 break; /* End of class handling */
2073 }
2074 #endif
2075
2076 /* If there are no characters > 255, negate the 32-byte map if necessary,
2077 and copy it into the code vector. If this is the first thing in the branch,
2078 there can be no first char setting, whatever the repeat count. Any reqbyte
2079 setting must remain unchanged after any kind of repeat. */
2080
2081 if (negate_class)
2082 {
2083 *code++ = OP_NCLASS;
2084 for (c = 0; c < 32; c++) code[c] = ~class[c];
2085 }
2086 else
2087 {
2088 *code++ = OP_CLASS;
2089 memcpy(code, class, 32);
2090 }
2091 code += 32;
2092 break;
2093
2094 /* Various kinds of repeat */
2095
2096 case '{':
2097 if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR;
2098 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorptr);
2099 if (*errorptr != NULL) goto FAILED;
2100 goto REPEAT;
2101
2102 case '*':
2103 repeat_min = 0;
2104 repeat_max = -1;
2105 goto REPEAT;
2106
2107 case '+':
2108 repeat_min = 1;
2109 repeat_max = -1;
2110 goto REPEAT;
2111
2112 case '?':
2113 repeat_min = 0;
2114 repeat_max = 1;
2115
2116 REPEAT:
2117 if (previous == NULL)
2118 {
2119 *errorptr = ERR9;
2120 goto FAILED;
2121 }
2122
2123 if (repeat_min == 0)
2124 {
2125 firstbyte = zerofirstbyte; /* Adjust for zero repeat */
2126 reqbyte = zeroreqbyte; /* Ditto */
2127 }
2128
2129 /* Remember whether this is a variable length repeat */
2130
2131 reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
2132
2133 op_type = 0; /* Default single-char op codes */
2134 possessive_quantifier = FALSE; /* Default not possessive quantifier */
2135
2136 /* Save start of previous item, in case we have to move it up to make space
2137 for an inserted OP_ONCE for the additional '+' extension. */
2138
2139 tempcode = previous;
2140
2141 /* If the next character is '+', we have a possessive quantifier. This
2142 implies greediness, whatever the setting of the PCRE_UNGREEDY option.
2143 If the next character is '?' this is a minimizing repeat, by default,
2144 but if PCRE_UNGREEDY is set, it works the other way round. We change the
2145 repeat type to the non-default. */
2146
2147 if (ptr[1] == '+')
2148 {
2149 repeat_type = 0; /* Force greedy */
2150 possessive_quantifier = TRUE;
2151 ptr++;
2152 }
2153 else if (ptr[1] == '?')
2154 {
2155 repeat_type = greedy_non_default;
2156 ptr++;
2157 }
2158 else repeat_type = greedy_default;
2159
2160 /* If previous was a recursion, we need to wrap it inside brackets so that
2161 it can be replicated if necessary. */
2162
2163 if (*previous == OP_RECURSE)
2164 {
2165 memmove(previous + 1 + LINK_SIZE, previous, 1 + LINK_SIZE);
2166 code += 1 + LINK_SIZE;
2167 *previous = OP_BRA;
2168 PUT(previous, 1, code - previous);
2169 *code = OP_KET;
2170 PUT(code, 1, code - previous);
2171 code += 1 + LINK_SIZE;
2172 }
2173
2174 /* If previous was a string of characters, chop off the last one and use it
2175 as the subject of the repeat. If there was only one character, we can
2176 abolish the previous item altogether. If a one-char item has a minumum of
2177 more than one, ensure that it is set in reqbyte - it might not be if a
2178 sequence such as x{3} is the first thing in a branch because the x will
2179 have gone into firstbyte instead. */
2180
2181 if (*previous == OP_CHARS)
2182 {
2183 /* Deal with UTF-8 characters that take up more than one byte. It's
2184 easier to write this out separately than try to macrify it. Use c to
2185 hold the length of the character in bytes, plus 0x80 to flag that it's a
2186 length rather than a small character. */
2187
2188 #ifdef SUPPORT_UTF8
2189 if (utf8 && (code[-1] & 0x80) != 0)
2190 {
2191 uschar *lastchar = code - 1;
2192 while((*lastchar & 0xc0) == 0x80) lastchar--;
2193 c = code - lastchar; /* Length of UTF-8 character */
2194 memcpy(utf8_char, lastchar, c); /* Save the char */
2195 if (lastchar == previous + 2) /* There was only one character */
2196 {
2197 code = previous; /* Abolish the previous item */
2198 }
2199 else
2200 {
2201 previous[1] -= c; /* Adjust length of previous */
2202 code = lastchar; /* Lost char off the end */
2203 tempcode = code; /* Adjust position to be moved for '+' */
2204 }
2205 c |= 0x80; /* Flag c as a length */
2206 }
2207 else
2208 #endif
2209
2210 /* Handle the case of a single byte - either with no UTF8 support, or
2211 with UTF-8 disabled, or for a UTF-8 character < 128. */
2212
2213 {
2214 c = *(--code);
2215 if (code == previous + 2) /* There was only one character */
2216 {
2217 code = previous; /* Abolish the previous item */
2218 if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
2219 }
2220 else
2221 {
2222 previous[1]--; /* adjust length */
2223 tempcode = code; /* Adjust position to be moved for '+' */
2224 }
2225 }
2226
2227 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
2228 }
2229
2230 /* If previous was a single negated character ([^a] or similar), we use
2231 one of the special opcodes, replacing it. The code is shared with single-
2232 character repeats by setting opt_type to add a suitable offset into
2233 repeat_type. OP_NOT is currently used only for single-byte chars. */
2234
2235 else if (*previous == OP_NOT)
2236 {
2237 op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
2238 c = previous[1];
2239 code = previous;
2240 goto OUTPUT_SINGLE_REPEAT;
2241 }
2242
2243 /* If previous was a character type match (\d or similar), abolish it and
2244 create a suitable repeat item. The code is shared with single-character
2245 repeats by setting op_type to add a suitable offset into repeat_type. */
2246
2247 else if (*previous < OP_EODN)
2248 {
2249 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
2250 c = *previous;
2251 code = previous;
2252
2253 OUTPUT_SINGLE_REPEAT:
2254
2255 /* If the maximum is zero then the minimum must also be zero; Perl allows
2256 this case, so we do too - by simply omitting the item altogether. */
2257
2258 if (repeat_max == 0) goto END_REPEAT;
2259
2260 /* Combine the op_type with the repeat_type */
2261
2262 repeat_type += op_type;
2263
2264 /* A minimum of zero is handled either as the special case * or ?, or as
2265 an UPTO, with the maximum given. */
2266
2267 if (repeat_min == 0)
2268 {
2269 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
2270 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
2271 else
2272 {
2273 *code++ = OP_UPTO + repeat_type;
2274 PUT2INC(code, 0, repeat_max);
2275 }
2276 }
2277
2278 /* The case {1,} is handled as the special case + */
2279
2280 else if (repeat_min == 1 && repeat_max == -1)
2281 *code++ = OP_PLUS + repeat_type;
2282
2283 /* The case {n,n} is just an EXACT, while the general case {n,m} is
2284 handled as an EXACT followed by an UPTO. An EXACT of 1 is optimized. */
2285
2286 else
2287 {
2288 if (repeat_min != 1)
2289 {
2290 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
2291 PUT2INC(code, 0, repeat_min);
2292 }
2293
2294 /* If the mininum is 1 and the previous item was a character string,
2295 we either have to put back the item that got cancelled if the string
2296 length was 1, or add the character back onto the end of a longer
2297 string. For a character type nothing need be done; it will just get
2298 put back naturally. Note that the final character is always going to
2299 get added below, so we leave code ready for its insertion. */
2300
2301 else if (*previous == OP_CHARS)
2302 {
2303 if (code == previous) code += 2; else
2304
2305 /* In UTF-8 mode, a multibyte char has its length in c, with the 0x80
2306 bit set as a flag. The length will always be between 2 and 6. */
2307
2308 #ifdef SUPPORT_UTF8
2309 if (utf8 && c >= 128) previous[1] += c & 7; else
2310 #endif
2311 previous[1]++;
2312 }
2313
2314 /* For a single negated character we also have to put back the
2315 item that got cancelled. At present this applies only to single byte
2316 characters in any mode. */
2317
2318 else if (*previous == OP_NOT) code++;
2319
2320 /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
2321 we have to insert the character for the previous code. In UTF-8 mode,
2322 long characters have their length in c, with the 0x80 bit as a flag. */
2323
2324 if (repeat_max < 0)
2325 {
2326 #ifdef SUPPORT_UTF8
2327 if (utf8 && c >= 128)
2328 {
2329 memcpy(code, utf8_char, c & 7);
2330 code += c & 7;
2331 }
2332 else
2333 #endif
2334 *code++ = c;
2335 *code++ = OP_STAR + repeat_type;
2336 }
2337
2338 /* Else insert an UPTO if the max is greater than the min, again
2339 preceded by the character, for the previously inserted code. */
2340
2341 else if (repeat_max != repeat_min)
2342 {
2343 #ifdef SUPPORT_UTF8
2344 if (utf8 && c >= 128)
2345 {
2346 memcpy(code, utf8_char, c & 7);
2347 code += c & 7;
2348 }
2349 else
2350 #endif
2351 *code++ = c;
2352 repeat_max -= repeat_min;
2353 *code++ = OP_UPTO + repeat_type;
2354 PUT2INC(code, 0, repeat_max);
2355 }
2356 }
2357
2358 /* The character or character type itself comes last in all cases. */
2359
2360 #ifdef SUPPORT_UTF8
2361 if (utf8 && c >= 128)
2362 {
2363 memcpy(code, utf8_char, c & 7);
2364 code += c & 7;
2365 }
2366 else
2367 #endif
2368
2369 *code++ = c;
2370 }
2371
2372 /* If previous was a character class or a back reference, we put the repeat
2373 stuff after it, but just skip the item if the repeat was {0,0}. */
2374
2375 else if (*previous == OP_CLASS ||
2376 *previous == OP_NCLASS ||
2377 #ifdef SUPPORT_UTF8
2378 *previous == OP_XCLASS ||
2379 #endif
2380 *previous == OP_REF)
2381 {
2382 if (repeat_max == 0)
2383 {
2384 code = previous;
2385 goto END_REPEAT;
2386 }
2387 if (repeat_min == 0 && repeat_max == -1)
2388 *code++ = OP_CRSTAR + repeat_type;
2389 else if (repeat_min == 1 && repeat_max == -1)
2390 *code++ = OP_CRPLUS + repeat_type;
2391 else if (repeat_min == 0 && repeat_max == 1)
2392 *code++ = OP_CRQUERY + repeat_type;
2393 else
2394 {
2395 *code++ = OP_CRRANGE + repeat_type;
2396 PUT2INC(code, 0, repeat_min);
2397 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
2398 PUT2INC(code, 0, repeat_max);
2399 }
2400 }
2401
2402 /* If previous was a bracket group, we may have to replicate it in certain
2403 cases. */
2404
2405 else if (*previous >= OP_BRA || *previous == OP_ONCE ||
2406 *previous == OP_COND)
2407 {
2408 register int i;
2409 int ketoffset = 0;
2410 int len = code - previous;
2411 uschar *bralink = NULL;
2412
2413 /* If the maximum repeat count is unlimited, find the end of the bracket
2414 by scanning through from the start, and compute the offset back to it
2415 from the current code pointer. There may be an OP_OPT setting following
2416 the final KET, so we can't find the end just by going back from the code
2417 pointer. */
2418
2419 if (repeat_max == -1)
2420 {
2421 register uschar *ket = previous;
2422 do ket += GET(ket, 1); while (*ket != OP_KET);
2423 ketoffset = code - ket;
2424 }
2425
2426 /* The case of a zero minimum is special because of the need to stick
2427 OP_BRAZERO in front of it, and because the group appears once in the
2428 data, whereas in other cases it appears the minimum number of times. For
2429 this reason, it is simplest to treat this case separately, as otherwise
2430 the code gets far too messy. There are several special subcases when the
2431 minimum is zero. */
2432
2433 if (repeat_min == 0)
2434 {
2435 /* If the maximum is also zero, we just omit the group from the output
2436 altogether. */
2437
2438 if (repeat_max == 0)
2439 {
2440 code = previous;
2441 goto END_REPEAT;
2442 }
2443
2444 /* If the maximum is 1 or unlimited, we just have to stick in the
2445 BRAZERO and do no more at this point. */
2446
2447 if (repeat_max <= 1)
2448 {
2449 memmove(previous+1, previous, len);
2450 code++;
2451 *previous++ = OP_BRAZERO + repeat_type;
2452 }
2453
2454 /* If the maximum is greater than 1 and limited, we have to replicate
2455 in a nested fashion, sticking OP_BRAZERO before each set of brackets.
2456 The first one has to be handled carefully because it's the original
2457 copy, which has to be moved up. The remainder can be handled by code
2458 that is common with the non-zero minimum case below. We just have to
2459 adjust the value or repeat_max, since one less copy is required. */
2460
2461 else
2462 {
2463 int offset;
2464 memmove(previous + 2 + LINK_SIZE, previous, len);
2465 code += 2 + LINK_SIZE;
2466 *previous++ = OP_BRAZERO + repeat_type;
2467 *previous++ = OP_BRA;
2468
2469 /* We chain together the bracket offset fields that have to be
2470 filled in later when the ends of the brackets are reached. */
2471
2472 offset = (bralink == NULL)? 0 : previous - bralink;
2473 bralink = previous;
2474 PUTINC(previous, 0, offset);
2475 }
2476
2477 repeat_max--;
2478 }
2479
2480 /* If the minimum is greater than zero, replicate the group as many
2481 times as necessary, and adjust the maximum to the number of subsequent
2482 copies that we need. If we set a first char from the group, and didn't
2483 set a required char, copy the latter from the former. */
2484
2485 else
2486 {
2487 if (repeat_min > 1)
2488 {
2489 if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
2490 for (i = 1; i < repeat_min; i++)
2491 {
2492 memcpy(code, previous, len);
2493 code += len;
2494 }
2495 }
2496 if (repeat_max > 0) repeat_max -= repeat_min;
2497 }
2498
2499 /* This code is common to both the zero and non-zero minimum cases. If
2500 the maximum is limited, it replicates the group in a nested fashion,
2501 remembering the bracket starts on a stack. In the case of a zero minimum,
2502 the first one was set up above. In all cases the repeat_max now specifies
2503 the number of additional copies needed. */
2504
2505 if (repeat_max >= 0)
2506 {
2507 for (i = repeat_max - 1; i >= 0; i--)
2508 {
2509 *code++ = OP_BRAZERO + repeat_type;
2510
2511 /* All but the final copy start a new nesting, maintaining the
2512 chain of brackets outstanding. */
2513
2514 if (i != 0)
2515 {
2516 int offset;
2517 *code++ = OP_BRA;
2518 offset = (bralink == NULL)? 0 : code - bralink;
2519 bralink = code;
2520 PUTINC(code, 0, offset);
2521 }
2522
2523 memcpy(code, previous, len);
2524 code += len;
2525 }
2526
2527 /* Now chain through the pending brackets, and fill in their length
2528 fields (which are holding the chain links pro tem). */
2529
2530 while (bralink != NULL)
2531 {
2532 int oldlinkoffset;
2533 int offset = code - bralink + 1;
2534 uschar *bra = code - offset;
2535 oldlinkoffset = GET(bra, 1);
2536 bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
2537 *code++ = OP_KET;
2538 PUTINC(code, 0, offset);
2539 PUT(bra, 1, offset);
2540 }
2541 }
2542
2543 /* If the maximum is unlimited, set a repeater in the final copy. We
2544 can't just offset backwards from the current code point, because we
2545 don't know if there's been an options resetting after the ket. The
2546 correct offset was computed above. */
2547
2548 else code[-ketoffset] = OP_KETRMAX + repeat_type;
2549 }
2550
2551 /* Else there's some kind of shambles */
2552
2553 else
2554 {
2555 *errorptr = ERR11;
2556 goto FAILED;
2557 }
2558
2559 /* If the character following a repeat is '+', we wrap the entire repeated
2560 item inside OP_ONCE brackets. This is just syntactic sugar, taken from
2561 Sun's Java package. The repeated item starts at tempcode, not at previous,
2562 which might be the first part of a string whose (former) last char we
2563 repeated. However, we don't support '+' after a greediness '?'. */
2564
2565 if (possessive_quantifier)
2566 {
2567 int len = code - tempcode;
2568 memmove(tempcode + 1+LINK_SIZE, tempcode, len);
2569 code += 1 + LINK_SIZE;
2570 len += 1 + LINK_SIZE;
2571 tempcode[0] = OP_ONCE;
2572 *code++ = OP_KET;
2573 PUTINC(code, 0, len);
2574 PUT(tempcode, 1, len);
2575 }
2576
2577 /* In all case we no longer have a previous item. We also set the
2578 "follows varying string" flag for subsequently encountered reqbytes if
2579 it isn't already set and we have just passed a varying length item. */
2580
2581 END_REPEAT:
2582 previous = NULL;
2583 cd->req_varyopt |= reqvary;
2584 break;
2585
2586
2587 /* Start of nested bracket sub-expression, or comment or lookahead or
2588 lookbehind or option setting or condition. First deal with special things
2589 that can come after a bracket; all are introduced by ?, and the appearance
2590 of any of them means that this is not a referencing group. They were
2591 checked for validity in the first pass over the string, so we don't have to
2592 check for syntax errors here. */
2593
2594 case '(':
2595 newoptions = options;
2596 skipbytes = 0;
2597
2598 if (*(++ptr) == '?')
2599 {
2600 int set, unset;
2601 int *optset;
2602
2603 switch (*(++ptr))
2604 {
2605 case '#': /* Comment; skip to ket */
2606 ptr++;
2607 while (*ptr != ')') ptr++;
2608 continue;
2609
2610 case ':': /* Non-extracting bracket */
2611 bravalue = OP_BRA;
2612 ptr++;
2613 break;
2614
2615 case '(':
2616 bravalue = OP_COND; /* Conditional group */
2617
2618 /* Condition to test for recursion */
2619
2620 if (ptr[1] == 'R')
2621 {
2622 code[1+LINK_SIZE] = OP_CREF;
2623 PUT2(code, 2+LINK_SIZE, CREF_RECURSE);
2624 skipbytes = 3;
2625 ptr += 3;
2626 }
2627
2628 /* Condition to test for a numbered subpattern match. We know that
2629 if a digit follows ( then there will just be digits until ) because
2630 the syntax was checked in the first pass. */
2631
2632 else if ((digitab[ptr[1]] && ctype_digit) != 0)
2633 {
2634 int condref; /* Don't amalgamate; some compilers */
2635 condref = *(++ptr) - '0'; /* grumble at autoincrement in declaration */
2636 while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';
2637 if (condref == 0)
2638 {
2639 *errorptr = ERR35;
2640 goto FAILED;
2641 }
2642 ptr++;
2643 code[1+LINK_SIZE] = OP_CREF;
2644 PUT2(code, 2+LINK_SIZE, condref);
2645 skipbytes = 3;
2646 }
2647 /* For conditions that are assertions, we just fall through, having
2648 set bravalue above. */
2649 break;
2650
2651 case '=': /* Positive lookahead */
2652 bravalue = OP_ASSERT;
2653 ptr++;
2654 break;
2655
2656 case '!': /* Negative lookahead */
2657 bravalue = OP_ASSERT_NOT;
2658 ptr++;
2659 break;
2660
2661 case '<': /* Lookbehinds */
2662 switch (*(++ptr))
2663 {
2664 case '=': /* Positive lookbehind */
2665 bravalue = OP_ASSERTBACK;
2666 ptr++;
2667 break;
2668
2669 case '!': /* Negative lookbehind */
2670 bravalue = OP_ASSERTBACK_NOT;
2671 ptr++;
2672 break;
2673 }
2674 break;
2675
2676 case '>': /* One-time brackets */
2677 bravalue = OP_ONCE;
2678 ptr++;
2679 break;
2680
2681 case 'C': /* Callout - may be followed by digits */
2682 *code++ = OP_CALLOUT;
2683 {
2684 int n = 0;
2685 while ((digitab[*(++ptr)] & ctype_digit) != 0)
2686 n = n * 10 + *ptr - '0';
2687 if (n > 255)
2688 {
2689 *errorptr = ERR38;
2690 goto FAILED;
2691 }
2692 *code++ = n;
2693 }
2694 previous = NULL;
2695 continue;
2696
2697 case 'P': /* Named subpattern handling */
2698 if (*(++ptr) == '<') /* Definition */
2699 {
2700 int i, namelen;
2701 uschar *slot = cd->name_table;
2702 const uschar *name; /* Don't amalgamate; some compilers */
2703 name = ++ptr; /* grumble at autoincrement in declaration */
2704
2705 while (*ptr++ != '>');
2706 namelen = ptr - name - 1;
2707
2708 for (i = 0; i < cd->names_found; i++)
2709 {
2710 int crc = memcmp(name, slot+2, namelen);
2711 if (crc == 0)
2712 {
2713 if (slot[2+namelen] == 0)
2714 {
2715 *errorptr = ERR43;
2716 goto FAILED;
2717 }
2718 crc = -1; /* Current name is substring */
2719 }
2720 if (crc < 0)
2721 {
2722 memmove(slot + cd->name_entry_size, slot,
2723 (cd->names_found - i) * cd->name_entry_size);
2724 break;
2725 }
2726 slot += cd->name_entry_size;
2727 }
2728
2729 PUT2(slot, 0, *brackets + 1);
2730 memcpy(slot + 2, name, namelen);
2731 slot[2+namelen] = 0;
2732 cd->names_found++;
2733 goto NUMBERED_GROUP;
2734 }
2735
2736 if (*ptr == '=' || *ptr == '>') /* Reference or recursion */
2737 {
2738 int i, namelen;
2739 int type = *ptr++;
2740 const uschar *name = ptr;
2741 uschar *slot = cd->name_table;
2742
2743 while (*ptr != ')') ptr++;
2744 namelen = ptr - name;
2745
2746 for (i = 0; i < cd->names_found; i++)
2747 {
2748 if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
2749 slot += cd->name_entry_size;
2750 }
2751 if (i >= cd->names_found)
2752 {
2753 *errorptr = ERR15;
2754 goto FAILED;
2755 }
2756
2757 recno = GET2(slot, 0);
2758
2759 if (type == '>') goto HANDLE_RECURSION; /* A few lines below */
2760
2761 /* Back reference */
2762
2763 previous = code;
2764 *code++ = OP_REF;
2765 PUT2INC(code, 0, recno);
2766 cd->backref_map |= (recno < 32)? (1 << recno) : 1;
2767 if (recno > cd->top_backref) cd->top_backref = recno;
2768 continue;
2769 }
2770
2771 /* Should never happen */
2772 break;
2773
2774 case 'R': /* Pattern recursion */
2775 ptr++; /* Same as (?0) */
2776 /* Fall through */
2777
2778 /* Recursion or "subroutine" call */
2779
2780 case '0': case '1': case '2': case '3': case '4':
2781 case '5': case '6': case '7': case '8': case '9':
2782 {
2783 const uschar *called;
2784 recno = 0;
2785 while((digitab[*ptr] & ctype_digit) != 0)
2786 recno = recno * 10 + *ptr++ - '0';
2787
2788 /* Come here from code above that handles a named recursion */
2789
2790 HANDLE_RECURSION:
2791
2792 previous = code;
2793
2794 /* Find the bracket that is being referenced. Temporarily end the
2795 regex in case it doesn't exist. */
2796
2797 *code = OP_END;
2798 called = (recno == 0)?
2799 cd->start_code : find_bracket(cd->start_code, utf8, recno);
2800
2801 if (called == NULL)
2802 {
2803 *errorptr = ERR15;
2804 goto FAILED;
2805 }
2806
2807 /* If the subpattern is still open, this is a recursive call. We
2808 check to see if this is a left recursion that could loop for ever,
2809 and diagnose that case. */
2810
2811 if (GET(called, 1) == 0 && could_be_empty(called, code, bcptr, utf8))
2812 {
2813 *errorptr = ERR40;
2814 goto FAILED;
2815 }
2816
2817 /* Insert the recursion/subroutine item */
2818
2819 *code = OP_RECURSE;
2820 PUT(code, 1, called - cd->start_code);
2821 code += 1 + LINK_SIZE;
2822 }
2823 continue;
2824
2825 /* Character after (? not specially recognized */
2826
2827 default: /* Option setting */
2828 set = unset = 0;
2829 optset = &set;
2830
2831 while (*ptr != ')' && *ptr != ':')
2832 {
2833 switch (*ptr++)
2834 {
2835 case '-': optset = &unset; break;
2836
2837 case 'i': *optset |= PCRE_CASELESS; break;
2838 case 'm': *optset |= PCRE_MULTILINE; break;
2839 case 's': *optset |= PCRE_DOTALL; break;
2840 case 'x': *optset |= PCRE_EXTENDED; break;
2841 case 'U': *optset |= PCRE_UNGREEDY; break;
2842 case 'X': *optset |= PCRE_EXTRA; break;
2843 }
2844 }
2845
2846 /* Set up the changed option bits, but don't change anything yet. */
2847
2848 newoptions = (options | set) & (~unset);
2849
2850 /* If the options ended with ')' this is not the start of a nested
2851 group with option changes, so the options change at this level. Compile
2852 code to change the ims options if this setting actually changes any of
2853 them. We also pass the new setting back so that it can be put at the
2854 start of any following branches, and when this group ends (if we are in
2855 a group), a resetting item can be compiled.
2856
2857 Note that if this item is right at the start of the pattern, the
2858 options will have been abstracted and made global, so there will be no
2859 change to compile. */
2860
2861 if (*ptr == ')')
2862 {
2863 if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
2864 {
2865 *code++ = OP_OPT;
2866 *code++ = newoptions & PCRE_IMS;
2867 }
2868
2869 /* Change options at this level, and pass them back for use
2870 in subsequent branches. Reset the greedy defaults and the case
2871 value for firstbyte and reqbyte. */
2872
2873 *optionsptr = options = newoptions;
2874 greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
2875 greedy_non_default = greedy_default ^ 1;
2876 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2877
2878 previous = NULL; /* This item can't be repeated */
2879 continue; /* It is complete */
2880 }
2881
2882 /* If the options ended with ':' we are heading into a nested group
2883 with possible change of options. Such groups are non-capturing and are
2884 not assertions of any kind. All we need to do is skip over the ':';
2885 the newoptions value is handled below. */
2886
2887 bravalue = OP_BRA;
2888 ptr++;
2889 }
2890 }
2891
2892 /* If PCRE_NO_AUTO_CAPTURE is set, all unadorned brackets become
2893 non-capturing and behave like (?:...) brackets */
2894
2895 else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
2896 {
2897 bravalue = OP_BRA;
2898 }
2899
2900 /* Else we have a referencing group; adjust the opcode. If the bracket
2901 number is greater than EXTRACT_BASIC_MAX, we set the opcode one higher, and
2902 arrange for the true number to follow later, in an OP_BRANUMBER item. */
2903
2904 else
2905 {
2906 NUMBERED_GROUP:
2907 if (++(*brackets) > EXTRACT_BASIC_MAX)
2908 {
2909 bravalue = OP_BRA + EXTRACT_BASIC_MAX + 1;
2910 code[1+LINK_SIZE] = OP_BRANUMBER;
2911 PUT2(code, 2+LINK_SIZE, *brackets);
2912 skipbytes = 3;
2913 }
2914 else bravalue = OP_BRA + *brackets;
2915 }
2916
2917 /* Process nested bracketed re. Assertions may not be repeated, but other
2918 kinds can be. We copy code into a non-register variable in order to be able
2919 to pass its address because some compilers complain otherwise. Pass in a
2920 new setting for the ims options if they have changed. */
2921
2922 previous = (bravalue >= OP_ONCE)? code : NULL;
2923 *code = bravalue;
2924 tempcode = code;
2925 tempreqvary = cd->req_varyopt; /* Save value before bracket */
2926
2927 if (!compile_regex(
2928 newoptions, /* The complete new option state */
2929 options & PCRE_IMS, /* The previous ims option state */
2930 brackets, /* Extracting bracket count */
2931 &tempcode, /* Where to put code (updated) */
2932 &ptr, /* Input pointer (updated) */
2933 errorptr, /* Where to put an error message */
2934 (bravalue == OP_ASSERTBACK ||
2935 bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
2936 skipbytes, /* Skip over OP_COND/OP_BRANUMBER */
2937 &subfirstbyte, /* For possible first char */
2938 &subreqbyte, /* For possible last char */
2939 bcptr, /* Current branch chain */
2940 cd)) /* Tables block */
2941 goto FAILED;
2942
2943 /* At the end of compiling, code is still pointing to the start of the
2944 group, while tempcode has been updated to point past the end of the group
2945 and any option resetting that may follow it. The pattern pointer (ptr)
2946 is on the bracket. */
2947
2948 /* If this is a conditional bracket, check that there are no more than
2949 two branches in the group. */
2950
2951 else if (bravalue == OP_COND)
2952 {
2953 uschar *tc = code;
2954 condcount = 0;
2955
2956 do {
2957 condcount++;
2958 tc += GET(tc,1);
2959 }
2960 while (*tc != OP_KET);
2961
2962 if (condcount > 2)
2963 {
2964 *errorptr = ERR27;
2965 goto FAILED;
2966 }
2967
2968 /* If there is just one branch, we must not make use of its firstbyte or
2969 reqbyte, because this is equivalent to an empty second branch. */
2970
2971 if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
2972 }
2973
2974 /* Handle updating of the required and first characters. Update for normal
2975 brackets of all kinds, and conditions with two branches (see code above).
2976 If the bracket is followed by a quantifier with zero repeat, we have to
2977 back off. Hence the definition of zeroreqbyte and zerofirstbyte outside the
2978 main loop so that they can be accessed for the back off. */
2979
2980 zeroreqbyte = reqbyte;
2981 zerofirstbyte = firstbyte;
2982 groupsetfirstbyte = FALSE;
2983
2984 if (bravalue >= OP_BRA || bravalue == OP_ONCE || bravalue == OP_COND)
2985 {
2986 /* If we have not yet set a firstbyte in this branch, take it from the
2987 subpattern, remembering that it was set here so that a repeat of more
2988 than one can replicate it as reqbyte if necessary. If the subpattern has
2989 no firstbyte, set "none" for the whole branch. In both cases, a zero
2990 repeat forces firstbyte to "none". */
2991
2992 if (firstbyte == REQ_UNSET)
2993 {
2994 if (subfirstbyte >= 0)
2995 {
2996 firstbyte = subfirstbyte;
2997 groupsetfirstbyte = TRUE;
2998 }
2999 else firstbyte = REQ_NONE;
3000 zerofirstbyte = REQ_NONE;
3001 }
3002
3003 /* If firstbyte was previously set, convert the subpattern's firstbyte
3004 into reqbyte if there wasn't one, using the vary flag that was in
3005 existence beforehand. */
3006
3007 else if (subfirstbyte >= 0 && subreqbyte < 0)
3008 subreqbyte = subfirstbyte | tempreqvary;
3009
3010 /* If the subpattern set a required byte (or set a first byte that isn't
3011 really the first byte - see above), set it. */
3012
3013 if (subreqbyte >= 0) reqbyte = subreqbyte;
3014 }
3015
3016 /* For a forward assertion, we take the reqbyte, if set. This can be
3017 helpful if the pattern that follows the assertion doesn't set a different
3018 char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
3019 for an assertion, however because it leads to incorrect effect for patterns
3020 such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
3021 of a firstbyte. This is overcome by a scan at the end if there's no
3022 firstbyte, looking for an asserted first char. */
3023
3024 else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
3025
3026 /* Now update the main code pointer to the end of the group. */
3027
3028 code = tempcode;
3029
3030 /* Error if hit end of pattern */
3031
3032 if (*ptr != ')')
3033 {
3034 *errorptr = ERR14;
3035 goto FAILED;
3036 }
3037 break;
3038
3039 /* Check \ for being a real metacharacter; if not, fall through and handle
3040 it as a data character at the start of a string. Escape items are checked
3041 for validity in the pre-compiling pass. */
3042
3043 case '\\':
3044 tempptr = ptr;
3045 c = check_escape(&ptr, errorptr, *brackets, options, FALSE);
3046
3047 /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
3048 are arranged to be the negation of the corresponding OP_values. For the
3049 back references, the values are ESC_REF plus the reference number. Only
3050 back references and those types that consume a character may be repeated.
3051 We can test for values between ESC_b and ESC_Z for the latter; this may
3052 have to change if any new ones are ever created. */
3053
3054 if (c < 0)
3055 {
3056 if (-c == ESC_Q) /* Handle start of quoted string */
3057 {
3058 if (ptr[1] == '\\' && ptr[2] == 'E') ptr += 2; /* avoid empty string */
3059 else inescq = TRUE;
3060 continue;
3061 }
3062
3063 /* For metasequences that actually match a character, we disable the
3064 setting of a first character if it hasn't already been set. */
3065
3066 if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
3067 firstbyte = REQ_NONE;
3068
3069 /* Set values to reset to if this is followed by a zero repeat. */
3070
3071 zerofirstbyte = firstbyte;
3072 zeroreqbyte = reqbyte;
3073
3074 /* Back references are handled specially */
3075
3076 if (-c >= ESC_REF)
3077 {
3078 int number = -c - ESC_REF;
3079 previous = code;
3080 *code++ = OP_REF;
3081 PUT2INC(code, 0, number);
3082 }
3083 else
3084 {
3085 previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
3086 *code++ = -c;
3087 }
3088 continue;
3089 }
3090
3091 /* Data character: reset and fall through */
3092
3093 ptr = tempptr;
3094 c = '\\';
3095
3096 /* Handle a run of data characters until a metacharacter is encountered.
3097 The first character is guaranteed not to be whitespace or # when the
3098 extended flag is set. */
3099
3100 NORMAL_CHAR:
3101 default:
3102 previous = code;
3103 *code = OP_CHARS;
3104 code += 2;
3105 length = 0;
3106
3107 do
3108 {
3109 /* If in \Q...\E, check for the end; if not, we always have a literal */
3110
3111 if (inescq)
3112 {
3113 if (c == '\\' && ptr[1] == 'E')
3114 {
3115 inescq = FALSE;
3116 ptr++;
3117 }
3118 else
3119 {
3120 *code++ = c;
3121 length++;
3122 }
3123 continue;
3124 }
3125
3126 /* Skip white space and comments for /x patterns */
3127
3128 if ((options & PCRE_EXTENDED) != 0)
3129 {
3130 if ((cd->ctypes[c] & ctype_space) != 0) continue;
3131 if (c == '#')
3132 {
3133 /* The space before the ; is to avoid a warning on a silly compiler
3134 on the Macintosh. */
3135 while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
3136 if (c == 0) break;
3137 continue;
3138 }
3139 }
3140
3141 /* Backslash may introduce a data char or a metacharacter. Escaped items
3142 are checked for validity in the pre-compiling pass. Stop the string
3143 before a metaitem. */
3144
3145 if (c == '\\')
3146 {
3147 tempptr = ptr;
3148 c = check_escape(&ptr, errorptr, *brackets, options, FALSE);
3149 if (c < 0) { ptr = tempptr; break; }
3150
3151 /* If a character is > 127 in UTF-8 mode, we have to turn it into
3152 two or more characters in the UTF-8 encoding. */
3153
3154 #ifdef SUPPORT_UTF8
3155 if (utf8 && c > 127)
3156 {
3157 uschar buffer[8];
3158 int len = ord2utf8(c, buffer);
3159 for (c = 0; c < len; c++) *code++ = buffer[c];
3160 length += len;
3161 continue;
3162 }
3163 #endif
3164 }
3165
3166 /* Ordinary character or single-char escape */
3167
3168 *code++ = c;
3169 length++;
3170 }
3171
3172 /* This "while" is the end of the "do" above. */
3173
3174 while (length < MAXLIT && (cd->ctypes[c = *(++ptr)] & ctype_meta) == 0);
3175
3176 /* Update the first and last requirements. These are always bytes, even in
3177 UTF-8 mode. However, there is a special case to be considered when there
3178 are only one or two characters. Because this gets messy in UTF-8 mode, the
3179 code is kept separate. When we get here "length" contains the number of
3180 bytes. */
3181
3182 #ifdef SUPPORT_UTF8
3183 if (utf8 && length > 1)
3184 {
3185 uschar *t = previous + 3; /* After this code, t */
3186 while (t < code && (*t & 0xc0) == 0x80) t++; /* follows the 1st char */
3187
3188 /* Handle the case when there is only one multibyte character. It must
3189 have at least two bytes because of the "length > 1" test above. */
3190
3191 if (t == code)
3192 {
3193 /* If no previous first byte, set it from this character, but revert to
3194 none on a zero repeat. */
3195
3196 if (firstbyte == REQ_UNSET)
3197 {
3198 zerofirstbyte = REQ_NONE;
3199 firstbyte = previous[2];
3200 }
3201
3202 /* Otherwise, leave the first byte value alone, and don't change it on
3203 a zero repeat */
3204
3205 else zerofirstbyte = firstbyte;
3206
3207 /* In both cases, a zero repeat resets the previous required byte */
3208
3209 zeroreqbyte = reqbyte;
3210 }
3211
3212 /* Handle the case when there is more than one character. These may be
3213 single-byte or multibyte characters */
3214
3215 else
3216 {
3217 t = code - 1; /* After this code, t is at the */
3218 while ((*t & 0xc0) == 0x80) t--; /* start of the last character */
3219
3220 /* If no previous first byte, set it from the first character, and
3221 retain it on a zero repeat (of the last character). The required byte
3222 is reset on a zero repeat, either to the byte before the last
3223 character, unless this is the first byte of the string. In that case,
3224 it reverts to its previous value. */
3225
3226 if (firstbyte == REQ_UNSET)
3227 {
3228 zerofirstbyte = firstbyte = previous[2] | req_caseopt;
3229 zeroreqbyte = (t - 1 == previous + 2)?
3230 reqbyte : t[-1] | req_caseopt | cd->req_varyopt;
3231 }
3232
3233 /* If there was a previous first byte, leave it alone, and don't change
3234 it on a zero repeat. The required byte is reset on a zero repeat to the
3235 byte before the last character. */
3236
3237 else
3238 {
3239 zerofirstbyte = firstbyte;
3240 zeroreqbyte = t[-1] | req_caseopt | cd->req_varyopt;
3241 }
3242 }
3243
3244 /* In all cases (we know length > 1), the new required byte is the last
3245 byte of the string. */
3246
3247 reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
3248 }
3249
3250 else /* End of UTF-8 coding */
3251 #endif
3252
3253 /* This is the code for non-UTF-8 operation, either without UTF-8 support,
3254 or when UTF-8 is not enabled. */
3255
3256 {
3257 /* firstbyte was not previously set; take it from this string */
3258
3259 if (firstbyte == REQ_UNSET)
3260 {
3261 if (length == 1)
3262 {
3263 zerofirstbyte = REQ_NONE;
3264 firstbyte = previous[2] | req_caseopt;
3265 zeroreqbyte = reqbyte;
3266 }
3267 else
3268 {
3269 zerofirstbyte = firstbyte = previous[2] | req_caseopt;
3270 zeroreqbyte = (length > 2)?
3271 (code[-2] | req_caseopt | cd->req_varyopt) : reqbyte;
3272 reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
3273 }
3274 }
3275
3276 /* firstbyte was previously set */
3277
3278 else
3279 {
3280 zerofirstbyte = firstbyte;
3281 zeroreqbyte = (length == 1)? reqbyte :
3282 code[-2] | req_caseopt | cd->req_varyopt;
3283 reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
3284 }
3285 }
3286
3287 /* Set the length in the data vector, and advance to the next state. */
3288
3289 previous[1] = length;
3290 if (length < MAXLIT) ptr--;
3291 break;
3292 }
3293 } /* end of big loop */
3294
3295 /* Control never reaches here by falling through, only by a goto for all the
3296 error states. Pass back the position in the pattern so that it can be displayed
3297 to the user for diagnosing the error. */
3298
3299 FAILED:
3300 *ptrptr = ptr;
3301 return FALSE;
3302 }
3303
3304
3305
3306
3307 /*************************************************
3308 * Compile sequence of alternatives *
3309 *************************************************/
3310
3311 /* On entry, ptr is pointing past the bracket character, but on return
3312 it points to the closing bracket, or vertical bar, or end of string.
3313 The code variable is pointing at the byte into which the BRA operator has been
3314 stored. If the ims options are changed at the start (for a (?ims: group) or
3315 during any branch, we need to insert an OP_OPT item at the start of every
3316 following branch to ensure they get set correctly at run time, and also pass
3317 the new options into every subsequent branch compile.
3318
3319 Argument:
3320 options option bits, including any changes for this subpattern
3321 oldims previous settings of ims option bits
3322 brackets -> int containing the number of extracting brackets used
3323 codeptr -> the address of the current code pointer
3324 ptrptr -> the address of the current pattern pointer
3325 errorptr -> pointer to error message
3326 lookbehind TRUE if this is a lookbehind assertion
3327 skipbytes skip this many bytes at start (for OP_COND, OP_BRANUMBER)
3328 firstbyteptr place to put the first required character, or a negative number
3329 reqbyteptr place to put the last required character, or a negative number
3330 bcptr pointer to the chain of currently open branches
3331 cd points to the data block with tables pointers etc.
3332
3333 Returns: TRUE on success
3334 */
3335
3336 static BOOL
3337 compile_regex(int options, int oldims, int *brackets, uschar **codeptr,
3338 const uschar **ptrptr, const char **errorptr, BOOL lookbehind, int skipbytes,
3339 int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd)
3340 {
3341 const uschar *ptr = *ptrptr;
3342 uschar *code = *codeptr;
3343 uschar *last_branch = code;
3344 uschar *start_bracket = code;
3345 uschar *reverse_count = NULL;
3346 int firstbyte, reqbyte;
3347 int branchfirstbyte, branchreqbyte;
3348 branch_chain bc;
3349
3350 bc.outer = bcptr;
3351 bc.current = code;
3352
3353 firstbyte = reqbyte = REQ_UNSET;
3354
3355 /* Offset is set zero to mark that this bracket is still open */
3356
3357 PUT(code, 1, 0);
3358 code += 1 + LINK_SIZE + skipbytes;
3359
3360 /* Loop for each alternative branch */
3361
3362 for (;;)
3363 {
3364 /* Handle a change of ims options at the start of the branch */
3365
3366 if ((options & PCRE_IMS) != oldims)
3367 {
3368 *code++ = OP_OPT;
3369 *code++ = options & PCRE_IMS;
3370 }
3371
3372 /* Set up dummy OP_REVERSE if lookbehind assertion */
3373
3374 if (lookbehind)
3375 {
3376 *code++ = OP_REVERSE;
3377 reverse_count = code;
3378 PUTINC(code, 0, 0);
3379 }
3380
3381 /* Now compile the branch */
3382
3383 if (!compile_branch(&options, brackets, &code, &ptr, errorptr,
3384 &branchfirstbyte, &branchreqbyte, &bc, cd))
3385 {
3386 *ptrptr = ptr;
3387 return FALSE;
3388 }
3389
3390 /* If this is the first branch, the firstbyte and reqbyte values for the
3391 branch become the values for the regex. */
3392
3393 if (*last_branch != OP_ALT)
3394 {
3395 firstbyte = branchfirstbyte;
3396 reqbyte = branchreqbyte;
3397 }
3398
3399 /* If this is not the first branch, the first char and reqbyte have to
3400 match the values from all the previous branches, except that if the previous
3401 value for reqbyte didn't have REQ_VARY set, it can still match, and we set
3402 REQ_VARY for the regex. */
3403
3404 else
3405 {
3406 /* If we previously had a firstbyte, but it doesn't match the new branch,
3407 we have to abandon the firstbyte for the regex, but if there was previously
3408 no reqbyte, it takes on the value of the old firstbyte. */
3409
3410 if (firstbyte >= 0 && firstbyte != branchfirstbyte)
3411 {
3412 if (reqbyte < 0) reqbyte = firstbyte;
3413 firstbyte = REQ_NONE;
3414 }
3415
3416 /* If we (now or from before) have no firstbyte, a firstbyte from the
3417 branch becomes a reqbyte if there isn't a branch reqbyte. */
3418
3419 if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
3420 branchreqbyte = branchfirstbyte;
3421
3422 /* Now ensure that the reqbytes match */
3423
3424 if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
3425 reqbyte = REQ_NONE;
3426 else reqbyte |= branchreqbyte; /* To "or" REQ_VARY */
3427 }
3428
3429 /* If lookbehind, check that this branch matches a fixed-length string,
3430 and put the length into the OP_REVERSE item. Temporarily mark the end of
3431 the branch with OP_END. */
3432
3433 if (lookbehind)
3434 {
3435 int length;
3436 *code = OP_END;
3437 length = find_fixedlength(last_branch, options);
3438 DPRINTF(("fixed length = %d\n", length));
3439 if (length < 0)
3440 {
3441 *errorptr = (length == -2)? ERR36 : ERR25;
3442 *ptrptr = ptr;
3443 return FALSE;
3444 }
3445 PUT(reverse_count, 0, length);
3446 }
3447
3448 /* Reached end of expression, either ')' or end of pattern. Go back through
3449 the alternative branches and reverse the chain of offsets, with the field in
3450 the BRA item now becoming an offset to the first alternative. If there are
3451 no alternatives, it points to the end of the group. The length in the
3452 terminating ket is always the length of the whole bracketed item. If any of
3453 the ims options were changed inside the group, compile a resetting op-code
3454 following, except at the very end of the pattern. Return leaving the pointer
3455 at the terminating char. */
3456
3457 if (*ptr != '|')
3458 {
3459 int length = code - last_branch;
3460 do
3461 {
3462 int prev_length = GET(last_branch, 1);
3463 PUT(last_branch, 1, length);
3464 length = prev_length;
3465 last_branch -= length;
3466 }
3467 while (length > 0);
3468
3469 /* Fill in the ket */
3470
3471 *code = OP_KET;
3472 PUT(code, 1, code - start_bracket);
3473 code += 1 + LINK_SIZE;
3474
3475 /* Resetting option if needed */
3476
3477 if ((options & PCRE_IMS) != oldims && *ptr == ')')
3478 {
3479 *code++ = OP_OPT;
3480 *code++ = oldims;
3481 }
3482
3483 /* Set values to pass back */
3484
3485 *codeptr = code;
3486 *ptrptr = ptr;
3487 *firstbyteptr = firstbyte;
3488 *reqbyteptr = reqbyte;
3489 return TRUE;
3490 }
3491
3492 /* Another branch follows; insert an "or" node. Its length field points back
3493 to the previous branch while the bracket remains open. At the end the chain
3494 is reversed. It's done like this so that the start of the bracket has a
3495 zero offset until it is closed, making it possible to detect recursion. */
3496
3497 *code = OP_ALT;
3498 PUT(code, 1, code - last_branch);
3499 bc.current = last_branch = code;
3500 code += 1 + LINK_SIZE;
3501 ptr++;
3502 }
3503 /* Control never reaches here */
3504 }
3505
3506
3507
3508
3509 /*************************************************
3510 * Check for anchored expression *
3511 *************************************************/
3512
3513 /* Try to find out if this is an anchored regular expression. Consider each
3514 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
3515 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
3516 it's anchored. However, if this is a multiline pattern, then only OP_SOD
3517 counts, since OP_CIRC can match in the middle.
3518
3519 We can also consider a regex to be anchored if OP_SOM starts all its branches.
3520 This is the code for \G, which means "match at start of match position, taking
3521 into account the match offset".
3522
3523 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
3524 because that will try the rest of the pattern at all possible matching points,
3525 so there is no point trying again.... er ....
3526
3527 .... except when the .* appears inside capturing parentheses, and there is a
3528 subsequent back reference to those parentheses. We haven't enough information
3529 to catch that case precisely.
3530
3531 At first, the best we could do was to detect when .* was in capturing brackets
3532 and the highest back reference was greater than or equal to that level.
3533 However, by keeping a bitmap of the first 31 back references, we can catch some
3534 of the more common cases more precisely.
3535
3536 Arguments:
3537 code points to start of expression (the bracket)
3538 options points to the options setting
3539 bracket_map a bitmap of which brackets we are inside while testing; this
3540 handles up to substring 31; after that we just have to take
3541 the less precise approach
3542 backref_map the back reference bitmap
3543
3544 Returns: TRUE or FALSE
3545 */
3546
3547 static BOOL
3548 is_anchored(register const uschar *code, int *options, unsigned int bracket_map,
3549 unsigned int backref_map)
3550 {
3551 do {
3552 const uschar *scode =
3553 first_significant_code(code + 1+LINK_SIZE, options, PCRE_MULTILINE);
3554 register int op = *scode;
3555
3556 /* Capturing brackets */
3557
3558 if (op > OP_BRA)
3559 {
3560 int new_map;
3561 op -= OP_BRA;
3562 if (op > EXTRACT_BASIC_MAX) op = GET2(scode, 2+LINK_SIZE);
3563 new_map = bracket_map | ((op < 32)? (1 << op) : 1);
3564 if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;
3565 }
3566
3567 /* Other brackets */
3568
3569 else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
3570 {
3571 if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
3572 }
3573
3574 /* .* is not anchored unless DOTALL is set and it isn't in brackets that
3575 are or may be referenced. */
3576
3577 else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR) &&
3578 (*options & PCRE_DOTALL) != 0)
3579 {
3580 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
3581 }
3582
3583 /* Check for explicit anchoring */
3584
3585 else if (op != OP_SOD && op != OP_SOM &&
3586 ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
3587 return FALSE;
3588 code += GET(code, 1);
3589 }
3590 while (*code == OP_ALT); /* Loop for each alternative */
3591 return TRUE;
3592 }
3593
3594
3595
3596 /*************************************************
3597 * Check for starting with ^ or .* *
3598 *************************************************/
3599
3600 /* This is called to find out if every branch starts with ^ or .* so that
3601 "first char" processing can be done to speed things up in multiline
3602 matching and for non-DOTALL patterns that start with .* (which must start at
3603 the beginning or after \n). As in the case of is_anchored() (see above), we
3604 have to take account of back references to capturing brackets that contain .*
3605 because in that case we can't make the assumption.
3606
3607 Arguments:
3608 code points to start of expression (the bracket)
3609 bracket_map a bitmap of which brackets we are inside while testing; this
3610 handles up to substring 31; after that we just have to take
3611 the less precise approach
3612 backref_map the back reference bitmap
3613
3614 Returns: TRUE or FALSE
3615 */
3616
3617 static BOOL
3618 is_startline(const uschar *code, unsigned int bracket_map,
3619 unsigned int backref_map)
3620 {
3621 do {
3622 const uschar *scode = first_significant_code(code + 1+LINK_SIZE, NULL, 0);
3623 register int op = *scode;
3624
3625 /* Capturing brackets */
3626
3627 if (op > OP_BRA)
3628 {
3629 int new_map;
3630 op -= OP_BRA;
3631 if (op > EXTRACT_BASIC_MAX) op = GET2(scode, 2+LINK_SIZE);
3632 new_map = bracket_map | ((op < 32)? (1 << op) : 1);
3633 if (!is_startline(scode, new_map, backref_map)) return FALSE;
3634 }
3635
3636 /* Other brackets */
3637
3638 else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
3639 { if (!is_startline(scode, bracket_map, backref_map)) return FALSE; }
3640
3641 /* .* is not anchored unless DOTALL is set and it isn't in brackets that
3642 may be referenced. */
3643
3644 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
3645 {
3646 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
3647 }
3648
3649 /* Check for explicit circumflex */
3650
3651 else if (op != OP_CIRC) return FALSE;
3652 code += GET(code, 1);
3653 }
3654 while (*code == OP_ALT); /* Loop for each alternative */
3655 return TRUE;
3656 }
3657
3658
3659
3660 /*************************************************
3661 * Check for asserted fixed first char *
3662 *************************************************/
3663
3664 /* During compilation, the "first char" settings from forward assertions are
3665 discarded, because they can cause conflicts with actual literals that follow.
3666 However, if we end up without a first char setting for an unanchored pattern,
3667 it is worth scanning the regex to see if there is an initial asserted first
3668 char. If all branches start with the same asserted char, or with a bracket all
3669 of whose alternatives start with the same asserted char (recurse ad lib), then
3670 we return that char, otherwise -1.
3671
3672 Arguments:
3673 code points to start of expression (the bracket)
3674 options pointer to the options (used to check casing changes)
3675 inassert TRUE if in an assertion
3676
3677 Returns: -1 or the fixed first char
3678 */
3679
3680 static int
3681 find_firstassertedchar(const uschar *code, int *options, BOOL inassert)
3682 {
3683 register int c = -1;
3684 do {
3685 int d;
3686 const uschar *scode =
3687 first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS);
3688 register int op = *scode;
3689
3690 if (op >= OP_BRA) op = OP_BRA;
3691
3692 switch(op)
3693 {
3694 default:
3695 return -1;
3696
3697 case OP_BRA:
3698 case OP_ASSERT:
3699 case OP_ONCE:
3700 case OP_COND:
3701 if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0)
3702 return -1;
3703 if (c < 0) c = d; else if (c != d) return -1;
3704 break;
3705
3706 case OP_EXACT: /* Fall through */
3707 scode++;
3708
3709 case OP_CHARS: /* Fall through */
3710 scode++;
3711
3712 case OP_PLUS:
3713 case OP_MINPLUS:
3714 if (!inassert) return -1;
3715 if (c < 0)
3716 {
3717 c = scode[1];
3718 if ((*options & PCRE_CASELESS) != 0) c |= REQ_CASELESS;
3719 }
3720 else if (c != scode[1]) return -1;
3721 break;
3722 }
3723
3724 code += GET(code, 1);
3725 }
3726 while (*code == OP_ALT);
3727 return c;
3728 }
3729
3730
3731
3732
3733 #ifdef SUPPORT_UTF8
3734 /*************************************************
3735 * Validate a UTF-8 string *
3736 *************************************************/
3737
3738 /* This function is called (optionally) at the start of compile or match, to
3739 validate that a supposed UTF-8 string is actually valid. The early check means
3740 that subsequent code can assume it is dealing with a valid string. The check
3741 can be turned off for maximum performance, but then consequences of supplying
3742 an invalid string are then undefined.
3743
3744 Arguments:
3745 string points to the string
3746 length length of string, or -1 if the string is zero-terminated
3747
3748 Returns: < 0 if the string is a valid UTF-8 string
3749 >= 0 otherwise; the value is the offset of the bad byte
3750 */
3751
3752 static int
3753 valid_utf8(const uschar *string, int length)
3754 {
3755 register const uschar *p;
3756
3757 if (length < 0)
3758 {
3759 for (p = string; *p != 0; p++);
3760 length = p - string;
3761 }
3762
3763 for (p = string; length-- > 0; p++)
3764 {
3765 int ab;
3766 if (*p < 128) continue;
3767 if ((*p & 0xc0) != 0xc0) return p - string;
3768 ab = utf8_table4[*p & 0x3f]; /* Number of additional bytes */
3769 if (length < ab) return p - string;
3770 while (ab-- > 0)
3771 {
3772 if ((*(++p) & 0xc0) != 0x80) return p - string;
3773 length--;
3774 }
3775 }
3776
3777 return -1;
3778 }
3779 #endif
3780
3781
3782
3783 /*************************************************
3784 * Compile a Regular Expression *
3785 *************************************************/
3786
3787 /* This function takes a string and returns a pointer to a block of store
3788 holding a compiled version of the expression.
3789
3790 Arguments:
3791 pattern the regular expression
3792 options various option bits
3793 errorptr pointer to pointer to error text
3794 erroroffset ptr offset in pattern where error was detected
3795 tables pointer to character tables or NULL
3796
3797 Returns: pointer to compiled data block, or NULL on error,
3798 with errorptr and erroroffset set
3799 */
3800
3801 pcre *
3802 pcre_compile(const char *pattern, int options, const char **errorptr,
3803 int *erroroffset, const unsigned char *tables)
3804 {
3805 real_pcre *re;
3806 int length = 1 + LINK_SIZE; /* For initial BRA plus length */
3807 int runlength;
3808 int c, firstbyte, reqbyte;
3809 int bracount = 0;
3810 int branch_extra = 0;
3811 int branch_newextra;
3812 int item_count = -1;
3813 int name_count = 0;
3814 int max_name_size = 0;
3815 #ifdef SUPPORT_UTF8
3816 int lastcharlength = 0;
3817 BOOL utf8;
3818 BOOL class_utf8;
3819 #endif
3820 BOOL inescq = FALSE;
3821 unsigned int brastackptr = 0;
3822 size_t size;
3823 uschar *code;
3824 const uschar *codestart;
3825 const uschar *ptr;
3826 compile_data compile_block;
3827 int brastack[BRASTACK_SIZE];
3828 uschar bralenstack[BRASTACK_SIZE];
3829
3830 /* We can't pass back an error message if errorptr is NULL; I guess the best we
3831 can do is just return NULL. */
3832
3833 if (errorptr == NULL) return NULL;
3834 *errorptr = NULL;
3835
3836 /* However, we can give a message for this error */
3837
3838 if (erroroffset == NULL)
3839 {
3840 *errorptr = ERR16;
3841 return NULL;
3842 }
3843 *erroroffset = 0;
3844
3845 /* Can't support UTF8 unless PCRE has been compiled to include the code. */
3846
3847 #ifdef SUPPORT_UTF8
3848 utf8 = (options & PCRE_UTF8) != 0;
3849 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
3850 (*erroroffset = valid_utf8((uschar *)pattern, -1)) >= 0)
3851 {
3852 *errorptr = ERR44;
3853 return NULL;
3854 }
3855 #else
3856 if ((options & PCRE_UTF8) != 0)
3857 {
3858 *errorptr = ERR32;
3859 return NULL;
3860 }
3861 #endif
3862
3863 if ((options & ~PUBLIC_OPTIONS) != 0)
3864 {
3865 *errorptr = ERR17;
3866 return NULL;
3867 }
3868
3869 /* Set up pointers to the individual character tables */
3870
3871 if (tables == NULL) tables = pcre_default_tables;
3872 compile_block.lcc = tables + lcc_offset;
3873 compile_block.fcc = tables + fcc_offset;
3874 compile_block.cbits = tables + cbits_offset;
3875 compile_block.ctypes = tables + ctypes_offset;
3876
3877 /* Maximum back reference and backref bitmap. This is updated for numeric
3878 references during the first pass, but for named references during the actual
3879 compile pass. The bitmap records up to 31 back references to help in deciding
3880 whether (.*) can be treated as anchored or not. */
3881
3882 compile_block.top_backref = 0;
3883 compile_block.backref_map = 0;
3884
3885 /* Reflect pattern for debugging output */
3886
3887 DPRINTF(("------------------------------------------------------------------\n"));
3888 DPRINTF(("%s\n", pattern));
3889
3890 /* The first thing to do is to make a pass over the pattern to compute the
3891 amount of store required to hold the compiled code. This does not have to be
3892 perfect as long as errors are overestimates. At the same time we can detect any
3893 flag settings right at the start, and extract them. Make an attempt to correct
3894 for any counted white space if an "extended" flag setting appears late in the
3895 pattern. We can't be so clever for #-comments. */
3896
3897 ptr = (const uschar *)(pattern - 1);
3898 while ((c = *(++ptr)) != 0)
3899 {
3900 int min, max;
3901 int class_optcount;
3902 int bracket_length;
3903 int duplength;
3904
3905 /* If we are inside a \Q...\E sequence, all chars are literal */
3906
3907 if (inescq) goto NORMAL_CHAR;
3908
3909 /* Otherwise, first check for ignored whitespace and comments */
3910
3911 if ((options & PCRE_EXTENDED) != 0)
3912 {
3913 if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
3914 if (c == '#')
3915 {
3916 /* The space before the ; is to avoid a warning on a silly compiler
3917 on the Macintosh. */
3918 while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
3919 if (c == 0) break;
3920 continue;
3921 }
3922 }
3923
3924 item_count++; /* Is zero for the first non-comment item */
3925
3926 switch(c)
3927 {
3928 /* A backslashed item may be an escaped "normal" character or a
3929 character type. For a "normal" character, put the pointers and
3930 character back so that tests for whitespace etc. in the input
3931 are done correctly. */
3932
3933 case '\\':
3934 {
3935 const uschar *save_ptr = ptr;
3936 c = check_escape(&ptr, errorptr, bracount, options, FALSE);
3937 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
3938 if (c >= 0)
3939 {
3940 ptr = save_ptr;
3941 c = '\\';
3942 goto NORMAL_CHAR;
3943 }
3944 }
3945
3946 /* If \Q, enter "literal" mode */
3947
3948 if (-c == ESC_Q)
3949 {
3950 inescq = TRUE;
3951 continue;
3952 }
3953
3954 /* Other escapes need one byte, and are of length one for repeats */
3955
3956 length++;
3957 #ifdef SUPPORT_UTF8
3958 lastcharlength = 1;
3959 #endif
3960
3961 /* A back reference needs an additional 2 bytes, plus either one or 5
3962 bytes for a repeat. We also need to keep the value of the highest
3963 back reference. */
3964
3965 if (c <= -ESC_REF)
3966 {
3967 int refnum = -c - ESC_REF;
3968 compile_block.backref_map |= (refnum < 32)? (1 << refnum) : 1;
3969 if (refnum > compile_block.top_backref)
3970 compile_block.top_backref = refnum;
3971 length += 2; /* For single back reference */
3972 if (ptr[1] == '{' && is_counted_repeat(ptr+2))
3973 {
3974 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
3975 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
3976 if ((min == 0 && (max == 1 || max == -1)) ||
3977 (min == 1 && max == -1))
3978 length++;
3979 else length += 5;
3980 if (ptr[1] == '?') ptr++;
3981 }
3982 }
3983 continue;
3984
3985 case '^': /* Single-byte metacharacters */
3986 case '.':
3987 case '$':
3988 length++;
3989 #ifdef SUPPORT_UTF8
3990 lastcharlength = 1;
3991 #endif
3992 continue;
3993
3994 case '*': /* These repeats won't be after brackets; */
3995 case '+': /* those are handled separately */
3996 case '?':
3997 length++;
3998 goto POSESSIVE; /* A few lines below */
3999
4000 /* This covers the cases of braced repeats after a single char, metachar,
4001 class, or back reference. */
4002
4003 case '{':
4004 if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR;
4005 ptr = read_repeat_counts(ptr+1, &min, &max, errorptr);
4006 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
4007
4008 /* These special cases just insert one extra opcode */
4009
4010 if ((min == 0 && (max == 1 || max == -1)) ||
4011 (min == 1 && max == -1))
4012 length++;
4013
4014 /* These cases might insert additional copies of a preceding character. */
4015
4016 else
4017 {
4018 #ifdef SUPPORT_UTF8
4019 /* In UTF-8 mode, we should find the length in lastcharlength */
4020 if (utf8)
4021 {
4022 if (min != 1)
4023 {
4024 length -= lastcharlength; /* Uncount the original char or metachar */
4025 if (min > 0) length += 3 + lastcharlength;
4026 }
4027 length += lastcharlength + ((max > 0)? 3 : 1);
4028 }
4029 else
4030 #endif
4031
4032 /* Not UTF-8 mode: all characters are one byte */
4033 {
4034 if (min != 1)
4035 {
4036 length--; /* Uncount the original char or metachar */
4037 if (min > 0) length += 4;
4038 }
4039
4040 length += (max > 0)? 4 : 2;
4041 }
4042 }
4043
4044 if (ptr[1] == '?') ptr++; /* Needs no extra length */
4045
4046 POSESSIVE: /* Test for possessive quantifier */
4047 if (ptr[1] == '+')
4048 {
4049 ptr++;
4050 length += 2 + 2*LINK_SIZE; /* Allow for atomic brackets */
4051 }
4052 continue;
4053
4054 /* An alternation contains an offset to the next branch or ket. If any ims
4055 options changed in the previous branch(es), and/or if we are in a
4056 lookbehind assertion, extra space will be needed at the start of the
4057 branch. This is handled by branch_extra. */
4058
4059 case '|':
4060 length += 1 + LINK_SIZE + branch_extra;
4061 continue;
4062
4063 /* A character class uses 33 characters provided that all the character
4064 values are less than 256. Otherwise, it uses a bit map for low valued
4065 characters, and individual items for others. Don't worry about character
4066 types that aren't allowed in classes - they'll get picked up during the
4067 compile. A character class that contains only one single-byte character
4068 uses 2 or 3 bytes, depending on whether it is negated or not. Notice this
4069 where we can. (In UTF-8 mode we can do this only for chars < 128.) */
4070
4071 case '[':
4072 class_optcount = 0;
4073
4074 #ifdef SUPPORT_UTF8
4075 class_utf8 = FALSE;
4076 #endif
4077
4078 if (*(++ptr) == '^') ptr++;
4079
4080 /* Written as a "do" so that an initial ']' is taken as data */
4081
4082 if (*ptr != 0) do
4083 {
4084 /* Inside \Q...\E everything is literal except \E */
4085
4086 if (inescq)
4087 {
4088 if (*ptr != '\\' || ptr[1] != 'E') goto NON_SPECIAL_CHARACTER;
4089 inescq = FALSE;
4090 ptr += 1;
4091 continue;
4092 }
4093
4094 /* Outside \Q...\E, check for escapes */
4095
4096 if (*ptr == '\\')
4097 {
4098 #ifdef SUPPORT_UTF8
4099 int prevchar = ptr[-1];
4100 #endif
4101 int ch = check_escape(&ptr, errorptr, bracount, options, TRUE);
4102 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
4103
4104 /* \b is backspace inside a class */
4105
4106 if (-ch == ESC_b) ch = '\b';
4107
4108 /* \Q enters quoting mode */
4109
4110 if (-ch == ESC_Q)
4111 {
4112 inescq = TRUE;
4113 continue;
4114 }
4115
4116 /* Handle escapes that turn into characters */
4117
4118 if (ch >= 0)
4119 {
4120 #ifdef SUPPORT_UTF8
4121 if (utf8)
4122 {
4123 if (ch > 127) class_optcount = 10; /* Ensure > 1 */
4124 if (ch > 255)
4125 {
4126 uschar buffer[6];
4127 if (!class_utf8)
4128 {
4129 class_utf8 = TRUE;
4130 length += LINK_SIZE + 1 + 1;
4131 }
4132 length += 1 + ord2utf8(ch, buffer);
4133
4134 /* If this wide character is preceded by '-', add an extra 2 to
4135 the length in case the previous character was < 128, because in
4136 this case the whole range will be put into the list. */
4137
4138 if (prevchar == '-') length += 2;
4139 }
4140 }
4141 #endif
4142 class_optcount++; /* for possible optimization */
4143 }
4144 else class_optcount = 10; /* \d, \s etc; make sure > 1 */
4145 }
4146
4147 /* Check the syntax for POSIX stuff. The bits we actually handle are
4148 checked during the real compile phase. */
4149
4150 else if (*ptr == '[' && check_posix_syntax(ptr, &ptr, &compile_block))
4151 {
4152 ptr++;
4153 class_optcount = 10; /* Make sure > 1 */
4154 }
4155
4156 /* Anything else just increments the possible optimization count. If
4157 there are wide characters, we are going to have to use an XCLASS. */
4158
4159 else
4160 {
4161 NON_SPECIAL_CHARACTER:
4162 class_optcount++;
4163
4164 #ifdef SUPPORT_UTF8
4165 if (utf8)
4166 {
4167 int ch;
4168 int extra = 0;
4169 GETCHARLEN(ch, ptr, extra);
4170 if (ch > 127) class_optcount = 10; /* No optimization possible */
4171 if (ch > 255)
4172 {
4173 if (!class_utf8)
4174 {
4175 class_utf8 = TRUE;
4176 length += LINK_SIZE + 1 + 1;
4177 }
4178 length += 2 + extra;
4179
4180 /* If this wide character is preceded by '-', add an extra 2 to
4181 the length in case the previous character was < 128, because in
4182 this case the whole range will be put into the list. */
4183
4184 if (ptr[-1] == '-') length += 2;
4185
4186 /* Advance to the end of this character */
4187
4188 ptr += extra;
4189 }
4190 }
4191 #endif
4192 }
4193 }
4194 while (*(++ptr) != 0 && (inescq || *ptr != ']')); /* Concludes "do" above */
4195
4196 if (*ptr == 0) /* Missing terminating ']' */
4197 {
4198 *errorptr = ERR6;
4199 goto PCRE_ERROR_RETURN;
4200 }
4201
4202 /* We can optimize when there was only one optimizable character. Repeats
4203 for positive and negated single one-byte chars are handled by the general
4204 code. Here, we handle repeats for the class opcodes. */
4205
4206 if (class_optcount == 1) length += 3; else
4207 {
4208 length += 33;
4209
4210 /* A repeat needs either 1 or 5 bytes. */
4211
4212 if (*ptr != 0 && ptr[1] == '{' && is_counted_repeat(ptr+2))
4213 {
4214 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
4215 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
4216 if ((min == 0 && (max == 1 || max == -1)) ||
4217 (min == 1 && max == -1))
4218 length++;
4219 else length += 5;
4220 if (ptr[1] == '?') ptr++;
4221 }
4222 }
4223 continue;
4224
4225 /* Brackets may be genuine groups or special things */
4226
4227 case '(':
4228 branch_newextra = 0;
4229 bracket_length = 1 + LINK_SIZE;
4230
4231 /* Handle special forms of bracket, which all start (? */
4232
4233 if (ptr[1] == '?')
4234 {
4235 int set, unset;
4236 int *optset;
4237
4238 switch (c = ptr[2])
4239 {
4240 /* Skip over comments entirely */
4241 case '#':
4242 ptr += 3;
4243 while (*ptr != 0 && *ptr != ')') ptr++;
4244 if (*ptr == 0)
4245 {
4246 *errorptr = ERR18;
4247 goto PCRE_ERROR_RETURN;
4248 }
4249 continue;
4250
4251 /* Non-referencing groups and lookaheads just move the pointer on, and
4252 then behave like a non-special bracket, except that they don't increment
4253 the count of extracting brackets. Ditto for the "once only" bracket,
4254 which is in Perl from version 5.005. */
4255
4256 case ':':
4257 case '=':
4258 case '!':
4259 case '>':
4260 ptr += 2;
4261 break;
4262
4263 /* (?R) specifies a recursive call to the regex, which is an extension
4264 to provide the facility which can be obtained by (?p{perl-code}) in
4265 Perl 5.6. In Perl 5.8 this has become (??{perl-code}).
4266
4267 From PCRE 4.00, items such as (?3) specify subroutine-like "calls" to
4268 the appropriate numbered brackets. This includes both recursive and
4269 non-recursive calls. (?R) is now synonymous with (?0). */
4270
4271 case 'R':
4272 ptr++;
4273
4274 case '0': case '1': case '2': case '3': case '4':
4275 case '5': case '6': case '7': case '8': case '9':
4276 ptr += 2;
4277 if (c != 'R')
4278 while ((digitab[*(++ptr)] & ctype_digit) != 0);
4279 if (*ptr != ')')
4280 {
4281 *errorptr = ERR29;
4282 goto PCRE_ERROR_RETURN;
4283 }
4284 length += 1 + LINK_SIZE;
4285
4286 /* If this item is quantified, it will get wrapped inside brackets so
4287 as to use the code for quantified brackets. We jump down and use the
4288 code that handles this for real brackets. */
4289
4290 if (ptr[1] == '+' || ptr[1] == '*' || ptr[1] == '?' || ptr[1] == '{')
4291 {
4292 length += 2 + 2 * LINK_SIZE; /* to make bracketed */
4293 duplength = 5 + 3 * LINK_SIZE;
4294 goto HANDLE_QUANTIFIED_BRACKETS;
4295 }
4296 continue;
4297
4298 /* (?C) is an extension which provides "callout" - to provide a bit of
4299 the functionality of the Perl (?{...}) feature. An optional number may
4300 follow (default is zero). */
4301
4302 case 'C':
4303 ptr += 2;
4304 while ((digitab[*(++ptr)] & ctype_digit) != 0);
4305 if (*ptr != ')')
4306 {
4307 *errorptr = ERR39;
4308 goto PCRE_ERROR_RETURN;
4309 }
4310 length += 2;
4311 continue;
4312
4313 /* Named subpatterns are an extension copied from Python */
4314
4315 case 'P':
4316 ptr += 3;
4317 if (*ptr == '<')
4318 {
4319 const uschar *p; /* Don't amalgamate; some compilers */
4320 p = ++ptr; /* grumble at autoincrement in declaration */
4321 while ((compile_block.ctypes[*ptr] & ctype_word) != 0) ptr++;
4322 if (*ptr != '>')
4323 {
4324 *errorptr = ERR42;
4325 goto PCRE_ERROR_RETURN;
4326 }
4327 name_count++;
4328 if (ptr - p > max_name_size) max_name_size = (ptr - p);
4329 break;
4330 }
4331
4332 if (*ptr == '=' || *ptr == '>')
4333 {
4334 while ((compile_block.ctypes[*(++ptr)] & ctype_word) != 0);
4335 if (*ptr != ')')
4336 {
4337 *errorptr = ERR42;
4338 goto PCRE_ERROR_RETURN;
4339 }
4340 break;
4341 }
4342
4343 /* Unknown character after (?P */
4344
4345 *errorptr = ERR41;
4346 goto PCRE_ERROR_RETURN;
4347
4348 /* Lookbehinds are in Perl from version 5.005 */
4349
4350 case '<':
4351 ptr += 3;
4352 if (*ptr == '=' || *ptr == '!')
4353 {
4354 branch_newextra = 1 + LINK_SIZE;
4355 length += 1 + LINK_SIZE; /* For the first branch */
4356 break;
4357 }
4358 *errorptr = ERR24;
4359 goto PCRE_ERROR_RETURN;
4360
4361 /* Conditionals are in Perl from version 5.005. The bracket must either
4362 be followed by a number (for bracket reference) or by an assertion
4363 group, or (a PCRE extension) by 'R' for a recursion test. */
4364
4365 case '(':
4366 if (ptr[3] == 'R' && ptr[4] == ')')
4367 {
4368 ptr += 4;
4369 length += 3;
4370 }
4371 else if ((digitab[ptr[3]] & ctype_digit) != 0)
4372 {
4373 ptr += 4;
4374 length += 3;
4375 while ((digitab[*ptr] & ctype_digit) != 0) ptr++;
4376 if (*ptr != ')')
4377 {
4378 *errorptr = ERR26;
4379 goto PCRE_ERROR_RETURN;
4380 }
4381 }
4382 else /* An assertion must follow */
4383 {
4384 ptr++; /* Can treat like ':' as far as spacing is concerned */
4385 if (ptr[2] != '?' ||
4386 (ptr[3] != '=' && ptr[3] != '!' && ptr[3] != '<') )
4387 {
4388 ptr += 2; /* To get right offset in message */
4389 *errorptr = ERR28;
4390 goto PCRE_ERROR_RETURN;
4391 }
4392 }
4393 break;
4394
4395 /* Else loop checking valid options until ) is met. Anything else is an
4396 error. If we are without any brackets, i.e. at top level, the settings
4397 act as if specified in the options, so massage the options immediately.
4398 This is for backward compatibility with Perl 5.004. */
4399
4400 default:
4401 set = unset = 0;
4402 optset = &set;
4403 ptr += 2;
4404
4405 for (;; ptr++)
4406 {
4407 c = *ptr;
4408 switch (c)
4409 {
4410 case 'i':
4411 *optset |= PCRE_CASELESS;
4412 continue;
4413
4414 case 'm':
4415 *optset |= PCRE_MULTILINE;
4416 continue;
4417
4418 case 's':
4419 *optset |= PCRE_DOTALL;
4420 continue;
4421
4422 case 'x':
4423 *optset |= PCRE_EXTENDED;
4424 continue;
4425
4426 case 'X':
4427 *optset |= PCRE_EXTRA;
4428 continue;
4429
4430 case 'U':
4431 *optset |= PCRE_UNGREEDY;
4432 continue;
4433
4434 case '-':
4435 optset = &unset;
4436 continue;
4437
4438 /* A termination by ')' indicates an options-setting-only item; if
4439 this is at the very start of the pattern (indicated by item_count
4440 being zero), we use it to set the global options. This is helpful
4441 when analyzing the pattern for first characters, etc. Otherwise
4442 nothing is done here and it is handled during the compiling
4443 process.
4444
4445 [Historical note: Up to Perl 5.8, options settings at top level
4446 were always global settings, wherever they appeared in the pattern.
4447 That is, they were equivalent to an external setting. From 5.8
4448 onwards, they apply only to what follows (which is what you might
4449 expect).] */
4450
4451 case ')':
4452 if (item_count == 0)
4453 {
4454 options = (options | set) & (~unset);
4455 set = unset = 0; /* To save length */
4456 item_count--; /* To allow for several */
4457 }
4458
4459 /* Fall through */
4460
4461 /* A termination by ':' indicates the start of a nested group with
4462 the given options set. This is again handled at compile time, but
4463 we must allow for compiled space if any of the ims options are
4464 set. We also have to allow for resetting space at the end of
4465 the group, which is why 4 is added to the length and not just 2.
4466 If there are several changes of options within the same group, this
4467 will lead to an over-estimate on the length, but this shouldn't
4468 matter very much. We also have to allow for resetting options at
4469 the start of any alternations, which we do by setting
4470 branch_newextra to 2. Finally, we record whether the case-dependent
4471 flag ever changes within the regex. This is used by the "required
4472 character" code. */
4473
4474 case ':':
4475 if (((set|unset) & PCRE_IMS) != 0)
4476 {
4477 length += 4;
4478 branch_newextra = 2;
4479 if (((set|unset) & PCRE_CASELESS) != 0) options |= PCRE_ICHANGED;
4480 }
4481 goto END_OPTIONS;
4482
4483 /* Unrecognized option character */
4484
4485 default:
4486 *errorptr = ERR12;
4487 goto PCRE_ERROR_RETURN;
4488 }
4489 }
4490
4491 /* If we hit a closing bracket, that's it - this is a freestanding
4492 option-setting. We need to ensure that branch_extra is updated if
4493 necessary. The only values branch_newextra can have here are 0 or 2.
4494 If the value is 2, then branch_extra must either be 2 or 5, depending
4495 on whether this is a lookbehind group or not. */
4496
4497 END_OPTIONS:
4498 if (c == ')')
4499 {
4500 if (branch_newextra == 2 &&
4501 (branch_extra == 0 || branch_extra == 1+LINK_SIZE))
4502 branch_extra += branch_newextra;
4503 continue;
4504 }
4505
4506 /* If options were terminated by ':' control comes here. Fall through
4507 to handle the group below. */
4508 }
4509 }
4510
4511 /* Extracting brackets must be counted so we can process escapes in a
4512 Perlish way. If the number exceeds EXTRACT_BASIC_MAX we are going to
4513 need an additional 3 bytes of store per extracting bracket. However, if
4514 PCRE_NO_AUTO)CAPTURE is set, unadorned brackets become non-capturing, so we
4515 must leave the count alone (it will aways be zero). */
4516
4517 else if ((options & PCRE_NO_AUTO_CAPTURE) == 0)
4518 {
4519 bracount++;
4520 if (bracount > EXTRACT_BASIC_MAX) bracket_length += 3;
4521 }
4522
4523 /* Save length for computing whole length at end if there's a repeat that
4524 requires duplication of the group. Also save the current value of
4525 branch_extra, and start the new group with the new value. If non-zero, this
4526 will either be 2 for a (?imsx: group, or 3 for a lookbehind assertion. */
4527
4528 if (brastackptr >= sizeof(brastack)/sizeof(int))
4529 {
4530 *errorptr = ERR19;
4531 goto PCRE_ERROR_RETURN;
4532 }
4533
4534 bralenstack[brastackptr] = branch_extra;
4535 branch_extra = branch_newextra;
4536
4537 brastack[brastackptr++] = length;
4538 length += bracket_length;
4539 continue;
4540
4541 /* Handle ket. Look for subsequent max/min; for certain sets of values we
4542 have to replicate this bracket up to that many times. If brastackptr is
4543 0 this is an unmatched bracket which will generate an error, but take care
4544 not to try to access brastack[-1] when computing the length and restoring
4545 the branch_extra value. */
4546
4547 case ')':
4548 length += 1 + LINK_SIZE;
4549 if (brastackptr > 0)
4550 {
4551 duplength = length - brastack[--brastackptr];
4552 branch_extra = bralenstack[brastackptr];
4553 }
4554 else duplength = 0;
4555
4556 /* The following code is also used when a recursion such as (?3) is
4557 followed by a quantifier, because in that case, it has to be wrapped inside
4558 brackets so that the quantifier works. The value of duplength must be
4559 set before arrival. */
4560
4561 HANDLE_QUANTIFIED_BRACKETS:
4562
4563 /* Leave ptr at the final char; for read_repeat_counts this happens
4564 automatically; for the others we need an increment. */
4565
4566 if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2))
4567 {
4568 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
4569 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
4570 }
4571 else if (c == '*') { min = 0; max = -1; ptr++; }
4572 else if (c == '+') { min = 1; max = -1; ptr++; }
4573 else if (c == '?') { min = 0; max = 1; ptr++; }
4574 else { min = 1; max = 1; }
4575
4576 /* If the minimum is zero, we have to allow for an OP_BRAZERO before the
4577 group, and if the maximum is greater than zero, we have to replicate
4578 maxval-1 times; each replication acquires an OP_BRAZERO plus a nesting
4579 bracket set. */
4580
4581 if (min == 0)
4582 {
4583 length++;
4584 if (max > 0) length += (max - 1) * (duplength + 3 + 2*LINK_SIZE);
4585 }
4586
4587 /* When the minimum is greater than zero, we have to replicate up to
4588 minval-1 times, with no additions required in the copies. Then, if there
4589 is a limited maximum we have to replicate up to maxval-1 times allowing
4590 for a BRAZERO item before each optional copy and nesting brackets for all
4591 but one of the optional copies. */
4592
4593 else
4594 {
4595 length += (min - 1) * duplength;
4596 if (max > min) /* Need this test as max=-1 means no limit */
4597 length += (max - min) * (duplength + 3 + 2*LINK_SIZE)
4598 - (2 + 2*LINK_SIZE);
4599 }
4600
4601 /* Allow space for once brackets for "possessive quantifier" */
4602
4603 if (ptr[1] == '+')
4604 {
4605 ptr++;
4606 length += 2 + 2*LINK_SIZE;
4607 }
4608 continue;
4609
4610 /* Non-special character. For a run of such characters the length required
4611 is the number of characters + 2, except that the maximum run length is
4612 MAXLIT. We won't get a skipped space or a non-data escape or the start of a
4613 # comment as the first character, so the length can't be zero. */
4614
4615 NORMAL_CHAR:
4616 default:
4617 length += 2;
4618 runlength = 0;
4619 do
4620 {
4621 #ifdef SUPPORT_UTF8
4622 lastcharlength = 1; /* Need length of last char for UTF-8 repeats */
4623 #endif
4624
4625 /* If in a \Q...\E sequence, check for end; otherwise it's a literal */
4626 if (inescq)
4627 {
4628 if (c == '\\' && ptr[1] == 'E')
4629 {
4630 inescq = FALSE;
4631 ptr++;
4632 }
4633 else runlength++;
4634 continue;
4635 }
4636
4637 /* Skip whitespace and comments for /x */
4638
4639 if ((options & PCRE_EXTENDED) != 0)
4640 {
4641 if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
4642 if (c == '#')
4643 {
4644 /* The space before the ; is to avoid a warning on a silly compiler
4645 on the Macintosh. */
4646 while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
4647 continue;
4648 }
4649 }
4650
4651 /* Backslash may introduce a data char or a metacharacter; stop the
4652 string before the latter. */
4653
4654 if (c == '\\')
4655 {
4656 const uschar *saveptr = ptr;
4657 c = check_escape(&ptr, errorptr, bracount, options, FALSE);
4658 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
4659 if (c < 0) { ptr = saveptr; break; }
4660
4661 /* In UTF-8 mode, add on the number of additional bytes needed to
4662 encode this character, and save the total length in case this is a
4663 final char that is repeated. */
4664
4665 #ifdef SUPPORT_UTF8
4666 if (utf8 && c > 127)
4667 {
4668 int i;
4669 for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
4670 if (c <= utf8_table1[i]) break;
4671 runlength += i;
4672 lastcharlength += i;
4673 }
4674 #endif
4675 }
4676
4677 /* Ordinary character or single-char escape */
4678
4679 runlength++;
4680 }
4681
4682 /* This "while" is the end of the "do" above. */
4683
4684 while (runlength < MAXLIT &&
4685 (compile_block.ctypes[c = *(++ptr)] & ctype_meta) == 0);
4686
4687 /* If we hit a meta-character, back off to point to it */
4688
4689 if (runlength < MAXLIT) ptr--;
4690
4691 /* If the last char in the string is a UTF-8 multibyte character, we must
4692 set lastcharlength correctly. If it was specified as an escape, this will
4693 already have been done above. However, we also have to support in-line
4694 UTF-8 characters, so check backwards from where we are. */
4695
4696 #ifdef SUPPORT_UTF8
4697 if (utf8)
4698 {
4699 const uschar *lastptr = ptr - 1;
4700 if ((*lastptr & 0x80) != 0)
4701 {
4702 while((*lastptr & 0xc0) == 0x80) lastptr--;
4703 lastcharlength = ptr - lastptr;
4704 }
4705 }
4706 #endif
4707
4708 length += runlength;
4709 continue;
4710 }
4711 }
4712
4713 length += 2 + LINK_SIZE; /* For final KET and END */
4714
4715 if (length > MAX_PATTERN_SIZE)
4716 {
4717 *errorptr = ERR20;
4718 return NULL;
4719 }
4720
4721 /* Compute the size of data block needed and get it, either from malloc or
4722 externally provided function. */
4723
4724 size = length + sizeof(real_pcre) + name_count * (max_name_size + 3);
4725 re = (real_pcre *)(pcre_malloc)(size);
4726
4727 if (re == NULL)
4728 {
4729 *errorptr = ERR21;
4730 return NULL;
4731 }
4732
4733 /* Put in the magic number, and save the size, options, and table pointer */
4734
4735 re->magic_number = MAGIC_NUMBER;
4736 re->size = size;
4737 re->options = options;
4738 re->tables = tables;
4739 re->name_entry_size = max_name_size + 3;
4740 re->name_count = name_count;
4741
4742 /* The starting points of the name/number translation table and of the code are
4743 passed around in the compile data block. */
4744
4745 compile_block.names_found = 0;
4746 compile_block.name_entry_size = max_name_size + 3;
4747 compile_block.name_table = (uschar *)re + sizeof(real_pcre);
4748 codestart = compile_block.name_table + re->name_entry_size * re->name_count;
4749 compile_block.start_code = codestart;
4750 compile_block.req_varyopt = 0;
4751
4752 /* Set up a starting, non-extracting bracket, then compile the expression. On
4753 error, *errorptr will be set non-NULL, so we don't need to look at the result
4754 of the function here. */
4755
4756 ptr = (const uschar *)pattern;
4757 code = (uschar *)codestart;
4758 *code = OP_BRA;
4759 bracount = 0;
4760 (void)compile_regex(options, options & PCRE_IMS, &bracount, &code, &ptr,
4761 errorptr, FALSE, 0, &firstbyte, &reqbyte, NULL, &compile_block);
4762 re->top_bracket = bracount;
4763 re->top_backref = compile_block.top_backref;
4764
4765 /* If not reached end of pattern on success, there's an excess bracket. */
4766
4767 if (*errorptr == NULL && *ptr != 0) *errorptr = ERR22;
4768
4769 /* Fill in the terminating state and check for disastrous overflow, but
4770 if debugging, leave the test till after things are printed out. */
4771
4772 *code++ = OP_END;
4773
4774 #ifndef DEBUG
4775 if (code - codestart > length) *errorptr = ERR23;
4776 #endif
4777
4778 /* Give an error if there's back reference to a non-existent capturing
4779 subpattern. */
4780
4781 if (re->top_backref > re->top_bracket) *errorptr = ERR15;
4782
4783 /* Failed to compile, or error while post-processing */
4784
4785 if (*errorptr != NULL)
4786 {
4787 (pcre_free)(re);
4788 PCRE_ERROR_RETURN:
4789 *erroroffset = ptr - (const uschar *)pattern;
4790 return NULL;
4791 }
4792
4793 /* If the anchored option was not passed, set the flag if we can determine that
4794 the pattern is anchored by virtue of ^ characters or \A or anything else (such
4795 as starting with .* when DOTALL is set).
4796
4797 Otherwise, if we know what the first character has to be, save it, because that
4798 speeds up unanchored matches no end. If not, see if we can set the
4799 PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
4800 start with ^. and also when all branches start with .* for non-DOTALL matches.
4801 */
4802
4803 if ((options & PCRE_ANCHORED) == 0)
4804 {
4805 int temp_options = options;
4806 if (is_anchored(codestart, &temp_options, 0, compile_block.backref_map))
4807 re->options |= PCRE_ANCHORED;
4808 else
4809 {
4810 if (firstbyte < 0)
4811 firstbyte = find_firstassertedchar(codestart, &temp_options, FALSE);
4812 if (firstbyte >= 0) /* Remove caseless flag for non-caseable chars */
4813 {
4814 int ch = firstbyte & 255;
4815 re->first_byte = ((firstbyte & REQ_CASELESS) != 0 &&
4816 compile_block.fcc[ch] == ch)? ch : firstbyte;
4817 re->options |= PCRE_FIRSTSET;
4818 }
4819 else if (is_startline(codestart, 0, compile_block.backref_map))
4820 re->options |= PCRE_STARTLINE;
4821 }
4822 }
4823
4824 /* For an anchored pattern, we use the "required byte" only if it follows a
4825 variable length item in the regex. Remove the caseless flag for non-caseable
4826 chars. */
4827
4828 if (reqbyte >= 0 &&
4829 ((re->options & PCRE_ANCHORED) == 0 || (reqbyte & REQ_VARY) != 0))
4830 {
4831 int ch = reqbyte & 255;
4832 re->req_byte = ((reqbyte & REQ_CASELESS) != 0 &&
4833 compile_block.fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte;
4834 re->options |= PCRE_REQCHSET;
4835 }
4836
4837 /* Print out the compiled data for debugging */
4838
4839 #ifdef DEBUG
4840
4841 printf("Length = %d top_bracket = %d top_backref = %d\n",
4842 length, re->top_bracket, re->top_backref);
4843
4844 if (re->options != 0)
4845 {
4846 printf("%s%s%s%s%s%s%s%s%s\n",
4847 ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
4848 ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
4849 ((re->options & PCRE_ICHANGED) != 0)? "case state changed " : "",
4850 ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
4851 ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
4852 ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
4853 ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
4854 ((re->options & PCRE_EXTRA) != 0)? "extra " : "",
4855 ((re->options & PCRE_UNGREEDY) != 0)? "ungreedy " : "");
4856 }
4857
4858 if ((re->options & PCRE_FIRSTSET) != 0)
4859 {
4860 int ch = re->first_byte & 255;
4861 char *caseless = ((re->first_byte & REQ_CASELESS) == 0)? "" : " (caseless)";
4862 if (isprint(ch)) printf("First char = %c%s\n", ch, caseless);
4863 else printf("First char = \\x%02x%s\n", ch, caseless);
4864 }
4865
4866 if ((re->options & PCRE_REQCHSET) != 0)
4867 {
4868 int ch = re->req_byte & 255;
4869 char *caseless = ((re->req_byte & REQ_CASELESS) == 0)? "" : " (caseless)";
4870 if (isprint(ch)) printf("Req char = %c%s\n", ch, caseless);
4871 else printf("Req char = \\x%02x%s\n", ch, caseless);
4872 }
4873
4874 print_internals(re, stdout);
4875
4876 /* This check is done here in the debugging case so that the code that
4877 was compiled can be seen. */
4878
4879 if (code - codestart > length)
4880 {
4881 *errorptr = ERR23;
4882 (pcre_free)(re);
4883 *erroroffset = ptr - (uschar *)pattern;
4884 return NULL;
4885 }
4886 #endif
4887
4888 return (pcre *)re;
4889 }
4890
4891
4892
4893 /*************************************************
4894 * Match a back-reference *
4895 *************************************************/
4896
4897 /* If a back reference hasn't been set, the length that is passed is greater
4898 than the number of characters left in the string, so the match fails.
4899
4900 Arguments:
4901 offset index into the offset vector
4902 eptr points into the subject
4903 length length to be matched
4904 md points to match data block
4905 ims the ims flags
4906
4907 Returns: TRUE if matched
4908 */
4909
4910 static BOOL
4911 match_ref(int offset, register const uschar *eptr, int length, match_data *md,
4912 unsigned long int ims)
4913 {
4914 const uschar *p = md->start_subject + md->offset_vector[offset];
4915
4916 #ifdef DEBUG
4917 if (eptr >= md->end_subject)
4918 printf("matching subject <null>");
4919 else
4920 {
4921 printf("matching subject ");
4922 pchars(eptr, length, TRUE, md);
4923 }
4924 printf(" against backref ");
4925 pchars(p, length, FALSE, md);
4926 printf("\n");
4927 #endif
4928
4929 /* Always fail if not enough characters left */
4930
4931 if (length > md->end_subject - eptr) return FALSE;
4932
4933 /* Separate the caselesss case for speed */
4934
4935 if ((ims & PCRE_CASELESS) != 0)
4936 {
4937 while (length-- > 0)
4938 if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE;
4939 }
4940 else
4941 { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
4942
4943 return TRUE;
4944 }
4945
4946
4947 #ifdef SUPPORT_UTF8
4948 /*************************************************
4949 * Match character against an XCLASS *
4950 *************************************************/
4951
4952 /* This function is called from within the XCLASS code below, to match a
4953 character against an extended class which might match values > 255.
4954
4955 Arguments:
4956 c the character
4957 data points to the flag byte of the XCLASS data
4958
4959 Returns: TRUE if character matches, else FALSE
4960 */
4961
4962 static BOOL
4963 match_xclass(int c, const uschar *data)
4964 {
4965 int t;
4966 BOOL negated = (*data & XCL_NOT) != 0;
4967
4968 /* Character values < 256 are matched against a bitmap, if one is present. If
4969 not, we still carry on, because there may be ranges that start below 256 in the
4970 additional data. */
4971
4972 if (c < 256)
4973 {
4974 if ((*data & XCL_MAP) != 0 && (data[1 + c/8] & (1 << (c&7))) != 0)
4975 return !negated; /* char found */
4976 }
4977
4978 /* Now match against the list of large chars or ranges that end with a large
4979 char. First skip the bit map if present. */
4980
4981 if ((*data++ & XCL_MAP) != 0) data += 32;
4982
4983 while ((t = *data++) != XCL_END)
4984 {
4985 int x, y;
4986 GETCHARINC(x, data);
4987 if (t == XCL_SINGLE)
4988 {
4989 if (c == x) return !negated;
4990 }
4991 else
4992 {
4993 GETCHARINC(y, data);
4994 if (c >= x && c <= y) return !negated;
4995 }
4996 }
4997
4998 return negated; /* char was not found */
4999 }
5000 #endif
5001
5002
5003
5004
5005 /*************************************************
5006 * Match from current position *
5007 *************************************************/
5008
5009 /* On entry ecode points to the first opcode, and eptr to the first character
5010 in the subject string, while eptrb holds the value of eptr at the start of the
5011 last bracketed group - used for breaking infinite loops matching zero-length
5012 strings. This function is called recursively in many circumstances. Whenever it
5013 returns a negative (error) response, the outer incarnation must also return the
5014 same response.
5015
5016 Performance note: It might be tempting to extract commonly used fields from the
5017 md structure (e.g. utf8, end_subject) into individual variables to improve
5018 performance. Tests using gcc on a SPARC disproved this; in the first case, it
5019 made performance worse.
5020
5021 Arguments:
5022 eptr pointer in subject
5023 ecode position in code
5024 offset_top current top pointer
5025 md pointer to "static" info for the match
5026 ims current /i, /m, and /s options
5027 eptrb pointer to chain of blocks containing eptr at start of
5028 brackets - for testing for empty matches
5029 flags can contain
5030 match_condassert - this is an assertion condition
5031 match_isgroup - this is the start of a bracketed group
5032
5033 Returns: MATCH_MATCH if matched ) these values are >= 0
5034 MATCH_NOMATCH if failed to match )
5035 a negative PCRE_ERROR_xxx value if aborted by an error condition
5036 (e.g. stopped by recursion limit)
5037 */
5038
5039 static int
5040 match(register const uschar *eptr, register const uschar *ecode,
5041 int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
5042 int flags)
5043 {
5044 unsigned long int original_ims = ims; /* Save for resetting on ')' */
5045 register int rrc;
5046 eptrblock newptrb;
5047
5048 if (md->match_call_count++ >= md->match_limit) return PCRE_ERROR_MATCHLIMIT;
5049
5050 /* At the start of a bracketed group, add the current subject pointer to the
5051 stack of such pointers, to be re-instated at the end of the group when we hit
5052 the closing ket. When match() is called in other circumstances, we don't add to
5053 the stack. */
5054
5055 if ((flags & match_isgroup) != 0)
5056 {
5057 newptrb.prev = eptrb;
5058 newptrb.saved_eptr = eptr;
5059 eptrb = &newptrb;
5060 }
5061
5062 /* Now start processing the operations. */
5063
5064 for (;;)
5065 {
5066 int op = (int)*ecode;
5067 int min, max, ctype;
5068 register int i;
5069 register int c;
5070 BOOL minimize = FALSE;
5071
5072 /* Opening capturing bracket. If there is space in the offset vector, save
5073 the current subject position in the working slot at the top of the vector. We
5074 mustn't change the current values of the data slot, because they may be set
5075 from a previous iteration of this group, and be referred to by a reference
5076 inside the group.
5077
5078 If the bracket fails to match, we need to restore this value and also the
5079 values of the final offsets, in case they were set by a previous iteration of
5080 the same bracket.
5081
5082 If there isn't enough space in the offset vector, treat this as if it were a
5083 non-capturing bracket. Don't worry about setting the flag for the error case
5084 here; that is handled in the code for KET. */
5085
5086 if (op > OP_BRA)
5087 {
5088 int offset;
5089 int number = op - OP_BRA;
5090
5091 /* For extended extraction brackets (large number), we have to fish out the
5092 number from a dummy opcode at the start. */
5093
5094 if (number > EXTRACT_BASIC_MAX)
5095 number = GET2(ecode, 2+LINK_SIZE);
5096 offset = number << 1;
5097
5098 #ifdef DEBUG
5099 printf("start bracket %d subject=", number);
5100 pchars(eptr, 16, TRUE, md);
5101 printf("\n");
5102 #endif
5103
5104 if (offset < md->offset_max)
5105 {
5106 int save_offset1 = md->offset_vector[offset];
5107 int save_offset2 = md->offset_vector[offset+1];
5108 int save_offset3 = md->offset_vector[md->offset_end - number];
5109 int save_capture_last = md->capture_last;
5110
5111 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
5112 md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
5113
5114 do
5115 {
5116 if ((rrc = match(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims,
5117 eptrb, match_isgroup)) != MATCH_NOMATCH) return rrc;
5118 md->capture_last = save_capture_last;
5119 ecode += GET(ecode, 1);
5120 }
5121 while (*ecode == OP_ALT);
5122
5123 DPRINTF(("bracket %d failed\n", number));
5124
5125 md->offset_vector[offset] = save_offset1;
5126 md->offset_vector[offset+1] = save_offset2;
5127 md->offset_vector[md->offset_end - number] = save_offset3;
5128
5129 return MATCH_NOMATCH;
5130 }
5131
5132 /* Insufficient room for saving captured contents */
5133
5134 else op = OP_BRA;
5135 }
5136
5137 /* Other types of node can be handled by a switch */
5138
5139 switch(op)
5140 {
5141 case OP_BRA: /* Non-capturing bracket: optimized */
5142 DPRINTF(("start bracket 0\n"));
5143 do
5144 {
5145 if ((rrc = match(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb,
5146 match_isgroup)) != MATCH_NOMATCH) return rrc;
5147 ecode += GET(ecode, 1);
5148 }
5149 while (*ecode == OP_ALT);
5150 DPRINTF(("bracket 0 failed\n"));
5151 return MATCH_NOMATCH;
5152
5153 /* Conditional group: compilation checked that there are no more than
5154 two branches. If the condition is false, skipping the first branch takes us
5155 past the end if there is only one branch, but that's OK because that is
5156 exactly what going to the ket would do. */
5157
5158 case OP_COND:
5159 if (ecode[LINK_SIZE+1] == OP_CREF) /* Condition extract or recurse test */
5160 {
5161 int offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
5162 BOOL condition = (offset == CREF_RECURSE * 2)?
5163 (md->recursive != NULL) :
5164 (offset < offset_top && md->offset_vector[offset] >= 0);
5165 return match(eptr, ecode + (condition?
5166 (LINK_SIZE + 4) : (LINK_SIZE + 1 + GET(ecode, 1))),
5167 offset_top, md, ims, eptrb, match_isgroup);
5168 }
5169
5170 /* The condition is an assertion. Call match() to evaluate it - setting
5171 the final argument TRUE causes it to stop at the end of an assertion. */
5172
5173 else
5174 {
5175 if ((rrc = match(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
5176 match_condassert | match_isgroup)) == MATCH_MATCH)
5177 {
5178 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE+2);
5179 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
5180 }
5181 else if (rrc != MATCH_NOMATCH) return rrc;
5182 else ecode += GET(ecode, 1);
5183 return match(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb,
5184 match_isgroup);
5185 }
5186 /* Control never reaches here */
5187
5188 /* Skip over conditional reference or large extraction number data if
5189 encountered. */
5190
5191 case OP_CREF:
5192 case OP_BRANUMBER:
5193 ecode += 3;
5194 break;
5195
5196 /* End of the pattern. If we are in a recursion, we should restore the
5197 offsets appropriately and continue from after the call. */
5198
5199 case OP_END:
5200 if (md->recursive != NULL && md->recursive->group_num == 0)
5201 {
5202 recursion_info *rec = md->recursive;
5203 DPRINTF(("Hit the end in a (?0) recursion\n"));
5204 md->recursive = rec->prev;
5205 memmove(md->offset_vector, rec->offset_save,
5206 rec->saved_max * sizeof(int));
5207 md->start_match = rec->save_start;
5208 ims = original_ims;
5209 ecode = rec->after_call;
5210 break;
5211 }
5212
5213 /* Otherwise, if PCRE_NOTEMPTY is set, fail if we have matched an empty
5214 string - backtracking will then try other alternatives, if any. */
5215
5216 if (md->notempty && eptr == md->start_match) return MATCH_NOMATCH;
5217 md->end_match_ptr = eptr; /* Record where we ended */
5218 md->end_offset_top = offset_top; /* and how many extracts were taken */
5219 return MATCH_MATCH;
5220
5221 /* Change option settings */
5222
5223 case OP_OPT:
5224 ims = ecode[1];
5225 ecode += 2;
5226 DPRINTF(("ims set to %02lx\n", ims));
5227 break;
5228
5229 /* Assertion brackets. Check the alternative branches in turn - the
5230 matching won't pass the KET for an assertion. If any one branch matches,
5231 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
5232 start of each branch to move the current point backwards, so the code at
5233 this level is identical to the lookahead case. */
5234
5235 case OP_ASSERT:
5236 case OP_ASSERTBACK:
5237 do
5238 {
5239 if ((rrc = match(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
5240 match_isgroup)) == MATCH_MATCH) break;
5241 if (rrc != MATCH_NOMATCH) return rrc;
5242 ecode += GET(ecode, 1);
5243 }
5244 while (*ecode == OP_ALT);
5245 if (*ecode == OP_KET) return MATCH_NOMATCH;
5246
5247 /* If checking an assertion for a condition, return MATCH_MATCH. */
5248
5249 if ((flags & match_condassert) != 0) return MATCH_MATCH;
5250
5251 /* Continue from after the assertion, updating the offsets high water
5252 mark, since extracts may have been taken during the assertion. */
5253
5254 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
5255 ecode += 1 + LINK_SIZE;
5256 offset_top = md->end_offset_top;
5257 continue;
5258
5259 /* Negative assertion: all branches must fail to match */
5260
5261 case OP_ASSERT_NOT:
5262 case OP_ASSERTBACK_NOT:
5263 do
5264 {
5265 if ((rrc = match(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
5266 match_isgroup)) == MATCH_MATCH) return MATCH_NOMATCH;
5267 if (rrc != MATCH_NOMATCH) return rrc;
5268 ecode += GET(ecode,1);
5269 }
5270 while (*ecode == OP_ALT);
5271
5272 if ((flags & match_condassert) != 0) return MATCH_MATCH;
5273
5274 ecode += 1 + LINK_SIZE;
5275 continue;
5276
5277 /* Move the subject pointer back. This occurs only at the start of
5278 each branch of a lookbehind assertion. If we are too close to the start to
5279 move back, this match function fails. When working with UTF-8 we move
5280 back a number of characters, not bytes. */
5281
5282 case OP_REVERSE:
5283 #ifdef SUPPORT_UTF8
5284 if (md->utf8)
5285 {
5286 c = GET(ecode,1);
5287 for (i = 0; i < c; i++)
5288 {
5289 eptr--;
5290 if (eptr < md->start_subject) return MATCH_NOMATCH;
5291 BACKCHAR(eptr)
5292 }
5293 }
5294 else
5295 #endif
5296
5297 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
5298
5299 {
5300 eptr -= GET(ecode,1);
5301 if (eptr < md->start_subject) return MATCH_NOMATCH;
5302 }
5303
5304 /* Skip to next op code */
5305
5306 ecode += 1 + LINK_SIZE;
5307 break;
5308
5309 /* The callout item calls an external function, if one is provided, passing
5310 details of the match so far. This is mainly for debugging, though the
5311 function is able to force a failure. */
5312
5313 case OP_CALLOUT:
5314 if (pcre_callout != NULL)
5315 {
5316 pcre_callout_block cb;
5317 cb.version = 0; /* Version 0 of the callout block */
5318 cb.callout_number = ecode[1];
5319 cb.offset_vector = md->offset_vector;
5320 cb.subject = (const char *)md->start_subject;
5321 cb.subject_length = md->end_subject - md->start_subject;
5322 cb.start_match = md->start_match - md->start_subject;
5323 cb.current_position = eptr - md->start_subject;
5324 cb.capture_top = offset_top/2;
5325 cb.capture_last = md->capture_last;
5326 cb.callout_data = md->callout_data;
5327 if ((rrc = (*pcre_callout)(&cb)) > 0) return MATCH_NOMATCH;
5328 if (rrc < 0) return rrc;
5329 }
5330 ecode += 2;
5331 break;
5332
5333 /* Recursion either matches the current regex, or some subexpression. The
5334 offset data is the offset to the starting bracket from the start of the
5335 whole pattern. However, it is possible that a BRAZERO was inserted before
5336 this bracket after we took the offset - we just skip it if encountered.
5337
5338 If there are any capturing brackets started but not finished, we have to
5339 save their starting points and reinstate them after the recursion. However,
5340 we don't know how many such there are (offset_top records the completed
5341 total) so we just have to save all the potential data. There may be up to
5342 65535 such values, which is too large to put on the stack, but using malloc
5343 for small numbers seems expensive. As a compromise, the stack is used when
5344 there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
5345 is used. A problem is what to do if the malloc fails ... there is no way of
5346 returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
5347 values on the stack, and accept that the rest may be wrong.
5348
5349 There are also other values that have to be saved. We use a chained
5350 sequence of blocks that actually live on the stack. Thanks to Robin Houston
5351 for the original version of this logic. */
5352
5353 case OP_RECURSE:
5354 {
5355 int stacksave[REC_STACK_SAVE_MAX];
5356 recursion_info new_recursive;
5357 const uschar *callpat = md->start_code + GET(ecode, 1);
5358
5359 if (*callpat == OP_BRAZERO) callpat++;
5360
5361 new_recursive.group_num = *callpat - OP_BRA;
5362
5363 /* For extended extraction brackets (large number), we have to fish out
5364 the number from a dummy opcode at the start. */
5365
5366 if (new_recursive.group_num > EXTRACT_BASIC_MAX)
5367 new_recursive.group_num = GET2(callpat, 2+LINK_SIZE);
5368
5369 /* Add to "recursing stack" */
5370
5371 new_recursive.prev = md->recursive;
5372 md->recursive = &new_recursive;
5373
5374 /* Find where to continue from afterwards */
5375
5376 ecode += 1 + LINK_SIZE;
5377 new_recursive.after_call = ecode;
5378
5379 /* Now save the offset data. */
5380
5381 new_recursive.saved_max = md->offset_end;
5382 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
5383 new_recursive.offset_save = stacksave;
5384 else
5385 {
5386 new_recursive.offset_save =
5387 (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
5388 if (new_recursive.offset_save == NULL) return PCRE_ERROR_NOMEMORY;
5389 }
5390
5391 memcpy(new_recursive.offset_save, md->offset_vector,
5392 new_recursive.saved_max * sizeof(int));
5393 new_recursive.save_start = md->start_match;
5394 md->start_match = eptr;
5395
5396 /* OK, now we can do the recursion. For each top-level alternative we
5397 restore the offset and recursion data. */
5398
5399 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
5400 do
5401 {
5402 if ((rrc = match(eptr, callpat + 1 + LINK_SIZE, offset_top, md, ims,
5403 eptrb, match_isgroup)) == MATCH_MATCH)
5404 {
5405 md->recursive = new_recursive.prev;
5406 if (new_recursive.offset_save != stacksave)
5407 (pcre_free)(new_recursive.offset_save);
5408 return MATCH_MATCH;
5409 }
5410 else if (rrc != MATCH_NOMATCH) return rrc;
5411
5412 md->recursive = &new_recursive;
5413 memcpy(md->offset_vector, new_recursive.offset_save,
5414 new_recursive.saved_max * sizeof(int));
5415 callpat += GET(callpat, 1);
5416 }
5417 while (*callpat == OP_ALT);
5418
5419 DPRINTF(("Recursion didn't match\n"));
5420 md->recursive = new_recursive.prev;
5421 if (new_recursive.offset_save != stacksave)
5422 (pcre_free)(new_recursive.offset_save);
5423 return MATCH_NOMATCH;
5424 }
5425 /* Control never reaches here */
5426
5427 /* "Once" brackets are like assertion brackets except that after a match,
5428 the point in the subject string is not moved back. Thus there can never be
5429 a move back into the brackets. Friedl calls these "atomic" subpatterns.
5430 Check the alternative branches in turn - the matching won't pass the KET
5431 for this kind of subpattern. If any one branch matches, we carry on as at
5432 the end of a normal bracket, leaving the subject pointer. */
5433
5434 case OP_ONCE:
5435 {
5436 const uschar *prev = ecode;
5437 const uschar *saved_eptr = eptr;
5438
5439 do
5440 {
5441 if ((rrc = match(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims,
5442 eptrb, match_isgroup)) == MATCH_MATCH) break;
5443 if (rrc != MATCH_NOMATCH) return rrc;
5444 ecode += GET(ecode,1);
5445 }
5446 while (*ecode == OP_ALT);
5447
5448 /* If hit the end of the group (which could be repeated), fail */
5449
5450 if (*ecode != OP_ONCE && *ecode != OP_ALT) return MATCH_NOMATCH;
5451
5452 /* Continue as from after the assertion, updating the offsets high water
5453 mark, since extracts may have been taken. */
5454
5455 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
5456
5457 offset_top = md->end_offset_top;
5458 eptr = md->end_match_ptr;
5459
5460 /* For a non-repeating ket, just continue at this level. This also
5461 happens for a repeating ket if no characters were matched in the group.
5462 This is the forcible breaking of infinite loops as implemented in Perl
5463 5.005. If there is an options reset, it will get obeyed in the normal
5464 course of events. */
5465
5466 if (*ecode == OP_KET || eptr == saved_eptr)
5467 {
5468 ecode += 1+LINK_SIZE;
5469 break;
5470 }
5471
5472 /* The repeating kets try the rest of the pattern or restart from the
5473 preceding bracket, in the appropriate order. We need to reset any options
5474 that changed within the bracket before re-running it, so check the next
5475 opcode. */
5476
5477 if (ecode[1+LINK_SIZE] == OP_OPT)
5478 {
5479 ims = (ims & ~PCRE_IMS) | ecode[4];
5480 DPRINTF(("ims set to %02lx at group repeat\n", ims));
5481 }
5482
5483 if (*ecode == OP_KETRMIN)
5484 {
5485 if ((rrc = match(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims,
5486 eptrb, 0)) != MATCH_NOMATCH) return rrc;
5487 if ((rrc = match(eptr, prev, offset_top, md, ims, eptrb,
5488 match_isgroup)) != MATCH_NOMATCH) return rrc;
5489 }
5490 else /* OP_KETRMAX */
5491 {
5492 if ((rrc = match(eptr, prev, offset_top, md, ims, eptrb,
5493 match_isgroup)) != MATCH_NOMATCH) return rrc;
5494 if ((rrc = match(eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb,
5495 0)) != MATCH_NOMATCH) return rrc;
5496 }
5497 }
5498 return MATCH_NOMATCH;
5499
5500 /* An alternation is the end of a branch; scan along to find the end of the
5501 bracketed group and go to there. */
5502
5503 case OP_ALT:
5504 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
5505 break;
5506
5507 /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
5508 that it may occur zero times. It may repeat infinitely, or not at all -
5509 i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
5510 repeat limits are compiled as a number of copies, with the optional ones
5511 preceded by BRAZERO or BRAMINZERO. */
5512
5513 case OP_BRAZERO:
5514 {
5515 const uschar *next = ecode+1;
5516 if ((rrc = match(eptr, next, offset_top, md, ims, eptrb, match_isgroup))
5517 != MATCH_NOMATCH) return rrc;
5518 do next += GET(next,1); while (*next == OP_ALT);
5519 ecode = next + 1+LINK_SIZE;
5520 }
5521 break;
5522
5523 case OP_BRAMINZERO:
5524 {
5525 const uschar *next = ecode+1;
5526 do next += GET(next,1); while (*next == OP_ALT);
5527 if ((rrc = match(eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb,
5528 match_isgroup)) != MATCH_NOMATCH) return rrc;
5529 ecode++;
5530 }
5531 break;
5532
5533 /* End of a group, repeated or non-repeating. If we are at the end of
5534 an assertion "group", stop matching and return MATCH_MATCH, but record the
5535 current high water mark for use by positive assertions. Do this also
5536 for the "once" (not-backup up) groups. */
5537
5538 case OP_KET:
5539 case OP_KETRMIN:
5540 case OP_KETRMAX:
5541 {
5542 const uschar *prev = ecode - GET(ecode, 1);
5543 const uschar *saved_eptr = eptrb->saved_eptr;
5544
5545 eptrb = eptrb->prev; /* Back up the stack of bracket start pointers */
5546
5547 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
5548 *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
5549 *prev == OP_ONCE)
5550 {
5551 md->end_match_ptr = eptr; /* For ONCE */
5552 md->end_offset_top = offset_top;
5553 return MATCH_MATCH;
5554 }
5555
5556 /* In all other cases except a conditional group we have to check the
5557 group number back at the start and if necessary complete handling an
5558 extraction by setting the offsets and bumping the high water mark. */
5559
5560 if (*prev != OP_COND)
5561 {
5562 int offset;
5563 int number = *prev - OP_BRA;
5564
5565 /* For extended extraction brackets (large number), we have to fish out
5566 the number from a dummy opcode at the start. */
5567
5568 if (number > EXTRACT_BASIC_MAX) number = GET2(prev, 2+LINK_SIZE);
5569 offset = number << 1;
5570
5571 #ifdef DEBUG
5572 printf("end bracket %d", number);
5573 printf("\n");
5574 #endif
5575
5576 /* Test for a numbered group. This includes groups called as a result
5577 of recursion. Note that whole-pattern recursion is coded as a recurse
5578 into group 0, so it won't be picked up here. Instead, we catch it when
5579 the OP_END is reached. */
5580
5581 if (number > 0)
5582 {
5583 md->capture_last = number;
5584 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
5585 {
5586 md->offset_vector[offset] =
5587 md->offset_vector[md->offset_end - number];
5588 md->offset_vector[offset+1] = eptr - md->start_subject;
5589 if (offset_top <= offset) offset_top = offset + 2;
5590 }
5591
5592 /* Handle a recursively called group. Restore the offsets
5593 appropriately and continue from after the call. */
5594
5595 if (md->recursive != NULL && md->recursive->group_num == number)
5596 {
5597 recursion_info *rec = md->recursive;
5598 DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
5599 md->recursive = rec->prev;
5600 md->start_match = rec->save_start;
5601 memcpy(md->offset_vector, rec->offset_save,
5602 rec->saved_max * sizeof(int));
5603 ecode = rec->after_call;
5604 ims = original_ims;
5605 break;
5606 }
5607 }
5608 }
5609
5610 /* Reset the value of the ims flags, in case they got changed during
5611 the group. */
5612
5613 ims = original_ims;
5614 DPRINTF(("ims reset to %02lx\n", ims));
5615
5616 /* For a non-repeating ket, just continue at this level. This also
5617 happens for a repeating ket if no characters were matched in the group.
5618 This is the forcible breaking of infinite loops as implemented in Perl
5619 5.005. If there is an options reset, it will get obeyed in the normal
5620 course of events. */
5621
5622 if (*ecode == OP_KET || eptr == saved_eptr)
5623 {
5624 ecode += 1 + LINK_SIZE;
5625 break;
5626 }
5627
5628 /* The repeating kets try the rest of the pattern or restart from the
5629 preceding bracket, in the appropriate order. */
5630
5631 if (*ecode == OP_KETRMIN)
5632 {
5633 if ((rrc = match(eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb,
5634 0)) != MATCH_NOMATCH) return rrc;
5635 if ((rrc = match(eptr, prev, offset_top, md, ims, eptrb,
5636 match_isgroup)) != MATCH_NOMATCH) return rrc;
5637 }
5638 else /* OP_KETRMAX */
5639 {
5640 if ((rrc = match(eptr, prev, offset_top, md, ims, eptrb,
5641 match_isgroup)) != MATCH_NOMATCH) return rrc;
5642 if ((rrc = match(eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb,
5643 0)) != MATCH_NOMATCH) return rrc;
5644 }
5645 }
5646 return MATCH_NOMATCH;
5647
5648 /* Start of subject unless notbol, or after internal newline if multiline */
5649
5650 case OP_CIRC:
5651 if (md->notbol && eptr == md->start_subject) return MATCH_NOMATCH;
5652 if ((ims & PCRE_MULTILINE) != 0)
5653 {
5654 if (eptr != md->start_subject && eptr[-1] != NEWLINE)
5655 return MATCH_NOMATCH;
5656 ecode++;
5657 break;
5658 }
5659 /* ... else fall through */
5660
5661 /* Start of subject assertion */
5662
5663 case OP_SOD:
5664 if (eptr != md->start_subject) return MATCH_NOMATCH;
5665 ecode++;
5666 break;
5667
5668 /* Start of match assertion */
5669
5670 case OP_SOM:
5671 if (eptr != md->start_subject + md->start_offset) return MATCH_NOMATCH;
5672 ecode++;
5673 break;
5674
5675 /* Assert before internal newline if multiline, or before a terminating
5676 newline unless endonly is set, else end of subject unless noteol is set. */
5677
5678 case OP_DOLL:
5679 if ((ims & PCRE_MULTILINE) != 0)
5680 {
5681 if (eptr < md->end_subject)
5682 { if (*eptr != NEWLINE) return MATCH_NOMATCH; }
5683 else
5684 { if (md->noteol) return MATCH_NOMATCH; }
5685 ecode++;
5686 break;
5687 }
5688 else
5689 {
5690 if (md->noteol) return MATCH_NOMATCH;
5691 if (!md->endonly)
5692 {
5693 if (eptr < md->end_subject - 1 ||
5694 (eptr == md->end_subject - 1 && *eptr != NEWLINE))
5695 return MATCH_NOMATCH;
5696 ecode++;
5697 break;
5698 }
5699 }
5700 /* ... else fall through */
5701
5702 /* End of subject assertion (\z) */
5703
5704 case OP_EOD:
5705 if (eptr < md->end_subject) return MATCH_NOMATCH;
5706 ecode++;
5707 break;
5708
5709 /* End of subject or ending \n assertion (\Z) */
5710
5711 case OP_EODN:
5712 if (eptr < md->end_subject - 1 ||
5713 (eptr == md->end_subject - 1 && *eptr != NEWLINE)) return MATCH_NOMATCH;
5714 ecode++;
5715 break;
5716
5717 /* Word boundary assertions */
5718
5719 case OP_NOT_WORD_BOUNDARY:
5720 case OP_WORD_BOUNDARY:
5721 {
5722 BOOL prev_is_word, cur_is_word;
5723
5724 /* Find out if the previous and current characters are "word" characters.
5725 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
5726 be "non-word" characters. */
5727
5728 #ifdef SUPPORT_UTF8
5729 if (md->utf8)
5730 {
5731 if (eptr == md->start_subject) prev_is_word = FALSE; else
5732 {
5733 const uschar *lastptr = eptr - 1;
5734 while((*lastptr & 0xc0) == 0x80) lastptr--;
5735 GETCHAR(c, lastptr);
5736 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
5737 }
5738 if (eptr >= md->end_subject) cur_is_word = FALSE; else
5739 {
5740 GETCHAR(c, eptr);
5741 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
5742 }
5743 }
5744 else
5745 #endif
5746
5747 /* More streamlined when not in UTF-8 mode */
5748
5749 {
5750 prev_is_word = (eptr != md->start_subject) &&
5751 ((md->ctypes[eptr[-1]] & ctype_word) != 0);
5752 cur_is_word = (eptr < md->end_subject) &&
5753 ((md->ctypes[*eptr] & ctype_word) != 0);
5754 }
5755
5756 /* Now see if the situation is what we want */
5757
5758 if ((*ecode++ == OP_WORD_BOUNDARY)?
5759 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
5760 return MATCH_NOMATCH;
5761 }
5762 break;
5763
5764 /* Match a single character type; inline for speed */
5765
5766 case OP_ANY:
5767 if ((ims & PCRE_DOTALL) == 0 && eptr < md->end_subject && *eptr == NEWLINE)
5768 return MATCH_NOMATCH;
5769 if (eptr++ >= md->end_subject) return MATCH_NOMATCH;
5770 #ifdef SUPPORT_UTF8
5771 if (md->utf8)
5772 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
5773 #endif
5774 ecode++;
5775 break;
5776
5777 /* Match a single byte, even in UTF-8 mode. This opcode really does match
5778 any byte, even newline, independent of the setting of PCRE_DOTALL. */
5779
5780 case OP_ANYBYTE:
5781 if (eptr++ >= md->end_subject) return MATCH_NOMATCH;
5782 ecode++;
5783 break;
5784
5785 case OP_NOT_DIGIT:
5786 if (eptr >= md->end_subject) return MATCH_NOMATCH;
5787 GETCHARINCTEST(c, eptr);
5788 if (
5789 #ifdef SUPPORT_UTF8
5790 c < 256 &&
5791 #endif
5792 (md->ctypes[c] & ctype_digit) != 0
5793 )
5794 return MATCH_NOMATCH;
5795 ecode++;
5796 break;
5797
5798 case OP_DIGIT:
5799 if (eptr >= md->end_subject) return MATCH_NOMATCH;
5800 GETCHARINCTEST(c, eptr);
5801 if (
5802 #ifdef SUPPORT_UTF8
5803 c >= 256 ||
5804 #endif
5805 (md->ctypes[c] & ctype_digit) == 0
5806 )
5807 return MATCH_NOMATCH;
5808 ecode++;
5809 break;
5810
5811 case OP_NOT_WHITESPACE:
5812 if (eptr >= md->end_subject) return MATCH_NOMATCH;
5813 GETCHARINCTEST(c, eptr);
5814 if (
5815 #ifdef SUPPORT_UTF8
5816 c < 256 &&
5817 #endif
5818 (md->ctypes[c] & ctype_space) != 0
5819 )
5820 return MATCH_NOMATCH;
5821 ecode++;
5822 break;
5823
5824 case OP_WHITESPACE:
5825 if (eptr >= md->end_subject) return MATCH_NOMATCH;
5826 GETCHARINCTEST(c, eptr);
5827 if (
5828 #ifdef SUPPORT_UTF8
5829 c >= 256 ||
5830 #endif
5831 (md->ctypes[c] & ctype_space) == 0
5832 )
5833 return MATCH_NOMATCH;
5834 ecode++;
5835 break;
5836
5837 case OP_NOT_WORDCHAR:
5838 if (eptr >= md->end_subject) return MATCH_NOMATCH;
5839 GETCHARINCTEST(c, eptr);
5840 if (
5841 #ifdef SUPPORT_UTF8
5842 c < 256 &&
5843 #endif
5844 (md->ctypes[c] & ctype_word) != 0
5845 )
5846 return MATCH_NOMATCH;
5847 ecode++;
5848 break;
5849
5850 case OP_WORDCHAR:
5851 if (eptr >= md->end_subject) return MATCH_NOMATCH;
5852 GETCHARINCTEST(c, eptr);
5853 if (
5854 #ifdef SUPPORT_UTF8
5855 c >= 256 ||
5856 #endif
5857 (md->ctypes[c] & ctype_word) == 0
5858 )
5859 return MATCH_NOMATCH;
5860 ecode++;
5861 break;
5862
5863 /* Match a back reference, possibly repeatedly. Look past the end of the
5864 item to see if there is repeat information following. The code is similar
5865 to that for character classes, but repeated for efficiency. Then obey
5866 similar code to character type repeats - written out again for speed.
5867 However, if the referenced string is the empty string, always treat
5868 it as matched, any number of times (otherwise there could be infinite
5869 loops). */
5870
5871 case OP_REF:
5872 {
5873 int length;
5874 int offset = GET2(ecode, 1) << 1; /* Doubled ref number */
5875 ecode += 3; /* Advance past item */
5876
5877 /* If the reference is unset, set the length to be longer than the amount
5878 of subject left; this ensures that every attempt at a match fails. We
5879 can't just fail here, because of the possibility of quantifiers with zero
5880 minima. */
5881
5882 length = (offset >= offset_top || md->offset_vector[offset] < 0)?
5883 md->end_subject - eptr + 1 :
5884 md->offset_vector[offset+1] - md->offset_vector[offset];
5885
5886 /* Set up for repetition, or handle the non-repeated case */
5887
5888 switch (*ecode)
5889 {
5890 case OP_CRSTAR:
5891 case OP_CRMINSTAR:
5892 case OP_CRPLUS:
5893 case OP_CRMINPLUS:
5894 case OP_CRQUERY:
5895 case OP_CRMINQUERY:
5896 c = *ecode++ - OP_CRSTAR;
5897 minimize = (c & 1) != 0;
5898 min = rep_min[c]; /* Pick up values from tables; */
5899 max = rep_max[c]; /* zero for max => infinity */
5900 if (max == 0) max = INT_MAX;
5901 break;
5902
5903 case OP_CRRANGE:
5904 case OP_CRMINRANGE:
5905 minimize = (*ecode == OP_CRMINRANGE);
5906 min = GET2(ecode, 1);
5907 max = GET2(ecode, 3);
5908 if (max == 0) max = INT_MAX;
5909 ecode += 5;
5910 break;
5911
5912 default: /* No repeat follows */
5913 if (!match_ref(offset, eptr, length, md, ims)) return MATCH_NOMATCH;
5914 eptr += length;
5915 continue; /* With the main loop */
5916 }
5917
5918 /* If the length of the reference is zero, just continue with the
5919 main loop. */
5920
5921 if (length == 0) continue;
5922
5923 /* First, ensure the minimum number of matches are present. We get back
5924 the length of the reference string explicitly rather than passing the
5925 address of eptr, so that eptr can be a register variable. */
5926
5927 for (i = 1; i <= min; i++)
5928 {
5929 if (!match_ref(offset, eptr, length, md, ims)) return MATCH_NOMATCH;
5930 eptr += length;
5931 }
5932
5933 /* If min = max, continue at the same level without recursion.
5934 They are not both allowed to be zero. */
5935
5936 if (min == max) continue;
5937
5938 /* If minimizing, keep trying and advancing the pointer */
5939
5940 if (minimize)
5941 {
5942 for (i = min;; i++)
5943 {
5944 if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
5945 MATCH_NOMATCH) return rrc;
5946 if (i >= max || !match_ref(offset, eptr, length, md, ims))
5947 return MATCH_NOMATCH;
5948 eptr += length;
5949 }
5950 /* Control never gets here */
5951 }
5952
5953 /* If maximizing, find the longest string and work backwards */
5954
5955 else
5956 {
5957 const uschar *pp = eptr;
5958 for (i = min; i < max; i++)
5959 {
5960 if (!match_ref(offset, eptr, length, md, ims)) break;
5961 eptr += length;
5962 }
5963 while (eptr >= pp)
5964 {
5965 if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
5966 MATCH_NOMATCH) return rrc;
5967 eptr -= length;
5968 }
5969 return MATCH_NOMATCH;
5970 }
5971 }
5972 /* Control never gets here */
5973
5974
5975
5976 /* Match a bit-mapped character class, possibly repeatedly. This op code is
5977 used when all the characters in the class have values in the range 0-255.
5978 The only difference between OP_CLASS and OP_NCLASS occurs when a data
5979 character outside the range is encountered.
5980
5981 First, look past the end of the item to see if there is repeat information
5982 following. Then obey similar code to character type repeats - written out
5983 again for speed. */
5984
5985 case OP_NCLASS:
5986 case OP_CLASS:
5987 {
5988 const uschar *data = ecode + 1; /* Save for matching */
5989 ecode += 33; /* Advance past the item */
5990
5991 switch (*ecode)
5992 {
5993 case OP_CRSTAR:
5994 case OP_CRMINSTAR:
5995 case OP_CRPLUS:
5996 case OP_CRMINPLUS:
5997 case OP_CRQUERY:
5998 case OP_CRMINQUERY:
5999 c = *ecode++ - OP_CRSTAR;
6000 minimize = (c & 1) != 0;
6001 min = rep_min[c]; /* Pick up values from tables; */
6002 max = rep_max[c]; /* zero for max => infinity */
6003 if (max == 0) max = INT_MAX;
6004 break;
6005
6006 case OP_CRRANGE:
6007 case OP_CRMINRANGE:
6008 minimize = (*ecode == OP_CRMINRANGE);
6009 min = GET2(ecode, 1);
6010 max = GET2(ecode, 3);
6011 if (max == 0) max = INT_MAX;
6012 ecode += 5;
6013 break;
6014
6015 default: /* No repeat follows */
6016 min = max = 1;
6017 break;
6018 }
6019
6020 /* First, ensure the minimum number of matches are present. */
6021
6022 #ifdef SUPPORT_UTF8
6023 /* UTF-8 mode */
6024 if (md->utf8)
6025 {
6026 for (i = 1; i <= min; i++)
6027 {
6028 if (eptr >= md->end_subject) return MATCH_NOMATCH;
6029 GETCHARINC(c, eptr);
6030 if (c > 255)
6031 {
6032 if (op == OP_CLASS) return MATCH_NOMATCH;
6033 }
6034 else
6035 {
6036 if ((data[c/8] & (1 << (c&7))) == 0) return MATCH_NOMATCH;
6037 }
6038 }
6039 }
6040 else
6041 #endif
6042 /* Not UTF-8 mode */
6043 {
6044 for (i = 1; i <= min; i++)
6045 {
6046 if (eptr >= md->end_subject) return MATCH_NOMATCH;
6047 c = *eptr++;
6048 if ((data[c/8] & (1 << (c&7))) == 0) return MATCH_NOMATCH;
6049 }
6050 }
6051
6052 /* If max == min we can continue with the main loop without the
6053 need to recurse. */
6054
6055 if (min == max) continue;
6056
6057 /* If minimizing, keep testing the rest of the expression and advancing
6058 the pointer while it matches the class. */
6059
6060 if (minimize)
6061 {
6062 #ifdef SUPPORT_UTF8
6063 /* UTF-8 mode */
6064 if (md->utf8)
6065 {
6066 for (i = min;; i++)
6067 {
6068 if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
6069 MATCH_NOMATCH) return rrc;
6070 if (i >= max || eptr >= md->end_subject) return MATCH_NOMATCH;
6071 GETCHARINC(c, eptr);
6072 if (c > 255)
6073 {
6074 if (op == OP_CLASS) return MATCH_NOMATCH;
6075 }
6076 else
6077 {
6078 if ((data[c/8] & (1 << (c&7))) == 0) return MATCH_NOMATCH;
6079 }
6080 }
6081 }
6082 else
6083 #endif
6084 /* Not UTF-8 mode */
6085 {
6086 for (i = min;; i++)
6087 {
6088 if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
6089 MATCH_NOMATCH) return rrc;
6090 if (i >= max || eptr >= md->end_subject) return MATCH_NOMATCH;
6091 c = *eptr++;
6092 if ((data[c/8] & (1 << (c&7))) == 0) return MATCH_NOMATCH;
6093 }
6094 }
6095 /* Control never gets here */
6096 }
6097
6098 /* If maximizing, find the longest possible run, then work backwards. */
6099
6100 else
6101 {
6102 const uschar *pp = eptr;
6103
6104 #ifdef SUPPORT_UTF8
6105 /* UTF-8 mode */
6106 if (md->utf8)
6107 {
6108 for (i = min; i < max; i++)
6109 {
6110 int len = 1;
6111 if (eptr >= md->end_subject) break;
6112 GETCHARLEN(c, eptr, len);
6113 if (c > 255)
6114 {
6115 if (op == OP_CLASS) break;
6116 }
6117 else
6118 {
6119 if ((data[c/8] & (1 << (c&7))) == 0) break;
6120 }
6121 eptr += len;
6122 }
6123 for (;;)
6124 {
6125 if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
6126 MATCH_NOMATCH) return rrc;
6127 if (eptr-- == pp) break; /* Stop if tried at original pos */
6128 BACKCHAR(eptr);
6129 }
6130 }
6131 else
6132 #endif
6133 /* Not UTF-8 mode */
6134 {
6135 for (i = min; i < max; i++)
6136 {
6137 if (eptr >= md->end_subject) break;
6138 c = *eptr;
6139 if ((data[c/8] & (1 << (c&7))) == 0) break;
6140 eptr++;
6141 }
6142 while (eptr >= pp)
6143 {
6144 if ((rrc = match(eptr--, ecode, offset_top, md, ims, eptrb, 0)) !=
6145 MATCH_NOMATCH) return rrc;
6146 }
6147 }
6148
6149 return MATCH_NOMATCH;
6150 }
6151 }
6152 /* Control never gets here */
6153
6154
6155 /* Match an extended character class. This opcode is encountered only
6156 in UTF-8 mode, because that's the only time it is compiled. */
6157
6158 #ifdef SUPPORT_UTF8
6159 case OP_XCLASS:
6160 {
6161 const uschar *data = ecode + 1 + LINK_SIZE; /* Save for matching */
6162 ecode += GET(ecode, 1); /* Advance past the item */
6163
6164 switch (*ecode)
6165 {
6166 case OP_CRSTAR:
6167 case OP_CRMINSTAR:
6168 case OP_CRPLUS:
6169 case OP_CRMINPLUS:
6170 case OP_CRQUERY:
6171 case OP_CRMINQUERY:
6172 c = *ecode++ - OP_CRSTAR;
6173 minimize = (c & 1) != 0;
6174 min = rep_min[c]; /* Pick up values from tables; */
6175 max = rep_max[c]; /* zero for max => infinity */
6176 if (max == 0) max = INT_MAX;
6177 break;
6178
6179 case OP_CRRANGE:
6180 case OP_CRMINRANGE:
6181 minimize = (*ecode == OP_CRMINRANGE);
6182 min = GET2(ecode, 1);
6183 max = GET2(ecode, 3);
6184 if (max == 0) max = INT_MAX;
6185 ecode += 5;
6186 break;
6187
6188 default: /* No repeat follows */
6189 min = max = 1;
6190 break;
6191 }
6192
6193 /* First, ensure the minimum number of matches are present. */
6194
6195 for (i = 1; i <= min; i++)
6196 {
6197 if (eptr >= md->end_subject) return MATCH_NOMATCH;
6198 GETCHARINC(c, eptr);
6199 if (!match_xclass(c, data)) return MATCH_NOMATCH;
6200 }
6201
6202 /* If max == min we can continue with the main loop without the
6203 need to recurse. */
6204
6205 if (min == max) continue;
6206
6207 /* If minimizing, keep testing the rest of the expression and advancing
6208 the pointer while it matches the class. */
6209
6210 if (minimize)
6211 {
6212 for (i = min;; i++)
6213 {
6214 if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
6215 MATCH_NOMATCH) return rrc;
6216 if (i >= max || eptr >= md->end_subject) return MATCH_NOMATCH;
6217 GETCHARINC(c, eptr);
6218 if (!match_xclass(c, data)) return MATCH_NOMATCH;
6219 }
6220 /* Control never gets here */
6221 }
6222
6223 /* If maximizing, find the longest possible run, then work backwards. */
6224
6225 else
6226 {
6227 const uschar *pp = eptr;
6228 for (i = min; i < max; i++)
6229 {
6230 int len = 1;
6231 if (eptr >= md->end_subject) break;
6232 GETCHARLEN(c, eptr, len);
6233 if (!match_xclass(c, data)) break;
6234 eptr += len;
6235 }
6236 for(;;)
6237 {
6238 if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
6239 MATCH_NOMATCH) return rrc;
6240 if (eptr-- == pp) break; /* Stop if tried at original pos */
6241 BACKCHAR(eptr)
6242 }
6243 return MATCH_NOMATCH;
6244 }
6245
6246 /* Control never gets here */
6247 }
6248 #endif /* End of XCLASS */
6249
6250 /* Match a run of characters */
6251
6252 case OP_CHARS:
6253 {
6254 register int length = ecode[1];
6255 ecode += 2;
6256
6257 #ifdef DEBUG /* Sigh. Some compilers never learn. */
6258 if (eptr >= md->end_subject)
6259 printf("matching subject <null> against pattern ");
6260 else
6261 {
6262 printf("matching subject ");
6263 pchars(eptr, length, TRUE, md);
6264 printf(" against pattern ");
6265 }
6266 pchars(ecode, length, FALSE, md);
6267 printf("\n");
6268 #endif
6269
6270 if (length > md->end_subject - eptr) return MATCH_NOMATCH;
6271 if ((ims & PCRE_CASELESS) != 0)
6272 {
6273 while (length-- > 0)
6274 if (md->lcc[*ecode++] != md->lcc[*eptr++])
6275 return MATCH_NOMATCH;
6276 }
6277 else
6278 {
6279 while (length-- > 0) if (*ecode++ != *eptr++) return MATCH_NOMATCH;
6280 }
6281 }
6282 break;
6283
6284 /* Match a single character repeatedly; different opcodes share code. */
6285
6286 case OP_EXACT:
6287 min = max = GET2(ecode, 1);
6288 ecode += 3;
6289 goto REPEATCHAR;
6290
6291 case OP_UPTO:
6292 case OP_MINUPTO:
6293 min = 0;
6294 max = GET2(ecode, 1);
6295 minimize = *ecode == OP_MINUPTO;
6296 ecode += 3;
6297 goto REPEATCHAR;
6298
6299 case OP_STAR:
6300 case OP_MINSTAR:
6301 case OP_PLUS:
6302 case OP_MINPLUS:
6303 case OP_QUERY:
6304 case OP_MINQUERY:
6305 c = *ecode++ - OP_STAR;
6306 minimize = (c & 1) != 0;
6307 min = rep_min[c]; /* Pick up values from tables; */
6308 max = rep_max[c]; /* zero for max => infinity */
6309 if (max == 0) max = INT_MAX;
6310
6311 /* Common code for all repeated single-character matches. We can give
6312 up quickly if there are fewer than the minimum number of characters left in
6313 the subject. */
6314
6315 REPEATCHAR:
6316 #ifdef SUPPORT_UTF8
6317 if (md->utf8)
6318 {
6319 int len = 1;
6320 const uschar *charptr = ecode;
6321 GETCHARLEN(c, ecode, len);
6322 if (min * len > md->end_subject - eptr) return MATCH_NOMATCH;
6323 ecode += len;
6324
6325 /* Handle multibyte character matching specially here. There is no
6326 support for any kind of casing for multibyte characters. */
6327
6328 if (len > 1)
6329 {
6330 for (i = 1; i <= min; i++)
6331 {
6332 if (memcmp(eptr, charptr, len) != 0) return MATCH_NOMATCH;
6333 eptr += len;
6334 }
6335
6336 if (min == max) continue;
6337
6338 if (minimize)
6339 {
6340 for (i = min;; i++)
6341 {
6342 if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
6343 MATCH_NOMATCH) return rrc;
6344 if (i >= max ||
6345 eptr >= md->end_subject ||
6346 memcmp(eptr, charptr, len) != 0)
6347 return MATCH_NOMATCH;
6348 eptr += len;
6349 }
6350 /* Control never gets here */
6351 }
6352 else
6353 {
6354 const uschar *pp = eptr;
6355 for (i = min; i < max; i++)
6356 {
6357 if (eptr > md->end_subject - len ||
6358 memcmp(eptr, charptr, len) != 0)
6359 break;
6360 eptr += len;
6361 }
6362 while (eptr >= pp)
6363 {
6364 if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
6365 MATCH_NOMATCH) return rrc;
6366 eptr -= len;
6367 }
6368 return MATCH_NOMATCH;
6369 }
6370 /* Control never gets here */
6371 }
6372
6373 /* If the length of a UTF-8 character is 1, we fall through here, and
6374 obey the code as for non-UTF-8 characters below, though in this case the
6375 value of c will always be < 128. */
6376 }
6377 else
6378 #endif
6379
6380 /* When not in UTF-8 mode, load a single-byte character. */
6381 {
6382 if (min > md->end_subject - eptr) return MATCH_NOMATCH;
6383 c = *ecode++;
6384 }
6385
6386 /* The value of c at this point is always less than 256, though we may or
6387 may not be in UTF-8 mode. The code is duplicated for the caseless and
6388 caseful cases, for speed, since matching characters is likely to be quite
6389 common. First, ensure the minimum number of matches are present. If min =
6390 max, continue at the same level without recursing. Otherwise, if
6391 minimizing, keep trying the rest of the expression and advancing one
6392 matching character if failing, up to the maximum. Alternatively, if
6393 maximizing, find the maximum number of characters and work backwards. */
6394
6395 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", c, min, max,
6396 max, eptr));
6397
6398 if ((ims & PCRE_CASELESS) != 0)
6399 {
6400 c = md->lcc[c];
6401 for (i = 1; i <= min; i++)
6402 if (c != md->lcc[*eptr++]) return MATCH_NOMATCH;
6403 if (min == max) continue;
6404 if (minimize)
6405 {
6406 for (i = min;; i++)
6407 {
6408 if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
6409 MATCH_NOMATCH) return rrc;
6410 if (i >= max || eptr >= md->end_subject ||
6411 c != md->lcc[*eptr++])
6412 return MATCH_NOMATCH;
6413 }
6414 /* Control never gets here */
6415 }
6416 else
6417 {
6418 const uschar *pp = eptr;
6419 for (i = min; i < max; i++)
6420 {
6421 if (eptr >= md->end_subject || c != md->lcc[*eptr]) break;
6422 eptr++;
6423 }
6424 while (eptr >= pp)
6425 if ((rrc = match(eptr--, ecode, offset_top, md, ims, eptrb, 0)) !=
6426 MATCH_NOMATCH) return rrc;
6427 return MATCH_NOMATCH;
6428 }
6429 /* Control never gets here */
6430 }
6431
6432 /* Caseful comparisons (includes all multi-byte characters) */
6433
6434 else
6435 {
6436 for (i = 1; i <= min; i++) if (c != *eptr++) return MATCH_NOMATCH;
6437 if (min == max) continue;
6438 if (minimize)
6439 {
6440 for (i = min;; i++)
6441 {
6442 if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
6443 MATCH_NOMATCH) return rrc;
6444 if (i >= max || eptr >= md->end_subject || c != *eptr++)
6445 return MATCH_NOMATCH;
6446 }
6447 /* Control never gets here */
6448 }
6449 else
6450 {
6451 const uschar *pp = eptr;
6452 for (i = min; i < max; i++)
6453 {
6454 if (eptr >= md->end_subject || c != *eptr) break;
6455 eptr++;
6456 }
6457 while (eptr >= pp)
6458 if ((rrc = match(eptr--, ecode, offset_top, md, ims, eptrb, 0)) !=
6459 MATCH_NOMATCH) return rrc;
6460 return MATCH_NOMATCH;
6461 }
6462 }
6463 /* Control never gets here */
6464
6465 /* Match a negated single one-byte character. The character we are
6466 checking can be multibyte. */
6467
6468 case OP_NOT:
6469 if (eptr >= md->end_subject) return MATCH_NOMATCH;
6470 ecode++;
6471 GETCHARINCTEST(c, eptr);
6472 if ((ims & PCRE_CASELESS) != 0)
6473 {
6474 #ifdef SUPPORT_UTF8
6475 if (c < 256)
6476 #endif
6477 c = md->lcc[c];
6478 if (md->lcc[*ecode++] == c) return MATCH_NOMATCH;
6479 }
6480 else
6481 {
6482 if (*ecode++ == c) return MATCH_NOMATCH;
6483 }
6484 break;
6485
6486 /* Match a negated single one-byte character repeatedly. This is almost a
6487 repeat of the code for a repeated single character, but I haven't found a
6488 nice way of commoning these up that doesn't require a test of the
6489 positive/negative option for each character match. Maybe that wouldn't add
6490 very much to the time taken, but character matching *is* what this is all
6491 about... */
6492
6493 case OP_NOTEXACT:
6494 min = max = GET2(ecode, 1);
6495 ecode += 3;
6496 goto REPEATNOTCHAR;
6497
6498 case OP_NOTUPTO:
6499 case OP_NOTMINUPTO:
6500 min = 0;
6501 max = GET2(ecode, 1);
6502 minimize = *ecode == OP_NOTMINUPTO;
6503 ecode += 3;
6504 goto REPEATNOTCHAR;
6505
6506 case OP_NOTSTAR:
6507 case OP_NOTMINSTAR:
6508 case OP_NOTPLUS:
6509 case OP_NOTMINPLUS:
6510 case OP_NOTQUERY:
6511 case OP_NOTMINQUERY:
6512 c = *ecode++ - OP_NOTSTAR;
6513 minimize = (c & 1) != 0;
6514 min = rep_min[c]; /* Pick up values from tables; */
6515 max = rep_max[c]; /* zero for max => infinity */
6516 if (max == 0) max = INT_MAX;
6517
6518 /* Common code for all repeated single-character (less than 255) matches.
6519 We can give up quickly if there are fewer than the minimum number of
6520 characters left in the subject. */
6521
6522 REPEATNOTCHAR:
6523 if (min > md->end_subject - eptr) return MATCH_NOMATCH;
6524 c = *ecode++;
6525
6526 /* The code is duplicated for the caseless and caseful cases, for speed,
6527 since matching characters is likely to be quite common. First, ensure the
6528 minimum number of matches are present. If min = max, continue at the same
6529 level without recursing. Otherwise, if minimizing, keep trying the rest of
6530 the expression and advancing one matching character if failing, up to the
6531 maximum. Alternatively, if maximizing, find the maximum number of
6532 characters and work backwards. */
6533
6534 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", c, min, max,
6535 max, eptr));
6536
6537 if ((ims & PCRE_CASELESS) != 0)
6538 {
6539 c = md->lcc[c];
6540
6541 #ifdef SUPPORT_UTF8
6542 /* UTF-8 mode */
6543 if (md->utf8)
6544 {
6545 register int d;
6546 for (i = 1; i <= min; i++)
6547 {
6548 GETCHARINC(d, eptr);
6549 if (d < 256) d = md->lcc[d];
6550 if (c == d) return MATCH_NOMATCH;
6551 }
6552 }
6553 else
6554 #endif
6555
6556 /* Not UTF-8 mode */
6557 {
6558 for (i = 1; i <= min; i++)
6559 if (c == md->lcc[*eptr++]) return MATCH_NOMATCH;
6560 }
6561
6562 if (min == max) continue;
6563
6564 if (minimize)
6565 {
6566 #ifdef SUPPORT_UTF8
6567 /* UTF-8 mode */
6568 if (md->utf8)
6569 {
6570 register int d;
6571 for (i = min;; i++)
6572 {
6573 if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
6574 MATCH_NOMATCH) return rrc;
6575 GETCHARINC(d, eptr);
6576 if (d < 256) d = md->lcc[d];
6577 if (i >= max || eptr >= md->end_subject || c == d)
6578 return MATCH_NOMATCH;
6579 }
6580 }
6581 else
6582 #endif
6583 /* Not UTF-8 mode */
6584 {
6585 for (i = min;; i++)
6586 {
6587 if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
6588 MATCH_NOMATCH) return rrc;
6589 if (i >= max || eptr >= md->end_subject || c == md->lcc[*eptr++])
6590 return MATCH_NOMATCH;
6591 }
6592 }
6593 /* Control never gets here */
6594 }
6595
6596 /* Maximize case */
6597
6598 else
6599 {
6600 const uschar *pp = eptr;
6601
6602 #ifdef SUPPORT_UTF8
6603 /* UTF-8 mode */
6604 if (md->utf8)
6605 {
6606 register int d;
6607 for (i = min; i < max; i++)
6608 {
6609 int len = 1;
6610 if (eptr >= md->end_subject) break;
6611 GETCHARLEN(d, eptr, len);
6612 if (d < 256) d = md->lcc[d];
6613 if (c == d) break;
6614 eptr += len;
6615 }
6616 for(;;)
6617 {
6618 if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
6619 MATCH_NOMATCH) return rrc;
6620 if (eptr-- == pp) break; /* Stop if tried at original pos */
6621 BACKCHAR(eptr);
6622 }
6623 }
6624 else
6625 #endif
6626 /* Not UTF-8 mode */
6627 {
6628 for (i = min; i < max; i++)
6629 {
6630 if (eptr >= md->end_subject || c == md->lcc[*eptr]) break;
6631 eptr++;
6632 }
6633 while (eptr >= pp)
6634 {
6635 if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
6636 MATCH_NOMATCH) return rrc;
6637 eptr--;
6638 }
6639 }
6640
6641 return MATCH_NOMATCH;
6642 }
6643 /* Control never gets here */
6644 }
6645
6646 /* Caseful comparisons */
6647
6648 else
6649 {
6650 #ifdef SUPPORT_UTF8
6651 /* UTF-8 mode */
6652 if (md->utf8)
6653 {
6654 register int d;
6655 for (i = 1; i <= min; i++)
6656 {
6657 GETCHARINC(d, eptr);
6658 if (c == d) return MATCH_NOMATCH;
6659 }
6660 }
6661 else
6662 #endif
6663 /* Not UTF-8 mode */
6664 {
6665 for (i = 1; i <= min; i++)
6666 if (c == *eptr++) return MATCH_NOMATCH;
6667 }
6668
6669 if (min == max) continue;
6670
6671 if (minimize)
6672 {
6673 #ifdef SUPPORT_UTF8
6674 /* UTF-8 mode */
6675 if (md->utf8)
6676 {
6677 register int d;
6678 for (i = min;; i++)
6679 {
6680 if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
6681 MATCH_NOMATCH) return rrc;
6682 GETCHARINC(d, eptr);
6683 if (i >= max || eptr >= md->end_subject || c == d)
6684 return MATCH_NOMATCH;
6685 }
6686 }
6687 else
6688 #endif
6689 /* Not UTF-8 mode */
6690 {
6691 for (i = min;; i++)
6692 {
6693 if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
6694 MATCH_NOMATCH) return rrc;
6695 if (i >= max || eptr >= md->end_subject || c == *eptr++)
6696 return MATCH_NOMATCH;
6697 }
6698 }
6699 /* Control never gets here */
6700 }
6701
6702 /* Maximize case */
6703
6704 else
6705 {
6706 const uschar *pp = eptr;
6707
6708 #ifdef SUPPORT_UTF8
6709 /* UTF-8 mode */
6710 if (md->utf8)
6711 {
6712 register int d;
6713 for (i = min; i < max; i++)
6714 {
6715 int len = 1;
6716 if (eptr >= md->end_subject) break;
6717 GETCHARLEN(d, eptr, len);
6718 if (c == d) break;
6719 eptr += len;
6720 }
6721 for(;;)
6722 {
6723 if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
6724 MATCH_NOMATCH) return rrc;
6725 if (eptr-- == pp) break; /* Stop if tried at original pos */
6726 BACKCHAR(eptr);
6727 }
6728 }
6729 else
6730 #endif
6731 /* Not UTF-8 mode */
6732 {
6733 for (i = min; i < max; i++)
6734 {
6735 if (eptr >= md->end_subject || c == *eptr) break;
6736 eptr++;
6737 }
6738 while (eptr >= pp)
6739 {
6740 if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=