/[pcre]/code/tags/pcre-4.3/pcre.c
ViewVC logotype

Contents of /code/tags/pcre-4.3/pcre.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 70 - (show annotations) (download)
Sat Feb 24 21:40:20 2007 UTC (7 years, 4 months ago) by nigel
File MIME type: text/plain
File size: 231769 byte(s)
Tag code/trunk as code/tags/pcre-4.3.

1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /*
6 This is a library of functions to support regular expressions whose syntax
7 and semantics are as close as possible to those of the Perl 5 language. See
8 the file Tech.Notes for some information on the internals.
9
10 Written by: Philip Hazel <ph10@cam.ac.uk>
11
12 Copyright (c) 1997-2003 University of Cambridge
13
14 -----------------------------------------------------------------------------
15 Permission is granted to anyone to use this software for any purpose on any
16 computer system, and to redistribute it freely, subject to the following
17 restrictions:
18
19 1. This software is distributed in the hope that it will be useful,
20 but WITHOUT ANY WARRANTY; without even the implied warranty of
21 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
22
23 2. The origin of this software must not be misrepresented, either by
24 explicit claim or by omission.
25
26 3. Altered versions must be plainly marked as such, and must not be
27 misrepresented as being the original software.
28
29 4. If PCRE is embedded in any software that is released under the GNU
30 General Purpose Licence (GPL), then the terms of that licence shall
31 supersede any condition above with which it is incompatible.
32 -----------------------------------------------------------------------------
33 */
34
35 /* Define DEBUG to get debugging output on stdout. */
36
37 /* #define DEBUG */
38
39 /* Use a macro for debugging printing, 'cause that eliminates the use of #ifdef
40 inline, and there are *still* stupid compilers about that don't like indented
41 pre-processor statements. I suppose it's only been 10 years... */
42
43 #ifdef DEBUG
44 #define DPRINTF(p) printf p
45 #else
46 #define DPRINTF(p) /*nothing*/
47 #endif
48
49 /* Include the internals header, which itself includes Standard C headers plus
50 the external pcre header. */
51
52 #include "internal.h"
53
54
55 /* Allow compilation as C++ source code, should anybody want to do that. */
56
57 #ifdef __cplusplus
58 #define class pcre_class
59 #endif
60
61
62 /* Maximum number of items on the nested bracket stacks at compile time. This
63 applies to the nesting of all kinds of parentheses. It does not limit
64 un-nested, non-capturing parentheses. This number can be made bigger if
65 necessary - it is used to dimension one int and one unsigned char vector at
66 compile time. */
67
68 #define BRASTACK_SIZE 200
69
70
71 /* Maximum number of ints of offset to save on the stack for recursive calls.
72 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
73 because the offset vector is always a multiple of 3 long. */
74
75 #define REC_STACK_SAVE_MAX 30
76
77
78 /* The number of bytes in a literal character string above which we can't add
79 any more is set at 250 in order to allow for UTF-8 characters. (In theory it
80 could be 255 when UTF-8 support is excluded, but that means that some of the
81 test output would be different, which just complicates things.) */
82
83 #define MAXLIT 250
84
85
86 /* The maximum remaining length of subject we are prepared to search for a
87 req_byte match. */
88
89 #define REQ_BYTE_MAX 1000
90
91
92 /* Table of sizes for the fixed-length opcodes. It's defined in a macro so that
93 the definition is next to the definition of the opcodes in internal.h. */
94
95 static uschar OP_lengths[] = { OP_LENGTHS };
96
97 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
98
99 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
100 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
101
102 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
103 are simple data values; negative values are for special things like \d and so
104 on. Zero means further processing is needed (for things like \x), or the escape
105 is invalid. */
106
107 static const short int escapes[] = {
108 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
109 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
110 '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */
111 0, 0, 0, 0, 0, 0, 0, 0, /* H - O */
112 0, -ESC_Q, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
113 0, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
114 '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */
115 0, 0, 0, 0, 0, 0, ESC_n, 0, /* h - o */
116 0, 0, ESC_r, -ESC_s, ESC_tee, 0, 0, -ESC_w, /* p - w */
117 0, 0, -ESC_z /* x - z */
118 };
119
120 /* Tables of names of POSIX character classes and their lengths. The list is
121 terminated by a zero length entry. The first three must be alpha, upper, lower,
122 as this is assumed for handling case independence. */
123
124 static const char *posix_names[] = {
125 "alpha", "lower", "upper",
126 "alnum", "ascii", "blank", "cntrl", "digit", "graph",
127 "print", "punct", "space", "word", "xdigit" };
128
129 static const uschar posix_name_lengths[] = {
130 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
131
132 /* Table of class bit maps for each POSIX class; up to three may be combined
133 to form the class. The table for [:blank:] is dynamically modified to remove
134 the vertical space characters. */
135
136 static const int posix_class_maps[] = {
137 cbit_lower, cbit_upper, -1, /* alpha */
138 cbit_lower, -1, -1, /* lower */
139 cbit_upper, -1, -1, /* upper */
140 cbit_digit, cbit_lower, cbit_upper, /* alnum */
141 cbit_print, cbit_cntrl, -1, /* ascii */
142 cbit_space, -1, -1, /* blank - a GNU extension */
143 cbit_cntrl, -1, -1, /* cntrl */
144 cbit_digit, -1, -1, /* digit */
145 cbit_graph, -1, -1, /* graph */
146 cbit_print, -1, -1, /* print */
147 cbit_punct, -1, -1, /* punct */
148 cbit_space, -1, -1, /* space */
149 cbit_word, -1, -1, /* word - a Perl extension */
150 cbit_xdigit,-1, -1 /* xdigit */
151 };
152
153 /* Table to identify ASCII digits and hex digits. This is used when compiling
154 patterns. Note that the tables in chartables are dependent on the locale, and
155 may mark arbitrary characters as digits - but the PCRE compiling code expects
156 to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
157 a private table here. It costs 256 bytes, but it is a lot faster than doing
158 character value tests (at least in some simple cases I timed), and in some
159 applications one wants PCRE to compile efficiently as well as match
160 efficiently.
161
162 For convenience, we use the same bit definitions as in chartables:
163
164 0x04 decimal digit
165 0x08 hexadecimal digit
166
167 Then we can use ctype_digit and ctype_xdigit in the code. */
168
169 static const unsigned char digitab[] =
170 {
171 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
172 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
173 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
174 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
175 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
176 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
177 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
178 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
179 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
180 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
181 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
182 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
183 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
184 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
185 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
186 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
187 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
188 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
189 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
190 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
191 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
192 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
193 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
194 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
195 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
196 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
197 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
198 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
199 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
200 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
201 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
202 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
203
204 /* Definition to allow mutual recursion */
205
206 static BOOL
207 compile_regex(int, int, int *, uschar **, const uschar **, const char **,
208 BOOL, int, int *, int *, branch_chain *, compile_data *);
209
210 /* Structure for building a chain of data that actually lives on the
211 stack, for holding the values of the subject pointer at the start of each
212 subpattern, so as to detect when an empty string has been matched by a
213 subpattern - to break infinite loops. */
214
215 typedef struct eptrblock {
216 struct eptrblock *prev;
217 const uschar *saved_eptr;
218 } eptrblock;
219
220 /* Flag bits for the match() function */
221
222 #define match_condassert 0x01 /* Called to check a condition assertion */
223 #define match_isgroup 0x02 /* Set if start of bracketed group */
224
225 /* Non-error returns from the match() function. Error returns are externally
226 defined PCRE_ERROR_xxx codes, which are all negative. */
227
228 #define MATCH_MATCH 1
229 #define MATCH_NOMATCH 0
230
231
232
233 /*************************************************
234 * Global variables *
235 *************************************************/
236
237 /* PCRE is thread-clean and doesn't use any global variables in the normal
238 sense. However, it calls memory allocation and free functions via the two
239 indirections below, and it can optionally do callouts. These values can be
240 changed by the caller, but are shared between all threads. However, when
241 compiling for Virtual Pascal, things are done differently (see pcre.in). */
242
243 #ifndef VPCOMPAT
244 void *(*pcre_malloc)(size_t) = malloc;
245 void (*pcre_free)(void *) = free;
246 int (*pcre_callout)(pcre_callout_block *) = NULL;
247 #endif
248
249
250 /*************************************************
251 * Macros and tables for character handling *
252 *************************************************/
253
254 /* When UTF-8 encoding is being used, a character is no longer just a single
255 byte. The macros for character handling generate simple sequences when used in
256 byte-mode, and more complicated ones for UTF-8 characters. */
257
258 #ifndef SUPPORT_UTF8
259 #define GETCHAR(c, eptr) c = *eptr;
260 #define GETCHARINC(c, eptr) c = *eptr++;
261 #define GETCHARINCTEST(c, eptr) c = *eptr++;
262 #define GETCHARLEN(c, eptr, len) c = *eptr;
263 #define BACKCHAR(eptr)
264
265 #else /* SUPPORT_UTF8 */
266
267 /* Get the next UTF-8 character, not advancing the pointer. This is called when
268 we know we are in UTF-8 mode. */
269
270 #define GETCHAR(c, eptr) \
271 c = *eptr; \
272 if ((c & 0xc0) == 0xc0) \
273 { \
274 int gcii; \
275 int gcaa = utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
276 int gcss = 6*gcaa; \
277 c = (c & utf8_table3[gcaa]) << gcss; \
278 for (gcii = 1; gcii <= gcaa; gcii++) \
279 { \
280 gcss -= 6; \
281 c |= (eptr[gcii] & 0x3f) << gcss; \
282 } \
283 }
284
285 /* Get the next UTF-8 character, advancing the pointer. This is called when we
286 know we are in UTF-8 mode. */
287
288 #define GETCHARINC(c, eptr) \
289 c = *eptr++; \
290 if ((c & 0xc0) == 0xc0) \
291 { \
292 int gcaa = utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
293 int gcss = 6*gcaa; \
294 c = (c & utf8_table3[gcaa]) << gcss; \
295 while (gcaa-- > 0) \
296 { \
297 gcss -= 6; \
298 c |= (*eptr++ & 0x3f) << gcss; \
299 } \
300 }
301
302 /* Get the next character, testing for UTF-8 mode, and advancing the pointer */
303
304 #define GETCHARINCTEST(c, eptr) \
305 c = *eptr++; \
306 if (md->utf8 && (c & 0xc0) == 0xc0) \
307 { \
308 int gcaa = utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
309 int gcss = 6*gcaa; \
310 c = (c & utf8_table3[gcaa]) << gcss; \
311 while (gcaa-- > 0) \
312 { \
313 gcss -= 6; \
314 c |= (*eptr++ & 0x3f) << gcss; \
315 } \
316 }
317
318 /* Get the next UTF-8 character, not advancing the pointer, incrementing length
319 if there are extra bytes. This is called when we know we are in UTF-8 mode. */
320
321 #define GETCHARLEN(c, eptr, len) \
322 c = *eptr; \
323 if ((c & 0xc0) == 0xc0) \
324 { \
325 int gcii; \
326 int gcaa = utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
327 int gcss = 6*gcaa; \
328 c = (c & utf8_table3[gcaa]) << gcss; \
329 for (gcii = 1; gcii <= gcaa; gcii++) \
330 { \
331 gcss -= 6; \
332 c |= (eptr[gcii] & 0x3f) << gcss; \
333 } \
334 len += gcaa; \
335 }
336
337 /* If the pointer is not at the start of a character, move it back until
338 it is. Called only in UTF-8 mode. */
339
340 #define BACKCHAR(eptr) while((*eptr & 0xc0) == 0x80) eptr--;
341
342 #endif
343
344
345
346 /*************************************************
347 * Default character tables *
348 *************************************************/
349
350 /* A default set of character tables is included in the PCRE binary. Its source
351 is built by the maketables auxiliary program, which uses the default C ctypes
352 functions, and put in the file chartables.c. These tables are used by PCRE
353 whenever the caller of pcre_compile() does not provide an alternate set of
354 tables. */
355
356 #include "chartables.c"
357
358
359
360 #ifdef SUPPORT_UTF8
361 /*************************************************
362 * Tables for UTF-8 support *
363 *************************************************/
364
365 /* These are the breakpoints for different numbers of bytes in a UTF-8
366 character. */
367
368 static const int utf8_table1[] =
369 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff};
370
371 /* These are the indicator bits and the mask for the data bits to set in the
372 first byte of a character, indexed by the number of additional bytes. */
373
374 static const int utf8_table2[] = { 0, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
375 static const int utf8_table3[] = { 0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01};
376
377 /* Table of the number of extra characters, indexed by the first character
378 masked with 0x3f. The highest number for a valid UTF-8 character is in fact
379 0x3d. */
380
381 static const uschar utf8_table4[] = {
382 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
383 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
384 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
385 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 };
386
387
388 /*************************************************
389 * Convert character value to UTF-8 *
390 *************************************************/
391
392 /* This function takes an integer value in the range 0 - 0x7fffffff
393 and encodes it as a UTF-8 character in 0 to 6 bytes.
394
395 Arguments:
396 cvalue the character value
397 buffer pointer to buffer for result - at least 6 bytes long
398
399 Returns: number of characters placed in the buffer
400 */
401
402 static int
403 ord2utf8(int cvalue, uschar *buffer)
404 {
405 register int i, j;
406 for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
407 if (cvalue <= utf8_table1[i]) break;
408 buffer += i;
409 for (j = i; j > 0; j--)
410 {
411 *buffer-- = 0x80 | (cvalue & 0x3f);
412 cvalue >>= 6;
413 }
414 *buffer = utf8_table2[i] | cvalue;
415 return i + 1;
416 }
417 #endif
418
419
420
421 /*************************************************
422 * Print compiled regex *
423 *************************************************/
424
425 /* The code for doing this is held in a separate file that is also included in
426 pcretest.c. It defines a function called print_internals(). */
427
428 #ifdef DEBUG
429 #include "printint.c"
430 #endif
431
432
433
434 /*************************************************
435 * Return version string *
436 *************************************************/
437
438 #define STRING(a) # a
439 #define XSTRING(s) STRING(s)
440
441 const char *
442 pcre_version(void)
443 {
444 return XSTRING(PCRE_MAJOR) "." XSTRING(PCRE_MINOR) " " XSTRING(PCRE_DATE);
445 }
446
447
448
449
450 /*************************************************
451 * (Obsolete) Return info about compiled pattern *
452 *************************************************/
453
454 /* This is the original "info" function. It picks potentially useful data out
455 of the private structure, but its interface was too rigid. It remains for
456 backwards compatibility. The public options are passed back in an int - though
457 the re->options field has been expanded to a long int, all the public options
458 at the low end of it, and so even on 16-bit systems this will still be OK.
459 Therefore, I haven't changed the API for pcre_info().
460
461 Arguments:
462 external_re points to compiled code
463 optptr where to pass back the options
464 first_byte where to pass back the first character,
465 or -1 if multiline and all branches start ^,
466 or -2 otherwise
467
468 Returns: number of capturing subpatterns
469 or negative values on error
470 */
471
472 int
473 pcre_info(const pcre *external_re, int *optptr, int *first_byte)
474 {
475 const real_pcre *re = (const real_pcre *)external_re;
476 if (re == NULL) return PCRE_ERROR_NULL;
477 if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
478 if (optptr != NULL) *optptr = (int)(re->options & PUBLIC_OPTIONS);
479 if (first_byte != NULL)
480 *first_byte = ((re->options & PCRE_FIRSTSET) != 0)? re->first_byte :
481 ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
482 return re->top_bracket;
483 }
484
485
486
487 /*************************************************
488 * Return info about compiled pattern *
489 *************************************************/
490
491 /* This is a newer "info" function which has an extensible interface so
492 that additional items can be added compatibly.
493
494 Arguments:
495 external_re points to compiled code
496 extra_data points extra data, or NULL
497 what what information is required
498 where where to put the information
499
500 Returns: 0 if data returned, negative on error
501 */
502
503 int
504 pcre_fullinfo(const pcre *external_re, const pcre_extra *extra_data, int what,
505 void *where)
506 {
507 const real_pcre *re = (const real_pcre *)external_re;
508 const pcre_study_data *study = NULL;
509
510 if (re == NULL || where == NULL) return PCRE_ERROR_NULL;
511 if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
512
513 if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_STUDY_DATA) != 0)
514 study = extra_data->study_data;
515
516 switch (what)
517 {
518 case PCRE_INFO_OPTIONS:
519 *((unsigned long int *)where) = re->options & PUBLIC_OPTIONS;
520 break;
521
522 case PCRE_INFO_SIZE:
523 *((size_t *)where) = re->size;
524 break;
525
526 case PCRE_INFO_STUDYSIZE:
527 *((size_t *)where) = (study == NULL)? 0 : study->size;
528 break;
529
530 case PCRE_INFO_CAPTURECOUNT:
531 *((int *)where) = re->top_bracket;
532 break;
533
534 case PCRE_INFO_BACKREFMAX:
535 *((int *)where) = re->top_backref;
536 break;
537
538 case PCRE_INFO_FIRSTBYTE:
539 *((int *)where) =
540 ((re->options & PCRE_FIRSTSET) != 0)? re->first_byte :
541 ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
542 break;
543
544 case PCRE_INFO_FIRSTTABLE:
545 *((const uschar **)where) =
546 (study != NULL && (study->options & PCRE_STUDY_MAPPED) != 0)?
547 study->start_bits : NULL;
548 break;
549
550 case PCRE_INFO_LASTLITERAL:
551 *((int *)where) =
552 ((re->options & PCRE_REQCHSET) != 0)? re->req_byte : -1;
553 break;
554
555 case PCRE_INFO_NAMEENTRYSIZE:
556 *((int *)where) = re->name_entry_size;
557 break;
558
559 case PCRE_INFO_NAMECOUNT:
560 *((int *)where) = re->name_count;
561 break;
562
563 case PCRE_INFO_NAMETABLE:
564 *((const uschar **)where) = (const uschar *)re + sizeof(real_pcre);
565 break;
566
567 default: return PCRE_ERROR_BADOPTION;
568 }
569
570 return 0;
571 }
572
573
574
575 /*************************************************
576 * Return info about what features are configured *
577 *************************************************/
578
579 /* This is function which has an extensible interface so that additional items
580 can be added compatibly.
581
582 Arguments:
583 what what information is required
584 where where to put the information
585
586 Returns: 0 if data returned, negative on error
587 */
588
589 int
590 pcre_config(int what, void *where)
591 {
592 switch (what)
593 {
594 case PCRE_CONFIG_UTF8:
595 #ifdef SUPPORT_UTF8
596 *((int *)where) = 1;
597 #else
598 *((int *)where) = 0;
599 #endif
600 break;
601
602 case PCRE_CONFIG_NEWLINE:
603 *((int *)where) = NEWLINE;
604 break;
605
606 case PCRE_CONFIG_LINK_SIZE:
607 *((int *)where) = LINK_SIZE;
608 break;
609
610 case PCRE_CONFIG_POSIX_MALLOC_THRESHOLD:
611 *((int *)where) = POSIX_MALLOC_THRESHOLD;
612 break;
613
614 case PCRE_CONFIG_MATCH_LIMIT:
615 *((unsigned int *)where) = MATCH_LIMIT;
616 break;
617
618 default: return PCRE_ERROR_BADOPTION;
619 }
620
621 return 0;
622 }
623
624
625
626 #ifdef DEBUG
627 /*************************************************
628 * Debugging function to print chars *
629 *************************************************/
630
631 /* Print a sequence of chars in printable format, stopping at the end of the
632 subject if the requested.
633
634 Arguments:
635 p points to characters
636 length number to print
637 is_subject TRUE if printing from within md->start_subject
638 md pointer to matching data block, if is_subject is TRUE
639
640 Returns: nothing
641 */
642
643 static void
644 pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
645 {
646 int c;
647 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
648 while (length-- > 0)
649 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
650 }
651 #endif
652
653
654
655
656 /*************************************************
657 * Handle escapes *
658 *************************************************/
659
660 /* This function is called when a \ has been encountered. It either returns a
661 positive value for a simple escape such as \n, or a negative value which
662 encodes one of the more complicated things such as \d. When UTF-8 is enabled,
663 a positive value greater than 255 may be returned. On entry, ptr is pointing at
664 the \. On exit, it is on the final character of the escape sequence.
665
666 Arguments:
667 ptrptr points to the pattern position pointer
668 errorptr points to the pointer to the error message
669 bracount number of previous extracting brackets
670 options the options bits
671 isclass TRUE if inside a character class
672 cd pointer to char tables block
673
674 Returns: zero or positive => a data character
675 negative => a special escape sequence
676 on error, errorptr is set
677 */
678
679 static int
680 check_escape(const uschar **ptrptr, const char **errorptr, int bracount,
681 int options, BOOL isclass, compile_data *cd)
682 {
683 const uschar *ptr = *ptrptr;
684 int c, i;
685
686 /* If backslash is at the end of the pattern, it's an error. */
687
688 c = *(++ptr);
689 if (c == 0) *errorptr = ERR1;
690
691 /* Digits or letters may have special meaning; all others are literals. */
692
693 else if (c < '0' || c > 'z') {}
694
695 /* Do an initial lookup in a table. A non-zero result is something that can be
696 returned immediately. Otherwise further processing may be required. */
697
698 else if ((i = escapes[c - '0']) != 0) c = i;
699
700 /* Escapes that need further processing, or are illegal. */
701
702 else
703 {
704 const uschar *oldptr;
705 switch (c)
706 {
707 /* A number of Perl escapes are not handled by PCRE. We give an explicit
708 error. */
709
710 case 'l':
711 case 'L':
712 case 'N':
713 case 'p':
714 case 'P':
715 case 'u':
716 case 'U':
717 case 'X':
718 *errorptr = ERR37;
719 break;
720
721 /* The handling of escape sequences consisting of a string of digits
722 starting with one that is not zero is not straightforward. By experiment,
723 the way Perl works seems to be as follows:
724
725 Outside a character class, the digits are read as a decimal number. If the
726 number is less than 10, or if there are that many previous extracting
727 left brackets, then it is a back reference. Otherwise, up to three octal
728 digits are read to form an escaped byte. Thus \123 is likely to be octal
729 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
730 value is greater than 377, the least significant 8 bits are taken. Inside a
731 character class, \ followed by a digit is always an octal number. */
732
733 case '1': case '2': case '3': case '4': case '5':
734 case '6': case '7': case '8': case '9':
735
736 if (!isclass)
737 {
738 oldptr = ptr;
739 c -= '0';
740 while ((digitab[ptr[1]] & ctype_digit) != 0)
741 c = c * 10 + *(++ptr) - '0';
742 if (c < 10 || c <= bracount)
743 {
744 c = -(ESC_REF + c);
745 break;
746 }
747 ptr = oldptr; /* Put the pointer back and fall through */
748 }
749
750 /* Handle an octal number following \. If the first digit is 8 or 9, Perl
751 generates a binary zero byte and treats the digit as a following literal.
752 Thus we have to pull back the pointer by one. */
753
754 if ((c = *ptr) >= '8')
755 {
756 ptr--;
757 c = 0;
758 break;
759 }
760
761 /* \0 always starts an octal number, but we may drop through to here with a
762 larger first octal digit. */
763
764 case '0':
765 c -= '0';
766 while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
767 c = c * 8 + *(++ptr) - '0';
768 c &= 255; /* Take least significant 8 bits */
769 break;
770
771 /* \x is complicated when UTF-8 is enabled. \x{ddd} is a character number
772 which can be greater than 0xff, but only if the ddd are hex digits. */
773
774 case 'x':
775 #ifdef SUPPORT_UTF8
776 if (ptr[1] == '{' && (options & PCRE_UTF8) != 0)
777 {
778 const uschar *pt = ptr + 2;
779 register int count = 0;
780 c = 0;
781 while ((digitab[*pt] & ctype_xdigit) != 0)
782 {
783 int cc = *pt++;
784 if (cc >= 'a') cc -= 32; /* Convert to upper case */
785 count++;
786 c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
787 }
788 if (*pt == '}')
789 {
790 if (c < 0 || count > 8) *errorptr = ERR34;
791 ptr = pt;
792 break;
793 }
794 /* If the sequence of hex digits does not end with '}', then we don't
795 recognize this construct; fall through to the normal \x handling. */
796 }
797 #endif
798
799 /* Read just a single hex char */
800
801 c = 0;
802 while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
803 {
804 int cc = *(++ptr);
805 if (cc >= 'a') cc -= 32; /* Convert to upper case */
806 c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
807 }
808 break;
809
810 /* Other special escapes not starting with a digit are straightforward */
811
812 case 'c':
813 c = *(++ptr);
814 if (c == 0)
815 {
816 *errorptr = ERR2;
817 return 0;
818 }
819
820 /* A letter is upper-cased; then the 0x40 bit is flipped. This coding
821 is ASCII-specific, but then the whole concept of \cx is ASCII-specific. */
822
823 if (c >= 'a' && c <= 'z') c -= 32;
824 c ^= 0x40;
825 break;
826
827 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
828 other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
829 for Perl compatibility, it is a literal. This code looks a bit odd, but
830 there used to be some cases other than the default, and there may be again
831 in future, so I haven't "optimized" it. */
832
833 default:
834 if ((options & PCRE_EXTRA) != 0) switch(c)
835 {
836 default:
837 *errorptr = ERR3;
838 break;
839 }
840 break;
841 }
842 }
843
844 *ptrptr = ptr;
845 return c;
846 }
847
848
849
850 /*************************************************
851 * Check for counted repeat *
852 *************************************************/
853
854 /* This function is called when a '{' is encountered in a place where it might
855 start a quantifier. It looks ahead to see if it really is a quantifier or not.
856 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
857 where the ddds are digits.
858
859 Arguments:
860 p pointer to the first char after '{'
861 cd pointer to char tables block
862
863 Returns: TRUE or FALSE
864 */
865
866 static BOOL
867 is_counted_repeat(const uschar *p, compile_data *cd)
868 {
869 if ((digitab[*p++] && ctype_digit) == 0) return FALSE;
870 while ((digitab[*p] & ctype_digit) != 0) p++;
871 if (*p == '}') return TRUE;
872
873 if (*p++ != ',') return FALSE;
874 if (*p == '}') return TRUE;
875
876 if ((digitab[*p++] && ctype_digit) == 0) return FALSE;
877 while ((digitab[*p] & ctype_digit) != 0) p++;
878
879 return (*p == '}');
880 }
881
882
883
884 /*************************************************
885 * Read repeat counts *
886 *************************************************/
887
888 /* Read an item of the form {n,m} and return the values. This is called only
889 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
890 so the syntax is guaranteed to be correct, but we need to check the values.
891
892 Arguments:
893 p pointer to first char after '{'
894 minp pointer to int for min
895 maxp pointer to int for max
896 returned as -1 if no max
897 errorptr points to pointer to error message
898 cd pointer to character tables clock
899
900 Returns: pointer to '}' on success;
901 current ptr on error, with errorptr set
902 */
903
904 static const uschar *
905 read_repeat_counts(const uschar *p, int *minp, int *maxp,
906 const char **errorptr, compile_data *cd)
907 {
908 int min = 0;
909 int max = -1;
910
911 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
912
913 if (*p == '}') max = min; else
914 {
915 if (*(++p) != '}')
916 {
917 max = 0;
918 while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
919 if (max < min)
920 {
921 *errorptr = ERR4;
922 return p;
923 }
924 }
925 }
926
927 /* Do paranoid checks, then fill in the required variables, and pass back the
928 pointer to the terminating '}'. */
929
930 if (min > 65535 || max > 65535)
931 *errorptr = ERR5;
932 else
933 {
934 *minp = min;
935 *maxp = max;
936 }
937 return p;
938 }
939
940
941
942 /*************************************************
943 * Find first significant op code *
944 *************************************************/
945
946 /* This is called by several functions that scan a compiled expression looking
947 for a fixed first character, or an anchoring op code etc. It skips over things
948 that do not influence this. For some calls, a change of option is important.
949
950 Arguments:
951 code pointer to the start of the group
952 options pointer to external options
953 optbit the option bit whose changing is significant, or
954 zero if none are
955
956 Returns: pointer to the first significant opcode
957 */
958
959 static const uschar*
960 first_significant_code(const uschar *code, int *options, int optbit)
961 {
962 for (;;)
963 {
964 switch ((int)*code)
965 {
966 case OP_OPT:
967 if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
968 *options = (int)code[1];
969 code += 2;
970 break;
971
972 case OP_ASSERT_NOT:
973 case OP_ASSERTBACK:
974 case OP_ASSERTBACK_NOT:
975 do code += GET(code, 1); while (*code == OP_ALT);
976 /* Fall through */
977
978 case OP_CALLOUT:
979 case OP_CREF:
980 case OP_BRANUMBER:
981 case OP_WORD_BOUNDARY:
982 case OP_NOT_WORD_BOUNDARY:
983 code += OP_lengths[*code];
984 break;
985
986 default:
987 return code;
988 }
989 }
990 /* Control never reaches here */
991 }
992
993
994
995
996 /*************************************************
997 * Find the fixed length of a pattern *
998 *************************************************/
999
1000 /* Scan a pattern and compute the fixed length of subject that will match it,
1001 if the length is fixed. This is needed for dealing with backward assertions.
1002 In UTF8 mode, the result is in characters rather than bytes.
1003
1004 Arguments:
1005 code points to the start of the pattern (the bracket)
1006 options the compiling options
1007
1008 Returns: the fixed length, or -1 if there is no fixed length,
1009 or -2 if \C was encountered
1010 */
1011
1012 static int
1013 find_fixedlength(uschar *code, int options)
1014 {
1015 int length = -1;
1016
1017 register int branchlength = 0;
1018 register uschar *cc = code + 1 + LINK_SIZE;
1019
1020 /* Scan along the opcodes for this branch. If we get to the end of the
1021 branch, check the length against that of the other branches. */
1022
1023 for (;;)
1024 {
1025 int d;
1026 register int op = *cc;
1027 if (op >= OP_BRA) op = OP_BRA;
1028
1029 switch (op)
1030 {
1031 case OP_BRA:
1032 case OP_ONCE:
1033 case OP_COND:
1034 d = find_fixedlength(cc, options);
1035 if (d < 0) return d;
1036 branchlength += d;
1037 do cc += GET(cc, 1); while (*cc == OP_ALT);
1038 cc += 1 + LINK_SIZE;
1039 break;
1040
1041 /* Reached end of a branch; if it's a ket it is the end of a nested
1042 call. If it's ALT it is an alternation in a nested call. If it is
1043 END it's the end of the outer call. All can be handled by the same code. */
1044
1045 case OP_ALT:
1046 case OP_KET:
1047 case OP_KETRMAX:
1048 case OP_KETRMIN:
1049 case OP_END:
1050 if (length < 0) length = branchlength;
1051 else if (length != branchlength) return -1;
1052 if (*cc != OP_ALT) return length;
1053 cc += 1 + LINK_SIZE;
1054 branchlength = 0;
1055 break;
1056
1057 /* Skip over assertive subpatterns */
1058
1059 case OP_ASSERT:
1060 case OP_ASSERT_NOT:
1061 case OP_ASSERTBACK:
1062 case OP_ASSERTBACK_NOT:
1063 do cc += GET(cc, 1); while (*cc == OP_ALT);
1064 /* Fall through */
1065
1066 /* Skip over things that don't match chars */
1067
1068 case OP_REVERSE:
1069 case OP_BRANUMBER:
1070 case OP_CREF:
1071 case OP_OPT:
1072 case OP_CALLOUT:
1073 case OP_SOD:
1074 case OP_SOM:
1075 case OP_EOD:
1076 case OP_EODN:
1077 case OP_CIRC:
1078 case OP_DOLL:
1079 case OP_NOT_WORD_BOUNDARY:
1080 case OP_WORD_BOUNDARY:
1081 cc += OP_lengths[*cc];
1082 break;
1083
1084 /* Handle char strings. In UTF-8 mode we must count characters, not bytes.
1085 This requires a scan of the string, unfortunately. We assume valid UTF-8
1086 strings, so all we do is reduce the length by one for every byte whose bits
1087 are 10xxxxxx. */
1088
1089 case OP_CHARS:
1090 branchlength += *(++cc);
1091 #ifdef SUPPORT_UTF8
1092 if ((options & PCRE_UTF8) != 0)
1093 for (d = 1; d <= *cc; d++)
1094 if ((cc[d] & 0xc0) == 0x80) branchlength--;
1095 #endif
1096 cc += *cc + 1;
1097 break;
1098
1099 /* Handle exact repetitions. The count is already in characters, but we
1100 need to skip over a multibyte character in UTF8 mode. */
1101
1102 case OP_EXACT:
1103 branchlength += GET2(cc,1);
1104 cc += 4;
1105 #ifdef SUPPORT_UTF8
1106 if ((options & PCRE_UTF8) != 0)
1107 {
1108 while((*cc & 0x80) == 0x80) cc++;
1109 }
1110 #endif
1111 break;
1112
1113 case OP_TYPEEXACT:
1114 branchlength += GET2(cc,1);
1115 cc += 4;
1116 break;
1117
1118 /* Handle single-char matchers */
1119
1120 case OP_NOT_DIGIT:
1121 case OP_DIGIT:
1122 case OP_NOT_WHITESPACE:
1123 case OP_WHITESPACE:
1124 case OP_NOT_WORDCHAR:
1125 case OP_WORDCHAR:
1126 case OP_ANY:
1127 branchlength++;
1128 cc++;
1129 break;
1130
1131 /* The single-byte matcher isn't allowed */
1132
1133 case OP_ANYBYTE:
1134 return -2;
1135
1136 /* Check a class for variable quantification */
1137
1138 #ifdef SUPPORT_UTF8
1139 case OP_XCLASS:
1140 cc += GET(cc, 1) - 33;
1141 /* Fall through */
1142 #endif
1143
1144 case OP_CLASS:
1145 case OP_NCLASS:
1146 cc += 33;
1147
1148 switch (*cc)
1149 {
1150 case OP_CRSTAR:
1151 case OP_CRMINSTAR:
1152 case OP_CRQUERY:
1153 case OP_CRMINQUERY:
1154 return -1;
1155
1156 case OP_CRRANGE:
1157 case OP_CRMINRANGE:
1158 if (GET2(cc,1) != GET2(cc,3)) return -1;
1159 branchlength += GET2(cc,1);
1160 cc += 5;
1161 break;
1162
1163 default:
1164 branchlength++;
1165 }
1166 break;
1167
1168 /* Anything else is variable length */
1169
1170 default:
1171 return -1;
1172 }
1173 }
1174 /* Control never gets here */
1175 }
1176
1177
1178
1179
1180 /*************************************************
1181 * Scan compiled regex for numbered bracket *
1182 *************************************************/
1183
1184 /* This little function scans through a compiled pattern until it finds a
1185 capturing bracket with the given number.
1186
1187 Arguments:
1188 code points to start of expression
1189 utf8 TRUE in UTF-8 mode
1190 number the required bracket number
1191
1192 Returns: pointer to the opcode for the bracket, or NULL if not found
1193 */
1194
1195 static const uschar *
1196 find_bracket(const uschar *code, BOOL utf8, int number)
1197 {
1198 #ifndef SUPPORT_UTF8
1199 utf8 = utf8; /* Stop pedantic compilers complaining */
1200 #endif
1201
1202 for (;;)
1203 {
1204 register int c = *code;
1205 if (c == OP_END) return NULL;
1206 else if (c == OP_CHARS) code += code[1] + OP_lengths[c];
1207 else if (c > OP_BRA)
1208 {
1209 int n = c - OP_BRA;
1210 if (n > EXTRACT_BASIC_MAX) n = GET2(code, 2+LINK_SIZE);
1211 if (n == number) return (uschar *)code;
1212 code += OP_lengths[OP_BRA];
1213 }
1214 else
1215 {
1216 code += OP_lengths[c];
1217
1218 /* In UTF-8 mode, opcodes that are followed by a character may be followed
1219 by a multi-byte character. The length in the table is a minimum, so we have
1220 to scan along to skip the extra characters. All opcodes are less than 128,
1221 so we can use relatively efficient code. */
1222
1223 #ifdef SUPPORT_UTF8
1224 if (utf8) switch(c)
1225 {
1226 case OP_EXACT:
1227 case OP_UPTO:
1228 case OP_MINUPTO:
1229 case OP_STAR:
1230 case OP_MINSTAR:
1231 case OP_PLUS:
1232 case OP_MINPLUS:
1233 case OP_QUERY:
1234 case OP_MINQUERY:
1235 while ((*code & 0xc0) == 0x80) code++;
1236 break;
1237 }
1238 #endif
1239 }
1240 }
1241 }
1242
1243
1244
1245 /*************************************************
1246 * Scan compiled branch for non-emptiness *
1247 *************************************************/
1248
1249 /* This function scans through a branch of a compiled pattern to see whether it
1250 can match the empty string or not. It is called only from could_be_empty()
1251 below. Note that first_significant_code() skips over assertions. If we hit an
1252 unclosed bracket, we return "empty" - this means we've struck an inner bracket
1253 whose current branch will already have been scanned.
1254
1255 Arguments:
1256 code points to start of search
1257 endcode points to where to stop
1258 utf8 TRUE if in UTF8 mode
1259
1260 Returns: TRUE if what is matched could be empty
1261 */
1262
1263 static BOOL
1264 could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1265 {
1266 register int c;
1267 for (code = first_significant_code(code + 1 + LINK_SIZE, NULL, 0);
1268 code < endcode;
1269 code = first_significant_code(code + OP_lengths[c], NULL, 0))
1270 {
1271 const uschar *ccode;
1272
1273 c = *code;
1274
1275 if (c >= OP_BRA)
1276 {
1277 BOOL empty_branch;
1278 if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1279
1280 /* Scan a closed bracket */
1281
1282 empty_branch = FALSE;
1283 do
1284 {
1285 if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1286 empty_branch = TRUE;
1287 code += GET(code, 1);
1288 }
1289 while (*code == OP_ALT);
1290 if (!empty_branch) return FALSE; /* All branches are non-empty */
1291 code += 1 + LINK_SIZE;
1292 c = *code;
1293 }
1294
1295 else switch (c)
1296 {
1297 /* Check for quantifiers after a class */
1298
1299 #ifdef SUPPORT_UTF8
1300 case OP_XCLASS:
1301 ccode = code + GET(code, 1);
1302 goto CHECK_CLASS_REPEAT;
1303 #endif
1304
1305 case OP_CLASS:
1306 case OP_NCLASS:
1307 ccode = code + 33;
1308
1309 #ifdef SUPPORT_UTF8
1310 CHECK_CLASS_REPEAT:
1311 #endif
1312
1313 switch (*ccode)
1314 {
1315 case OP_CRSTAR: /* These could be empty; continue */
1316 case OP_CRMINSTAR:
1317 case OP_CRQUERY:
1318 case OP_CRMINQUERY:
1319 break;
1320
1321 default: /* Non-repeat => class must match */
1322 case OP_CRPLUS: /* These repeats aren't empty */
1323 case OP_CRMINPLUS:
1324 return FALSE;
1325
1326 case OP_CRRANGE:
1327 case OP_CRMINRANGE:
1328 if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
1329 break;
1330 }
1331 break;
1332
1333 /* Opcodes that must match a character */
1334
1335 case OP_NOT_DIGIT:
1336 case OP_DIGIT:
1337 case OP_NOT_WHITESPACE:
1338 case OP_WHITESPACE:
1339 case OP_NOT_WORDCHAR:
1340 case OP_WORDCHAR:
1341 case OP_ANY:
1342 case OP_ANYBYTE:
1343 case OP_CHARS:
1344 case OP_NOT:
1345 case OP_PLUS:
1346 case OP_MINPLUS:
1347 case OP_EXACT:
1348 case OP_NOTPLUS:
1349 case OP_NOTMINPLUS:
1350 case OP_NOTEXACT:
1351 case OP_TYPEPLUS:
1352 case OP_TYPEMINPLUS:
1353 case OP_TYPEEXACT:
1354 return FALSE;
1355
1356 /* End of branch */
1357
1358 case OP_KET:
1359 case OP_KETRMAX:
1360 case OP_KETRMIN:
1361 case OP_ALT:
1362 return TRUE;
1363
1364 /* In UTF-8 mode, STAR, MINSTAR, QUERY, MINQUERY, UPTO, and MINUPTO may be
1365 followed by a multibyte character */
1366
1367 #ifdef SUPPORT_UTF8
1368 case OP_STAR:
1369 case OP_MINSTAR:
1370 case OP_QUERY:
1371 case OP_MINQUERY:
1372 case OP_UPTO:
1373 case OP_MINUPTO:
1374 if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1375 break;
1376 #endif
1377 }
1378 }
1379
1380 return TRUE;
1381 }
1382
1383
1384
1385 /*************************************************
1386 * Scan compiled regex for non-emptiness *
1387 *************************************************/
1388
1389 /* This function is called to check for left recursive calls. We want to check
1390 the current branch of the current pattern to see if it could match the empty
1391 string. If it could, we must look outwards for branches at other levels,
1392 stopping when we pass beyond the bracket which is the subject of the recursion.
1393
1394 Arguments:
1395 code points to start of the recursion
1396 endcode points to where to stop (current RECURSE item)
1397 bcptr points to the chain of current (unclosed) branch starts
1398 utf8 TRUE if in UTF-8 mode
1399
1400 Returns: TRUE if what is matched could be empty
1401 */
1402
1403 static BOOL
1404 could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1405 BOOL utf8)
1406 {
1407 while (bcptr != NULL && bcptr->current >= code)
1408 {
1409 if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1410 bcptr = bcptr->outer;
1411 }
1412 return TRUE;
1413 }
1414
1415
1416
1417 /*************************************************
1418 * Check for POSIX class syntax *
1419 *************************************************/
1420
1421 /* This function is called when the sequence "[:" or "[." or "[=" is
1422 encountered in a character class. It checks whether this is followed by an
1423 optional ^ and then a sequence of letters, terminated by a matching ":]" or
1424 ".]" or "=]".
1425
1426 Argument:
1427 ptr pointer to the initial [
1428 endptr where to return the end pointer
1429 cd pointer to compile data
1430
1431 Returns: TRUE or FALSE
1432 */
1433
1434 static BOOL
1435 check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
1436 {
1437 int terminator; /* Don't combine these lines; the Solaris cc */
1438 terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
1439 if (*(++ptr) == '^') ptr++;
1440 while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
1441 if (*ptr == terminator && ptr[1] == ']')
1442 {
1443 *endptr = ptr;
1444 return TRUE;
1445 }
1446 return FALSE;
1447 }
1448
1449
1450
1451
1452 /*************************************************
1453 * Check POSIX class name *
1454 *************************************************/
1455
1456 /* This function is called to check the name given in a POSIX-style class entry
1457 such as [:alnum:].
1458
1459 Arguments:
1460 ptr points to the first letter
1461 len the length of the name
1462
1463 Returns: a value representing the name, or -1 if unknown
1464 */
1465
1466 static int
1467 check_posix_name(const uschar *ptr, int len)
1468 {
1469 register int yield = 0;
1470 while (posix_name_lengths[yield] != 0)
1471 {
1472 if (len == posix_name_lengths[yield] &&
1473 strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
1474 yield++;
1475 }
1476 return -1;
1477 }
1478
1479
1480
1481
1482 /*************************************************
1483 * Compile one branch *
1484 *************************************************/
1485
1486 /* Scan the pattern, compiling it into the code vector. If the options are
1487 changed during the branch, the pointer is used to change the external options
1488 bits.
1489
1490 Arguments:
1491 optionsptr pointer to the option bits
1492 brackets points to number of extracting brackets used
1493 code points to the pointer to the current code point
1494 ptrptr points to the current pattern pointer
1495 errorptr points to pointer to error message
1496 firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
1497 reqbyteptr set to the last literal character required, else < 0
1498 bcptr points to current branch chain
1499 cd contains pointers to tables etc.
1500
1501 Returns: TRUE on success
1502 FALSE, with *errorptr set on error
1503 */
1504
1505 static BOOL
1506 compile_branch(int *optionsptr, int *brackets, uschar **codeptr,
1507 const uschar **ptrptr, const char **errorptr, int *firstbyteptr,
1508 int *reqbyteptr, branch_chain *bcptr, compile_data *cd)
1509 {
1510 int repeat_type, op_type;
1511 int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
1512 int bravalue = 0;
1513 int length;
1514 int greedy_default, greedy_non_default;
1515 int firstbyte, reqbyte;
1516 int zeroreqbyte, zerofirstbyte;
1517 int req_caseopt, reqvary, tempreqvary;
1518 int condcount = 0;
1519 int options = *optionsptr;
1520 register int c;
1521 register uschar *code = *codeptr;
1522 uschar *tempcode;
1523 BOOL inescq = FALSE;
1524 BOOL groupsetfirstbyte = FALSE;
1525 const uschar *ptr = *ptrptr;
1526 const uschar *tempptr;
1527 uschar *previous = NULL;
1528 uschar class[32];
1529
1530 #ifdef SUPPORT_UTF8
1531 BOOL class_utf8;
1532 BOOL utf8 = (options & PCRE_UTF8) != 0;
1533 uschar *class_utf8data;
1534 uschar utf8_char[6];
1535 #else
1536 BOOL utf8 = FALSE;
1537 #endif
1538
1539 /* Set up the default and non-default settings for greediness */
1540
1541 greedy_default = ((options & PCRE_UNGREEDY) != 0);
1542 greedy_non_default = greedy_default ^ 1;
1543
1544 /* Initialize no first char, no required char. REQ_UNSET means "no char
1545 matching encountered yet". It gets changed to REQ_NONE if we hit something that
1546 matches a non-fixed char first char; reqbyte just remains unset if we never
1547 find one.
1548
1549 When we hit a repeat whose minimum is zero, we may have to adjust these values
1550 to take the zero repeat into account. This is implemented by setting them to
1551 zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
1552 item types that can be repeated set these backoff variables appropriately. */
1553
1554 firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
1555
1556 /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
1557 according to the current setting of the caseless flag. REQ_CASELESS is a bit
1558 value > 255. It is added into the firstbyte or reqbyte variables to record the
1559 case status of the value. */
1560
1561 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
1562
1563 /* Switch on next character until the end of the branch */
1564
1565 for (;; ptr++)
1566 {
1567 BOOL negate_class;
1568 BOOL possessive_quantifier;
1569 int class_charcount;
1570 int class_lastchar;
1571 int newoptions;
1572 int recno;
1573 int skipbytes;
1574 int subreqbyte;
1575 int subfirstbyte;
1576
1577 c = *ptr;
1578 if (inescq && c != 0) goto NORMAL_CHAR;
1579
1580 if ((options & PCRE_EXTENDED) != 0)
1581 {
1582 if ((cd->ctypes[c] & ctype_space) != 0) continue;
1583 if (c == '#')
1584 {
1585 /* The space before the ; is to avoid a warning on a silly compiler
1586 on the Macintosh. */
1587 while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
1588 if (c != 0) continue; /* Else fall through to handle end of string */
1589 }
1590 }
1591
1592 switch(c)
1593 {
1594 /* The branch terminates at end of string, |, or ). */
1595
1596 case 0:
1597 case '|':
1598 case ')':
1599 *firstbyteptr = firstbyte;
1600 *reqbyteptr = reqbyte;
1601 *codeptr = code;
1602 *ptrptr = ptr;
1603 return TRUE;
1604
1605 /* Handle single-character metacharacters. In multiline mode, ^ disables
1606 the setting of any following char as a first character. */
1607
1608 case '^':
1609 if ((options & PCRE_MULTILINE) != 0)
1610 {
1611 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
1612 }
1613 previous = NULL;
1614 *code++ = OP_CIRC;
1615 break;
1616
1617 case '$':
1618 previous = NULL;
1619 *code++ = OP_DOLL;
1620 break;
1621
1622 /* There can never be a first char if '.' is first, whatever happens about
1623 repeats. The value of reqbyte doesn't change either. */
1624
1625 case '.':
1626 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
1627 zerofirstbyte = firstbyte;
1628 zeroreqbyte = reqbyte;
1629 previous = code;
1630 *code++ = OP_ANY;
1631 break;
1632
1633 /* Character classes. If the included characters are all < 255 in value, we
1634 build a 32-byte bitmap of the permitted characters, except in the special
1635 case where there is only one such character. For negated classes, we build
1636 the map as usual, then invert it at the end. However, we use a different
1637 opcode so that data characters > 255 can be handled correctly.
1638
1639 If the class contains characters outside the 0-255 range, a different
1640 opcode is compiled. It may optionally have a bit map for characters < 256,
1641 but those above are are explicitly listed afterwards. A flag byte tells
1642 whether the bitmap is present, and whether this is a negated class or not.
1643 */
1644
1645 case '[':
1646 previous = code;
1647
1648 /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
1649 they are encountered at the top level, so we'll do that too. */
1650
1651 if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
1652 check_posix_syntax(ptr, &tempptr, cd))
1653 {
1654 *errorptr = (ptr[1] == ':')? ERR13 : ERR31;
1655 goto FAILED;
1656 }
1657
1658 /* If the first character is '^', set the negation flag and skip it. */
1659
1660 if ((c = *(++ptr)) == '^')
1661 {
1662 negate_class = TRUE;
1663 c = *(++ptr);
1664 }
1665 else
1666 {
1667 negate_class = FALSE;
1668 }
1669
1670 /* Keep a count of chars with values < 256 so that we can optimize the case
1671 of just a single character (as long as it's < 256). For higher valued UTF-8
1672 characters, we don't yet do any optimization. */
1673
1674 class_charcount = 0;
1675 class_lastchar = -1;
1676
1677 #ifdef SUPPORT_UTF8
1678 class_utf8 = FALSE; /* No chars >= 256 */
1679 class_utf8data = code + LINK_SIZE + 34; /* For UTF-8 items */
1680 #endif
1681
1682 /* Initialize the 32-char bit map to all zeros. We have to build the
1683 map in a temporary bit of store, in case the class contains only 1
1684 character (< 256), because in that case the compiled code doesn't use the
1685 bit map. */
1686
1687 memset(class, 0, 32 * sizeof(uschar));
1688
1689 /* Process characters until ] is reached. By writing this as a "do" it
1690 means that an initial ] is taken as a data character. The first pass
1691 through the regex checked the overall syntax, so we don't need to be very
1692 strict here. At the start of the loop, c contains the first byte of the
1693 character. */
1694
1695 do
1696 {
1697 #ifdef SUPPORT_UTF8
1698 if (utf8 && c > 127)
1699 { /* Braces are required because the */
1700 GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
1701 }
1702 #endif
1703
1704 /* Inside \Q...\E everything is literal except \E */
1705
1706 if (inescq)
1707 {
1708 if (c == '\\' && ptr[1] == 'E')
1709 {
1710 inescq = FALSE;
1711 ptr++;
1712 continue;
1713 }
1714 else goto LONE_SINGLE_CHARACTER;
1715 }
1716
1717 /* Handle POSIX class names. Perl allows a negation extension of the
1718 form [:^name:]. A square bracket that doesn't match the syntax is
1719 treated as a literal. We also recognize the POSIX constructions
1720 [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
1721 5.6 and 5.8 do. */
1722
1723 if (c == '[' &&
1724 (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
1725 check_posix_syntax(ptr, &tempptr, cd))
1726 {
1727 BOOL local_negate = FALSE;
1728 int posix_class, i;
1729 register const uschar *cbits = cd->cbits;
1730
1731 if (ptr[1] != ':')
1732 {
1733 *errorptr = ERR31;
1734 goto FAILED;
1735 }
1736
1737 ptr += 2;
1738 if (*ptr == '^')
1739 {
1740 local_negate = TRUE;
1741 ptr++;
1742 }
1743
1744 posix_class = check_posix_name(ptr, tempptr - ptr);
1745 if (posix_class < 0)
1746 {
1747 *errorptr = ERR30;
1748 goto FAILED;
1749 }
1750
1751 /* If matching is caseless, upper and lower are converted to
1752 alpha. This relies on the fact that the class table starts with
1753 alpha, lower, upper as the first 3 entries. */
1754
1755 if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
1756 posix_class = 0;
1757
1758 /* Or into the map we are building up to 3 of the static class
1759 tables, or their negations. The [:blank:] class sets up the same
1760 chars as the [:space:] class (all white space). We remove the vertical
1761 white space chars afterwards. */
1762
1763 posix_class *= 3;
1764 for (i = 0; i < 3; i++)
1765 {
1766 BOOL isblank = strncmp((char *)ptr, "blank", 5) == 0;
1767 int taboffset = posix_class_maps[posix_class + i];
1768 if (taboffset < 0) break;
1769 if (local_negate)
1770 {
1771 for (c = 0; c < 32; c++) class[c] |= ~cbits[c+taboffset];
1772 if (isblank) class[1] |= 0x3c;
1773 }
1774 else
1775 {
1776 for (c = 0; c < 32; c++) class[c] |= cbits[c+taboffset];
1777 if (isblank) class[1] &= ~0x3c;
1778 }
1779 }
1780
1781 ptr = tempptr + 1;
1782 class_charcount = 10; /* Set > 1; assumes more than 1 per class */
1783 continue; /* End of POSIX syntax handling */
1784 }
1785
1786 /* Backslash may introduce a single character, or it may introduce one
1787 of the specials, which just set a flag. Escaped items are checked for
1788 validity in the pre-compiling pass. The sequence \b is a special case.
1789 Inside a class (and only there) it is treated as backspace. Elsewhere
1790 it marks a word boundary. Other escapes have preset maps ready to
1791 or into the one we are building. We assume they have more than one
1792 character in them, so set class_charcount bigger than one. */
1793
1794 if (c == '\\')
1795 {
1796 c = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd);
1797 if (-c == ESC_b) c = '\b'; /* \b is backslash in a class */
1798
1799 if (-c == ESC_Q) /* Handle start of quoted string */
1800 {
1801 if (ptr[1] == '\\' && ptr[2] == 'E')
1802 {
1803 ptr += 2; /* avoid empty string */
1804 }
1805 else inescq = TRUE;
1806 continue;
1807 }
1808
1809 else if (c < 0)
1810 {
1811 register const uschar *cbits = cd->cbits;
1812 class_charcount = 10; /* Greater than 1 is what matters */
1813 switch (-c)
1814 {
1815 case ESC_d:
1816 for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_digit];
1817 continue;
1818
1819 case ESC_D:
1820 for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_digit];
1821 continue;
1822
1823 case ESC_w:
1824 for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_word];
1825 continue;
1826
1827 case ESC_W:
1828 for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_word];
1829 continue;
1830
1831 case ESC_s:
1832 for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_space];
1833 class[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
1834 continue;
1835
1836 case ESC_S:
1837 for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_space];
1838 class[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
1839 continue;
1840
1841 /* Unrecognized escapes are faulted if PCRE is running in its
1842 strict mode. By default, for compatibility with Perl, they are
1843 treated as literals. */
1844
1845 default:
1846 if ((options & PCRE_EXTRA) != 0)
1847 {
1848 *errorptr = ERR7;
1849 goto FAILED;
1850 }
1851 c = *ptr; /* The final character */
1852 }
1853 }
1854
1855 /* Fall through if we have a single character (c >= 0). This may be
1856 > 256 in UTF-8 mode. */
1857
1858 } /* End of backslash handling */
1859
1860 /* A single character may be followed by '-' to form a range. However,
1861 Perl does not permit ']' to be the end of the range. A '-' character
1862 here is treated as a literal. */
1863
1864 if (ptr[1] == '-' && ptr[2] != ']')
1865 {
1866 int d;
1867 ptr += 2;
1868
1869 #ifdef SUPPORT_UTF8
1870 if (utf8)
1871 { /* Braces are required because the */
1872 GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
1873 }
1874 else
1875 #endif
1876 d = *ptr;
1877
1878 /* The second part of a range can be a single-character escape, but
1879 not any of the other escapes. Perl 5.6 treats a hyphen as a literal
1880 in such circumstances. */
1881
1882 if (d == '\\')
1883 {
1884 const uschar *oldptr = ptr;
1885 d = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd);
1886
1887 /* \b is backslash; any other special means the '-' was literal */
1888
1889 if (d < 0)
1890 {
1891 if (d == -ESC_b) d = '\b'; else
1892 {
1893 ptr = oldptr - 2;
1894 goto LONE_SINGLE_CHARACTER; /* A few lines below */
1895 }
1896 }
1897 }
1898
1899 /* Check that the two values are in the correct order */
1900
1901 if (d < c)
1902 {
1903 *errorptr = ERR8;
1904 goto FAILED;
1905 }
1906
1907 /* If d is greater than 255, we can't just use the bit map, so set up
1908 for the UTF-8 supporting class type. If we are not caseless, we can
1909 just set up a single range. If we are caseless, the characters < 256
1910 are handled with a bitmap, in order to get the case-insensitive
1911 handling. */
1912
1913 #ifdef SUPPORT_UTF8
1914 if (d > 255)
1915 {
1916 class_utf8 = TRUE;
1917 *class_utf8data++ = XCL_RANGE;
1918 if ((options & PCRE_CASELESS) == 0)
1919 {
1920 class_utf8data += ord2utf8(c, class_utf8data);
1921 class_utf8data += ord2utf8(d, class_utf8data);
1922 continue; /* Go get the next char in the class */
1923 }
1924 class_utf8data += ord2utf8(256, class_utf8data);
1925 class_utf8data += ord2utf8(d, class_utf8data);
1926 d = 255;
1927 /* Fall through */
1928 }
1929 #endif
1930 /* We use the bit map if the range is entirely < 255, or if part of it
1931 is < 255 and matching is caseless. */
1932
1933 for (; c <= d; c++)
1934 {
1935 class[c/8] |= (1 << (c&7));
1936 if ((options & PCRE_CASELESS) != 0)
1937 {
1938 int uc = cd->fcc[c]; /* flip case */
1939 class[uc/8] |= (1 << (uc&7));
1940 }
1941 class_charcount++; /* in case a one-char range */
1942 class_lastchar = c;
1943 }
1944
1945 continue; /* Go get the next char in the class */
1946 }
1947
1948 /* Handle a lone single character - we can get here for a normal
1949 non-escape char, or after \ that introduces a single character. */
1950
1951 LONE_SINGLE_CHARACTER:
1952
1953 /* Handle a multibyte character */
1954
1955 #ifdef SUPPORT_UTF8
1956 if (utf8 && c > 255)
1957 {
1958 class_utf8 = TRUE;
1959 *class_utf8data++ = XCL_SINGLE;
1960 class_utf8data += ord2utf8(c, class_utf8data);
1961 }
1962 else
1963 #endif
1964 /* Handle a single-byte character */
1965 {
1966 class [c/8] |= (1 << (c&7));
1967 if ((options & PCRE_CASELESS) != 0)
1968 {
1969 c = cd->fcc[c]; /* flip case */
1970 class[c/8] |= (1 << (c&7));
1971 }
1972 class_charcount++;
1973 class_lastchar = c;
1974 }
1975 }
1976
1977 /* Loop until ']' reached; the check for end of string happens inside the
1978 loop. This "while" is the end of the "do" above. */
1979
1980 while ((c = *(++ptr)) != ']' || inescq);
1981
1982 /* If class_charcount is 1, we saw precisely one character with a value <
1983 256. In UTF-8 mode, we can optimize if there were no characters >= 256 and
1984 the one character is < 128. In non-UTF-8 mode we can always optimize.
1985
1986 The optimization throws away the bit map. We turn the item into a
1987 1-character OP_CHARS if it's positive, or OP_NOT if it's negative. Note
1988 that OP_NOT does not support multibyte characters. In the positive case, it
1989 can cause firstbyte to be set. Otherwise, there can be no first char if
1990 this item is first, whatever repeat count may follow. In the case of
1991 reqbyte, save the previous value for reinstating. */
1992
1993 #ifdef SUPPORT_UTF8
1994 if (class_charcount == 1 &&
1995 (!utf8 ||
1996 (!class_utf8 && class_lastchar < 128)))
1997 #else
1998 if (class_charcount == 1)
1999 #endif
2000 {
2001 zeroreqbyte = reqbyte;
2002 if (negate_class)
2003 {
2004 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2005 zerofirstbyte = firstbyte;
2006 *code++ = OP_NOT;
2007 }
2008 else
2009 {
2010 if (firstbyte == REQ_UNSET)
2011 {
2012 zerofirstbyte = REQ_NONE;
2013 firstbyte = class_lastchar | req_caseopt;
2014 }
2015 else
2016 {
2017 zerofirstbyte = firstbyte;
2018 reqbyte = class_lastchar | req_caseopt | cd->req_varyopt;
2019 }
2020 *code++ = OP_CHARS;
2021 *code++ = 1;
2022 }
2023 *code++ = class_lastchar;
2024 break; /* End of class handling */
2025 } /* End of 1-byte optimization */
2026
2027 /* Otherwise, if this is the first thing in the branch, there can be no
2028 first char setting, whatever the repeat count. Any reqbyte setting must
2029 remain unchanged after any kind of repeat. */
2030
2031 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2032 zerofirstbyte = firstbyte;
2033 zeroreqbyte = reqbyte;
2034
2035 /* If there are characters with values > 255, we have to compile an
2036 extended class, with its own opcode. If there are no characters < 256,
2037 we can omit the bitmap. */
2038
2039 #ifdef SUPPORT_UTF8
2040 if (class_utf8)
2041 {
2042 *class_utf8data++ = XCL_END; /* Marks the end of extra data */
2043 *code++ = OP_XCLASS;
2044 code += LINK_SIZE;
2045 *code = negate_class? XCL_NOT : 0;
2046
2047 /* If the map is required, install it, and move on to the end of
2048 the extra data */
2049
2050 if (class_charcount > 0)
2051 {
2052 *code++ |= XCL_MAP;
2053 memcpy(code, class, 32);
2054 code = class_utf8data;
2055 }
2056
2057 /* If the map is not required, slide down the extra data. */
2058
2059 else
2060 {
2061 int len = class_utf8data - (code + 33);
2062 memmove(code + 1, code + 33, len);
2063 code += len + 1;
2064 }
2065
2066 /* Now fill in the complete length of the item */
2067
2068 PUT(previous, 1, code - previous);
2069 break; /* End of class handling */
2070 }
2071 #endif
2072
2073 /* If there are no characters > 255, negate the 32-byte map if necessary,
2074 and copy it into the code vector. If this is the first thing in the branch,
2075 there can be no first char setting, whatever the repeat count. Any reqbyte
2076 setting must remain unchanged after any kind of repeat. */
2077
2078 if (negate_class)
2079 {
2080 *code++ = OP_NCLASS;
2081 for (c = 0; c < 32; c++) code[c] = ~class[c];
2082 }
2083 else
2084 {
2085 *code++ = OP_CLASS;
2086 memcpy(code, class, 32);
2087 }
2088 code += 32;
2089 break;
2090
2091 /* Various kinds of repeat */
2092
2093 case '{':
2094 if (!is_counted_repeat(ptr+1, cd)) goto NORMAL_CHAR;
2095 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorptr, cd);
2096 if (*errorptr != NULL) goto FAILED;
2097 goto REPEAT;
2098
2099 case '*':
2100 repeat_min = 0;
2101 repeat_max = -1;
2102 goto REPEAT;
2103
2104 case '+':
2105 repeat_min = 1;
2106 repeat_max = -1;
2107 goto REPEAT;
2108
2109 case '?':
2110 repeat_min = 0;
2111 repeat_max = 1;
2112
2113 REPEAT:
2114 if (previous == NULL)
2115 {
2116 *errorptr = ERR9;
2117 goto FAILED;
2118 }
2119
2120 if (repeat_min == 0)
2121 {
2122 firstbyte = zerofirstbyte; /* Adjust for zero repeat */
2123 reqbyte = zeroreqbyte; /* Ditto */
2124 }
2125
2126 /* Remember whether this is a variable length repeat */
2127
2128 reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
2129
2130 op_type = 0; /* Default single-char op codes */
2131 possessive_quantifier = FALSE; /* Default not possessive quantifier */
2132
2133 /* Save start of previous item, in case we have to move it up to make space
2134 for an inserted OP_ONCE for the additional '+' extension. */
2135
2136 tempcode = previous;
2137
2138 /* If the next character is '+', we have a possessive quantifier. This
2139 implies greediness, whatever the setting of the PCRE_UNGREEDY option.
2140 If the next character is '?' this is a minimizing repeat, by default,
2141 but if PCRE_UNGREEDY is set, it works the other way round. We change the
2142 repeat type to the non-default. */
2143
2144 if (ptr[1] == '+')
2145 {
2146 repeat_type = 0; /* Force greedy */
2147 possessive_quantifier = TRUE;
2148 ptr++;
2149 }
2150 else if (ptr[1] == '?')
2151 {
2152 repeat_type = greedy_non_default;
2153 ptr++;
2154 }
2155 else repeat_type = greedy_default;
2156
2157 /* If previous was a recursion, we need to wrap it inside brackets so that
2158 it can be replicated if necessary. */
2159
2160 if (*previous == OP_RECURSE)
2161 {
2162 memmove(previous + 1 + LINK_SIZE, previous, 1 + LINK_SIZE);
2163 code += 1 + LINK_SIZE;
2164 *previous = OP_BRA;
2165 PUT(previous, 1, code - previous);
2166 *code = OP_KET;
2167 PUT(code, 1, code - previous);
2168 code += 1 + LINK_SIZE;
2169 }
2170
2171 /* If previous was a string of characters, chop off the last one and use it
2172 as the subject of the repeat. If there was only one character, we can
2173 abolish the previous item altogether. If a one-char item has a minumum of
2174 more than one, ensure that it is set in reqbyte - it might not be if a
2175 sequence such as x{3} is the first thing in a branch because the x will
2176 have gone into firstbyte instead. */
2177
2178 if (*previous == OP_CHARS)
2179 {
2180 /* Deal with UTF-8 characters that take up more than one byte. It's
2181 easier to write this out separately than try to macrify it. Use c to
2182 hold the length of the character in bytes, plus 0x80 to flag that it's a
2183 length rather than a small character. */
2184
2185 #ifdef SUPPORT_UTF8
2186 if (utf8 && (code[-1] & 0x80) != 0)
2187 {
2188 uschar *lastchar = code - 1;
2189 while((*lastchar & 0xc0) == 0x80) lastchar--;
2190 c = code - lastchar; /* Length of UTF-8 character */
2191 memcpy(utf8_char, lastchar, c); /* Save the char */
2192 if (lastchar == previous + 2) /* There was only one character */
2193 {
2194 code = previous; /* Abolish the previous item */
2195 }
2196 else
2197 {
2198 previous[1] -= c; /* Adjust length of previous */
2199 code = lastchar; /* Lost char off the end */
2200 tempcode = code; /* Adjust position to be moved for '+' */
2201 }
2202 c |= 0x80; /* Flag c as a length */
2203 }
2204 else
2205 #endif
2206
2207 /* Handle the case of a single byte - either with no UTF8 support, or
2208 with UTF-8 disabled, or for a UTF-8 character < 128. */
2209
2210 {
2211 c = *(--code);
2212 if (code == previous + 2) /* There was only one character */
2213 {
2214 code = previous; /* Abolish the previous item */
2215 if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
2216 }
2217 else
2218 {
2219 previous[1]--; /* adjust length */
2220 tempcode = code; /* Adjust position to be moved for '+' */
2221 }
2222 }
2223
2224 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
2225 }
2226
2227 /* If previous was a single negated character ([^a] or similar), we use
2228 one of the special opcodes, replacing it. The code is shared with single-
2229 character repeats by setting opt_type to add a suitable offset into
2230 repeat_type. OP_NOT is currently used only for single-byte chars. */
2231
2232 else if (*previous == OP_NOT)
2233 {
2234 op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
2235 c = previous[1];
2236 code = previous;
2237 goto OUTPUT_SINGLE_REPEAT;
2238 }
2239
2240 /* If previous was a character type match (\d or similar), abolish it and
2241 create a suitable repeat item. The code is shared with single-character
2242 repeats by setting op_type to add a suitable offset into repeat_type. */
2243
2244 else if (*previous < OP_EODN)
2245 {
2246 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
2247 c = *previous;
2248 code = previous;
2249
2250 OUTPUT_SINGLE_REPEAT:
2251
2252 /* If the maximum is zero then the minimum must also be zero; Perl allows
2253 this case, so we do too - by simply omitting the item altogether. */
2254
2255 if (repeat_max == 0) goto END_REPEAT;
2256
2257 /* Combine the op_type with the repeat_type */
2258
2259 repeat_type += op_type;
2260
2261 /* A minimum of zero is handled either as the special case * or ?, or as
2262 an UPTO, with the maximum given. */
2263
2264 if (repeat_min == 0)
2265 {
2266 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
2267 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
2268 else
2269 {
2270 *code++ = OP_UPTO + repeat_type;
2271 PUT2INC(code, 0, repeat_max);
2272 }
2273 }
2274
2275 /* The case {1,} is handled as the special case + */
2276
2277 else if (repeat_min == 1 && repeat_max == -1)
2278 *code++ = OP_PLUS + repeat_type;
2279
2280 /* The case {n,n} is just an EXACT, while the general case {n,m} is
2281 handled as an EXACT followed by an UPTO. An EXACT of 1 is optimized. */
2282
2283 else
2284 {
2285 if (repeat_min != 1)
2286 {
2287 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
2288 PUT2INC(code, 0, repeat_min);
2289 }
2290
2291 /* If the mininum is 1 and the previous item was a character string,
2292 we either have to put back the item that got cancelled if the string
2293 length was 1, or add the character back onto the end of a longer
2294 string. For a character type nothing need be done; it will just get
2295 put back naturally. Note that the final character is always going to
2296 get added below, so we leave code ready for its insertion. */
2297
2298 else if (*previous == OP_CHARS)
2299 {
2300 if (code == previous) code += 2; else
2301
2302 /* In UTF-8 mode, a multibyte char has its length in c, with the 0x80
2303 bit set as a flag. The length will always be between 2 and 6. */
2304
2305 #ifdef SUPPORT_UTF8
2306 if (utf8 && c >= 128) previous[1] += c & 7; else
2307 #endif
2308 previous[1]++;
2309 }
2310
2311 /* For a single negated character we also have to put back the
2312 item that got cancelled. At present this applies only to single byte
2313 characters in any mode. */
2314
2315 else if (*previous == OP_NOT) code++;
2316
2317 /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
2318 we have to insert the character for the previous code. In UTF-8 mode,
2319 long characters have their length in c, with the 0x80 bit as a flag. */
2320
2321 if (repeat_max < 0)
2322 {
2323 #ifdef SUPPORT_UTF8
2324 if (utf8 && c >= 128)
2325 {
2326 memcpy(code, utf8_char, c & 7);
2327 code += c & 7;
2328 }
2329 else
2330 #endif
2331 *code++ = c;
2332 *code++ = OP_STAR + repeat_type;
2333 }
2334
2335 /* Else insert an UPTO if the max is greater than the min, again
2336 preceded by the character, for the previously inserted code. */
2337
2338 else if (repeat_max != repeat_min)
2339 {
2340 #ifdef SUPPORT_UTF8
2341 if (utf8 && c >= 128)
2342 {
2343 memcpy(code, utf8_char, c & 7);
2344 code += c & 7;
2345 }
2346 else
2347 #endif
2348 *code++ = c;
2349 repeat_max -= repeat_min;
2350 *code++ = OP_UPTO + repeat_type;
2351 PUT2INC(code, 0, repeat_max);
2352 }
2353 }
2354
2355 /* The character or character type itself comes last in all cases. */
2356
2357 #ifdef SUPPORT_UTF8
2358 if (utf8 && c >= 128)
2359 {
2360 memcpy(code, utf8_char, c & 7);
2361 code += c & 7;
2362 }
2363 else
2364 #endif
2365
2366 *code++ = c;
2367 }
2368
2369 /* If previous was a character class or a back reference, we put the repeat
2370 stuff after it, but just skip the item if the repeat was {0,0}. */
2371
2372 else if (*previous == OP_CLASS ||
2373 *previous == OP_NCLASS ||
2374 #ifdef SUPPORT_UTF8
2375 *previous == OP_XCLASS ||
2376 #endif
2377 *previous == OP_REF)
2378 {
2379 if (repeat_max == 0)
2380 {
2381 code = previous;
2382 goto END_REPEAT;
2383 }
2384 if (repeat_min == 0 && repeat_max == -1)
2385 *code++ = OP_CRSTAR + repeat_type;
2386 else if (repeat_min == 1 && repeat_max == -1)
2387 *code++ = OP_CRPLUS + repeat_type;
2388 else if (repeat_min == 0 && repeat_max == 1)
2389 *code++ = OP_CRQUERY + repeat_type;
2390 else
2391 {
2392 *code++ = OP_CRRANGE + repeat_type;
2393 PUT2INC(code, 0, repeat_min);
2394 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
2395 PUT2INC(code, 0, repeat_max);
2396 }
2397 }
2398
2399 /* If previous was a bracket group, we may have to replicate it in certain
2400 cases. */
2401
2402 else if (*previous >= OP_BRA || *previous == OP_ONCE ||
2403 *previous == OP_COND)
2404 {
2405 register int i;
2406 int ketoffset = 0;
2407 int len = code - previous;
2408 uschar *bralink = NULL;
2409
2410 /* If the maximum repeat count is unlimited, find the end of the bracket
2411 by scanning through from the start, and compute the offset back to it
2412 from the current code pointer. There may be an OP_OPT setting following
2413 the final KET, so we can't find the end just by going back from the code
2414 pointer. */
2415
2416 if (repeat_max == -1)
2417 {
2418 register uschar *ket = previous;
2419 do ket += GET(ket, 1); while (*ket != OP_KET);
2420 ketoffset = code - ket;
2421 }
2422
2423 /* The case of a zero minimum is special because of the need to stick
2424 OP_BRAZERO in front of it, and because the group appears once in the
2425 data, whereas in other cases it appears the minimum number of times. For
2426 this reason, it is simplest to treat this case separately, as otherwise
2427 the code gets far too messy. There are several special subcases when the
2428 minimum is zero. */
2429
2430 if (repeat_min == 0)
2431 {
2432 /* If the maximum is also zero, we just omit the group from the output
2433 altogether. */
2434
2435 if (repeat_max == 0)
2436 {
2437 code = previous;
2438 goto END_REPEAT;
2439 }
2440
2441 /* If the maximum is 1 or unlimited, we just have to stick in the
2442 BRAZERO and do no more at this point. */
2443
2444 if (repeat_max <= 1)
2445 {
2446 memmove(previous+1, previous, len);
2447 code++;
2448 *previous++ = OP_BRAZERO + repeat_type;
2449 }
2450
2451 /* If the maximum is greater than 1 and limited, we have to replicate
2452 in a nested fashion, sticking OP_BRAZERO before each set of brackets.
2453 The first one has to be handled carefully because it's the original
2454 copy, which has to be moved up. The remainder can be handled by code
2455 that is common with the non-zero minimum case below. We just have to
2456 adjust the value or repeat_max, since one less copy is required. */
2457
2458 else
2459 {
2460 int offset;
2461 memmove(previous + 2 + LINK_SIZE, previous, len);
2462 code += 2 + LINK_SIZE;
2463 *previous++ = OP_BRAZERO + repeat_type;
2464 *previous++ = OP_BRA;
2465
2466 /* We chain together the bracket offset fields that have to be
2467 filled in later when the ends of the brackets are reached. */
2468
2469 offset = (bralink == NULL)? 0 : previous - bralink;
2470 bralink = previous;
2471 PUTINC(previous, 0, offset);
2472 }
2473
2474 repeat_max--;
2475 }
2476
2477 /* If the minimum is greater than zero, replicate the group as many
2478 times as necessary, and adjust the maximum to the number of subsequent
2479 copies that we need. If we set a first char from the group, and didn't
2480 set a required char, copy the latter from the former. */
2481
2482 else
2483 {
2484 if (repeat_min > 1)
2485 {
2486 if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
2487 for (i = 1; i < repeat_min; i++)
2488 {
2489 memcpy(code, previous, len);
2490 code += len;
2491 }
2492 }
2493 if (repeat_max > 0) repeat_max -= repeat_min;
2494 }
2495
2496 /* This code is common to both the zero and non-zero minimum cases. If
2497 the maximum is limited, it replicates the group in a nested fashion,
2498 remembering the bracket starts on a stack. In the case of a zero minimum,
2499 the first one was set up above. In all cases the repeat_max now specifies
2500 the number of additional copies needed. */
2501
2502 if (repeat_max >= 0)
2503 {
2504 for (i = repeat_max - 1; i >= 0; i--)
2505 {
2506 *code++ = OP_BRAZERO + repeat_type;
2507
2508 /* All but the final copy start a new nesting, maintaining the
2509 chain of brackets outstanding. */
2510
2511 if (i != 0)
2512 {
2513 int offset;
2514 *code++ = OP_BRA;
2515 offset = (bralink == NULL)? 0 : code - bralink;
2516 bralink = code;
2517 PUTINC(code, 0, offset);
2518 }
2519
2520 memcpy(code, previous, len);
2521 code += len;
2522 }
2523
2524 /* Now chain through the pending brackets, and fill in their length
2525 fields (which are holding the chain links pro tem). */
2526
2527 while (bralink != NULL)
2528 {
2529 int oldlinkoffset;
2530 int offset = code - bralink + 1;
2531 uschar *bra = code - offset;
2532 oldlinkoffset = GET(bra, 1);
2533 bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
2534 *code++ = OP_KET;
2535 PUTINC(code, 0, offset);
2536 PUT(bra, 1, offset);
2537 }
2538 }
2539
2540 /* If the maximum is unlimited, set a repeater in the final copy. We
2541 can't just offset backwards from the current code point, because we
2542 don't know if there's been an options resetting after the ket. The
2543 correct offset was computed above. */
2544
2545 else code[-ketoffset] = OP_KETRMAX + repeat_type;
2546 }
2547
2548 /* Else there's some kind of shambles */
2549
2550 else
2551 {
2552 *errorptr = ERR11;
2553 goto FAILED;
2554 }
2555
2556 /* If the character following a repeat is '+', we wrap the entire repeated
2557 item inside OP_ONCE brackets. This is just syntactic sugar, taken from
2558 Sun's Java package. The repeated item starts at tempcode, not at previous,
2559 which might be the first part of a string whose (former) last char we
2560 repeated. However, we don't support '+' after a greediness '?'. */
2561
2562 if (possessive_quantifier)
2563 {
2564 int len = code - tempcode;
2565 memmove(tempcode + 1+LINK_SIZE, tempcode, len);
2566 code += 1 + LINK_SIZE;
2567 len += 1 + LINK_SIZE;
2568 tempcode[0] = OP_ONCE;
2569 *code++ = OP_KET;
2570 PUTINC(code, 0, len);
2571 PUT(tempcode, 1, len);
2572 }
2573
2574 /* In all case we no longer have a previous item. We also set the
2575 "follows varying string" flag for subsequently encountered reqbytes if
2576 it isn't already set and we have just passed a varying length item. */
2577
2578 END_REPEAT:
2579 previous = NULL;
2580 cd->req_varyopt |= reqvary;
2581 break;
2582
2583
2584 /* Start of nested bracket sub-expression, or comment or lookahead or
2585 lookbehind or option setting or condition. First deal with special things
2586 that can come after a bracket; all are introduced by ?, and the appearance
2587 of any of them means that this is not a referencing group. They were
2588 checked for validity in the first pass over the string, so we don't have to
2589 check for syntax errors here. */
2590
2591 case '(':
2592 newoptions = options;
2593 skipbytes = 0;
2594
2595 if (*(++ptr) == '?')
2596 {
2597 int set, unset;
2598 int *optset;
2599
2600 switch (*(++ptr))
2601 {
2602 case '#': /* Comment; skip to ket */
2603 ptr++;
2604 while (*ptr != ')') ptr++;
2605 continue;
2606
2607 case ':': /* Non-extracting bracket */
2608 bravalue = OP_BRA;
2609 ptr++;
2610 break;
2611
2612 case '(':
2613 bravalue = OP_COND; /* Conditional group */
2614
2615 /* Condition to test for recursion */
2616
2617 if (ptr[1] == 'R')
2618 {
2619 code[1+LINK_SIZE] = OP_CREF;
2620 PUT2(code, 2+LINK_SIZE, CREF_RECURSE);
2621 skipbytes = 3;
2622 ptr += 3;
2623 }
2624
2625 /* Condition to test for a numbered subpattern match. We know that
2626 if a digit follows ( then there will just be digits until ) because
2627 the syntax was checked in the first pass. */
2628
2629 else if ((digitab[ptr[1]] && ctype_digit) != 0)
2630 {
2631 int condref; /* Don't amalgamate; some compilers */
2632 condref = *(++ptr) - '0'; /* grumble at autoincrement in declaration */
2633 while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';
2634 if (condref == 0)
2635 {
2636 *errorptr = ERR35;
2637 goto FAILED;
2638 }
2639 ptr++;
2640 code[1+LINK_SIZE] = OP_CREF;
2641 PUT2(code, 2+LINK_SIZE, condref);
2642 skipbytes = 3;
2643 }
2644 /* For conditions that are assertions, we just fall through, having
2645 set bravalue above. */
2646 break;
2647
2648 case '=': /* Positive lookahead */
2649 bravalue = OP_ASSERT;
2650 ptr++;
2651 break;
2652
2653 case '!': /* Negative lookahead */
2654 bravalue = OP_ASSERT_NOT;
2655 ptr++;
2656 break;
2657
2658 case '<': /* Lookbehinds */
2659 switch (*(++ptr))
2660 {
2661 case '=': /* Positive lookbehind */
2662 bravalue = OP_ASSERTBACK;
2663 ptr++;
2664 break;
2665
2666 case '!': /* Negative lookbehind */
2667 bravalue = OP_ASSERTBACK_NOT;
2668 ptr++;
2669 break;
2670 }
2671 break;
2672
2673 case '>': /* One-time brackets */
2674 bravalue = OP_ONCE;
2675 ptr++;
2676 break;
2677
2678 case 'C': /* Callout - may be followed by digits */
2679 *code++ = OP_CALLOUT;
2680 {
2681 int n = 0;
2682 while ((digitab[*(++ptr)] & ctype_digit) != 0)
2683 n = n * 10 + *ptr - '0';
2684 if (n > 255)
2685 {
2686 *errorptr = ERR38;
2687 goto FAILED;
2688 }
2689 *code++ = n;
2690 }
2691 previous = NULL;
2692 continue;
2693
2694 case 'P': /* Named subpattern handling */
2695 if (*(++ptr) == '<') /* Definition */
2696 {
2697 int i, namelen;
2698 uschar *slot = cd->name_table;
2699 const uschar *name; /* Don't amalgamate; some compilers */
2700 name = ++ptr; /* grumble at autoincrement in declaration */
2701
2702 while (*ptr++ != '>');
2703 namelen = ptr - name - 1;
2704
2705 for (i = 0; i < cd->names_found; i++)
2706 {
2707 int crc = memcmp(name, slot+2, namelen);
2708 if (crc == 0)
2709 {
2710 if (slot[2+namelen] == 0)
2711 {
2712 *errorptr = ERR43;
2713 goto FAILED;
2714 }
2715 crc = -1; /* Current name is substring */
2716 }
2717 if (crc < 0)
2718 {
2719 memmove(slot + cd->name_entry_size, slot,
2720 (cd->names_found - i) * cd->name_entry_size);
2721 break;
2722 }
2723 slot += cd->name_entry_size;
2724 }
2725
2726 PUT2(slot, 0, *brackets + 1);
2727 memcpy(slot + 2, name, namelen);
2728 slot[2+namelen] = 0;
2729 cd->names_found++;
2730 goto NUMBERED_GROUP;
2731 }
2732
2733 if (*ptr == '=' || *ptr == '>') /* Reference or recursion */
2734 {
2735 int i, namelen;
2736 int type = *ptr++;
2737 const uschar *name = ptr;
2738 uschar *slot = cd->name_table;
2739
2740 while (*ptr != ')') ptr++;
2741 namelen = ptr - name;
2742
2743 for (i = 0; i < cd->names_found; i++)
2744 {
2745 if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
2746 slot += cd->name_entry_size;
2747 }
2748 if (i >= cd->names_found)
2749 {
2750 *errorptr = ERR15;
2751 goto FAILED;
2752 }
2753
2754 recno = GET2(slot, 0);
2755
2756 if (type == '>') goto HANDLE_RECURSION; /* A few lines below */
2757
2758 /* Back reference */
2759
2760 previous = code;
2761 *code++ = OP_REF;
2762 PUT2INC(code, 0, recno);
2763 cd->backref_map |= (recno < 32)? (1 << recno) : 1;
2764 if (recno > cd->top_backref) cd->top_backref = recno;
2765 continue;
2766 }
2767
2768 /* Should never happen */
2769 break;
2770
2771 case 'R': /* Pattern recursion */
2772 ptr++; /* Same as (?0) */
2773 /* Fall through */
2774
2775 /* Recursion or "subroutine" call */
2776
2777 case '0': case '1': case '2': case '3': case '4':
2778 case '5': case '6': case '7': case '8': case '9':
2779 {
2780 const uschar *called;
2781 recno = 0;
2782 while((digitab[*ptr] & ctype_digit) != 0)
2783 recno = recno * 10 + *ptr++ - '0';
2784
2785 /* Come here from code above that handles a named recursion */
2786
2787 HANDLE_RECURSION:
2788
2789 previous = code;
2790
2791 /* Find the bracket that is being referenced. Temporarily end the
2792 regex in case it doesn't exist. */
2793
2794 *code = OP_END;
2795 called = (recno == 0)?
2796 cd->start_code : find_bracket(cd->start_code, utf8, recno);
2797
2798 if (called == NULL)
2799 {
2800 *errorptr = ERR15;
2801 goto FAILED;
2802 }
2803
2804 /* If the subpattern is still open, this is a recursive call. We
2805 check to see if this is a left recursion that could loop for ever,
2806 and diagnose that case. */
2807
2808 if (GET(called, 1) == 0 && could_be_empty(called, code, bcptr, utf8))
2809 {
2810 *errorptr = ERR40;
2811 goto FAILED;
2812 }
2813
2814 /* Insert the recursion/subroutine item */
2815
2816 *code = OP_RECURSE;
2817 PUT(code, 1, called - cd->start_code);
2818 code += 1 + LINK_SIZE;
2819 }
2820 continue;
2821
2822 /* Character after (? not specially recognized */
2823
2824 default: /* Option setting */
2825 set = unset = 0;
2826 optset = &set;
2827
2828 while (*ptr != ')' && *ptr != ':')
2829 {
2830 switch (*ptr++)
2831 {
2832 case '-': optset = &unset; break;
2833
2834 case 'i': *optset |= PCRE_CASELESS; break;
2835 case 'm': *optset |= PCRE_MULTILINE; break;
2836 case 's': *optset |= PCRE_DOTALL; break;
2837 case 'x': *optset |= PCRE_EXTENDED; break;
2838 case 'U': *optset |= PCRE_UNGREEDY; break;
2839 case 'X': *optset |= PCRE_EXTRA; break;
2840 }
2841 }
2842
2843 /* Set up the changed option bits, but don't change anything yet. */
2844
2845 newoptions = (options | set) & (~unset);
2846
2847 /* If the options ended with ')' this is not the start of a nested
2848 group with option changes, so the options change at this level. Compile
2849 code to change the ims options if this setting actually changes any of
2850 them. We also pass the new setting back so that it can be put at the
2851 start of any following branches, and when this group ends (if we are in
2852 a group), a resetting item can be compiled.
2853
2854 Note that if this item is right at the start of the pattern, the
2855 options will have been abstracted and made global, so there will be no
2856 change to compile. */
2857
2858 if (*ptr == ')')
2859 {
2860 if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
2861 {
2862 *code++ = OP_OPT;
2863 *code++ = newoptions & PCRE_IMS;
2864 }
2865
2866 /* Change options at this level, and pass them back for use
2867 in subsequent branches. Reset the greedy defaults and the case
2868 value for firstbyte and reqbyte. */
2869
2870 *optionsptr = options = newoptions;
2871 greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
2872 greedy_non_default = greedy_default ^ 1;
2873 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2874
2875 previous = NULL; /* This item can't be repeated */
2876 continue; /* It is complete */
2877 }
2878
2879 /* If the options ended with ':' we are heading into a nested group
2880 with possible change of options. Such groups are non-capturing and are
2881 not assertions of any kind. All we need to do is skip over the ':';
2882 the newoptions value is handled below. */
2883
2884 bravalue = OP_BRA;
2885 ptr++;
2886 }
2887 }
2888
2889 /* If PCRE_NO_AUTO_CAPTURE is set, all unadorned brackets become
2890 non-capturing and behave like (?:...) brackets */
2891
2892 else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
2893 {
2894 bravalue = OP_BRA;
2895 }
2896
2897 /* Else we have a referencing group; adjust the opcode. If the bracket
2898 number is greater than EXTRACT_BASIC_MAX, we set the opcode one higher, and
2899 arrange for the true number to follow later, in an OP_BRANUMBER item. */
2900
2901 else
2902 {
2903 NUMBERED_GROUP:
2904 if (++(*brackets) > EXTRACT_BASIC_MAX)
2905 {
2906 bravalue = OP_BRA + EXTRACT_BASIC_MAX + 1;
2907 code[1+LINK_SIZE] = OP_BRANUMBER;
2908 PUT2(code, 2+LINK_SIZE, *brackets);
2909 skipbytes = 3;
2910 }
2911 else bravalue = OP_BRA + *brackets;
2912 }
2913
2914 /* Process nested bracketed re. Assertions may not be repeated, but other
2915 kinds can be. We copy code into a non-register variable in order to be able
2916 to pass its address because some compilers complain otherwise. Pass in a
2917 new setting for the ims options if they have changed. */
2918
2919 previous = (bravalue >= OP_ONCE)? code : NULL;
2920 *code = bravalue;
2921 tempcode = code;
2922 tempreqvary = cd->req_varyopt; /* Save value before bracket */
2923
2924 if (!compile_regex(
2925 newoptions, /* The complete new option state */
2926 options & PCRE_IMS, /* The previous ims option state */
2927 brackets, /* Extracting bracket count */
2928 &tempcode, /* Where to put code (updated) */
2929 &ptr, /* Input pointer (updated) */
2930 errorptr, /* Where to put an error message */
2931 (bravalue == OP_ASSERTBACK ||
2932 bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
2933 skipbytes, /* Skip over OP_COND/OP_BRANUMBER */
2934 &subfirstbyte, /* For possible first char */
2935 &subreqbyte, /* For possible last char */
2936 bcptr, /* Current branch chain */
2937 cd)) /* Tables block */
2938 goto FAILED;
2939
2940 /* At the end of compiling, code is still pointing to the start of the
2941 group, while tempcode has been updated to point past the end of the group
2942 and any option resetting that may follow it. The pattern pointer (ptr)
2943 is on the bracket. */
2944
2945 /* If this is a conditional bracket, check that there are no more than
2946 two branches in the group. */
2947
2948 else if (bravalue == OP_COND)
2949 {
2950 uschar *tc = code;
2951 condcount = 0;
2952
2953 do {
2954 condcount++;
2955 tc += GET(tc,1);
2956 }
2957 while (*tc != OP_KET);
2958
2959 if (condcount > 2)
2960 {
2961 *errorptr = ERR27;
2962 goto FAILED;
2963 }
2964
2965 /* If there is just one branch, we must not make use of its firstbyte or
2966 reqbyte, because this is equivalent to an empty second branch. */
2967
2968 if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
2969 }
2970
2971 /* Handle updating of the required and first characters. Update for normal
2972 brackets of all kinds, and conditions with two branches (see code above).
2973 If the bracket is followed by a quantifier with zero repeat, we have to
2974 back off. Hence the definition of zeroreqbyte and zerofirstbyte outside the
2975 main loop so that they can be accessed for the back off. */
2976
2977 zeroreqbyte = reqbyte;
2978 zerofirstbyte = firstbyte;
2979 groupsetfirstbyte = FALSE;
2980
2981 if (bravalue >= OP_BRA || bravalue == OP_ONCE || bravalue == OP_COND)
2982 {
2983 /* If we have not yet set a firstbyte in this branch, take it from the
2984 subpattern, remembering that it was set here so that a repeat of more
2985 than one can replicate it as reqbyte if necessary. If the subpattern has
2986 no firstbyte, set "none" for the whole branch. In both cases, a zero
2987 repeat forces firstbyte to "none". */
2988
2989 if (firstbyte == REQ_UNSET)
2990 {
2991 if (subfirstbyte >= 0)
2992 {
2993 firstbyte = subfirstbyte;
2994 groupsetfirstbyte = TRUE;
2995 }
2996 else firstbyte = REQ_NONE;
2997 zerofirstbyte = REQ_NONE;
2998 }
2999
3000 /* If firstbyte was previously set, convert the subpattern's firstbyte
3001 into reqbyte if there wasn't one, using the vary flag that was in
3002 existence beforehand. */
3003
3004 else if (subfirstbyte >= 0 && subreqbyte < 0)
3005 subreqbyte = subfirstbyte | tempreqvary;
3006
3007 /* If the subpattern set a required byte (or set a first byte that isn't
3008 really the first byte - see above), set it. */
3009
3010 if (subreqbyte >= 0) reqbyte = subreqbyte;
3011 }
3012
3013 /* For a forward assertion, we take the reqbyte, if set. This can be
3014 helpful if the pattern that follows the assertion doesn't set a different
3015 char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
3016 for an assertion, however because it leads to incorrect effect for patterns
3017 such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
3018 of a firstbyte. This is overcome by a scan at the end if there's no
3019 firstbyte, looking for an asserted first char. */
3020
3021 else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
3022
3023 /* Now update the main code pointer to the end of the group. */
3024
3025 code = tempcode;
3026
3027 /* Error if hit end of pattern */
3028
3029 if (*ptr != ')')
3030 {
3031 *errorptr = ERR14;
3032 goto FAILED;
3033 }
3034 break;
3035
3036 /* Check \ for being a real metacharacter; if not, fall through and handle
3037 it as a data character at the start of a string. Escape items are checked
3038 for validity in the pre-compiling pass. */
3039
3040 case '\\':
3041 tempptr = ptr;
3042 c = check_escape(&ptr, errorptr, *brackets, options, FALSE, cd);
3043
3044 /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
3045 are arranged to be the negation of the corresponding OP_values. For the
3046 back references, the values are ESC_REF plus the reference number. Only
3047 back references and those types that consume a character may be repeated.
3048 We can test for values between ESC_b and ESC_Z for the latter; this may
3049 have to change if any new ones are ever created. */
3050
3051 if (c < 0)
3052 {
3053 if (-c == ESC_Q) /* Handle start of quoted string */
3054 {
3055 if (ptr[1] == '\\' && ptr[2] == 'E') ptr += 2; /* avoid empty string */
3056 else inescq = TRUE;
3057 continue;
3058 }
3059
3060 /* For metasequences that actually match a character, we disable the
3061 setting of a first character if it hasn't already been set. */
3062
3063 if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
3064 firstbyte = REQ_NONE;
3065
3066 /* Set values to reset to if this is followed by a zero repeat. */
3067
3068 zerofirstbyte = firstbyte;
3069 zeroreqbyte = reqbyte;
3070
3071 /* Back references are handled specially */
3072
3073 if (-c >= ESC_REF)
3074 {
3075 int number = -c - ESC_REF;
3076 previous = code;
3077 *code++ = OP_REF;
3078 PUT2INC(code, 0, number);
3079 }
3080 else
3081 {
3082 previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
3083 *code++ = -c;
3084 }
3085 continue;
3086 }
3087
3088 /* Data character: reset and fall through */
3089
3090 ptr = tempptr;
3091 c = '\\';
3092
3093 /* Handle a run of data characters until a metacharacter is encountered.
3094 The first character is guaranteed not to be whitespace or # when the
3095 extended flag is set. */
3096
3097 NORMAL_CHAR:
3098 default:
3099 previous = code;
3100 *code = OP_CHARS;
3101 code += 2;
3102 length = 0;
3103
3104 do
3105 {
3106 /* If in \Q...\E, check for the end; if not, we always have a literal */
3107
3108 if (inescq)
3109 {
3110 if (c == '\\' && ptr[1] == 'E')
3111 {
3112 inescq = FALSE;
3113 ptr++;
3114 }
3115 else
3116 {
3117 *code++ = c;
3118 length++;
3119 }
3120 continue;
3121 }
3122
3123 /* Skip white space and comments for /x patterns */
3124
3125 if ((options & PCRE_EXTENDED) != 0)
3126 {
3127 if ((cd->ctypes[c] & ctype_space) != 0) continue;
3128 if (c == '#')
3129 {
3130 /* The space before the ; is to avoid a warning on a silly compiler
3131 on the Macintosh. */
3132 while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
3133 if (c == 0) break;
3134 continue;
3135 }
3136 }
3137
3138 /* Backslash may introduce a data char or a metacharacter. Escaped items
3139 are checked for validity in the pre-compiling pass. Stop the string
3140 before a metaitem. */
3141
3142 if (c == '\\')
3143 {
3144 tempptr = ptr;
3145 c = check_escape(&ptr, errorptr, *brackets, options, FALSE, cd);
3146 if (c < 0) { ptr = tempptr; break; }
3147
3148 /* If a character is > 127 in UTF-8 mode, we have to turn it into
3149 two or more characters in the UTF-8 encoding. */
3150
3151 #ifdef SUPPORT_UTF8
3152 if (utf8 && c > 127)
3153 {
3154 uschar buffer[8];
3155 int len = ord2utf8(c, buffer);
3156 for (c = 0; c < len; c++) *code++ = buffer[c];
3157 length += len;
3158 continue;
3159 }
3160 #endif
3161 }
3162
3163 /* Ordinary character or single-char escape */
3164
3165 *code++ = c;
3166 length++;
3167 }
3168
3169 /* This "while" is the end of the "do" above. */
3170
3171 while (length < MAXLIT && (cd->ctypes[c = *(++ptr)] & ctype_meta) == 0);
3172
3173 /* Update the first and last requirements. These are always bytes, even in
3174 UTF-8 mode. However, there is a special case to be considered when there
3175 are only one or two characters. Because this gets messy in UTF-8 mode, the
3176 code is kept separate. When we get here "length" contains the number of
3177 bytes. */
3178
3179 #ifdef SUPPORT_UTF8
3180 if (utf8 && length > 1)
3181 {
3182 uschar *t = previous + 3; /* After this code, t */
3183 while (t < code && (*t & 0xc0) == 0x80) t++; /* follows the 1st char */
3184
3185 /* Handle the case when there is only one multibyte character. It must
3186 have at least two bytes because of the "length > 1" test above. */
3187
3188 if (t == code)
3189 {
3190 /* If no previous first byte, set it from this character, but revert to
3191 none on a zero repeat. */
3192
3193 if (firstbyte == REQ_UNSET)
3194 {
3195 zerofirstbyte = REQ_NONE;
3196 firstbyte = previous[2];
3197 }
3198
3199 /* Otherwise, leave the first byte value alone, and don't change it on
3200 a zero repeat */
3201
3202 else zerofirstbyte = firstbyte;
3203
3204 /* In both cases, a zero repeat resets the previous required byte */
3205
3206 zeroreqbyte = reqbyte;
3207 }
3208
3209 /* Handle the case when there is more than one character. These may be
3210 single-byte or multibyte characters */
3211
3212 else
3213 {
3214 t = code - 1; /* After this code, t is at the */
3215 while ((*t & 0xc0) == 0x80) t--; /* start of the last character */
3216
3217 /* If no previous first byte, set it from the first character, and
3218 retain it on a zero repeat (of the last character). The required byte
3219 is reset on a zero repeat, either to the byte before the last
3220 character, unless this is the first byte of the string. In that case,
3221 it reverts to its previous value. */
3222
3223 if (firstbyte == REQ_UNSET)
3224 {
3225 zerofirstbyte = firstbyte = previous[2] | req_caseopt;
3226 zeroreqbyte = (t - 1 == previous + 2)?
3227 reqbyte : t[-1] | req_caseopt | cd->req_varyopt;
3228 }
3229
3230 /* If there was a previous first byte, leave it alone, and don't change
3231 it on a zero repeat. The required byte is reset on a zero repeat to the
3232 byte before the last character. */
3233
3234 else
3235 {
3236 zerofirstbyte = firstbyte;
3237 zeroreqbyte = t[-1] | req_caseopt | cd->req_varyopt;
3238 }
3239 }
3240
3241 /* In all cases (we know length > 1), the new required byte is the last
3242 byte of the string. */
3243
3244 reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
3245 }
3246
3247 else /* End of UTF-8 coding */
3248 #endif
3249
3250 /* This is the code for non-UTF-8 operation, either without UTF-8 support,
3251 or when UTF-8 is not enabled. */
3252
3253 {
3254 /* firstbyte was not previously set; take it from this string */
3255
3256 if (firstbyte == REQ_UNSET)
3257 {
3258 if (length == 1)
3259 {
3260 zerofirstbyte = REQ_NONE;
3261 firstbyte = previous[2] | req_caseopt;
3262 zeroreqbyte = reqbyte;
3263 }
3264 else
3265 {
3266 zerofirstbyte = firstbyte = previous[2] | req_caseopt;
3267 zeroreqbyte = (length > 2)?
3268 (code[-2] | req_caseopt | cd->req_varyopt) : reqbyte;
3269 reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
3270 }
3271 }
3272
3273 /* firstbyte was previously set */
3274
3275 else
3276 {
3277 zerofirstbyte = firstbyte;
3278 zeroreqbyte = (length == 1)? reqbyte :
3279 code[-2] | req_caseopt | cd->req_varyopt;
3280 reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
3281 }
3282 }
3283
3284 /* Set the length in the data vector, and advance to the next state. */
3285
3286 previous[1] = length;
3287 if (length < MAXLIT) ptr--;
3288 break;
3289 }
3290 } /* end of big loop */
3291
3292 /* Control never reaches here by falling through, only by a goto for all the
3293 error states. Pass back the position in the pattern so that it can be displayed
3294 to the user for diagnosing the error. */
3295
3296 FAILED:
3297 *ptrptr = ptr;
3298 return FALSE;
3299 }
3300
3301
3302
3303
3304 /*************************************************
3305 * Compile sequence of alternatives *
3306 *************************************************/
3307
3308 /* On entry, ptr is pointing past the bracket character, but on return
3309 it points to the closing bracket, or vertical bar, or end of string.
3310 The code variable is pointing at the byte into which the BRA operator has been
3311 stored. If the ims options are changed at the start (for a (?ims: group) or
3312 during any branch, we need to insert an OP_OPT item at the start of every
3313 following branch to ensure they get set correctly at run time, and also pass
3314 the new options into every subsequent branch compile.
3315
3316 Argument:
3317 options option bits, including any changes for this subpattern
3318 oldims previous settings of ims option bits
3319 brackets -> int containing the number of extracting brackets used
3320 codeptr -> the address of the current code pointer
3321 ptrptr -> the address of the current pattern pointer
3322 errorptr -> pointer to error message
3323 lookbehind TRUE if this is a lookbehind assertion
3324 skipbytes skip this many bytes at start (for OP_COND, OP_BRANUMBER)
3325 firstbyteptr place to put the first required character, or a negative number
3326 reqbyteptr place to put the last required character, or a negative number
3327 bcptr pointer to the chain of currently open branches
3328 cd points to the data block with tables pointers etc.
3329
3330 Returns: TRUE on success
3331 */
3332
3333 static BOOL
3334 compile_regex(int options, int oldims, int *brackets, uschar **codeptr,
3335 const uschar **ptrptr, const char **errorptr, BOOL lookbehind, int skipbytes,
3336 int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd)
3337 {
3338 const uschar *ptr = *ptrptr;
3339 uschar *code = *codeptr;
3340 uschar *last_branch = code;
3341 uschar *start_bracket = code;
3342 uschar *reverse_count = NULL;
3343 int firstbyte, reqbyte;
3344 int branchfirstbyte, branchreqbyte;
3345 branch_chain bc;
3346
3347 bc.outer = bcptr;
3348 bc.current = code;
3349
3350 firstbyte = reqbyte = REQ_UNSET;
3351
3352 /* Offset is set zero to mark that this bracket is still open */
3353
3354 PUT(code, 1, 0);
3355 code += 1 + LINK_SIZE + skipbytes;
3356
3357 /* Loop for each alternative branch */
3358
3359 for (;;)
3360 {
3361 /* Handle a change of ims options at the start of the branch */
3362
3363 if ((options & PCRE_IMS) != oldims)
3364 {
3365 *code++ = OP_OPT;
3366 *code++ = options & PCRE_IMS;
3367 }
3368
3369 /* Set up dummy OP_REVERSE if lookbehind assertion */
3370
3371 if (lookbehind)
3372 {
3373 *code++ = OP_REVERSE;
3374 reverse_count = code;
3375 PUTINC(code, 0, 0);
3376 }
3377
3378 /* Now compile the branch */
3379
3380 if (!compile_branch(&options, brackets, &code, &ptr, errorptr,
3381 &branchfirstbyte, &branchreqbyte, &bc, cd))
3382 {
3383 *ptrptr = ptr;
3384 return FALSE;
3385 }
3386
3387 /* If this is the first branch, the firstbyte and reqbyte values for the
3388 branch become the values for the regex. */
3389
3390 if (*last_branch != OP_ALT)
3391 {
3392 firstbyte = branchfirstbyte;
3393 reqbyte = branchreqbyte;
3394 }
3395
3396 /* If this is not the first branch, the first char and reqbyte have to
3397 match the values from all the previous branches, except that if the previous
3398 value for reqbyte didn't have REQ_VARY set, it can still match, and we set
3399 REQ_VARY for the regex. */
3400
3401 else
3402 {
3403 /* If we previously had a firstbyte, but it doesn't match the new branch,
3404 we have to abandon the firstbyte for the regex, but if there was previously
3405 no reqbyte, it takes on the value of the old firstbyte. */
3406
3407 if (firstbyte >= 0 && firstbyte != branchfirstbyte)
3408 {
3409 if (reqbyte < 0) reqbyte = firstbyte;
3410 firstbyte = REQ_NONE;
3411 }
3412
3413 /* If we (now or from before) have no firstbyte, a firstbyte from the
3414 branch becomes a reqbyte if there isn't a branch reqbyte. */
3415
3416 if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
3417 branchreqbyte = branchfirstbyte;
3418
3419 /* Now ensure that the reqbytes match */
3420
3421 if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
3422 reqbyte = REQ_NONE;
3423 else reqbyte |= branchreqbyte; /* To "or" REQ_VARY */
3424 }
3425
3426 /* If lookbehind, check that this branch matches a fixed-length string,
3427 and put the length into the OP_REVERSE item. Temporarily mark the end of
3428 the branch with OP_END. */
3429
3430 if (lookbehind)
3431 {
3432 int length;
3433 *code = OP_END;
3434 length = find_fixedlength(last_branch, options);
3435 DPRINTF(("fixed length = %d\n", length));
3436 if (length < 0)
3437 {
3438 *errorptr = (length == -2)? ERR36 : ERR25;
3439 *ptrptr = ptr;
3440 return FALSE;
3441 }
3442 PUT(reverse_count, 0, length);
3443 }
3444
3445 /* Reached end of expression, either ')' or end of pattern. Go back through
3446 the alternative branches and reverse the chain of offsets, with the field in
3447 the BRA item now becoming an offset to the first alternative. If there are
3448 no alternatives, it points to the end of the group. The length in the
3449 terminating ket is always the length of the whole bracketed item. If any of
3450 the ims options were changed inside the group, compile a resetting op-code
3451 following, except at the very end of the pattern. Return leaving the pointer
3452 at the terminating char. */
3453
3454 if (*ptr != '|')
3455 {
3456 int length = code - last_branch;
3457 do
3458 {
3459 int prev_length = GET(last_branch, 1);
3460 PUT(last_branch, 1, length);
3461 length = prev_length;
3462 last_branch -= length;
3463 }
3464 while (length > 0);
3465
3466 /* Fill in the ket */
3467
3468 *code = OP_KET;
3469 PUT(code, 1, code - start_bracket);
3470 code += 1 + LINK_SIZE;
3471
3472 /* Resetting option if needed */
3473
3474 if ((options & PCRE_IMS) != oldims && *ptr == ')')
3475 {
3476 *code++ = OP_OPT;
3477 *code++ = oldims;
3478 }
3479
3480 /* Set values to pass back */
3481
3482 *codeptr = code;
3483 *ptrptr = ptr;
3484 *firstbyteptr = firstbyte;
3485 *reqbyteptr = reqbyte;
3486 return TRUE;
3487 }
3488
3489 /* Another branch follows; insert an "or" node. Its length field points back
3490 to the previous branch while the bracket remains open. At the end the chain
3491 is reversed. It's done like this so that the start of the bracket has a
3492 zero offset until it is closed, making it possible to detect recursion. */
3493
3494 *code = OP_ALT;
3495 PUT(code, 1, code - last_branch);
3496 bc.current = last_branch = code;
3497 code += 1 + LINK_SIZE;
3498 ptr++;
3499 }
3500 /* Control never reaches here */
3501 }
3502
3503
3504
3505
3506 /*************************************************
3507 * Check for anchored expression *
3508 *************************************************/
3509
3510 /* Try to find out if this is an anchored regular expression. Consider each
3511 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
3512 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
3513 it's anchored. However, if this is a multiline pattern, then only OP_SOD
3514 counts, since OP_CIRC can match in the middle.
3515
3516 We can also consider a regex to be anchored if OP_SOM starts all its branches.
3517 This is the code for \G, which means "match at start of match position, taking
3518 into account the match offset".
3519
3520 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
3521 because that will try the rest of the pattern at all possible matching points,
3522 so there is no point trying again.... er ....
3523
3524 .... except when the .* appears inside capturing parentheses, and there is a
3525 subsequent back reference to those parentheses. We haven't enough information
3526 to catch that case precisely.
3527
3528 At first, the best we could do was to detect when .* was in capturing brackets
3529 and the highest back reference was greater than or equal to that level.
3530 However, by keeping a bitmap of the first 31 back references, we can catch some
3531 of the more common cases more precisely.
3532
3533 Arguments:
3534 code points to start of expression (the bracket)
3535 options points to the options setting
3536 bracket_map a bitmap of which brackets we are inside while testing; this
3537 handles up to substring 31; after that we just have to take
3538 the less precise approach
3539 backref_map the back reference bitmap
3540
3541 Returns: TRUE or FALSE
3542 */
3543
3544 static BOOL
3545 is_anchored(register const uschar *code, int *options, unsigned int bracket_map,
3546 unsigned int backref_map)
3547 {
3548 do {
3549 const uschar *scode =
3550 first_significant_code(code + 1+LINK_SIZE, options, PCRE_MULTILINE);
3551 register int op = *scode;
3552
3553 /* Capturing brackets */
3554
3555 if (op > OP_BRA)
3556 {
3557 int new_map;
3558 op -= OP_BRA;
3559 if (op > EXTRACT_BASIC_MAX) op = GET2(scode, 2+LINK_SIZE);
3560 new_map = bracket_map | ((op < 32)? (1 << op) : 1);
3561 if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;
3562 }
3563
3564 /* Other brackets */
3565
3566 else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
3567 {
3568 if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
3569 }
3570
3571 /* .* is not anchored unless DOTALL is set and it isn't in brackets that
3572 are or may be referenced. */
3573
3574 else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR) &&
3575 (*options & PCRE_DOTALL) != 0)
3576 {
3577 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
3578 }
3579
3580 /* Check for explicit anchoring */
3581
3582 else if (op != OP_SOD && op != OP_SOM &&
3583 ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
3584 return FALSE;
3585 code += GET(code, 1);
3586 }
3587 while (*code == OP_ALT); /* Loop for each alternative */
3588 return TRUE;
3589 }
3590
3591
3592
3593 /*************************************************
3594 * Check for starting with ^ or .* *
3595 *************************************************/
3596
3597 /* This is called to find out if every branch starts with ^ or .* so that
3598 "first char" processing can be done to speed things up in multiline
3599 matching and for non-DOTALL patterns that start with .* (which must start at
3600 the beginning or after \n). As in the case of is_anchored() (see above), we
3601 have to take account of back references to capturing brackets that contain .*
3602 because in that case we can't make the assumption.
3603
3604 Arguments:
3605 code points to start of expression (the bracket)
3606 bracket_map a bitmap of which brackets we are inside while testing; this
3607 handles up to substring 31; after that we just have to take
3608 the less precise approach
3609 backref_map the back reference bitmap
3610
3611 Returns: TRUE or FALSE
3612 */
3613
3614 static BOOL
3615 is_startline(const uschar *code, unsigned int bracket_map,
3616 unsigned int backref_map)
3617 {
3618 do {
3619 const uschar *scode = first_significant_code(code + 1+LINK_SIZE, NULL, 0);
3620 register int op = *scode;
3621
3622 /* Capturing brackets */
3623
3624 if (op > OP_BRA)
3625 {
3626 int new_map;
3627 op -= OP_BRA;
3628 if (op > EXTRACT_BASIC_MAX) op = GET2(scode, 2+LINK_SIZE);
3629 new_map = bracket_map | ((op < 32)? (1 << op) : 1);
3630 if (!is_startline(scode, new_map, backref_map)) return FALSE;
3631 }
3632
3633 /* Other brackets */
3634
3635 else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
3636 { if (!is_startline(scode, bracket_map, backref_map)) return FALSE; }
3637
3638 /* .* is not anchored unless DOTALL is set and it isn't in brackets that
3639 may be referenced. */
3640
3641 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
3642 {
3643 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
3644 }
3645
3646 /* Check for explicit circumflex */
3647
3648 else if (op != OP_CIRC) return FALSE;
3649 code += GET(code, 1);
3650 }
3651 while (*code == OP_ALT); /* Loop for each alternative */
3652 return TRUE;
3653 }
3654
3655
3656
3657 /*************************************************
3658 * Check for asserted fixed first char *
3659 *************************************************/
3660
3661 /* During compilation, the "first char" settings from forward assertions are
3662 discarded, because they can cause conflicts with actual literals that follow.
3663 However, if we end up without a first char setting for an unanchored pattern,
3664 it is worth scanning the regex to see if there is an initial asserted first
3665 char. If all branches start with the same asserted char, or with a bracket all
3666 of whose alternatives start with the same asserted char (recurse ad lib), then
3667 we return that char, otherwise -1.
3668
3669 Arguments:
3670 code points to start of expression (the bracket)
3671 options pointer to the options (used to check casing changes)
3672 inassert TRUE if in an assertion
3673
3674 Returns: -1 or the fixed first char
3675 */
3676
3677 static int
3678 find_firstassertedchar(const uschar *code, int *options, BOOL inassert)
3679 {
3680 register int c = -1;
3681 do {
3682 int d;
3683 const uschar *scode =
3684 first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS);
3685 register int op = *scode;
3686
3687 if (op >= OP_BRA) op = OP_BRA;
3688
3689 switch(op)
3690 {
3691 default:
3692 return -1;
3693
3694 case OP_BRA:
3695 case OP_ASSERT:
3696 case OP_ONCE:
3697 case OP_COND:
3698 if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0)
3699 return -1;
3700 if (c < 0) c = d; else if (c != d) return -1;
3701 break;
3702
3703 case OP_EXACT: /* Fall through */
3704 scode++;
3705
3706 case OP_CHARS: /* Fall through */
3707 scode++;
3708
3709 case OP_PLUS:
3710 case OP_MINPLUS:
3711 if (!inassert) return -1;
3712 if (c < 0)
3713 {
3714 c = scode[1];
3715 if ((*options & PCRE_CASELESS) != 0) c |= REQ_CASELESS;
3716 }
3717 else if (c != scode[1]) return -1;
3718 break;
3719 }
3720
3721 code += GET(code, 1);
3722 }
3723 while (*code == OP_ALT);
3724 return c;
3725 }
3726
3727
3728
3729
3730 /*************************************************
3731 * Compile a Regular Expression *
3732 *************************************************/
3733
3734 /* This function takes a string and returns a pointer to a block of store
3735 holding a compiled version of the expression.
3736
3737 Arguments:
3738 pattern the regular expression
3739 options various option bits
3740 errorptr pointer to pointer to error text
3741 erroroffset ptr offset in pattern where error was detected
3742 tables pointer to character tables or NULL
3743
3744 Returns: pointer to compiled data block, or NULL on error,
3745 with errorptr and erroroffset set
3746 */
3747
3748 pcre *
3749 pcre_compile(const char *pattern, int options, const char **errorptr,
3750 int *erroroffset, const unsigned char *tables)
3751 {
3752 real_pcre *re;
3753 int length = 1 + LINK_SIZE; /* For initial BRA plus length */
3754 int runlength;
3755 int c, firstbyte, reqbyte;
3756 int bracount = 0;
3757 int branch_extra = 0;
3758 int branch_newextra;
3759 int item_count = -1;
3760 int name_count = 0;
3761 int max_name_size = 0;
3762 #ifdef SUPPORT_UTF8
3763 int lastcharlength = 0;
3764 BOOL utf8;
3765 BOOL class_utf8;
3766 #endif
3767 BOOL inescq = FALSE;
3768 unsigned int brastackptr = 0;
3769 size_t size;
3770 uschar *code;
3771 const uschar *codestart;
3772 const uschar *ptr;
3773 compile_data compile_block;
3774 int brastack[BRASTACK_SIZE];
3775 uschar bralenstack[BRASTACK_SIZE];
3776
3777 /* We can't pass back an error message if errorptr is NULL; I guess the best we
3778 can do is just return NULL. */
3779
3780 if (errorptr == NULL) return NULL;
3781 *errorptr = NULL;
3782
3783 /* However, we can give a message for this error */
3784
3785 if (erroroffset == NULL)
3786 {
3787 *errorptr = ERR16;
3788 return NULL;
3789 }
3790 *erroroffset = 0;
3791
3792 /* Can't support UTF8 unless PCRE has been compiled to include the code. */
3793
3794 #ifdef SUPPORT_UTF8
3795 utf8 = (options & PCRE_UTF8) != 0;
3796 #else
3797 if ((options & PCRE_UTF8) != 0)
3798 {
3799 *errorptr = ERR32;
3800 return NULL;
3801 }
3802 #endif
3803
3804 if ((options & ~PUBLIC_OPTIONS) != 0)
3805 {
3806 *errorptr = ERR17;
3807 return NULL;
3808 }
3809
3810 /* Set up pointers to the individual character tables */
3811
3812 if (tables == NULL) tables = pcre_default_tables;
3813 compile_block.lcc = tables + lcc_offset;
3814 compile_block.fcc = tables + fcc_offset;
3815 compile_block.cbits = tables + cbits_offset;
3816 compile_block.ctypes = tables + ctypes_offset;
3817
3818 /* Maximum back reference and backref bitmap. This is updated for numeric
3819 references during the first pass, but for named references during the actual
3820 compile pass. The bitmap records up to 31 back references to help in deciding
3821 whether (.*) can be treated as anchored or not. */
3822
3823 compile_block.top_backref = 0;
3824 compile_block.backref_map = 0;
3825
3826 /* Reflect pattern for debugging output */
3827
3828 DPRINTF(("------------------------------------------------------------------\n"));
3829 DPRINTF(("%s\n", pattern));
3830
3831 /* The first thing to do is to make a pass over the pattern to compute the
3832 amount of store required to hold the compiled code. This does not have to be
3833 perfect as long as errors are overestimates. At the same time we can detect any
3834 flag settings right at the start, and extract them. Make an attempt to correct
3835 for any counted white space if an "extended" flag setting appears late in the
3836 pattern. We can't be so clever for #-comments. */
3837
3838 ptr = (const uschar *)(pattern - 1);
3839 while ((c = *(++ptr)) != 0)
3840 {
3841 int min, max;
3842 int class_optcount;
3843 int bracket_length;
3844 int duplength;
3845
3846 /* If we are inside a \Q...\E sequence, all chars are literal */
3847
3848 if (inescq) goto NORMAL_CHAR;
3849
3850 /* Otherwise, first check for ignored whitespace and comments */
3851
3852 if ((options & PCRE_EXTENDED) != 0)
3853 {
3854 if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
3855 if (c == '#')
3856 {
3857 /* The space before the ; is to avoid a warning on a silly compiler
3858 on the Macintosh. */
3859 while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
3860 if (c == 0) break;
3861 continue;
3862 }
3863 }
3864
3865 item_count++; /* Is zero for the first non-comment item */
3866
3867 switch(c)
3868 {
3869 /* A backslashed item may be an escaped "normal" character or a
3870 character type. For a "normal" character, put the pointers and
3871 character back so that tests for whitespace etc. in the input
3872 are done correctly. */
3873
3874 case '\\':
3875 {
3876 const uschar *save_ptr = ptr;
3877 c = check_escape(&ptr, errorptr, bracount, options, FALSE, &compile_block);
3878 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
3879 if (c >= 0)
3880 {
3881 ptr = save_ptr;
3882 c = '\\';
3883 goto NORMAL_CHAR;
3884 }
3885 }
3886
3887 /* If \Q, enter "literal" mode */
3888
3889 if (-c == ESC_Q)
3890 {
3891 inescq = TRUE;
3892 continue;
3893 }
3894
3895 /* Other escapes need one byte, and are of length one for repeats */
3896
3897 length++;
3898 #ifdef SUPPORT_UTF8
3899 lastcharlength = 1;
3900 #endif
3901
3902 /* A back reference needs an additional 2 bytes, plus either one or 5
3903 bytes for a repeat. We also need to keep the value of the highest
3904 back reference. */
3905
3906 if (c <= -ESC_REF)
3907 {
3908 int refnum = -c - ESC_REF;
3909 compile_block.backref_map |= (refnum < 32)? (1 << refnum) : 1;
3910 if (refnum > compile_block.top_backref)
3911 compile_block.top_backref = refnum;
3912 length += 2; /* For single back reference */
3913 if (ptr[1] == '{' && is_counted_repeat(ptr+2, &compile_block))
3914 {
3915 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr, &compile_block);
3916 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
3917 if ((min == 0 && (max == 1 || max == -1)) ||
3918 (min == 1 && max == -1))
3919 length++;
3920 else length += 5;
3921 if (ptr[1] == '?') ptr++;
3922 }
3923 }
3924 continue;
3925
3926 case '^': /* Single-byte metacharacters */
3927 case '.':
3928 case '$':
3929 length++;
3930 #ifdef SUPPORT_UTF8
3931 lastcharlength = 1;
3932 #endif
3933 continue;
3934
3935 case '*': /* These repeats won't be after brackets; */
3936 case '+': /* those are handled separately */
3937 case '?':
3938 length++;
3939 goto POSESSIVE; /* A few lines below */
3940
3941 /* This covers the cases of braced repeats after a single char, metachar,
3942 class, or back reference. */
3943
3944 case '{':
3945 if (!is_counted_repeat(ptr+1, &compile_block)) goto NORMAL_CHAR;
3946 ptr = read_repeat_counts(ptr+1, &min, &max, errorptr, &compile_block);
3947 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
3948
3949 /* These special cases just insert one extra opcode */
3950
3951 if ((min == 0 && (max == 1 || max == -1)) ||
3952 (min == 1 && max == -1))
3953 length++;
3954
3955 /* These cases might insert additional copies of a preceding character. */
3956
3957 else
3958 {
3959 #ifdef SUPPORT_UTF8
3960 /* In UTF-8 mode, we should find the length in lastcharlength */
3961 if (utf8)
3962 {
3963 if (min != 1)
3964 {
3965 length -= lastcharlength; /* Uncount the original char or metachar */
3966 if (min > 0) length += 3 + lastcharlength;
3967 }
3968 length += lastcharlength + ((max > 0)? 3 : 1);
3969 }
3970 else
3971 #endif
3972
3973 /* Not UTF-8 mode: all characters are one byte */
3974 {
3975 if (min != 1)
3976 {
3977 length--; /* Uncount the original char or metachar */
3978 if (min > 0) length += 4;
3979 }
3980
3981 length += (max > 0)? 4 : 2;
3982 }
3983 }
3984
3985 if (ptr[1] == '?') ptr++; /* Needs no extra length */
3986
3987 POSESSIVE: /* Test for possessive quantifier */
3988 if (ptr[1] == '+')
3989 {
3990 ptr++;
3991 length += 2 + 2*LINK_SIZE; /* Allow for atomic brackets */
3992 }
3993 continue;
3994
3995 /* An alternation contains an offset to the next branch or ket. If any ims
3996 options changed in the previous branch(es), and/or if we are in a
3997 lookbehind assertion, extra space will be needed at the start of the
3998 branch. This is handled by branch_extra. */
3999
4000 case '|':
4001 length += 1 + LINK_SIZE + branch_extra;
4002 continue;
4003
4004 /* A character class uses 33 characters provided that all the character
4005 values are less than 256. Otherwise, it uses a bit map for low valued
4006 characters, and individual items for others. Don't worry about character
4007 types that aren't allowed in classes - they'll get picked up during the
4008 compile. A character class that contains only one single-byte character
4009 uses 2 or 3 bytes, depending on whether it is negated or not. Notice this
4010 where we can. (In UTF-8 mode we can do this only for chars < 128.) */
4011
4012 case '[':
4013 class_optcount = 0;
4014
4015 #ifdef SUPPORT_UTF8
4016 class_utf8 = FALSE;
4017 #endif
4018
4019 if (*(++ptr) == '^') ptr++;
4020
4021 /* Written as a "do" so that an initial ']' is taken as data */
4022
4023 if (*ptr != 0) do
4024 {
4025 /* Inside \Q...\E everything is literal except \E */
4026
4027 if (inescq)
4028 {
4029 if (*ptr != '\\' || ptr[1] != 'E') goto NON_SPECIAL_CHARACTER;
4030 inescq = FALSE;
4031 ptr += 1;
4032 continue;
4033 }
4034
4035 /* Outside \Q...\E, check for escapes */
4036
4037 if (*ptr == '\\')
4038 {
4039 #ifdef SUPPORT_UTF8
4040 int prevchar = ptr[-1];
4041 #endif
4042 int ch = check_escape(&ptr, errorptr, bracount, options, TRUE,
4043 &compile_block);
4044 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
4045
4046 /* \b is backspace inside a class */
4047
4048 if (-ch == ESC_b) ch = '\b';
4049
4050 /* \Q enters quoting mode */
4051
4052 if (-ch == ESC_Q)
4053 {
4054 inescq = TRUE;
4055 continue;
4056 }
4057
4058 /* Handle escapes that turn into characters */
4059
4060 if (ch >= 0)
4061 {
4062 #ifdef SUPPORT_UTF8
4063 if (utf8)
4064 {
4065 if (ch > 127) class_optcount = 10; /* Ensure > 1 */
4066 if (ch > 255)
4067 {
4068 uschar buffer[6];
4069 if (!class_utf8)
4070 {
4071 class_utf8 = TRUE;
4072 length += LINK_SIZE + 1 + 1;
4073 }
4074 length += 1 + ord2utf8(ch, buffer);
4075
4076 /* If this wide character is preceded by '-', add an extra 2 to
4077 the length in case the previous character was < 128, because in
4078 this case the whole range will be put into the list. */
4079
4080 if (prevchar == '-') length += 2;
4081 }
4082 }
4083 #endif
4084 class_optcount++; /* for possible optimization */
4085 }
4086 else class_optcount = 10; /* \d, \s etc; make sure > 1 */
4087 }
4088
4089 /* Check the syntax for POSIX stuff. The bits we actually handle are
4090 checked during the real compile phase. */
4091
4092 else if (*ptr == '[' && check_posix_syntax(ptr, &ptr, &compile_block))
4093 {
4094 ptr++;
4095 class_optcount = 10; /* Make sure > 1 */
4096 }
4097
4098 /* Anything else just increments the possible optimization count. If
4099 there are wide characters, we are going to have to use an XCLASS. */
4100
4101 else
4102 {
4103 NON_SPECIAL_CHARACTER:
4104 class_optcount++;
4105
4106 #ifdef SUPPORT_UTF8
4107 if (utf8)
4108 {
4109 int ch;
4110 int extra = 0;
4111 GETCHARLEN(ch, ptr, extra);
4112 if (ch > 127) class_optcount = 10; /* No optimization possible */
4113 if (ch > 255)
4114 {
4115 if (!class_utf8)
4116 {
4117 class_utf8 = TRUE;
4118 length += LINK_SIZE + 1 + 1;
4119 }
4120 length += 2 + extra;
4121
4122 /* If this wide character is preceded by '-', add an extra 2 to
4123 the length in case the previous character was < 128, because in
4124 this case the whole range will be put into the list. */
4125
4126 if (ptr[-1] == '-') length += 2;
4127
4128 /* Advance to the end of this character */
4129
4130 ptr += extra;
4131 }
4132 }
4133 #endif
4134 }
4135 }
4136 while (*(++ptr) != 0 && (inescq || *ptr != ']')); /* Concludes "do" above */
4137
4138 if (*ptr == 0) /* Missing terminating ']' */
4139 {
4140 *errorptr = ERR6;
4141 goto PCRE_ERROR_RETURN;
4142 }
4143
4144 /* We can optimize when there was only one optimizable character. Repeats
4145 for positive and negated single one-byte chars are handled by the general
4146 code. Here, we handle repeats for the class opcodes. */
4147
4148 if (class_optcount == 1) length += 3; else
4149 {
4150 length += 33;
4151
4152 /* A repeat needs either 1 or 5 bytes. */
4153
4154 if (*ptr != 0 && ptr[1] == '{' && is_counted_repeat(ptr+2, &compile_block))
4155 {
4156 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr, &compile_block);
4157 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
4158 if ((min == 0 && (max == 1 || max == -1)) ||
4159 (min == 1 && max == -1))
4160 length++;
4161 else length += 5;
4162 if (ptr[1] == '?') ptr++;
4163 }
4164 }
4165 continue;
4166
4167 /* Brackets may be genuine groups or special things */
4168
4169 case '(':
4170 branch_newextra = 0;
4171 bracket_length = 1 + LINK_SIZE;
4172
4173 /* Handle special forms of bracket, which all start (? */
4174
4175 if (ptr[1] == '?')
4176 {
4177 int set, unset;
4178 int *optset;
4179
4180 switch (c = ptr[2])
4181 {
4182 /* Skip over comments entirely */
4183 case '#':
4184 ptr += 3;
4185 while (*ptr != 0 && *ptr != ')') ptr++;
4186 if (*ptr == 0)
4187 {
4188 *errorptr = ERR18;
4189 goto PCRE_ERROR_RETURN;
4190 }
4191 continue;
4192
4193 /* Non-referencing groups and lookaheads just move the pointer on, and
4194 then behave like a non-special bracket, except that they don't increment
4195 the count of extracting brackets. Ditto for the "once only" bracket,
4196 which is in Perl from version 5.005. */
4197
4198 case ':':
4199 case '=':
4200 case '!':
4201 case '>':
4202 ptr += 2;
4203 break;
4204
4205 /* (?R) specifies a recursive call to the regex, which is an extension
4206 to provide the facility which can be obtained by (?p{perl-code}) in
4207 Perl 5.6. In Perl 5.8 this has become (??{perl-code}).
4208
4209 From PCRE 4.00, items such as (?3) specify subroutine-like "calls" to
4210 the appropriate numbered brackets. This includes both recursive and
4211 non-recursive calls. (?R) is now synonymous with (?0). */
4212
4213 case 'R':
4214 ptr++;
4215
4216 case '0': case '1': case '2': case '3': case '4':
4217 case '5': case '6': case '7': case '8': case '9':
4218 ptr += 2;
4219 if (c != 'R')
4220 while ((digitab[*(++ptr)] & ctype_digit) != 0);
4221 if (*ptr != ')')
4222 {
4223 *errorptr = ERR29;
4224 goto PCRE_ERROR_RETURN;
4225 }
4226 length += 1 + LINK_SIZE;
4227
4228 /* If this item is quantified, it will get wrapped inside brackets so
4229 as to use the code for quantified brackets. We jump down and use the
4230 code that handles this for real brackets. */
4231
4232 if (ptr[1] == '+' || ptr[1] == '*' || ptr[1] == '?' || ptr[1] == '{')
4233 {
4234 length += 2 + 2 * LINK_SIZE; /* to make bracketed */
4235 duplength = 5 + 3 * LINK_SIZE;
4236 goto HANDLE_QUANTIFIED_BRACKETS;
4237 }
4238 continue;
4239
4240 /* (?C) is an extension which provides "callout" - to provide a bit of
4241 the functionality of the Perl (?{...}) feature. An optional number may
4242 follow (default is zero). */
4243
4244 case 'C':
4245 ptr += 2;
4246 while ((digitab[*(++ptr)] & ctype_digit) != 0);
4247 if (*ptr != ')')
4248 {
4249 *errorptr = ERR39;
4250 goto PCRE_ERROR_RETURN;
4251 }
4252 length += 2;
4253 continue;
4254
4255 /* Named subpatterns are an extension copied from Python */
4256
4257 case 'P':
4258 ptr += 3;
4259 if (*ptr == '<')
4260 {
4261 const uschar *p; /* Don't amalgamate; some compilers */
4262 p = ++ptr; /* grumble at autoincrement in declaration */
4263 while ((compile_block.ctypes[*ptr] & ctype_word) != 0) ptr++;
4264 if (*ptr != '>')
4265 {
4266 *errorptr = ERR42;
4267 goto PCRE_ERROR_RETURN;
4268 }
4269 name_count++;
4270 if (ptr - p > max_name_size) max_name_size = (ptr - p);
4271 break;
4272 }
4273
4274 if (*ptr == '=' || *ptr == '>')
4275 {
4276 while ((compile_block.ctypes[*(++ptr)] & ctype_word) != 0);
4277 if (*ptr != ')')
4278 {
4279 *errorptr = ERR42;
4280 goto PCRE_ERROR_RETURN;
4281 }
4282 break;
4283 }
4284
4285 /* Unknown character after (?P */
4286
4287 *errorptr = ERR41;
4288 goto PCRE_ERROR_RETURN;
4289
4290 /* Lookbehinds are in Perl from version 5.005 */
4291
4292 case '<':
4293 ptr += 3;
4294 if (*ptr == '=' || *ptr == '!')
4295 {
4296 branch_newextra = 1 + LINK_SIZE;
4297 length += 1 + LINK_SIZE; /* For the first branch */
4298 break;
4299 }
4300 *errorptr = ERR24;
4301 goto PCRE_ERROR_RETURN;
4302
4303 /* Conditionals are in Perl from version 5.005. The bracket must either
4304 be followed by a number (for bracket reference) or by an assertion
4305 group, or (a PCRE extension) by 'R' for a recursion test. */
4306
4307 case '(':
4308 if (ptr[3] == 'R' && ptr[4] == ')')
4309 {
4310 ptr += 4;
4311 length += 3;
4312 }
4313 else if ((digitab[ptr[3]] & ctype_digit) != 0)
4314 {
4315 ptr += 4;
4316 length += 3;
4317 while ((digitab[*ptr] & ctype_digit) != 0) ptr++;
4318 if (*ptr != ')')
4319 {
4320 *errorptr = ERR26;
4321 goto PCRE_ERROR_RETURN;
4322 }
4323 }
4324 else /* An assertion must follow */
4325 {
4326 ptr++; /* Can treat like ':' as far as spacing is concerned */
4327 if (ptr[2] != '?' ||
4328 (ptr[3] != '=' && ptr[3] != '!' && ptr[3] != '<') )
4329 {
4330 ptr += 2; /* To get right offset in message */
4331 *errorptr = ERR28;
4332 goto PCRE_ERROR_RETURN;
4333 }
4334 }
4335 break;
4336
4337 /* Else loop checking valid options until ) is met. Anything else is an
4338 error. If we are without any brackets, i.e. at top level, the settings
4339 act as if specified in the options, so massage the options immediately.
4340 This is for backward compatibility with Perl 5.004. */
4341
4342 default:
4343 set = unset = 0;
4344 optset = &set;
4345 ptr += 2;
4346
4347 for (;; ptr++)
4348 {
4349 c = *ptr;
4350 switch (c)
4351 {
4352 case 'i':
4353 *optset |= PCRE_CASELESS;
4354 continue;
4355
4356 case 'm':
4357 *optset |= PCRE_MULTILINE;
4358 continue;
4359
4360 case 's':
4361 *optset |= PCRE_DOTALL;
4362 continue;
4363
4364 case 'x':
4365 *optset |= PCRE_EXTENDED;
4366 continue;
4367
4368 case 'X':
4369 *optset |= PCRE_EXTRA;
4370 continue;
4371
4372 case 'U':
4373 *optset |= PCRE_UNGREEDY;
4374 continue;
4375
4376 case '-':
4377 optset = &unset;
4378 continue;
4379
4380 /* A termination by ')' indicates an options-setting-only item; if
4381 this is at the very start of the pattern (indicated by item_count
4382 being zero), we use it to set the global options. This is helpful
4383 when analyzing the pattern for first characters, etc. Otherwise
4384 nothing is done here and it is handled during the compiling
4385 process.
4386
4387 [Historical note: Up to Perl 5.8, options settings at top level
4388 were always global settings, wherever they appeared in the pattern.
4389 That is, they were equivalent to an external setting. From 5.8
4390 onwards, they apply only to what follows (which is what you might
4391 expect).] */
4392
4393 case ')':
4394 if (item_count == 0)
4395 {
4396 options = (options | set) & (~unset);
4397 set = unset = 0; /* To save length */
4398 item_count--; /* To allow for several */
4399 }
4400
4401 /* Fall through */
4402
4403 /* A termination by ':' indicates the start of a nested group with
4404 the given options set. This is again handled at compile time, but
4405 we must allow for compiled space if any of the ims options are
4406 set. We also have to allow for resetting space at the end of
4407 the group, which is why 4 is added to the length and not just 2.
4408 If there are several changes of options within the same group, this
4409 will lead to an over-estimate on the length, but this shouldn't
4410 matter very much. We also have to allow for resetting options at
4411 the start of any alternations, which we do by setting
4412 branch_newextra to 2. Finally, we record whether the case-dependent
4413 flag ever changes within the regex. This is used by the "required
4414 character" code. */
4415
4416 case ':':
4417 if (((set|unset) & PCRE_IMS) != 0)
4418 {
4419 length += 4;
4420 branch_newextra = 2;
4421 if (((set|unset) & PCRE_CASELESS) != 0) options |= PCRE_ICHANGED;
4422 }
4423 goto END_OPTIONS;
4424
4425 /* Unrecognized option character */
4426
4427 default:
4428 *errorptr = ERR12;
4429 goto PCRE_ERROR_RETURN;
4430 }
4431 }
4432
4433 /* If we hit a closing bracket, that's it - this is a freestanding
4434 option-setting. We need to ensure that branch_extra is updated if
4435 necessary. The only values branch_newextra can have here are 0 or 2.
4436 If the value is 2, then branch_extra must either be 2 or 5, depending
4437 on whether this is a lookbehind group or not. */
4438
4439 END_OPTIONS:
4440 if (c == ')')
4441 {
4442 if (branch_newextra == 2 &&
4443 (branch_extra == 0 || branch_extra == 1+LINK_SIZE))
4444 branch_extra += branch_newextra;
4445 continue;
4446 }
4447
4448 /* If options were terminated by ':' control comes here. Fall through
4449 to handle the group below. */
4450 }
4451 }
4452
4453 /* Extracting brackets must be counted so we can process escapes in a
4454 Perlish way. If the number exceeds EXTRACT_BASIC_MAX we are going to
4455 need an additional 3 bytes of store per extracting bracket. However, if
4456 PCRE_NO_AUTO)CAPTURE is set, unadorned brackets become non-capturing, so we
4457 must leave the count alone (it will aways be zero). */
4458
4459 else if ((options & PCRE_NO_AUTO_CAPTURE) == 0)
4460 {
4461 bracount++;
4462 if (bracount > EXTRACT_BASIC_MAX) bracket_length += 3;
4463 }
4464
4465 /* Save length for computing whole length at end if there's a repeat that
4466 requires duplication of the group. Also save the current value of
4467 branch_extra, and start the new group with the new value. If non-zero, this
4468 will either be 2 for a (?imsx: group, or 3 for a lookbehind assertion. */
4469
4470 if (brastackptr >= sizeof(brastack)/sizeof(int))
4471 {
4472 *errorptr = ERR19;
4473 goto PCRE_ERROR_RETURN;
4474 }
4475
4476 bralenstack[brastackptr] = branch_extra;
4477 branch_extra = branch_newextra;
4478
4479 brastack[brastackptr++] = length;
4480 length += bracket_length;
4481 continue;
4482
4483 /* Handle ket. Look for subsequent max/min; for certain sets of values we
4484 have to replicate this bracket up to that many times. If brastackptr is
4485 0 this is an unmatched bracket which will generate an error, but take care
4486 not to try to access brastack[-1] when computing the length and restoring
4487 the branch_extra value. */
4488
4489 case ')':
4490 length += 1 + LINK_SIZE;
4491 if (brastackptr > 0)
4492 {
4493 duplength = length - brastack[--brastackptr];
4494 branch_extra = bralenstack[brastackptr];
4495 }
4496 else duplength = 0;
4497
4498 /* The following code is also used when a recursion such as (?3) is
4499 followed by a quantifier, because in that case, it has to be wrapped inside
4500 brackets so that the quantifier works. The value of duplength must be
4501 set before arrival. */
4502
4503 HANDLE_QUANTIFIED_BRACKETS:
4504
4505 /* Leave ptr at the final char; for read_repeat_counts this happens
4506 automatically; for the others we need an increment. */
4507
4508 if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2, &compile_block))
4509 {
4510 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr, &compile_block);
4511 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
4512 }
4513 else if (c == '*') { min = 0; max = -1; ptr++; }
4514 else if (c == '+') { min = 1; max = -1; ptr++; }
4515 else if (c == '?') { min = 0; max = 1; ptr++; }
4516 else { min = 1; max = 1; }
4517
4518 /* If the minimum is zero, we have to allow for an OP_BRAZERO before the
4519 group, and if the maximum is greater than zero, we have to replicate
4520 maxval-1 times; each replication acquires an OP_BRAZERO plus a nesting
4521 bracket set. */
4522
4523 if (min == 0)
4524 {
4525 length++;
4526 if (max > 0) length += (max - 1) * (duplength + 3 + 2*LINK_SIZE);
4527 }
4528
4529 /* When the minimum is greater than zero, we have to replicate up to
4530 minval-1 times, with no additions required in the copies. Then, if there
4531 is a limited maximum we have to replicate up to maxval-1 times allowing
4532 for a BRAZERO item before each optional copy and nesting brackets for all
4533 but one of the optional copies. */
4534
4535 else
4536 {
4537 length += (min - 1) * duplength;
4538 if (max > min) /* Need this test as max=-1 means no limit */
4539 length += (max - min) * (duplength + 3 + 2*LINK_SIZE)
4540 - (2 + 2*LINK_SIZE);
4541 }
4542
4543 /* Allow space for once brackets for "possessive quantifier" */
4544
4545 if (ptr[1] == '+')
4546 {
4547 ptr++;
4548 length += 2 + 2*LINK_SIZE;
4549 }
4550 continue;
4551
4552 /* Non-special character. For a run of such characters the length required
4553 is the number of characters + 2, except that the maximum run length is
4554 MAXLIT. We won't get a skipped space or a non-data escape or the start of a
4555 # comment as the first character, so the length can't be zero. */
4556
4557 NORMAL_CHAR:
4558 default:
4559 length += 2;
4560 runlength = 0;
4561 do
4562 {
4563 #ifdef SUPPORT_UTF8
4564 lastcharlength = 1; /* Need length of last char for UTF-8 repeats */
4565 #endif
4566
4567 /* If in a \Q...\E sequence, check for end; otherwise it's a literal */
4568 if (inescq)
4569 {
4570 if (c == '\\' && ptr[1] == 'E')
4571 {
4572 inescq = FALSE;
4573 ptr++;
4574 }
4575 else runlength++;
4576 continue;
4577 }
4578
4579 /* Skip whitespace and comments for /x */
4580
4581 if ((options & PCRE_EXTENDED) != 0)
4582 {
4583 if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
4584 if (c == '#')
4585 {
4586 /* The space before the ; is to avoid a warning on a silly compiler
4587 on the Macintosh. */
4588 while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
4589 continue;
4590 }
4591 }
4592
4593 /* Backslash may introduce a data char or a metacharacter; stop the
4594 string before the latter. */
4595
4596 if (c == '\\')
4597 {
4598 const uschar *saveptr = ptr;
4599 c = check_escape(&ptr, errorptr, bracount, options, FALSE,
4600 &compile_block);
4601 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
4602 if (c < 0) { ptr = saveptr; break; }
4603
4604 /* In UTF-8 mode, add on the number of additional bytes needed to
4605 encode this character, and save the total length in case this is a
4606 final char that is repeated. */
4607
4608 #ifdef SUPPORT_UTF8
4609 if (utf8 && c > 127)
4610 {
4611 int i;
4612 for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
4613 if (c <= utf8_table1[i]) break;
4614 runlength += i;
4615 lastcharlength += i;
4616 }
4617 #endif
4618 }
4619
4620 /* Ordinary character or single-char escape */
4621
4622 runlength++;
4623 }
4624
4625 /* This "while" is the end of the "do" above. */
4626
4627 while (runlength < MAXLIT &&
4628 (compile_block.ctypes[c = *(++ptr)] & ctype_meta) == 0);
4629
4630 /* If we hit a meta-character, back off to point to it */
4631
4632 if (runlength < MAXLIT) ptr--;
4633
4634 /* If the last char in the string is a UTF-8 multibyte character, we must
4635 set lastcharlength correctly. If it was specified as an escape, this will
4636 already have been done above. However, we also have to support in-line
4637 UTF-8 characters, so check backwards from where we are. */
4638
4639 #ifdef SUPPORT_UTF8
4640 if (utf8)
4641 {
4642 const uschar *lastptr = ptr - 1;
4643 if ((*lastptr & 0x80) != 0)
4644 {
4645 while((*lastptr & 0xc0) == 0x80) lastptr--;
4646 lastcharlength = ptr - lastptr;
4647 }
4648 }
4649 #endif
4650
4651 length += runlength;
4652 continue;
4653 }
4654 }
4655
4656 length += 2 + LINK_SIZE; /* For final KET and END */
4657
4658 if (length > MAX_PATTERN_SIZE)
4659 {
4660 *errorptr = ERR20;
4661 return NULL;
4662 }
4663
4664 /* Compute the size of data block needed and get it, either from malloc or
4665 externally provided function. */
4666
4667 size = length + sizeof(real_pcre) + name_count * (max_name_size + 3);
4668 re = (real_pcre *)(pcre_malloc)(size);
4669
4670 if (re == NULL)
4671 {
4672 *errorptr = ERR21;
4673 return NULL;
4674 }
4675
4676 /* Put in the magic number, and save the size, options, and table pointer */
4677
4678 re->magic_number = MAGIC_NUMBER;
4679 re->size = size;
4680 re->options = options;
4681 re->tables = tables;
4682 re->name_entry_size = max_name_size + 3;
4683 re->name_count = name_count;
4684
4685 /* The starting points of the name/number translation table and of the code are
4686 passed around in the compile data block. */
4687
4688 compile_block.names_found = 0;
4689 compile_block.name_entry_size = max_name_size + 3;
4690 compile_block.name_table = (uschar *)re + sizeof(real_pcre);
4691 codestart = compile_block.name_table + re->name_entry_size * re->name_count;
4692 compile_block.start_code = codestart;
4693 compile_block.req_varyopt = 0;
4694
4695 /* Set up a starting, non-extracting bracket, then compile the expression. On
4696 error, *errorptr will be set non-NULL, so we don't need to look at the result
4697 of the function here. */
4698
4699 ptr = (const uschar *)pattern;
4700 code = (uschar *)codestart;
4701 *code = OP_BRA;
4702 bracount = 0;
4703 (void)compile_regex(options, options & PCRE_IMS, &bracount, &code, &ptr,
4704 errorptr, FALSE, 0, &firstbyte, &reqbyte, NULL, &compile_block);
4705 re->top_bracket = bracount;
4706 re->top_backref = compile_block.top_backref;
4707
4708 /* If not reached end of pattern on success, there's an excess bracket. */
4709
4710 if (*errorptr == NULL && *ptr != 0) *errorptr = ERR22;
4711
4712 /* Fill in the terminating state and check for disastrous overflow, but
4713 if debugging, leave the test till after things are printed out. */
4714
4715 *code++ = OP_END;
4716
4717 #ifndef DEBUG
4718 if (code - codestart > length) *errorptr = ERR23;
4719 #endif
4720
4721 /* Give an error if there's back reference to a non-existent capturing
4722 subpattern. */
4723
4724 if (re->top_backref > re->top_bracket) *errorptr = ERR15;
4725
4726 /* Failed to compile, or error while post-processing */
4727
4728 if (*errorptr != NULL)
4729 {
4730 (pcre_free)(re);
4731 PCRE_ERROR_RETURN:
4732 *erroroffset = ptr - (const uschar *)pattern;
4733 return NULL;
4734 }
4735
4736 /* If the anchored option was not passed, set the flag if we can determine that
4737 the pattern is anchored by virtue of ^ characters or \A or anything else (such
4738 as starting with .* when DOTALL is set).
4739
4740 Otherwise, if we know what the first character has to be, save it, because that
4741 speeds up unanchored matches no end. If not, see if we can set the
4742 PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
4743 start with ^. and also when all branches start with .* for non-DOTALL matches.
4744 */
4745
4746 if ((options & PCRE_ANCHORED) == 0)
4747 {
4748 int temp_options = options;
4749 if (is_anchored(codestart, &temp_options, 0, compile_block.backref_map))
4750 re->options |= PCRE_ANCHORED;
4751 else
4752 {
4753 if (firstbyte < 0)
4754 firstbyte = find_firstassertedchar(codestart, &temp_options, FALSE);
4755 if (firstbyte >= 0) /* Remove caseless flag for non-caseable chars */
4756 {
4757 int ch = firstbyte & 255;
4758 re->first_byte = ((firstbyte & REQ_CASELESS) != 0 &&
4759 compile_block.fcc[ch] == ch)? ch : firstbyte;
4760 re->options |= PCRE_FIRSTSET;
4761 }
4762 else if (is_startline(codestart, 0, compile_block.backref_map))
4763 re->options |= PCRE_STARTLINE;
4764 }
4765 }
4766
4767 /* For an anchored pattern, we use the "required byte" only if it follows a
4768 variable length item in the regex. Remove the caseless flag for non-caseable
4769 chars. */
4770
4771 if (reqbyte >= 0 &&
4772 ((re->options & PCRE_ANCHORED) == 0 || (reqbyte & REQ_VARY) != 0))
4773 {
4774 int ch = reqbyte & 255;
4775 re->req_byte = ((reqbyte & REQ_CASELESS) != 0 &&
4776 compile_block.fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte;
4777 re->options |= PCRE_REQCHSET;
4778 }
4779
4780 /* Print out the compiled data for debugging */
4781
4782 #ifdef DEBUG
4783
4784 printf("Length = %d top_bracket = %d top_backref = %d\n",
4785 length, re->top_bracket, re->top_backref);
4786
4787 if (re->options != 0)
4788 {
4789 printf("%s%s%s%s%s%s%s%s%s\n",
4790 ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
4791 ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
4792 ((re->options & PCRE_ICHANGED) != 0)? "case state changed " : "",
4793 ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
4794 ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
4795 ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
4796 ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
4797 ((re->options & PCRE_EXTRA) != 0)? "extra " : "",
4798 ((re->options & PCRE_UNGREEDY) != 0)? "ungreedy " : "");
4799 }
4800
4801 if ((re->options & PCRE_FIRSTSET) != 0)
4802 {
4803 int ch = re->first_byte & 255;
4804 char *caseless = ((re->first_byte & REQ_CASELESS) == 0)? "" : " (caseless)";
4805 if (isprint(ch)) printf("First char = %c%s\n", ch, caseless);
4806 else printf("First char = \\x%02x%s\n", ch, caseless);
4807 }
4808
4809 if ((re->options & PCRE_REQCHSET) != 0)
4810 {
4811 int ch = re->req_byte & 255;
4812 char *caseless = ((re->req_byte & REQ_CASELESS) == 0)? "" : " (caseless)";
4813 if (isprint(ch)) printf("Req char = %c%s\n", ch, caseless);
4814 else printf("Req char = \\x%02x%s\n", ch, caseless);
4815 }
4816
4817 print_internals(re, stdout);
4818
4819 /* This check is done here in the debugging case so that the code that
4820 was compiled can be seen. */
4821
4822 if (code - codestart > length)
4823 {
4824 *errorptr = ERR23;
4825 (pcre_free)(re);
4826 *erroroffset = ptr - (uschar *)pattern;
4827 return NULL;
4828 }
4829 #endif
4830
4831 return (pcre *)re;
4832 }
4833
4834
4835
4836 /*************************************************
4837 * Match a back-reference *
4838 *************************************************/
4839
4840 /* If a back reference hasn't been set, the length that is passed is greater
4841 than the number of characters left in the string, so the match fails.
4842
4843 Arguments:
4844 offset index into the offset vector
4845 eptr points into the subject
4846 length length to be matched
4847 md points to match data block
4848 ims the ims flags
4849
4850 Returns: TRUE if matched
4851 */
4852
4853 static BOOL
4854 match_ref(int offset, register const uschar *eptr, int length, match_data *md,
4855 unsigned long int ims)
4856 {
4857 const uschar *p = md->start_subject + md->offset_vector[offset];
4858
4859 #ifdef DEBUG
4860 if (eptr >= md->end_subject)
4861 printf("matching subject <null>");
4862 else
4863 {
4864 printf("matching subject ");
4865 pchars(eptr, length, TRUE, md);
4866 }
4867 printf(" against backref ");
4868 pchars(p, length, FALSE, md);
4869 printf("\n");
4870 #endif
4871
4872 /* Always fail if not enough characters left */
4873
4874 if (length > md->end_subject - eptr) return FALSE;
4875
4876 /* Separate the caselesss case for speed */
4877
4878 if ((ims & PCRE_CASELESS) != 0)
4879 {
4880 while (length-- > 0)
4881 if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE;
4882 }
4883 else
4884 { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
4885
4886 return TRUE;
4887 }
4888
4889
4890 #ifdef SUPPORT_UTF8
4891 /*************************************************
4892 * Match character against an XCLASS *
4893 *************************************************/
4894
4895 /* This function is called from within the XCLASS code below, to match a
4896 character against an extended class which might match values > 255.
4897
4898 Arguments:
4899 c the character
4900 data points to the flag byte of the XCLASS data
4901
4902 Returns: TRUE if character matches, else FALSE
4903 */
4904
4905 static BOOL
4906 match_xclass(int c, const uschar *data)
4907 {
4908 int t;
4909 BOOL negated = (*data & XCL_NOT) != 0;
4910
4911 /* Character values < 256 are matched against a bitmap, if one is present. If
4912 not, we still carry on, because there may be ranges that start below 256 in the
4913 additional data. */
4914
4915 if (c < 256)
4916 {
4917 if ((*data & XCL_MAP) != 0 && (data[1 + c/8] & (1 << (c&7))) != 0)
4918 return !negated; /* char found */
4919 }
4920
4921 /* Now match against the list of large chars or ranges that end with a large
4922 char. First skip the bit map if present. */
4923
4924 if ((*data++ & XCL_MAP) != 0) data += 32;
4925
4926 while ((t = *data++) != XCL_END)
4927 {
4928 int x, y;
4929 GETCHARINC(x, data);
4930 if (t == XCL_SINGLE)
4931 {
4932 if (c == x) return !negated;
4933 }
4934 else
4935 {
4936 GETCHARINC(y, data);
4937 if (c >= x && c <= y) return !negated;
4938 }
4939 }
4940
4941 return negated; /* char was not found */
4942 }
4943 #endif
4944
4945
4946
4947
4948 /*************************************************
4949 * Match from current position *
4950 *************************************************/
4951
4952 /* On entry ecode points to the first opcode, and eptr to the first character
4953 in the subject string, while eptrb holds the value of eptr at the start of the
4954 last bracketed group - used for breaking infinite loops matching zero-length
4955 strings. This function is called recursively in many circumstances. Whenever it
4956 returns a negative (error) response, the outer incarnation must also return the
4957 same response.
4958
4959 Performance note: It might be tempting to extract commonly used fields from the
4960 md structure (e.g. utf8, end_subject) into individual variables to improve
4961 performance. Tests using gcc on a SPARC disproved this; in the first case, it
4962 made performance worse.
4963
4964 Arguments:
4965 eptr pointer in subject
4966 ecode position in code
4967 offset_top current top pointer
4968 md pointer to "static" info for the match
4969 ims current /i, /m, and /s options
4970 eptrb pointer to chain of blocks containing eptr at start of
4971 brackets - for testing for empty matches
4972 flags can contain
4973 match_condassert - this is an assertion condition
4974 match_isgroup - this is the start of a bracketed group
4975
4976 Returns: MATCH_MATCH if matched ) these values are >= 0
4977 MATCH_NOMATCH if failed to match )
4978 a negative PCRE_ERROR_xxx value if aborted by an error condition
4979 (e.g. stopped by recursion limit)
4980 */
4981
4982 static int
4983 match(register const uschar *eptr, register const uschar *ecode,
4984 int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
4985 int flags)
4986 {
4987 unsigned long int original_ims = ims; /* Save for resetting on ')' */
4988 register int rrc;
4989 eptrblock newptrb;
4990
4991 if (md->match_call_count++ >= md->match_limit) return PCRE_ERROR_MATCHLIMIT;
4992
4993 /* At the start of a bracketed group, add the current subject pointer to the
4994 stack of such pointers, to be re-instated at the end of the group when we hit
4995 the closing ket. When match() is called in other circumstances, we don't add to
4996 the stack. */
4997
4998 if ((flags & match_isgroup) != 0)
4999 {
5000 newptrb.prev = eptrb;
5001 newptrb.saved_eptr = eptr;
5002 eptrb = &newptrb;
5003 }
5004
5005 /* Now start processing the operations. */
5006
5007 for (;;)
5008 {
5009 int op = (int)*ecode;
5010 int min, max, ctype;
5011 register int i;
5012 register int c;
5013 BOOL minimize = FALSE;
5014
5015 /* Opening capturing bracket. If there is space in the offset vector, save
5016 the current subject position in the working slot at the top of the vector. We
5017 mustn't change the current values of the data slot, because they may be set
5018 from a previous iteration of this group, and be referred to by a reference
5019 inside the group.
5020
5021 If the bracket fails to match, we need to restore this value and also the
5022 values of the final offsets, in case they were set by a previous iteration of
5023 the same bracket.
5024
5025 If there isn't enough space in the offset vector, treat this as if it were a
5026 non-capturing bracket. Don't worry about setting the flag for the error case
5027 here; that is handled in the code for KET. */
5028
5029 if (op > OP_BRA)
5030 {
5031 int offset;
5032 int number = op - OP_BRA;
5033
5034 /* For extended extraction brackets (large number), we have to fish out the
5035 number from a dummy opcode at the start. */
5036
5037 if (number > EXTRACT_BASIC_MAX)
5038 number = GET2(ecode, 2+LINK_SIZE);
5039 offset = number << 1;
5040
5041 #ifdef DEBUG
5042 printf("start bracket %d subject=", number);
5043 pchars(eptr, 16, TRUE, md);
5044 printf("\n");
5045 #endif
5046
5047 if (offset < md->offset_max)
5048 {
5049 int save_offset1 = md->offset_vector[offset];
5050 int save_offset2 = md->offset_vector[offset+1];
5051 int save_offset3 = md->offset_vector[md->offset_end - number];
5052 int save_capture_last = md->capture_last;
5053
5054 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
5055 md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
5056
5057 do
5058 {
5059 if ((rrc = match(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims,
5060 eptrb, match_isgroup)) != MATCH_NOMATCH) return rrc;
5061 md->capture_last = save_capture_last;
5062 ecode += GET(ecode, 1);
5063 }
5064 while (*ecode == OP_ALT);
5065
5066 DPRINTF(("bracket %d failed\n", number));
5067
5068 md->offset_vector[offset] = save_offset1;
5069 md->offset_vector[offset+1] = save_offset2;
5070 md->offset_vector[md->offset_end - number] = save_offset3;
5071
5072 return MATCH_NOMATCH;
5073 }
5074
5075 /* Insufficient room for saving captured contents */
5076
5077 else op = OP_BRA;
5078 }
5079
5080 /* Other types of node can be handled by a switch */
5081
5082 switch(op)
5083 {
5084 case OP_BRA: /* Non-capturing bracket: optimized */
5085 DPRINTF(("start bracket 0\n"));
5086 do
5087 {
5088 if ((rrc = match(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb,
5089 match_isgroup)) != MATCH_NOMATCH) return rrc;
5090 ecode += GET(ecode, 1);
5091 }
5092 while (*ecode == OP_ALT);
5093 DPRINTF(("bracket 0 failed\n"));
5094 return MATCH_NOMATCH;
5095
5096 /* Conditional group: compilation checked that there are no more than
5097 two branches. If the condition is false, skipping the first branch takes us
5098 past the end if there is only one branch, but that's OK because that is
5099 exactly what going to the ket would do. */
5100
5101 case OP_COND:
5102 if (ecode[LINK_SIZE+1] == OP_CREF) /* Condition extract or recurse test */
5103 {
5104 int offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
5105 BOOL condition = (offset == CREF_RECURSE * 2)?
5106 (md->recursive != NULL) :
5107 (offset < offset_top && md->offset_vector[offset] >= 0);
5108 return match(eptr, ecode + (condition?
5109 (LINK_SIZE + 4) : (LINK_SIZE + 1 + GET(ecode, 1))),
5110 offset_top, md, ims, eptrb, match_isgroup);
5111 }
5112
5113 /* The condition is an assertion. Call match() to evaluate it - setting
5114 the final argument TRUE causes it to stop at the end of an assertion. */
5115
5116 else
5117 {
5118 if ((rrc = match(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
5119 match_condassert | match_isgroup)) == MATCH_MATCH)
5120 {
5121 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE+2);
5122 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
5123 }
5124 else if (rrc != MATCH_NOMATCH) return rrc;
5125 else ecode += GET(ecode, 1);
5126 return match(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb,
5127 match_isgroup);
5128 }
5129 /* Control never reaches here */
5130
5131 /* Skip over conditional reference or large extraction number data if
5132 encountered. */
5133
5134 case OP_CREF:
5135 case OP_BRANUMBER:
5136 ecode += 3;
5137 break;
5138
5139 /* End of the pattern. If we are in a recursion, we should restore the
5140 offsets appropriately and continue from after the call. */
5141
5142 case OP_END:
5143 if (md->recursive != NULL && md->recursive->group_num == 0)
5144 {
5145 recursion_info *rec = md->recursive;
5146 DPRINTF(("Hit the end in a (?0) recursion\n"));
5147 md->recursive = rec->prev;
5148 memmove(md->offset_vector, rec->offset_save,
5149 rec->saved_max * sizeof(int));
5150 md->start_match = rec->save_start;
5151 ims = original_ims;
5152 ecode = rec->after_call;
5153 break;
5154 }
5155
5156 /* Otherwise, if PCRE_NOTEMPTY is set, fail if we have matched an empty
5157 string - backtracking will then try other alternatives, if any. */
5158
5159 if (md->notempty && eptr == md->start_match) return MATCH_NOMATCH;
5160 md->end_match_ptr = eptr; /* Record where we ended */
5161 md->end_offset_top = offset_top; /* and how many extracts were taken */
5162 return MATCH_MATCH;
5163
5164 /* Change option settings */
5165
5166 case OP_OPT:
5167 ims = ecode[1];
5168 ecode += 2;
5169 DPRINTF(("ims set to %02lx\n", ims));
5170 break;
5171
5172 /* Assertion brackets. Check the alternative branches in turn - the
5173 matching won't pass the KET for an assertion. If any one branch matches,
5174 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
5175 start of each branch to move the current point backwards, so the code at
5176 this level is identical to the lookahead case. */
5177
5178 case OP_ASSERT:
5179 case OP_ASSERTBACK:
5180 do
5181 {
5182 if ((rrc = match(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
5183 match_isgroup)) == MATCH_MATCH) break;
5184 if (rrc != MATCH_NOMATCH) return rrc;
5185 ecode += GET(ecode, 1);
5186 }
5187 while (*ecode == OP_ALT);
5188 if (*ecode == OP_KET) return MATCH_NOMATCH;
5189
5190 /* If checking an assertion for a condition, return MATCH_MATCH. */
5191
5192 if ((flags & match_condassert) != 0) return MATCH_MATCH;
5193
5194 /* Continue from after the assertion, updating the offsets high water
5195 mark, since extracts may have been taken during the assertion. */
5196
5197 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
5198 ecode += 1 + LINK_SIZE;
5199 offset_top = md->end_offset_top;
5200 continue;
5201
5202 /* Negative assertion: all branches must fail to match */
5203
5204 case OP_ASSERT_NOT:
5205 case OP_ASSERTBACK_NOT:
5206 do
5207 {
5208 if ((rrc = match(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
5209 match_isgroup)) == MATCH_MATCH) return MATCH_NOMATCH;
5210 if (rrc != MATCH_NOMATCH) return rrc;
5211 ecode += GET(ecode,1);
5212 }
5213 while (*ecode == OP_ALT);
5214
5215 if ((flags & match_condassert) != 0) return MATCH_MATCH;
5216
5217 ecode += 1 + LINK_SIZE;
5218 continue;
5219
5220 /* Move the subject pointer back. This occurs only at the start of
5221 each branch of a lookbehind assertion. If we are too close to the start to
5222 move back, this match function fails. When working with UTF-8 we move
5223 back a number of characters, not bytes. */
5224
5225 case OP_REVERSE:
5226 #ifdef SUPPORT_UTF8
5227 if (md->utf8)
5228 {
5229 c = GET(ecode,1);
5230 for (i = 0; i < c; i++)
5231 {
5232 eptr--;
5233 if (eptr < md->start_subject) return MATCH_NOMATCH;
5234 BACKCHAR(eptr)
5235 }
5236 }
5237 else
5238 #endif
5239
5240 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
5241
5242 {
5243 eptr -= GET(ecode,1);
5244 if (eptr < md->start_subject) return MATCH_NOMATCH;
5245 }
5246
5247 /* Skip to next op code */
5248
5249 ecode += 1 + LINK_SIZE;
5250 break;
5251
5252 /* The callout item calls an external function, if one is provided, passing
5253 details of the match so far. This is mainly for debugging, though the
5254 function is able to force a failure. */
5255
5256 case OP_CALLOUT:
5257 if (pcre_callout != NULL)
5258 {
5259 pcre_callout_block cb;
5260 cb.version = 0; /* Version 0 of the callout block */
5261 cb.callout_number = ecode[1];
5262 cb.offset_vector = md->offset_vector;
5263 cb.subject = (const char *)md->start_subject;
5264 cb.subject_length = md->end_subject - md->start_subject;
5265 cb.start_match = md->start_match - md->start_subject;
5266 cb.current_position = eptr - md->start_subject;
5267 cb.capture_top = offset_top/2;
5268 cb.capture_last = md->capture_last;
5269 cb.callout_data = md->callout_data;
5270 if ((rrc = (*pcre_callout)(&cb)) > 0) return MATCH_NOMATCH;
5271 if (rrc < 0) return rrc;
5272 }
5273 ecode += 2;
5274 break;
5275
5276 /* Recursion either matches the current regex, or some subexpression. The
5277 offset data is the offset to the starting bracket from the start of the
5278 whole pattern. However, it is possible that a BRAZERO was inserted before
5279 this bracket after we took the offset - we just skip it if encountered.
5280
5281 If there are any capturing brackets started but not finished, we have to
5282 save their starting points and reinstate them after the recursion. However,
5283 we don't know how many such there are (offset_top records the completed
5284 total) so we just have to save all the potential data. There may be up to
5285 65535 such values, which is too large to put on the stack, but using malloc
5286 for small numbers seems expensive. As a compromise, the stack is used when
5287 there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
5288 is used. A problem is what to do if the malloc fails ... there is no way of
5289 returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
5290 values on the stack, and accept that the rest may be wrong.
5291
5292 There are also other values that have to be saved. We use a chained
5293 sequence of blocks that actually live on the stack. Thanks to Robin Houston
5294 for the original version of this logic. */
5295
5296 case OP_RECURSE:
5297 {
5298 int stacksave[REC_STACK_SAVE_MAX];
5299 recursion_info new_recursive;
5300 const uschar *callpat = md->start_code + GET(ecode, 1);
5301
5302 if (*callpat == OP_BRAZERO) callpat++;
5303
5304 new_recursive.group_num = *callpat - OP_BRA;
5305
5306 /* For extended extraction brackets (large number), we have to fish out
5307 the number from a dummy opcode at the start. */
5308
5309 if (new_recursive.group_num > EXTRACT_BASIC_MAX)
5310 new_recursive.group_num = GET2(callpat, 2+LINK_SIZE);
5311
5312 /* Add to "recursing stack" */
5313
5314 new_recursive.prev = md->recursive;
5315 md->recursive = &new_recursive;
5316
5317 /* Find where to continue from afterwards */
5318
5319 ecode += 1 + LINK_SIZE;
5320 new_recursive.after_call = ecode;
5321
5322 /* Now save the offset data. */
5323
5324 new_recursive.saved_max = md->offset_end;
5325 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
5326 new_recursive.offset_save = stacksave;
5327 else
5328 {
5329 new_recursive.offset_save =
5330 (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
5331 if (new_recursive.offset_save == NULL) return PCRE_ERROR_NOMEMORY;
5332 }
5333
5334 memcpy(new_recursive.offset_save, md->offset_vector,
5335 new_recursive.saved_max * sizeof(int));
5336 new_recursive.save_start = md->start_match;
5337 md->start_match = eptr;
5338
5339 /* OK, now we can do the recursion. For each top-level alternative we
5340 restore the offset and recursion data. */
5341
5342 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
5343 do
5344 {
5345 if ((rrc = match(eptr, callpat + 1 + LINK_SIZE, offset_top, md, ims,
5346 eptrb, match_isgroup)) == MATCH_MATCH)
5347 {
5348 md->recursive = new_recursive.prev;
5349 if (new_recursive.offset_save != stacksave)
5350 (pcre_free)(new_recursive.offset_save);
5351 return MATCH_MATCH;
5352 }
5353 else if (rrc != MATCH_NOMATCH) return rrc;
5354
5355 md->recursive = &new_recursive;
5356 memcpy(md->offset_vector, new_recursive.offset_save,
5357 new_recursive.saved_max * sizeof(int));
5358 callpat += GET(callpat, 1);
5359 }
5360 while (*callpat == OP_ALT);
5361
5362 DPRINTF(("Recursion didn't match\n"));
5363 md->recursive = new_recursive.prev;
5364 if (new_recursive.offset_save != stacksave)
5365 (pcre_free)(new_recursive.offset_save);
5366 return MATCH_NOMATCH;
5367 }
5368 /* Control never reaches here */
5369
5370 /* "Once" brackets are like assertion brackets except that after a match,
5371 the point in the subject string is not moved back. Thus there can never be
5372 a move back into the brackets. Friedl calls these "atomic" subpatterns.
5373 Check the alternative branches in turn - the matching won't pass the KET
5374 for this kind of subpattern. If any one branch matches, we carry on as at
5375 the end of a normal bracket, leaving the subject pointer. */
5376
5377 case OP_ONCE:
5378 {
5379 const uschar *prev = ecode;
5380 const uschar *saved_eptr = eptr;
5381
5382 do
5383 {
5384 if ((rrc = match(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims,
5385 eptrb, match_isgroup)) == MATCH_MATCH) break;
5386 if (rrc != MATCH_NOMATCH) return rrc;
5387 ecode += GET(ecode,1);
5388 }
5389 while (*ecode == OP_ALT);
5390
5391 /* If hit the end of the group (which could be repeated), fail */
5392
5393 if (*ecode != OP_ONCE && *ecode != OP_ALT) return MATCH_NOMATCH;
5394
5395 /* Continue as from after the assertion, updating the offsets high water
5396 mark, since extracts may have been taken. */
5397
5398 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
5399
5400 offset_top = md->end_offset_top;
5401 eptr = md->end_match_ptr;
5402
5403 /* For a non-repeating ket, just continue at this level. This also
5404 happens for a repeating ket if no characters were matched in the group.
5405 This is the forcible breaking of infinite loops as implemented in Perl
5406 5.005. If there is an options reset, it will get obeyed in the normal
5407 course of events. */
5408
5409 if (*ecode == OP_KET || eptr == saved_eptr)
5410 {
5411 ecode += 1+LINK_SIZE;
5412 break;
5413 }
5414
5415 /* The repeating kets try the rest of the pattern or restart from the
5416 preceding bracket, in the appropriate order. We need to reset any options
5417 that changed within the bracket before re-running it, so check the next
5418 opcode. */
5419
5420 if (ecode[1+LINK_SIZE] == OP_OPT)
5421 {
5422 ims = (ims & ~PCRE_IMS) | ecode[4];
5423 DPRINTF(("ims set to %02lx at group repeat\n", ims));
5424 }
5425
5426 if (*ecode == OP_KETRMIN)
5427 {
5428 if ((rrc = match(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims,
5429 eptrb, 0)) != MATCH_NOMATCH) return rrc;
5430 if ((rrc = match(eptr, prev, offset_top, md, ims, eptrb,
5431 match_isgroup)) != MATCH_NOMATCH) return rrc;
5432 }
5433 else /* OP_KETRMAX */
5434 {
5435 if ((rrc = match(eptr, prev, offset_top, md, ims, eptrb,
5436 match_isgroup)) != MATCH_NOMATCH) return rrc;
5437 if ((rrc = match(eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb,
5438 0)) != MATCH_NOMATCH) return rrc;
5439 }
5440 }
5441 return MATCH_NOMATCH;
5442
5443 /* An alternation is the end of a branch; scan along to find the end of the
5444 bracketed group and go to there. */
5445
5446 case OP_ALT:
5447 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
5448 break;
5449
5450 /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
5451 that it may occur zero times. It may repeat infinitely, or not at all -
5452 i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
5453 repeat limits are compiled as a number of copies, with the optional ones
5454 preceded by BRAZERO or BRAMINZERO. */
5455
5456 case OP_BRAZERO:
5457 {
5458 const uschar *next = ecode+1;
5459 if ((rrc = match(eptr, next, offset_top, md, ims, eptrb, match_isgroup))
5460 != MATCH_NOMATCH) return rrc;
5461 do next += GET(next,1); while (*next == OP_ALT);
5462 ecode = next + 1+LINK_SIZE;
5463 }
5464 break;
5465
5466 case OP_BRAMINZERO:
5467 {
5468 const uschar *next = ecode+1;
5469 do next += GET(next,1); while (*next == OP_ALT);
5470 if ((rrc = match(eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb,
5471 match_isgroup)) != MATCH_NOMATCH) return rrc;
5472 ecode++;
5473 }
5474 break;
5475
5476 /* End of a group, repeated or non-repeating. If we are at the end of
5477 an assertion "group", stop matching and return MATCH_MATCH, but record the
5478 current high water mark for use by positive assertions. Do this also
5479 for the "once" (not-backup up) groups. */
5480
5481 case OP_KET:
5482 case OP_KETRMIN:
5483 case OP_KETRMAX:
5484 {
5485 const uschar *prev = ecode - GET(ecode, 1);
5486 const uschar *saved_eptr = eptrb->saved_eptr;
5487
5488 eptrb = eptrb->prev; /* Back up the stack of bracket start pointers */
5489
5490 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
5491 *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
5492 *prev == OP_ONCE)
5493 {
5494 md->end_match_ptr = eptr; /* For ONCE */
5495 md->end_offset_top = offset_top;
5496 return MATCH_MATCH;
5497 }
5498
5499 /* In all other cases except a conditional group we have to check the
5500 group number back at the start and if necessary complete handling an
5501 extraction by setting the offsets and bumping the high water mark. */
5502
5503 if (*prev != OP_COND)
5504 {
5505 int offset;
5506 int number = *prev - OP_BRA;
5507
5508 /* For extended extraction brackets (large number), we have to fish out
5509 the number from a dummy opcode at the start. */
5510
5511 if (number > EXTRACT_BASIC_MAX) number = GET2(prev, 2+LINK_SIZE);
5512 offset = number << 1;
5513
5514 #ifdef DEBUG
5515 printf("end bracket %d", number);
5516 printf("\n");
5517 #endif
5518
5519 /* Test for a numbered group. This includes groups called as a result
5520 of recursion. Note that whole-pattern recursion is coded as a recurse
5521 into group 0, so it won't be picked up here. Instead, we catch it when
5522 the OP_END is reached. */
5523
5524 if (number > 0)
5525 {
5526 md->capture_last = number;
5527 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
5528 {
5529 md->offset_vector[offset] =
5530 md->offset_vector[md->offset_end - number];
5531 md->offset_vector[offset+1] = eptr - md->start_subject;
5532 if (offset_top <= offset) offset_top = offset + 2;
5533 }
5534
5535 /* Handle a recursively called group. Restore the offsets
5536 appropriately and continue from after the call. */
5537
5538 if (md->recursive != NULL && md->recursive->group_num == number)
5539 {
5540 recursion_info *rec = md->recursive;
5541 DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
5542 md->recursive = rec->prev;
5543 md->start_match = rec->save_start;
5544 memcpy(md->offset_vector, rec->offset_save,
5545 rec->saved_max * sizeof(int));
5546 ecode = rec->after_call;
5547 ims = original_ims;
5548 break;
5549 }
5550 }
5551 }
5552
5553 /* Reset the value of the ims flags, in case they got changed during
5554 the group. */
5555
5556 ims = original_ims;
5557 DPRINTF(("ims reset to %02lx\n", ims));
5558
5559 /* For a non-repeating ket, just continue at this level. This also
5560 happens for a repeating ket if no characters were matched in the group.
5561 This is the forcible breaking of infinite loops as implemented in Perl
5562 5.005. If there is an options reset, it will get obeyed in the normal
5563 course of events. */
5564
5565 if (*ecode == OP_KET || eptr == saved_eptr)
5566 {
5567 ecode += 1 + LINK_SIZE;
5568 break;
5569 }
5570
5571 /* The repeating kets try the rest of the pattern or restart from the
5572 preceding bracket, in the appropriate order. */
5573
5574 if (*ecode == OP_KETRMIN)
5575 {
5576 if ((rrc = match(eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb,
5577 0)) != MATCH_NOMATCH) return rrc;
5578 if ((rrc = match(eptr, prev, offset_top, md, ims, eptrb,
5579 match_isgroup)) != MATCH_NOMATCH) return rrc;
5580 }
5581 else /* OP_KETRMAX */
5582 {
5583 if ((rrc = match(eptr, prev, offset_top, md, ims, eptrb,
5584 match_isgroup)) != MATCH_NOMATCH) return rrc;
5585 if ((rrc = match(eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb,
5586 0)) != MATCH_NOMATCH) return rrc;
5587 }
5588 }
5589 return MATCH_NOMATCH;
5590
5591 /* Start of subject unless notbol, or after internal newline if multiline */
5592
5593 case OP_CIRC:
5594 if (md->notbol && eptr == md->start_subject) return MATCH_NOMATCH;
5595 if ((ims & PCRE_MULTILINE) != 0)
5596 {
5597 if (eptr != md->start_subject && eptr[-1] != NEWLINE)
5598 return MATCH_NOMATCH;
5599 ecode++;
5600 break;
5601 }
5602 /* ... else fall through */
5603
5604 /* Start of subject assertion */
5605
5606 case OP_SOD:
5607 if (eptr != md->start_subject) return MATCH_NOMATCH;
5608 ecode++;
5609 break;
5610
5611 /* Start of match assertion */
5612
5613 case OP_SOM:
5614 if (eptr != md->start_subject + md->start_offset) return MATCH_NOMATCH;
5615 ecode++;
5616 break;
5617
5618 /* Assert before internal newline if multiline, or before a terminating
5619 newline unless endonly is set, else end of subject unless noteol is set. */
5620
5621 case OP_DOLL:
5622 if ((ims & PCRE_MULTILINE) != 0)
5623 {
5624 if (eptr < md->end_subject)
5625 { if (*eptr != NEWLINE) return MATCH_NOMATCH; }
5626 else
5627 { if (md->noteol) return MATCH_NOMATCH; }
5628 ecode++;
5629 break;
5630 }
5631 else
5632 {
5633 if (md->noteol) return MATCH_NOMATCH;
5634 if (!md->endonly)
5635 {
5636 if (eptr < md->end_subject - 1 ||
5637 (eptr == md->end_subject - 1 && *eptr != NEWLINE))
5638 return MATCH_NOMATCH;
5639 ecode++;
5640 break;
5641 }
5642 }
5643 /* ... else fall through */
5644
5645 /* End of subject assertion (\z) */
5646
5647 case OP_EOD:
5648 if (eptr < md->end_subject) return MATCH_NOMATCH;
5649 ecode++;
5650 break;
5651
5652 /* End of subject or ending \n assertion (\Z) */
5653
5654 case OP_EODN:
5655 if (eptr < md->end_subject - 1 ||
5656 (eptr == md->end_subject - 1 && *eptr != NEWLINE)) return MATCH_NOMATCH;
5657 ecode++;
5658 break;
5659
5660 /* Word boundary assertions */
5661
5662 case OP_NOT_WORD_BOUNDARY:
5663 case OP_WORD_BOUNDARY:
5664 {
5665 BOOL prev_is_word, cur_is_word;
5666
5667 /* Find out if the previous and current characters are "word" characters.
5668 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
5669 be "non-word" characters. */
5670
5671 #ifdef SUPPORT_UTF8
5672 if (md->utf8)
5673 {
5674 if (eptr == md->start_subject) prev_is_word = FALSE; else
5675 {
5676 const uschar *lastptr = eptr - 1;
5677 while((*lastptr & 0xc0) == 0x80) lastptr--;
5678 GETCHAR(c, lastptr);
5679 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
5680 }
5681 if (eptr >= md->end_subject) cur_is_word = FALSE; else
5682 {
5683 GETCHAR(c, eptr);
5684 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
5685 }
5686 }
5687 else
5688 #endif
5689
5690 /* More streamlined when not in UTF-8 mode */
5691
5692 {
5693 prev_is_word = (eptr != md->start_subject) &&
5694 ((md->ctypes[eptr[-1]] & ctype_word) != 0);
5695 cur_is_word = (eptr < md->end_subject) &&
5696 ((md->ctypes[*eptr] & ctype_word) != 0);
5697 }
5698
5699 /* Now see if the situation is what we want */
5700
5701 if ((*ecode++ == OP_WORD_BOUNDARY)?
5702 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
5703 return MATCH_NOMATCH;
5704 }
5705 break;
5706
5707 /* Match a single character type; inline for speed */
5708
5709 case OP_ANY:
5710 if ((ims & PCRE_DOTALL) == 0 && eptr < md->end_subject && *eptr == NEWLINE)
5711 return MATCH_NOMATCH;
5712 if (eptr++ >= md->end_subject) return MATCH_NOMATCH;
5713 #ifdef SUPPORT_UTF8
5714 if (md->utf8)
5715 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
5716 #endif
5717 ecode++;
5718 break;
5719
5720 /* Match a single byte, even in UTF-8 mode. This opcode really does match
5721 any byte, even newline, independent of the setting of PCRE_DOTALL. */
5722
5723 case OP_ANYBYTE:
5724 if (eptr++ >= md->end_subject) return MATCH_NOMATCH;
5725 ecode++;
5726 break;
5727
5728 case OP_NOT_DIGIT:
5729 if (eptr >= md->end_subject) return MATCH_NOMATCH;
5730 GETCHARINCTEST(c, eptr);
5731 if (
5732 #ifdef SUPPORT_UTF8
5733 c < 256 &&
5734 #endif
5735 (md->ctypes[c] & ctype_digit) != 0
5736 )
5737 return MATCH_NOMATCH;
5738 ecode++;
5739 break;
5740
5741 case OP_DIGIT:
5742 if (eptr >= md->end_subject) return MATCH_NOMATCH;
5743 GETCHARINCTEST(c, eptr);
5744 if (
5745 #ifdef SUPPORT_UTF8
5746 c >= 256 ||
5747 #endif
5748 (md->ctypes[c] & ctype_digit) == 0
5749 )
5750 return MATCH_NOMATCH;
5751 ecode++;
5752 break;
5753
5754 case OP_NOT_WHITESPACE:
5755 if (eptr >= md->end_subject) return MATCH_NOMATCH;
5756 GETCHARINCTEST(c, eptr);
5757 if (
5758 #ifdef SUPPORT_UTF8
5759 c < 256 &&
5760 #endif
5761 (md->ctypes[c] & ctype_space) != 0
5762 )
5763 return MATCH_NOMATCH;
5764 ecode++;
5765 break;
5766
5767 case OP_WHITESPACE:
5768 if (eptr >= md->end_subject) return MATCH_NOMATCH;
5769 GETCHARINCTEST(c, eptr);
5770 if (
5771 #ifdef SUPPORT_UTF8
5772 c >= 256 ||
5773 #endif
5774 (md->ctypes[c] & ctype_space) == 0
5775 )
5776 return MATCH_NOMATCH;
5777 ecode++;
5778 break;
5779
5780 case OP_NOT_WORDCHAR:
5781 if (eptr >= md->end_subject) return MATCH_NOMATCH;
5782 GETCHARINCTEST(c, eptr);
5783 if (
5784 #ifdef SUPPORT_UTF8
5785 c < 256 &&
5786 #endif
5787 (md->ctypes[c] & ctype_word) != 0
5788 )
5789 return MATCH_NOMATCH;
5790 ecode++;
5791 break;
5792
5793 case OP_WORDCHAR:
5794 if (eptr >= md->end_subject) return MATCH_NOMATCH;
5795 GETCHARINCTEST(c, eptr);
5796 if (
5797 #ifdef SUPPORT_UTF8
5798 c >= 256 ||
5799 #endif
5800 (md->ctypes[c] & ctype_word) == 0
5801 )
5802 return MATCH_NOMATCH;
5803 ecode++;
5804 break;
5805
5806 /* Match a back reference, possibly repeatedly. Look past the end of the
5807 item to see if there is repeat information following. The code is similar
5808 to that for character classes, but repeated for efficiency. Then obey
5809 similar code to character type repeats - written out again for speed.
5810 However, if the referenced string is the empty string, always treat
5811 it as matched, any number of times (otherwise there could be infinite
5812 loops). */
5813
5814 case OP_REF:
5815 {
5816 int length;
5817 int offset = GET2(ecode, 1) << 1; /* Doubled ref number */
5818 ecode += 3; /* Advance past item */
5819
5820 /* If the reference is unset, set the length to be longer than the amount
5821 of subject left; this ensures that every attempt at a match fails. We
5822 can't just fail here, because of the possibility of quantifiers with zero
5823 minima. */
5824
5825 length = (offset >= offset_top || md->offset_vector[offset] < 0)?
5826 md->end_subject - eptr + 1 :
5827 md->offset_vector[offset+1] - md->offset_vector[offset];
5828
5829 /* Set up for repetition, or handle the non-repeated case */
5830
5831 switch (*ecode)
5832 {
5833 case OP_CRSTAR:
5834 case OP_CRMINSTAR:
5835 case OP_CRPLUS:
5836 case OP_CRMINPLUS:
5837 case OP_CRQUERY:
5838 case OP_CRMINQUERY:
5839 c = *ecode++ - OP_CRSTAR;
5840 minimize = (c & 1) != 0;
5841 min = rep_min[c]; /* Pick up values from tables; */
5842 max = rep_max[c]; /* zero for max => infinity */
5843 if (max == 0) max = INT_MAX;
5844 break;
5845
5846 case OP_CRRANGE:
5847 case OP_CRMINRANGE:
5848 minimize = (*ecode == OP_CRMINRANGE);
5849 min = GET2(ecode, 1);
5850 max = GET2(ecode, 3);
5851 if (max == 0) max = INT_MAX;
5852 ecode += 5;
5853 break;
5854
5855 default: /* No repeat follows */
5856 if (!match_ref(offset, eptr, length, md, ims)) return MATCH_NOMATCH;
5857 eptr += length;
5858 continue; /* With the main loop */
5859 }
5860
5861 /* If the length of the reference is zero, just continue with the
5862 main loop. */
5863
5864 if (length == 0) continue;
5865
5866 /* First, ensure the minimum number of matches are present. We get back
5867 the length of the reference string explicitly rather than passing the
5868 address of eptr, so that eptr can be a register variable. */
5869
5870 for (i = 1; i <= min; i++)
5871 {
5872 if (!match_ref(offset, eptr, length, md, ims)) return MATCH_NOMATCH;
5873 eptr += length;
5874 }
5875
5876 /* If min = max, continue at the same level without recursion.
5877 They are not both allowed to be zero. */
5878
5879 if (min == max) continue;
5880
5881 /* If minimizing, keep trying and advancing the pointer */
5882
5883 if (minimize)
5884 {
5885 for (i = min;; i++)
5886 {
5887 if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
5888 MATCH_NOMATCH) return rrc;
5889 if (i >= max || !match_ref(offset, eptr, length, md, ims))
5890 return MATCH_NOMATCH;
5891 eptr += length;
5892 }
5893 /* Control never gets here */
5894 }
5895
5896 /* If maximizing, find the longest string and work backwards */
5897
5898 else
5899 {
5900 const uschar *pp = eptr;
5901 for (i = min; i < max; i++)
5902 {
5903 if (!match_ref(offset, eptr, length, md, ims)) break;
5904 eptr += length;
5905 }
5906 while (eptr >= pp)
5907 {
5908 if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
5909 MATCH_NOMATCH) return rrc;
5910 eptr -= length;
5911 }
5912 return MATCH_NOMATCH;
5913 }
5914 }
5915 /* Control never gets here */
5916
5917
5918
5919 /* Match a bit-mapped character class, possibly repeatedly. This op code is
5920 used when all the characters in the class have values in the range 0-255.
5921 The only difference between OP_CLASS and OP_NCLASS occurs when a data
5922 character outside the range is encountered.
5923
5924 First, look past the end of the item to see if there is repeat information
5925 following. Then obey similar code to character type repeats - written out
5926 again for speed. */
5927
5928 case OP_NCLASS:
5929 case OP_CLASS:
5930 {
5931 const uschar *data = ecode + 1; /* Save for matching */
5932 ecode += 33; /* Advance past the item */
5933
5934 switch (*ecode)
5935 {
5936 case OP_CRSTAR:
5937 case OP_CRMINSTAR:
5938 case OP_CRPLUS:
5939 case OP_CRMINPLUS:
5940 case OP_CRQUERY:
5941 case OP_CRMINQUERY:
5942 c = *ecode++ - OP_CRSTAR;
5943 minimize = (c & 1) != 0;
5944 min = rep_min[c]; /* Pick up values from tables; */
5945 max = rep_max[c]; /* zero for max => infinity */
5946 if (max == 0) max = INT_MAX;
5947 break;
5948
5949 case OP_CRRANGE:
5950 case OP_CRMINRANGE:
5951 minimize = (*ecode == OP_CRMINRANGE);
5952 min = GET2(ecode, 1);
5953 max = GET2(ecode, 3);
5954 if (max == 0) max = INT_MAX;
5955 ecode += 5;
5956 break;
5957
5958 default: /* No repeat follows */
5959 min = max = 1;
5960 break;
5961 }
5962
5963 /* First, ensure the minimum number of matches are present. */
5964
5965 #ifdef SUPPORT_UTF8
5966 /* UTF-8 mode */
5967 if (md->utf8)
5968 {
5969 for (i = 1; i <= min; i++)
5970 {
5971 if (eptr >= md->end_subject) return MATCH_NOMATCH;
5972 GETCHARINC(c, eptr);
5973 if (c > 255)
5974 {
5975 if (op == OP_CLASS) return MATCH_NOMATCH;
5976 }
5977 else
5978 {
5979 if ((data[c/8] & (1 << (c&7))) == 0) return MATCH_NOMATCH;
5980 }
5981 }
5982 }
5983 else
5984 #endif
5985 /* Not UTF-8 mode */
5986 {
5987 for (i = 1; i <= min; i++)
5988 {
5989 if (eptr >= md->end_subject) return MATCH_NOMATCH;
5990 c = *eptr++;
5991 if ((data[c/8] & (1 << (c&7))) == 0) return MATCH_NOMATCH;
5992 }
5993 }
5994
5995 /* If max == min we can continue with the main loop without the
5996 need to recurse. */
5997
5998 if (min == max) continue;
5999
6000 /* If minimizing, keep testing the rest of the expression and advancing
6001 the pointer while it matches the class. */
6002
6003 if (minimize)
6004 {
6005 #ifdef SUPPORT_UTF8
6006 /* UTF-8 mode */
6007 if (md->utf8)
6008 {
6009 for (i = min;; i++)
6010 {
6011 if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
6012 MATCH_NOMATCH) return rrc;
6013 if (i >= max || eptr >= md->end_subject) return MATCH_NOMATCH;
6014 GETCHARINC(c, eptr);
6015 if (c > 255)
6016 {
6017 if (op == OP_CLASS) return MATCH_NOMATCH;
6018 }
6019 else
6020 {
6021 if ((data[c/8] & (1 << (c&7))) == 0) return MATCH_NOMATCH;
6022 }
6023 }
6024 }
6025 else
6026 #endif
6027 /* Not UTF-8 mode */
6028 {
6029 for (i = min;; i++)
6030 {
6031 if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
6032 MATCH_NOMATCH) return rrc;
6033 if (i >= max || eptr >= md->end_subject) return MATCH_NOMATCH;
6034 c = *eptr++;
6035 if ((data[c/8] & (1 << (c&7))) == 0) return MATCH_NOMATCH;
6036 }
6037 }
6038 /* Control never gets here */
6039 }
6040
6041 /* If maximizing, find the longest possible run, then work backwards. */
6042
6043 else
6044 {
6045 const uschar *pp = eptr;
6046
6047 #ifdef SUPPORT_UTF8
6048 /* UTF-8 mode */
6049 if (md->utf8)
6050 {
6051 for (i = min; i < max; i++)
6052 {
6053 int len = 1;
6054 if (eptr >= md->end_subject) break;
6055 GETCHARLEN(c, eptr, len);
6056 if (c > 255)
6057 {
6058 if (op == OP_CLASS) break;
6059 }
6060 else
6061 {
6062 if ((data[c/8] & (1 << (c&7))) == 0) break;
6063 }
6064 eptr += len;
6065 }
6066 for (;;)
6067 {
6068 if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
6069 MATCH_NOMATCH) return rrc;
6070 if (eptr-- == pp) break; /* Stop if tried at original pos */
6071 BACKCHAR(eptr);
6072 }
6073 }
6074 else
6075 #endif
6076 /* Not UTF-8 mode */
6077 {
6078 for (i = min; i < max; i++)
6079 {
6080 if (eptr >= md->end_subject) break;
6081 c = *eptr;
6082 if ((data[c/8] & (1 << (c&7))) == 0) break;
6083 eptr++;
6084 }
6085 while (eptr >= pp)
6086 {
6087 if ((rrc = match(eptr--, ecode, offset_top, md, ims, eptrb, 0)) !=
6088 MATCH_NOMATCH) return rrc;
6089 }
6090 }
6091
6092 return MATCH_NOMATCH;
6093 }
6094 }
6095 /* Control never gets here */
6096
6097
6098 /* Match an extended character class. This opcode is encountered only
6099 in UTF-8 mode, because that's the only time it is compiled. */
6100
6101 #ifdef SUPPORT_UTF8
6102 case OP_XCLASS:
6103 {
6104 const uschar *data = ecode + 1 + LINK_SIZE; /* Save for matching */
6105 ecode += GET(ecode, 1); /* Advance past the item */
6106
6107 switch (*ecode)
6108 {
6109 case OP_CRSTAR:
6110 case OP_CRMINSTAR:
6111 case OP_CRPLUS:
6112 case OP_CRMINPLUS:
6113 case OP_CRQUERY:
6114 case OP_CRMINQUERY:
6115 c = *ecode++ - OP_CRSTAR;
6116 minimize = (c & 1) != 0;
6117 min = rep_min[c]; /* Pick up values from tables; */
6118 max = rep_max[c]; /* zero for max => infinity */
6119 if (max == 0) max = INT_MAX;
6120 break;
6121
6122 case OP_CRRANGE:
6123 case OP_CRMINRANGE:
6124 minimize = (*ecode == OP_CRMINRANGE);
6125 min = GET2(ecode, 1);
6126 max = GET2(ecode, 3);
6127 if (max == 0) max = INT_MAX;
6128 ecode += 5;
6129 break;
6130
6131 default: /* No repeat follows */
6132 min = max = 1;
6133 break;
6134 }
6135
6136 /* First, ensure the minimum number of matches are present. */
6137
6138 for (i = 1; i <= min; i++)
6139 {
6140 if (eptr >= md->end_subject) return MATCH_NOMATCH;
6141 GETCHARINC(c, eptr);
6142 if (!match_xclass(c, data)) return MATCH_NOMATCH;
6143 }
6144
6145 /* If max == min we can continue with the main loop without the
6146 need to recurse. */
6147
6148 if (min == max) continue;
6149
6150 /* If minimizing, keep testing the rest of the expression and advancing
6151 the pointer while it matches the class. */
6152
6153 if (minimize)
6154 {
6155 for (i = min;; i++)
6156 {
6157 if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
6158 MATCH_NOMATCH) return rrc;
6159 if (i >= max || eptr >= md->end_subject) return MATCH_NOMATCH;
6160 GETCHARINC(c, eptr);
6161 if (!match_xclass(c, data)) return MATCH_NOMATCH;
6162 }
6163 /* Control never gets here */
6164 }
6165
6166 /* If maximizing, find the longest possible run, then work backwards. */
6167
6168 else
6169 {
6170 const uschar *pp = eptr;
6171 for (i = min; i < max; i++)
6172 {
6173 int len = 1;
6174 if (eptr >= md->end_subject) break;
6175 GETCHARLEN(c, eptr, len);
6176 if (!match_xclass(c, data)) break;
6177 eptr += len;
6178 }
6179 for(;;)
6180 {
6181 if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
6182 MATCH_NOMATCH) return rrc;
6183 if (eptr-- == pp) break; /* Stop if tried at original pos */
6184 BACKCHAR(eptr)
6185 }
6186 return MATCH_NOMATCH;
6187 }
6188
6189 /* Control never gets here */
6190 }
6191 #endif /* End of XCLASS */
6192
6193 /* Match a run of characters */
6194
6195 case OP_CHARS:
6196 {
6197 register int length = ecode[1];
6198 ecode += 2;
6199
6200 #ifdef DEBUG /* Sigh. Some compilers never learn. */
6201 if (eptr >= md->end_subject)
6202 printf("matching subject <null> against pattern ");
6203 else
6204 {
6205 printf("matching subject ");
6206 pchars(eptr, length, TRUE, md);
6207 printf(" against pattern ");
6208 }
6209 pchars(ecode, length, FALSE, md);
6210 printf("\n");
6211 #endif
6212
6213 if (length > md->end_subject - eptr) return MATCH_NOMATCH;
6214 if ((ims & PCRE_CASELESS) != 0)
6215 {
6216 while (length-- > 0)
6217 if (md->lcc[*ecode++] != md->lcc[*eptr++])
6218 return MATCH_NOMATCH;
6219 }
6220 else
6221 {
6222 while (length-- > 0) if (*ecode++ != *eptr++) return MATCH_NOMATCH;
6223 }
6224 }
6225 break;
6226
6227 /* Match a single character repeatedly; different opcodes share code. */
6228
6229 case OP_EXACT:
6230 min = max = GET2(ecode, 1);
6231 ecode += 3;
6232 goto REPEATCHAR;
6233
6234 case OP_UPTO:
6235 case OP_MINUPTO:
6236 min = 0;
6237 max = GET2(ecode, 1);
6238 minimize = *ecode == OP_MINUPTO;
6239 ecode += 3;
6240 goto REPEATCHAR;
6241
6242 case OP_STAR:
6243 case OP_MINSTAR:
6244 case OP_PLUS:
6245 case OP_MINPLUS:
6246 case OP_QUERY:
6247 case OP_MINQUERY:
6248 c = *ecode++ - OP_STAR;
6249 minimize = (c & 1) != 0;
6250 min = rep_min[c]; /* Pick up values from tables; */
6251 max = rep_max[c]; /* zero for max => infinity */
6252 if (max == 0) max = INT_MAX;
6253
6254 /* Common code for all repeated single-character matches. We can give
6255 up quickly if there are fewer than the minimum number of characters left in
6256 the subject. */
6257
6258 REPEATCHAR:
6259 #ifdef SUPPORT_UTF8
6260 if (md->utf8)
6261 {
6262 int len = 1;
6263 const uschar *charptr = ecode;
6264 GETCHARLEN(c, ecode, len);
6265 if (min * len > md->end_subject - eptr) return MATCH_NOMATCH;
6266 ecode += len;
6267
6268 /* Handle multibyte character matching specially here. There is no
6269 support for any kind of casing for multibyte characters. */
6270
6271 if (len > 1)
6272 {
6273 for (i = 1; i <= min; i++)
6274 {
6275 if (memcmp(eptr, charptr, len) != 0) return MATCH_NOMATCH;
6276 eptr += len;
6277 }
6278
6279 if (min == max) continue;
6280
6281 if (minimize)
6282 {
6283 for (i = min;; i++)
6284 {
6285 if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
6286 MATCH_NOMATCH) return rrc;
6287 if (i >= max ||
6288 eptr >= md->end_subject ||
6289 memcmp(eptr, charptr, len) != 0)
6290 return MATCH_NOMATCH;
6291 eptr += len;
6292 }
6293 /* Control never gets here */
6294 }
6295 else
6296 {
6297 const uschar *pp = eptr;
6298 for (i = min; i < max; i++)
6299 {
6300 if (eptr > md->end_subject - len ||
6301 memcmp(eptr, charptr, len) != 0)
6302 break;
6303 eptr += len;
6304 }
6305 while (eptr >= pp)
6306 {
6307 if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
6308 MATCH_NOMATCH) return rrc;
6309 eptr -= len;
6310 }
6311 return MATCH_NOMATCH;
6312 }
6313 /* Control never gets here */
6314 }
6315
6316 /* If the length of a UTF-8 character is 1, we fall through here, and
6317 obey the code as for non-UTF-8 characters below, though in this case the
6318 value of c will always be < 128. */
6319 }
6320 else
6321 #endif
6322
6323 /* When not in UTF-8 mode, load a single-byte character. */
6324 {
6325 if (min > md->end_subject - eptr) return MATCH_NOMATCH;
6326 c = *ecode++;
6327 }
6328
6329 /* The value of c at this point is always less than 256, though we may or
6330 may not be in UTF-8 mode. The code is duplicated for the caseless and
6331 caseful cases, for speed, since matching characters is likely to be quite
6332 common. First, ensure the minimum number of matches are present. If min =
6333 max, continue at the same level without recursing. Otherwise, if
6334 minimizing, keep trying the rest of the expression and advancing one
6335 matching character if failing, up to the maximum. Alternatively, if
6336 maximizing, find the maximum number of characters and work backwards. */
6337
6338 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", c, min, max,
6339 max, eptr));
6340
6341 if ((ims & PCRE_CASELESS) != 0)
6342 {
6343 c = md->lcc[c];
6344 for (i = 1; i <= min; i++)
6345 if (c != md->lcc[*eptr++]) return MATCH_NOMATCH;
6346 if (min == max) continue;
6347 if (minimize)
6348 {
6349 for (i = min;; i++)
6350 {
6351 if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
6352 MATCH_NOMATCH) return rrc;
6353 if (i >= max || eptr >= md->end_subject ||
6354 c != md->lcc[*eptr++])
6355 return MATCH_NOMATCH;
6356 }
6357 /* Control never gets here */
6358 }
6359 else
6360 {
6361 const uschar *pp = eptr;
6362 for (i = min; i < max; i++)
6363 {
6364 if (eptr >= md->end_subject || c != md->lcc[*eptr]) break;
6365 eptr++;
6366 }
6367 while (eptr >= pp)
6368 if ((rrc = match(eptr--, ecode, offset_top, md, ims, eptrb, 0)) !=
6369 MATCH_NOMATCH) return rrc;
6370 return MATCH_NOMATCH;
6371 }
6372 /* Control never gets here */
6373 }
6374
6375 /* Caseful comparisons (includes all multi-byte characters) */
6376
6377 else
6378 {
6379 for (i = 1; i <= min; i++) if (c != *eptr++) return MATCH_NOMATCH;
6380 if (min == max) continue;
6381 if (minimize)
6382 {
6383 for (i = min;; i++)
6384 {
6385 if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
6386 MATCH_NOMATCH) return rrc;
6387 if (i >= max || eptr >= md->end_subject || c != *eptr++)
6388 return MATCH_NOMATCH;
6389 }
6390 /* Control never gets here */
6391 }
6392 else
6393 {
6394 const uschar *pp = eptr;
6395 for (i = min; i < max; i++)
6396 {
6397 if (eptr >= md->end_subject || c != *eptr) break;
6398 eptr++;
6399 }
6400 while (eptr >= pp)
6401 if ((rrc = match(eptr--, ecode, offset_top, md, ims, eptrb, 0)) !=
6402 MATCH_NOMATCH) return rrc;
6403 return MATCH_NOMATCH;
6404 }
6405 }
6406 /* Control never gets here */
6407
6408 /* Match a negated single one-byte character. The character we are
6409 checking can be multibyte. */
6410
6411 case OP_NOT:
6412 if (eptr >= md->end_subject) return MATCH_NOMATCH;
6413 ecode++;
6414 GETCHARINCTEST(c, eptr);
6415 if ((ims & PCRE_CASELESS) != 0)
6416 {
6417 #ifdef SUPPORT_UTF8
6418 if (c < 256)
6419 #endif
6420 c = md->lcc[c];
6421 if (md->lcc[*ecode++] == c) return MATCH_NOMATCH;
6422 }
6423 else
6424 {
6425 if (*ecode++ == c) return MATCH_NOMATCH;
6426 }
6427 break;
6428
6429 /* Match a negated single one-byte character repeatedly. This is almost a
6430 repeat of the code for a repeated single character, but I haven't found a
6431 nice way of commoning these up that doesn't require a test of the
6432 positive/negative option for each character match. Maybe that wouldn't add
6433 very much to the time taken, but character matching *is* what this is all
6434 about... */
6435
6436 case OP_NOTEXACT:
6437 min = max = GET2(ecode, 1);
6438 ecode += 3;
6439 goto REPEATNOTCHAR;
6440
6441 case OP_NOTUPTO:
6442 case OP_NOTMINUPTO:
6443 min = 0;
6444 max = GET2(ecode, 1);
6445 minimize = *ecode == OP_NOTMINUPTO;
6446 ecode += 3;
6447 goto REPEATNOTCHAR;
6448
6449 case OP_NOTSTAR:
6450 case OP_NOTMINSTAR:
6451 case OP_NOTPLUS:
6452 case OP_NOTMINPLUS:
6453 case OP_NOTQUERY:
6454 case OP_NOTMINQUERY:
6455 c = *ecode++ - OP_NOTSTAR;
6456 minimize = (c & 1) != 0;
6457 min = rep_min[c]; /* Pick up values from tables; */
6458 max = rep_max[c]; /* zero for max => infinity */
6459 if (max == 0) max = INT_MAX;
6460
6461 /* Common code for all repeated single-character (less than 255) matches.
6462 We can give up quickly if there are fewer than the minimum number of
6463 characters left in the subject. */
6464
6465 REPEATNOTCHAR:
6466 if (min > md->end_subject - eptr) return MATCH_NOMATCH;
6467 c = *ecode++;
6468
6469 /* The code is duplicated for the caseless and caseful cases, for speed,
6470 since matching characters is likely to be quite common. First, ensure the
6471 minimum number of matches are present. If min = max, continue at the same
6472 level without recursing. Otherwise, if minimizing, keep trying the rest of
6473 the expression and advancing one matching character if failing, up to the
6474 maximum. Alternatively, if maximizing, find the maximum number of
6475 characters and work backwards. */
6476
6477 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", c, min, max,
6478 max, eptr));
6479
6480 if ((ims & PCRE_CASELESS) != 0)
6481 {
6482 c = md->lcc[c];
6483
6484 #ifdef SUPPORT_UTF8
6485 /* UTF-8 mode */
6486 if (md->utf8)
6487 {
6488 register int d;
6489 for (i = 1; i <= min; i++)
6490 {
6491 GETCHARINC(d, eptr);
6492 if (d < 256) d = md->lcc[d];
6493 if (c == d) return MATCH_NOMATCH;
6494 }
6495 }
6496 else
6497 #endif
6498
6499 /* Not UTF-8 mode */
6500 {
6501 for (i = 1; i <= min; i++)
6502 if (c == md->lcc[*eptr++]) return MATCH_NOMATCH;
6503 }
6504
6505 if (min == max) continue;
6506
6507 if (minimize)
6508 {
6509 #ifdef SUPPORT_UTF8
6510 /* UTF-8 mode */
6511 if (md->utf8)
6512 {
6513 register int d;
6514 for (i = min;; i++)
6515 {
6516 if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
6517 MATCH_NOMATCH) return rrc;
6518 GETCHARINC(d, eptr);
6519 if (d < 256) d = md->lcc[d];
6520 if (i >= max || eptr >= md->end_subject || c == d)
6521 return MATCH_NOMATCH;
6522 }
6523 }
6524 else
6525 #endif
6526 /* Not UTF-8 mode */
6527 {
6528 for (i = min;; i++)
6529 {
6530 if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
6531 MATCH_NOMATCH) return rrc;
6532 if (i >= max || eptr >= md->end_subject || c == md->lcc[*eptr++])
6533 return MATCH_NOMATCH;
6534 }
6535 }
6536 /* Control never gets here */
6537 }
6538
6539 /* Maximize case */
6540
6541 else
6542 {
6543 const uschar *pp = eptr;
6544
6545 #ifdef SUPPORT_UTF8
6546 /* UTF-8 mode */
6547 if (md->utf8)
6548 {
6549 register int d;
6550 for (i = min; i < max; i++)
6551 {
6552 int len = 1;
6553 if (eptr >= md->end_subject) break;
6554 GETCHARLEN(d, eptr, len);
6555 if (d < 256) d = md->lcc[d];
6556 if (c == d) break;
6557 eptr += len;
6558 }
6559 for(;;)
6560 {
6561 if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
6562 MATCH_NOMATCH) return rrc;
6563 if (eptr-- == pp) break; /* Stop if tried at original pos */
6564 BACKCHAR(eptr);
6565 }
6566 }
6567 else
6568 #endif
6569 /* Not UTF-8 mode */
6570 {
6571 for (i = min; i < max; i++)
6572 {
6573 if (eptr >= md->end_subject || c == md->lcc[*eptr]) break;
6574 eptr++;
6575 }
6576 while (eptr >= pp)
6577 {
6578 if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
6579 MATCH_NOMATCH) return rrc;
6580 eptr--;
6581 }
6582 }
6583
6584 return MATCH_NOMATCH;
6585 }
6586 /* Control never gets here */
6587 }
6588
6589 /* Caseful comparisons */
6590
6591 else
6592 {
6593 #ifdef SUPPORT_UTF8
6594 /* UTF-8 mode */
6595 if (md->utf8)
6596 {
6597 register int d;
6598 for (i = 1; i <= min; i++)
6599 {
6600 GETCHARINC(d, eptr);
6601 if (c == d) return MATCH_NOMATCH;
6602 }
6603 }
6604 else
6605 #endif
6606 /* Not UTF-8 mode */
6607 {
6608 for (i = 1; i <= min; i++)
6609 if (c == *eptr++) return MATCH_NOMATCH;
6610 }
6611
6612 if (min == max) continue;
6613
6614 if (minimize)
6615 {
6616 #ifdef SUPPORT_UTF8
6617 /* UTF-8 mode */
6618 if (md->utf8)
6619 {
6620 register int d;
6621 for (i = min;; i++)
6622 {
6623 if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
6624 MATCH_NOMATCH) return rrc;
6625 GETCHARINC(d, eptr);
6626 if (i >= max || eptr >= md->end_subject || c == d)
6627 return MATCH_NOMATCH;
6628 }
6629 }
6630 else
6631 #endif
6632 /* Not UTF-8 mode */
6633 {
6634 for (i = min;; i++)
6635 {
6636 if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
6637 MATCH_NOMATCH) return rrc;
6638 if (i >= max || eptr >= md->end_subject || c == *eptr++)
6639 return MATCH_NOMATCH;
6640 }
6641 }
6642 /* Control never gets here */
6643 }
6644
6645 /* Maximize case */
6646
6647 else
6648 {
6649 const uschar *pp = eptr;
6650
6651 #ifdef SUPPORT_UTF8
6652 /* UTF-8 mode */
6653 if (md->utf8)
6654 {
6655 register int d;
6656 for (i = min; i < max; i++)
6657 {
6658 int len = 1;
6659 if (eptr >= md->end_subject) break;
6660 GETCHARLEN(d, eptr, len);
6661 if (c == d) break;
6662 eptr += len;
6663 }
6664 for(;;)
6665 {
6666 if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
6667 MATCH_NOMATCH) return rrc;
6668 if (eptr-- == pp) break; /* Stop if tried at original pos */
6669 BACKCHAR(eptr);
6670 }
6671 }
6672 else
6673 #endif
6674 /* Not UTF-8 mode */
6675 {
6676 for (i = min; i < max; i++)
6677 {
6678 if (eptr >= md->end_subject || c == *eptr) break;
6679 eptr++;
6680 }
6681 while (eptr >= pp)
6682 {
6683 if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
6684 MATCH_NOMATCH) return rrc;
6685 eptr--;
6686 }
6687 }
6688
6689 return MATCH_NOMATCH;
6690 }
6691 }
6692 /* Control never gets here */
6693
6694 /* Match a single character type repeatedly; several different opcodes
6695 share code. This is very similar to the code for single characters, but we
6696 repeat it in the interests of efficiency. */
6697
6698 case OP_TYPEEXACT:
6699 min = max = GET2(ecode, 1);
6700 minimize = TRUE;
6701 ecode += 3;
6702 goto REPEATTYPE;
6703
6704 case OP_TYPEUPTO:
6705 case OP_TYPEMINUPTO:
6706 min = 0;
6707 max = GET2(ecode, 1);
6708 minimize = *ecode == OP_TYPEMINUPTO;
6709 ecode += 3;
6710 goto REPEATTYPE;
6711
6712 case OP_TYPESTAR:
6713 case OP_TYPEMINSTAR:
6714 case OP_TYPEPLUS:
6715 case OP_TYPEMINPLUS:
6716 case OP_TYPEQUERY:
6717 case OP_TYPEMINQUERY:
6718 c = *ecode++ - OP_TYPESTAR;
6719 minimize = (c & 1) != 0;
6720 min = rep_min[c]; /* Pick up values from tables; */
6721 max = rep_max[c]; /* zero for max => infinity */
6722 if (max == 0) max = INT_MAX;
6723
6724 /* Common code for all repeated single character type matches. Note that
6725 in UTF-8 mode, '.' matches a character of any length, but for the other
6726 character types, the valid characters are all one-byte long. */
6727
6728 REPEATTYPE:
6729 ctype = *ecode++; /* Code for the character type */
6730
6731 /* First, ensure the minimum number of matches are present. Use inline
6732 code for maximizing the speed, and do the type test once at the start
6733 (i.e. keep it out of the loop). Also we can test that there are at least