/[pcre]/code/trunk/pcre_dfa_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_dfa_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 87 - (show annotations) (download)
Sat Feb 24 21:41:21 2007 UTC (7 years, 6 months ago) by nigel
File MIME type: text/plain
File size: 68288 byte(s)
Load pcre-6.5 into code/trunk.

1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2006 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains the external function pcre_dfa_exec(), which is an
42 alternative matching function that uses a DFA algorithm. This is NOT Perl-
43 compatible, but it has advantages in certain applications. */
44
45
46 #include "pcre_internal.h"
47
48
49 /* For use to indent debugging output */
50
51 #define SP " "
52
53
54
55 /*************************************************
56 * Code parameters and static tables *
57 *************************************************/
58
59 /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
60 into others, under special conditions. A gap of 10 between the blocks should be
61 enough. */
62
63 #define OP_PROP_EXTRA (EXTRACT_BASIC_MAX+1)
64 #define OP_EXTUNI_EXTRA (EXTRACT_BASIC_MAX+11)
65
66
67 /* This table identifies those opcodes that are followed immediately by a
68 character that is to be tested in some way. This makes is possible to
69 centralize the loading of these characters. In the case of Type * etc, the
70 "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
71 small value. */
72
73 static uschar coptable[] = {
74 0, /* End */
75 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* \A, \G, \B, \b, \D, \d, \S, \s, \W, \w */
76 0, 0, /* Any, Anybyte */
77 0, 0, 0, /* NOTPROP, PROP, EXTUNI */
78 0, 0, 0, 0, 0, /* \Z, \z, Opt, ^, $ */
79 1, /* Char */
80 1, /* Charnc */
81 1, /* not */
82 /* Positive single-char repeats */
83 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
84 3, 3, 3, /* upto, minupto, exact */
85 /* Negative single-char repeats - only for chars < 256 */
86 1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
87 3, 3, 3, /* NOT upto, minupto, exact */
88 /* Positive type repeats */
89 1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
90 3, 3, 3, /* Type upto, minupto, exact */
91 /* Character class & ref repeats */
92 0, 0, 0, 0, 0, 0, /* *, *?, +, +?, ?, ?? */
93 0, 0, /* CRRANGE, CRMINRANGE */
94 0, /* CLASS */
95 0, /* NCLASS */
96 0, /* XCLASS - variable length */
97 0, /* REF */
98 0, /* RECURSE */
99 0, /* CALLOUT */
100 0, /* Alt */
101 0, /* Ket */
102 0, /* KetRmax */
103 0, /* KetRmin */
104 0, /* Assert */
105 0, /* Assert not */
106 0, /* Assert behind */
107 0, /* Assert behind not */
108 0, /* Reverse */
109 0, /* Once */
110 0, /* COND */
111 0, /* CREF */
112 0, 0, /* BRAZERO, BRAMINZERO */
113 0, /* BRANUMBER */
114 0 /* BRA */
115 };
116
117 /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
118 and \w */
119
120 static uschar toptable1[] = {
121 0, 0, 0, 0, 0,
122 ctype_digit, ctype_digit,
123 ctype_space, ctype_space,
124 ctype_word, ctype_word,
125 0 /* OP_ANY */
126 };
127
128 static uschar toptable2[] = {
129 0, 0, 0, 0, 0,
130 ctype_digit, 0,
131 ctype_space, 0,
132 ctype_word, 0,
133 1 /* OP_ANY */
134 };
135
136
137 /* Structure for holding data about a particular state, which is in effect the
138 current data for an active path through the match tree. It must consist
139 entirely of ints because the working vector we are passed, and which we put
140 these structures in, is a vector of ints. */
141
142 typedef struct stateblock {
143 int offset; /* Offset to opcode */
144 int count; /* Count for repeats */
145 int ims; /* ims flag bits */
146 int data; /* Some use extra data */
147 } stateblock;
148
149 #define INTS_PER_STATEBLOCK (sizeof(stateblock)/sizeof(int))
150
151
152 #ifdef DEBUG
153 /*************************************************
154 * Print character string *
155 *************************************************/
156
157 /* Character string printing function for debugging.
158
159 Arguments:
160 p points to string
161 length number of bytes
162 f where to print
163
164 Returns: nothing
165 */
166
167 static void
168 pchars(unsigned char *p, int length, FILE *f)
169 {
170 int c;
171 while (length-- > 0)
172 {
173 if (isprint(c = *(p++)))
174 fprintf(f, "%c", c);
175 else
176 fprintf(f, "\\x%02x", c);
177 }
178 }
179 #endif
180
181
182
183 /*************************************************
184 * Execute a Regular Expression - DFA engine *
185 *************************************************/
186
187 /* This internal function applies a compiled pattern to a subject string,
188 starting at a given point, using a DFA engine. This function is called from the
189 external one, possibly multiple times if the pattern is not anchored. The
190 function calls itself recursively for some kinds of subpattern.
191
192 Arguments:
193 md the match_data block with fixed information
194 this_start_code the opening bracket of this subexpression's code
195 current_subject where we currently are in the subject string
196 start_offset start offset in the subject string
197 offsets vector to contain the matching string offsets
198 offsetcount size of same
199 workspace vector of workspace
200 wscount size of same
201 ims the current ims flags
202 rlevel function call recursion level
203 recursing regex recursive call level
204
205 Returns: > 0 =>
206 = 0 =>
207 -1 => failed to match
208 < -1 => some kind of unexpected problem
209
210 The following macros are used for adding states to the two state vectors (one
211 for the current character, one for the following character). */
212
213 #define ADD_ACTIVE(x,y) \
214 if (active_count++ < wscount) \
215 { \
216 next_active_state->offset = (x); \
217 next_active_state->count = (y); \
218 next_active_state->ims = ims; \
219 next_active_state++; \
220 DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
221 } \
222 else return PCRE_ERROR_DFA_WSSIZE
223
224 #define ADD_ACTIVE_DATA(x,y,z) \
225 if (active_count++ < wscount) \
226 { \
227 next_active_state->offset = (x); \
228 next_active_state->count = (y); \
229 next_active_state->ims = ims; \
230 next_active_state->data = (z); \
231 next_active_state++; \
232 DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
233 } \
234 else return PCRE_ERROR_DFA_WSSIZE
235
236 #define ADD_NEW(x,y) \
237 if (new_count++ < wscount) \
238 { \
239 next_new_state->offset = (x); \
240 next_new_state->count = (y); \
241 next_new_state->ims = ims; \
242 next_new_state++; \
243 DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
244 } \
245 else return PCRE_ERROR_DFA_WSSIZE
246
247 #define ADD_NEW_DATA(x,y,z) \
248 if (new_count++ < wscount) \
249 { \
250 next_new_state->offset = (x); \
251 next_new_state->count = (y); \
252 next_new_state->ims = ims; \
253 next_new_state->data = (z); \
254 next_new_state++; \
255 DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
256 } \
257 else return PCRE_ERROR_DFA_WSSIZE
258
259 /* And now, here is the code */
260
261 static int
262 internal_dfa_exec(
263 dfa_match_data *md,
264 const uschar *this_start_code,
265 const uschar *current_subject,
266 int start_offset,
267 int *offsets,
268 int offsetcount,
269 int *workspace,
270 int wscount,
271 int ims,
272 int rlevel,
273 int recursing)
274 {
275 stateblock *active_states, *new_states, *temp_states;
276 stateblock *next_active_state, *next_new_state;
277
278 const uschar *ctypes, *lcc, *fcc;
279 const uschar *ptr;
280 const uschar *end_code;
281
282 int active_count, new_count, match_count;
283
284 /* Some fields in the md block are frequently referenced, so we load them into
285 independent variables in the hope that this will perform better. */
286
287 const uschar *start_subject = md->start_subject;
288 const uschar *end_subject = md->end_subject;
289 const uschar *start_code = md->start_code;
290
291 #ifdef SUPPORT_UTF8
292 BOOL utf8 = (md->poptions & PCRE_UTF8) != 0;
293 #endif
294
295 rlevel++;
296 offsetcount &= (-2);
297
298 wscount -= 2;
299 wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
300 (2 * INTS_PER_STATEBLOCK);
301
302 DPRINTF(("\n%.*s---------------------\n"
303 "%.*sCall to internal_dfa_exec f=%d r=%d\n",
304 rlevel*2-2, SP, rlevel*2-2, SP, rlevel, recursing));
305
306 ctypes = md->tables + ctypes_offset;
307 lcc = md->tables + lcc_offset;
308 fcc = md->tables + fcc_offset;
309
310 match_count = PCRE_ERROR_NOMATCH; /* A negative number */
311
312 active_states = (stateblock *)(workspace + 2);
313 next_new_state = new_states = active_states + wscount;
314 new_count = 0;
315
316 /* The first thing in any (sub) pattern is a bracket of some sort. Push all
317 the alternative states onto the list, and find out where the end is. This
318 makes is possible to use this function recursively, when we want to stop at a
319 matching internal ket rather than at the end.
320
321 If the first opcode in the first alternative is OP_REVERSE, we are dealing with
322 a backward assertion. In that case, we have to find out the maximum amount to
323 move back, and set up each alternative appropriately. */
324
325 if (this_start_code[1+LINK_SIZE] == OP_REVERSE)
326 {
327 int max_back = 0;
328 int gone_back;
329
330 end_code = this_start_code;
331 do
332 {
333 int back = GET(end_code, 2+LINK_SIZE);
334 if (back > max_back) max_back = back;
335 end_code += GET(end_code, 1);
336 }
337 while (*end_code == OP_ALT);
338
339 /* If we can't go back the amount required for the longest lookbehind
340 pattern, go back as far as we can; some alternatives may still be viable. */
341
342 #ifdef SUPPORT_UTF8
343 /* In character mode we have to step back character by character */
344
345 if (utf8)
346 {
347 for (gone_back = 0; gone_back < max_back; gone_back++)
348 {
349 if (current_subject <= start_subject) break;
350 current_subject--;
351 while (current_subject > start_subject &&
352 (*current_subject & 0xc0) == 0x80)
353 current_subject--;
354 }
355 }
356 else
357 #endif
358
359 /* In byte-mode we can do this quickly. */
360
361 {
362 gone_back = (current_subject - max_back < start_subject)?
363 current_subject - start_subject : max_back;
364 current_subject -= gone_back;
365 }
366
367 /* Now we can process the individual branches. */
368
369 end_code = this_start_code;
370 do
371 {
372 int back = GET(end_code, 2+LINK_SIZE);
373 if (back <= gone_back)
374 {
375 int bstate = end_code - start_code + 2 + 2*LINK_SIZE;
376 ADD_NEW_DATA(-bstate, 0, gone_back - back);
377 }
378 end_code += GET(end_code, 1);
379 }
380 while (*end_code == OP_ALT);
381 }
382
383 /* This is the code for a "normal" subpattern (not a backward assertion). The
384 start of a whole pattern is always one of these. If we are at the top level,
385 we may be asked to restart matching from the same point that we reached for a
386 previous partial match. We still have to scan through the top-level branches to
387 find the end state. */
388
389 else
390 {
391 end_code = this_start_code;
392
393 /* Restarting */
394
395 if (rlevel == 1 && (md->moptions & PCRE_DFA_RESTART) != 0)
396 {
397 do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);
398 new_count = workspace[1];
399 if (!workspace[0])
400 memcpy(new_states, active_states, new_count * sizeof(stateblock));
401 }
402
403 /* Not restarting */
404
405 else
406 {
407 do
408 {
409 ADD_NEW(end_code - start_code + 1 + LINK_SIZE, 0);
410 end_code += GET(end_code, 1);
411 }
412 while (*end_code == OP_ALT);
413 }
414 }
415
416 workspace[0] = 0; /* Bit indicating which vector is current */
417
418 DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, end_code - start_code));
419
420 /* Loop for scanning the subject */
421
422 ptr = current_subject;
423 for (;;)
424 {
425 int i, j;
426 int c, d, clen, dlen;
427
428 /* Make the new state list into the active state list and empty the
429 new state list. */
430
431 temp_states = active_states;
432 active_states = new_states;
433 new_states = temp_states;
434 active_count = new_count;
435 new_count = 0;
436
437 workspace[0] ^= 1; /* Remember for the restarting feature */
438 workspace[1] = active_count;
439
440 #ifdef DEBUG
441 printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);
442 pchars((uschar *)ptr, strlen((char *)ptr), stdout);
443 printf("\"\n");
444
445 printf("%.*sActive states: ", rlevel*2-2, SP);
446 for (i = 0; i < active_count; i++)
447 printf("%d/%d ", active_states[i].offset, active_states[i].count);
448 printf("\n");
449 #endif
450
451 /* Set the pointers for adding new states */
452
453 next_active_state = active_states + active_count;
454 next_new_state = new_states;
455
456 /* Load the current character from the subject outside the loop, as many
457 different states may want to look at it, and we assume that at least one
458 will. */
459
460 if (ptr < end_subject)
461 {
462 clen = 1;
463 #ifdef SUPPORT_UTF8
464 if (utf8) { GETCHARLEN(c, ptr, clen); } else
465 #endif /* SUPPORT_UTF8 */
466 c = *ptr;
467 }
468 else
469 {
470 clen = 0; /* At end subject */
471 c = -1;
472 }
473
474 /* Scan up the active states and act on each one. The result of an action
475 may be to add more states to the currently active list (e.g. on hitting a
476 parenthesis) or it may be to put states on the new list, for considering
477 when we move the character pointer on. */
478
479 for (i = 0; i < active_count; i++)
480 {
481 stateblock *current_state = active_states + i;
482 const uschar *code;
483 int state_offset = current_state->offset;
484 int count, codevalue;
485 int chartype, script;
486
487 #ifdef DEBUG
488 printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
489 if (c < 0) printf("-1\n");
490 else if (c > 32 && c < 127) printf("'%c'\n", c);
491 else printf("0x%02x\n", c);
492 #endif
493
494 /* This variable is referred to implicity in the ADD_xxx macros. */
495
496 ims = current_state->ims;
497
498 /* A negative offset is a special case meaning "hold off going to this
499 (negated) state until the number of characters in the data field have
500 been skipped". */
501
502 if (state_offset < 0)
503 {
504 if (current_state->data > 0)
505 {
506 DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP));
507 ADD_NEW_DATA(state_offset, current_state->count,
508 current_state->data - 1);
509 continue;
510 }
511 else
512 {
513 current_state->offset = state_offset = -state_offset;
514 }
515 }
516
517 /* Check for a duplicate state with the same count, and skip if found. */
518
519 for (j = 0; j < i; j++)
520 {
521 if (active_states[j].offset == state_offset &&
522 active_states[j].count == current_state->count)
523 {
524 DPRINTF(("%.*sDuplicate state: skipped\n", rlevel*2-2, SP));
525 goto NEXT_ACTIVE_STATE;
526 }
527 }
528
529 /* The state offset is the offset to the opcode */
530
531 code = start_code + state_offset;
532 codevalue = *code;
533 if (codevalue >= OP_BRA) codevalue = OP_BRA; /* All brackets are equal */
534
535 /* If this opcode is followed by an inline character, load it. It is
536 tempting to test for the presence of a subject character here, but that
537 is wrong, because sometimes zero repetitions of the subject are
538 permitted.
539
540 We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
541 argument that is not a data character - but is always one byte long.
542 Unfortunately, we have to take special action to deal with \P, \p, and
543 \X in this case. To keep the other cases fast, convert these ones to new
544 opcodes. */
545
546 if (coptable[codevalue] > 0)
547 {
548 dlen = 1;
549 #ifdef SUPPORT_UTF8
550 if (utf8) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
551 #endif /* SUPPORT_UTF8 */
552 d = code[coptable[codevalue]];
553 if (codevalue >= OP_TYPESTAR)
554 {
555 if (d == OP_ANYBYTE) return PCRE_ERROR_DFA_UITEM;
556 if (d >= OP_NOTPROP)
557 codevalue += (d == OP_EXTUNI)? OP_EXTUNI_EXTRA : OP_PROP_EXTRA;
558 }
559 }
560 else
561 {
562 dlen = 0; /* Not strictly necessary, but compilers moan */
563 d = -1; /* if these variables are not set. */
564 }
565
566
567 /* Now process the individual opcodes */
568
569 switch (codevalue)
570 {
571
572 /* ========================================================================== */
573 /* Reached a closing bracket. If not at the end of the pattern, carry
574 on with the next opcode. Otherwise, unless we have an empty string and
575 PCRE_NOTEMPTY is set, save the match data, shifting up all previous
576 matches so we always have the longest first. */
577
578 case OP_KET:
579 case OP_KETRMIN:
580 case OP_KETRMAX:
581 if (code != end_code)
582 {
583 ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
584 if (codevalue != OP_KET)
585 {
586 ADD_ACTIVE(state_offset - GET(code, 1), 0);
587 }
588 }
589 else if (ptr > current_subject || (md->moptions & PCRE_NOTEMPTY) == 0)
590 {
591 if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
592 else if (match_count > 0 && ++match_count * 2 >= offsetcount)
593 match_count = 0;
594 count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
595 if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
596 if (offsetcount >= 2)
597 {
598 offsets[0] = current_subject - start_subject;
599 offsets[1] = ptr - start_subject;
600 DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
601 offsets[1] - offsets[0], current_subject));
602 }
603 if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
604 {
605 DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
606 "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,
607 match_count, rlevel*2-2, SP));
608 return match_count;
609 }
610 }
611 break;
612
613 /* ========================================================================== */
614 /* These opcodes add to the current list of states without looking
615 at the current character. */
616
617 /*-----------------------------------------------------------------*/
618 case OP_ALT:
619 do { code += GET(code, 1); } while (*code == OP_ALT);
620 ADD_ACTIVE(code - start_code, 0);
621 break;
622
623 /*-----------------------------------------------------------------*/
624 case OP_BRA:
625 do
626 {
627 ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
628 code += GET(code, 1);
629 }
630 while (*code == OP_ALT);
631 break;
632
633 /*-----------------------------------------------------------------*/
634 case OP_BRAZERO:
635 case OP_BRAMINZERO:
636 ADD_ACTIVE(state_offset + 1, 0);
637 code += 1 + GET(code, 2);
638 while (*code == OP_ALT) code += GET(code, 1);
639 ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
640 break;
641
642 /*-----------------------------------------------------------------*/
643 case OP_BRANUMBER:
644 ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
645 break;
646
647 /*-----------------------------------------------------------------*/
648 case OP_CIRC:
649 if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
650 ((ims & PCRE_MULTILINE) != 0 && ptr[-1] == NEWLINE))
651 { ADD_ACTIVE(state_offset + 1, 0); }
652 break;
653
654 /*-----------------------------------------------------------------*/
655 case OP_EOD:
656 if (ptr >= end_subject) { ADD_ACTIVE(state_offset + 1, 0); }
657 break;
658
659 /*-----------------------------------------------------------------*/
660 case OP_OPT:
661 ims = code[1];
662 ADD_ACTIVE(state_offset + 2, 0);
663 break;
664
665 /*-----------------------------------------------------------------*/
666 case OP_SOD:
667 if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
668 break;
669
670 /*-----------------------------------------------------------------*/
671 case OP_SOM:
672 if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); }
673 break;
674
675
676 /* ========================================================================== */
677 /* These opcodes inspect the next subject character, and sometimes
678 the previous one as well, but do not have an argument. The variable
679 clen contains the length of the current character and is zero if we are
680 at the end of the subject. */
681
682 /*-----------------------------------------------------------------*/
683 case OP_ANY:
684 if (clen > 0 && (c != NEWLINE || (ims & PCRE_DOTALL) != 0))
685 { ADD_NEW(state_offset + 1, 0); }
686 break;
687
688 /*-----------------------------------------------------------------*/
689 case OP_EODN:
690 if (clen == 0 || (c == NEWLINE && ptr + 1 == end_subject))
691 { ADD_ACTIVE(state_offset + 1, 0); }
692 break;
693
694 /*-----------------------------------------------------------------*/
695 case OP_DOLL:
696 if ((md->moptions & PCRE_NOTEOL) == 0)
697 {
698 if (clen == 0 || (c == NEWLINE && (ptr + 1 == end_subject ||
699 (ims & PCRE_MULTILINE) != 0)))
700 { ADD_ACTIVE(state_offset + 1, 0); }
701 }
702 else if (c == NEWLINE && (ims & PCRE_MULTILINE) != 0)
703 { ADD_ACTIVE(state_offset + 1, 0); }
704 break;
705
706 /*-----------------------------------------------------------------*/
707
708 case OP_DIGIT:
709 case OP_WHITESPACE:
710 case OP_WORDCHAR:
711 if (clen > 0 && c < 256 &&
712 ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)
713 { ADD_NEW(state_offset + 1, 0); }
714 break;
715
716 /*-----------------------------------------------------------------*/
717 case OP_NOT_DIGIT:
718 case OP_NOT_WHITESPACE:
719 case OP_NOT_WORDCHAR:
720 if (clen > 0 && (c >= 256 ||
721 ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0))
722 { ADD_NEW(state_offset + 1, 0); }
723 break;
724
725 /*-----------------------------------------------------------------*/
726 case OP_WORD_BOUNDARY:
727 case OP_NOT_WORD_BOUNDARY:
728 {
729 int left_word, right_word;
730
731 if (ptr > start_subject)
732 {
733 const uschar *temp = ptr - 1;
734 #ifdef SUPPORT_UTF8
735 if (utf8) BACKCHAR(temp);
736 #endif
737 GETCHARTEST(d, temp);
738 left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
739 }
740 else left_word = 0;
741
742 if (clen > 0) right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
743 else right_word = 0;
744
745 if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
746 { ADD_ACTIVE(state_offset + 1, 0); }
747 }
748 break;
749
750
751 #ifdef SUPPORT_UCP
752
753 /*-----------------------------------------------------------------*/
754 /* Check the next character by Unicode property. We will get here only
755 if the support is in the binary; otherwise a compile-time error occurs.
756 */
757
758 case OP_PROP:
759 case OP_NOTPROP:
760 if (clen > 0)
761 {
762 BOOL OK;
763 int category = _pcre_ucp_findprop(c, &chartype, &script);
764 switch(code[1])
765 {
766 case PT_ANY:
767 OK = TRUE;
768 break;
769
770 case PT_LAMP:
771 OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
772 break;
773
774 case PT_GC:
775 OK = category == code[2];
776 break;
777
778 case PT_PC:
779 OK = chartype == code[2];
780 break;
781
782 case PT_SC:
783 OK = script == code[2];
784 break;
785
786 /* Should never occur, but keep compilers from grumbling. */
787
788 default:
789 OK = codevalue != OP_PROP;
790 break;
791 }
792
793 if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); }
794 }
795 break;
796 #endif
797
798
799
800 /* ========================================================================== */
801 /* These opcodes likewise inspect the subject character, but have an
802 argument that is not a data character. It is one of these opcodes:
803 OP_ANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE, OP_WORDCHAR,
804 OP_NOT_WORDCHAR. The value is loaded into d. */
805
806 case OP_TYPEPLUS:
807 case OP_TYPEMINPLUS:
808 count = current_state->count; /* Already matched */
809 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
810 if (clen > 0)
811 {
812 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
813 (c < 256 &&
814 (d != OP_ANY || c != '\n' || (ims & PCRE_DOTALL) != 0) &&
815 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
816 {
817 count++;
818 ADD_NEW(state_offset, count);
819 }
820 }
821 break;
822
823 /*-----------------------------------------------------------------*/
824 case OP_TYPEQUERY:
825 case OP_TYPEMINQUERY:
826 ADD_ACTIVE(state_offset + 2, 0);
827 if (clen > 0)
828 {
829 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
830 (c < 256 &&
831 (d != OP_ANY || c != '\n' || (ims & PCRE_DOTALL) != 0) &&
832 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
833 {
834 ADD_NEW(state_offset + 2, 0);
835 }
836 }
837 break;
838
839 /*-----------------------------------------------------------------*/
840 case OP_TYPESTAR:
841 case OP_TYPEMINSTAR:
842 ADD_ACTIVE(state_offset + 2, 0);
843 if (clen > 0)
844 {
845 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
846 (c < 256 &&
847 (d != OP_ANY || c != '\n' || (ims & PCRE_DOTALL) != 0) &&
848 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
849 {
850 ADD_NEW(state_offset, 0);
851 }
852 }
853 break;
854
855 /*-----------------------------------------------------------------*/
856 case OP_TYPEEXACT:
857 case OP_TYPEUPTO:
858 case OP_TYPEMINUPTO:
859 if (codevalue != OP_TYPEEXACT)
860 { ADD_ACTIVE(state_offset + 4, 0); }
861 count = current_state->count; /* Number already matched */
862 if (clen > 0)
863 {
864 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
865 (c < 256 &&
866 (d != OP_ANY || c != '\n' || (ims & PCRE_DOTALL) != 0) &&
867 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
868 {
869 if (++count >= GET2(code, 1))
870 { ADD_NEW(state_offset + 4, 0); }
871 else
872 { ADD_NEW(state_offset, count); }
873 }
874 }
875 break;
876
877 /* ========================================================================== */
878 /* These are virtual opcodes that are used when something like
879 OP_TYPEPLUS has OP_PROP, OP_NOTPROP, or OP_EXTUNI as its argument. It
880 keeps the code above fast for the other cases. The argument is in the
881 d variable. */
882
883 case OP_PROP_EXTRA + OP_TYPEPLUS:
884 case OP_PROP_EXTRA + OP_TYPEMINPLUS:
885 count = current_state->count; /* Already matched */
886 if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
887 if (clen > 0)
888 {
889 BOOL OK;
890 int category = _pcre_ucp_findprop(c, &chartype, &script);
891 switch(code[2])
892 {
893 case PT_ANY:
894 OK = TRUE;
895 break;
896
897 case PT_LAMP:
898 OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
899 break;
900
901 case PT_GC:
902 OK = category == code[3];
903 break;
904
905 case PT_PC:
906 OK = chartype == code[3];
907 break;
908
909 case PT_SC:
910 OK = script == code[3];
911 break;
912
913 /* Should never occur, but keep compilers from grumbling. */
914
915 default:
916 OK = codevalue != OP_PROP;
917 break;
918 }
919
920 if (OK == (d == OP_PROP)) { count++; ADD_NEW(state_offset, count); }
921 }
922 break;
923
924 /*-----------------------------------------------------------------*/
925 case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
926 case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
927 count = current_state->count; /* Already matched */
928 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
929 if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)
930 {
931 const uschar *nptr = ptr + clen;
932 int ncount = 0;
933 while (nptr < end_subject)
934 {
935 int nd;
936 int ndlen = 1;
937 GETCHARLEN(nd, nptr, ndlen);
938 if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;
939 ncount++;
940 nptr += ndlen;
941 }
942 count++;
943 ADD_NEW_DATA(-state_offset, count, ncount);
944 }
945 break;
946
947 /*-----------------------------------------------------------------*/
948 case OP_PROP_EXTRA + OP_TYPEQUERY:
949 case OP_PROP_EXTRA + OP_TYPEMINQUERY:
950 count = 4;
951 goto QS1;
952
953 case OP_PROP_EXTRA + OP_TYPESTAR:
954 case OP_PROP_EXTRA + OP_TYPEMINSTAR:
955 count = 0;
956
957 QS1:
958
959 ADD_ACTIVE(state_offset + 4, 0);
960 if (clen > 0)
961 {
962 BOOL OK;
963 int category = _pcre_ucp_findprop(c, &chartype, &script);
964 switch(code[2])
965 {
966 case PT_ANY:
967 OK = TRUE;
968 break;
969
970 case PT_LAMP:
971 OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
972 break;
973
974 case PT_GC:
975 OK = category == code[3];
976 break;
977
978 case PT_PC:
979 OK = chartype == code[3];
980 break;
981
982 case PT_SC:
983 OK = script == code[3];
984 break;
985
986 /* Should never occur, but keep compilers from grumbling. */
987
988 default:
989 OK = codevalue != OP_PROP;
990 break;
991 }
992
993 if (OK == (d == OP_PROP)) { ADD_NEW(state_offset + count, 0); }
994 }
995 break;
996
997 /*-----------------------------------------------------------------*/
998 case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
999 case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
1000 count = 2;
1001 goto QS2;
1002
1003 case OP_EXTUNI_EXTRA + OP_TYPESTAR:
1004 case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
1005 count = 0;
1006
1007 QS2:
1008
1009 ADD_ACTIVE(state_offset + 2, 0);
1010 if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)
1011 {
1012 const uschar *nptr = ptr + clen;
1013 int ncount = 0;
1014 while (nptr < end_subject)
1015 {
1016 int nd;
1017 int ndlen = 1;
1018 GETCHARLEN(nd, nptr, ndlen);
1019 if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;
1020 ncount++;
1021 nptr += ndlen;
1022 }
1023 ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1024 }
1025 break;
1026
1027 /*-----------------------------------------------------------------*/
1028 case OP_PROP_EXTRA + OP_TYPEEXACT:
1029 case OP_PROP_EXTRA + OP_TYPEUPTO:
1030 case OP_PROP_EXTRA + OP_TYPEMINUPTO:
1031 if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
1032 { ADD_ACTIVE(state_offset + 6, 0); }
1033 count = current_state->count; /* Number already matched */
1034 if (clen > 0)
1035 {
1036 BOOL OK;
1037 int category = _pcre_ucp_findprop(c, &chartype, &script);
1038 switch(code[4])
1039 {
1040 case PT_ANY:
1041 OK = TRUE;
1042 break;
1043
1044 case PT_LAMP:
1045 OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
1046 break;
1047
1048 case PT_GC:
1049 OK = category == code[5];
1050 break;
1051
1052 case PT_PC:
1053 OK = chartype == code[5];
1054 break;
1055
1056 case PT_SC:
1057 OK = script == code[5];
1058 break;
1059
1060 /* Should never occur, but keep compilers from grumbling. */
1061
1062 default:
1063 OK = codevalue != OP_PROP;
1064 break;
1065 }
1066
1067 if (OK == (d == OP_PROP))
1068 {
1069 if (++count >= GET2(code, 1))
1070 { ADD_NEW(state_offset + 6, 0); }
1071 else
1072 { ADD_NEW(state_offset, count); }
1073 }
1074 }
1075 break;
1076
1077 /*-----------------------------------------------------------------*/
1078 case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
1079 case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
1080 case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
1081 if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
1082 { ADD_ACTIVE(state_offset + 4, 0); }
1083 count = current_state->count; /* Number already matched */
1084 if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)
1085 {
1086 const uschar *nptr = ptr + clen;
1087 int ncount = 0;
1088 while (nptr < end_subject)
1089 {
1090 int nd;
1091 int ndlen = 1;
1092 GETCHARLEN(nd, nptr, ndlen);
1093 if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;
1094 ncount++;
1095 nptr += ndlen;
1096 }
1097 if (++count >= GET2(code, 1))
1098 { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
1099 else
1100 { ADD_NEW_DATA(-state_offset, count, ncount); }
1101 }
1102 break;
1103
1104 /* ========================================================================== */
1105 /* These opcodes are followed by a character that is usually compared
1106 to the current subject character; it is loaded into d. We still get
1107 here even if there is no subject character, because in some cases zero
1108 repetitions are permitted. */
1109
1110 /*-----------------------------------------------------------------*/
1111 case OP_CHAR:
1112 if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); }
1113 break;
1114
1115 /*-----------------------------------------------------------------*/
1116 case OP_CHARNC:
1117 if (clen == 0) break;
1118
1119 #ifdef SUPPORT_UTF8
1120 if (utf8)
1121 {
1122 if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
1123 {
1124 int othercase;
1125 if (c < 128) othercase = fcc[c]; else
1126
1127 /* If we have Unicode property support, we can use it to test the
1128 other case of the character. */
1129
1130 #ifdef SUPPORT_UCP
1131 othercase = _pcre_ucp_othercase(c);
1132 #else
1133 othercase = -1;
1134 #endif
1135
1136 if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
1137 }
1138 }
1139 else
1140 #endif /* SUPPORT_UTF8 */
1141
1142 /* Non-UTF-8 mode */
1143 {
1144 if (lcc[c] == lcc[d]) { ADD_NEW(state_offset + 2, 0); }
1145 }
1146 break;
1147
1148
1149 #ifdef SUPPORT_UCP
1150 /*-----------------------------------------------------------------*/
1151 /* This is a tricky one because it can match more than one character.
1152 Find out how many characters to skip, and then set up a negative state
1153 to wait for them to pass before continuing. */
1154
1155 case OP_EXTUNI:
1156 if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)
1157 {
1158 const uschar *nptr = ptr + clen;
1159 int ncount = 0;
1160 while (nptr < end_subject)
1161 {
1162 int nclen = 1;
1163 GETCHARLEN(c, nptr, nclen);
1164 if (_pcre_ucp_findprop(c, &chartype, &script) != ucp_M) break;
1165 ncount++;
1166 nptr += nclen;
1167 }
1168 ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
1169 }
1170 break;
1171 #endif
1172
1173 /*-----------------------------------------------------------------*/
1174 /* Match a negated single character. This is only used for one-byte
1175 characters, that is, we know that d < 256. The character we are
1176 checking (c) can be multibyte. */
1177
1178 case OP_NOT:
1179 if (clen > 0)
1180 {
1181 int otherd = ((ims & PCRE_CASELESS) != 0)? fcc[d] : d;
1182 if (c != d && c != otherd) { ADD_NEW(state_offset + dlen + 1, 0); }
1183 }
1184 break;
1185
1186 /*-----------------------------------------------------------------*/
1187 case OP_PLUS:
1188 case OP_MINPLUS:
1189 case OP_NOTPLUS:
1190 case OP_NOTMINPLUS:
1191 count = current_state->count; /* Already matched */
1192 if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
1193 if (clen > 0)
1194 {
1195 int otherd = -1;
1196 if ((ims & PCRE_CASELESS) != 0)
1197 {
1198 #ifdef SUPPORT_UTF8
1199 if (utf8 && d >= 128)
1200 {
1201 #ifdef SUPPORT_UCP
1202 otherd = _pcre_ucp_othercase(d);
1203 #endif /* SUPPORT_UCP */
1204 }
1205 else
1206 #endif /* SUPPORT_UTF8 */
1207 otherd = fcc[d];
1208 }
1209 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1210 { count++; ADD_NEW(state_offset, count); }
1211 }
1212 break;
1213
1214 /*-----------------------------------------------------------------*/
1215 case OP_QUERY:
1216 case OP_MINQUERY:
1217 case OP_NOTQUERY:
1218 case OP_NOTMINQUERY:
1219 ADD_ACTIVE(state_offset + dlen + 1, 0);
1220 if (clen > 0)
1221 {
1222 int otherd = -1;
1223 if ((ims && PCRE_CASELESS) != 0)
1224 {
1225 #ifdef SUPPORT_UTF8
1226 if (utf8 && d >= 128)
1227 {
1228 #ifdef SUPPORT_UCP
1229 otherd = _pcre_ucp_othercase(d);
1230 #endif /* SUPPORT_UCP */
1231 }
1232 else
1233 #endif /* SUPPORT_UTF8 */
1234 otherd = fcc[d];
1235 }
1236 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1237 { ADD_NEW(state_offset + dlen + 1, 0); }
1238 }
1239 break;
1240
1241 /*-----------------------------------------------------------------*/
1242 case OP_STAR:
1243 case OP_MINSTAR:
1244 case OP_NOTSTAR:
1245 case OP_NOTMINSTAR:
1246 ADD_ACTIVE(state_offset + dlen + 1, 0);
1247 if (clen > 0)
1248 {
1249 int otherd = -1;
1250 if ((ims && PCRE_CASELESS) != 0)
1251 {
1252 #ifdef SUPPORT_UTF8
1253 if (utf8 && d >= 128)
1254 {
1255 #ifdef SUPPORT_UCP
1256 otherd = _pcre_ucp_othercase(d);
1257 #endif /* SUPPORT_UCP */
1258 }
1259 else
1260 #endif /* SUPPORT_UTF8 */
1261 otherd = fcc[d];
1262 }
1263 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1264 { ADD_NEW(state_offset, 0); }
1265 }
1266 break;
1267
1268 /*-----------------------------------------------------------------*/
1269 case OP_EXACT:
1270 case OP_UPTO:
1271 case OP_MINUPTO:
1272 case OP_NOTEXACT:
1273 case OP_NOTUPTO:
1274 case OP_NOTMINUPTO:
1275 if (codevalue != OP_EXACT && codevalue != OP_NOTEXACT)
1276 { ADD_ACTIVE(state_offset + dlen + 3, 0); }
1277 count = current_state->count; /* Number already matched */
1278 if (clen > 0)
1279 {
1280 int otherd = -1;
1281 if ((ims & PCRE_CASELESS) != 0)
1282 {
1283 #ifdef SUPPORT_UTF8
1284 if (utf8 && d >= 128)
1285 {
1286 #ifdef SUPPORT_UCP
1287 otherd = _pcre_ucp_othercase(d);
1288 #endif /* SUPPORT_UCP */
1289 }
1290 else
1291 #endif /* SUPPORT_UTF8 */
1292 otherd = fcc[d];
1293 }
1294 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1295 {
1296 if (++count >= GET2(code, 1))
1297 { ADD_NEW(state_offset + dlen + 3, 0); }
1298 else
1299 { ADD_NEW(state_offset, count); }
1300 }
1301 }
1302 break;
1303
1304
1305 /* ========================================================================== */
1306 /* These are the class-handling opcodes */
1307
1308 case OP_CLASS:
1309 case OP_NCLASS:
1310 case OP_XCLASS:
1311 {
1312 BOOL isinclass = FALSE;
1313 int next_state_offset;
1314 const uschar *ecode;
1315
1316 /* For a simple class, there is always just a 32-byte table, and we
1317 can set isinclass from it. */
1318
1319 if (codevalue != OP_XCLASS)
1320 {
1321 ecode = code + 33;
1322 if (clen > 0)
1323 {
1324 isinclass = (c > 255)? (codevalue == OP_NCLASS) :
1325 ((code[1 + c/8] & (1 << (c&7))) != 0);
1326 }
1327 }
1328
1329 /* An extended class may have a table or a list of single characters,
1330 ranges, or both, and it may be positive or negative. There's a
1331 function that sorts all this out. */
1332
1333 else
1334 {
1335 ecode = code + GET(code, 1);
1336 if (clen > 0) isinclass = _pcre_xclass(c, code + 1 + LINK_SIZE);
1337 }
1338
1339 /* At this point, isinclass is set for all kinds of class, and ecode
1340 points to the byte after the end of the class. If there is a
1341 quantifier, this is where it will be. */
1342
1343 next_state_offset = ecode - start_code;
1344
1345 switch (*ecode)
1346 {
1347 case OP_CRSTAR:
1348 case OP_CRMINSTAR:
1349 ADD_ACTIVE(next_state_offset + 1, 0);
1350 if (isinclass) { ADD_NEW(state_offset, 0); }
1351 break;
1352
1353 case OP_CRPLUS:
1354 case OP_CRMINPLUS:
1355 count = current_state->count; /* Already matched */
1356 if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
1357 if (isinclass) { count++; ADD_NEW(state_offset, count); }
1358 break;
1359
1360 case OP_CRQUERY:
1361 case OP_CRMINQUERY:
1362 ADD_ACTIVE(next_state_offset + 1, 0);
1363 if (isinclass) { ADD_NEW(next_state_offset + 1, 0); }
1364 break;
1365
1366 case OP_CRRANGE:
1367 case OP_CRMINRANGE:
1368 count = current_state->count; /* Already matched */
1369 if (count >= GET2(ecode, 1))
1370 { ADD_ACTIVE(next_state_offset + 5, 0); }
1371 if (isinclass)
1372 {
1373 if (++count >= GET2(ecode, 3))
1374 { ADD_NEW(next_state_offset + 5, 0); }
1375 else
1376 { ADD_NEW(state_offset, count); }
1377 }
1378 break;
1379
1380 default:
1381 if (isinclass) { ADD_NEW(next_state_offset, 0); }
1382 break;
1383 }
1384 }
1385 break;
1386
1387 /* ========================================================================== */
1388 /* These are the opcodes for fancy brackets of various kinds. We have
1389 to use recursion in order to handle them. */
1390
1391 case OP_ASSERT:
1392 case OP_ASSERT_NOT:
1393 case OP_ASSERTBACK:
1394 case OP_ASSERTBACK_NOT:
1395 {
1396 int rc;
1397 int local_offsets[2];
1398 int local_workspace[1000];
1399 const uschar *endasscode = code + GET(code, 1);
1400
1401 while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
1402
1403 rc = internal_dfa_exec(
1404 md, /* static match data */
1405 code, /* this subexpression's code */
1406 ptr, /* where we currently are */
1407 ptr - start_subject, /* start offset */
1408 local_offsets, /* offset vector */
1409 sizeof(local_offsets)/sizeof(int), /* size of same */
1410 local_workspace, /* workspace vector */
1411 sizeof(local_workspace)/sizeof(int), /* size of same */
1412 ims, /* the current ims flags */
1413 rlevel, /* function recursion level */
1414 recursing); /* pass on regex recursion */
1415
1416 if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
1417 { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
1418 }
1419 break;
1420
1421 /*-----------------------------------------------------------------*/
1422 case OP_COND:
1423 {
1424 int local_offsets[1000];
1425 int local_workspace[1000];
1426 int condcode = code[LINK_SIZE+1];
1427
1428 /* The only supported version of OP_CREF is for the value 0xffff, which
1429 means "test if in a recursion". */
1430
1431 if (condcode == OP_CREF)
1432 {
1433 int value = GET2(code, LINK_SIZE+2);
1434 if (value != 0xffff) return PCRE_ERROR_DFA_UCOND;
1435 if (recursing > 0) { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); }
1436 else { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }
1437 }
1438
1439 /* Otherwise, the condition is an assertion */
1440
1441 else
1442 {
1443 int rc;
1444 const uschar *asscode = code + LINK_SIZE + 1;
1445 const uschar *endasscode = asscode + GET(asscode, 1);
1446
1447 while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
1448
1449 rc = internal_dfa_exec(
1450 md, /* fixed match data */
1451 asscode, /* this subexpression's code */
1452 ptr, /* where we currently are */
1453 ptr - start_subject, /* start offset */
1454 local_offsets, /* offset vector */
1455 sizeof(local_offsets)/sizeof(int), /* size of same */
1456 local_workspace, /* workspace vector */
1457 sizeof(local_workspace)/sizeof(int), /* size of same */
1458 ims, /* the current ims flags */
1459 rlevel, /* function recursion level */
1460 recursing); /* pass on regex recursion */
1461
1462 if ((rc >= 0) ==
1463 (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
1464 { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
1465 else
1466 { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }
1467 }
1468 }
1469 break;
1470
1471 /*-----------------------------------------------------------------*/
1472 case OP_RECURSE:
1473 {
1474 int local_offsets[1000];
1475 int local_workspace[1000];
1476 int rc;
1477
1478 DPRINTF(("%.*sStarting regex recursion %d\n", rlevel*2-2, SP,
1479 recursing + 1));
1480
1481 rc = internal_dfa_exec(
1482 md, /* fixed match data */
1483 start_code + GET(code, 1), /* this subexpression's code */
1484 ptr, /* where we currently are */
1485 ptr - start_subject, /* start offset */
1486 local_offsets, /* offset vector */
1487 sizeof(local_offsets)/sizeof(int), /* size of same */
1488 local_workspace, /* workspace vector */
1489 sizeof(local_workspace)/sizeof(int), /* size of same */
1490 ims, /* the current ims flags */
1491 rlevel, /* function recursion level */
1492 recursing + 1); /* regex recurse level */
1493
1494 DPRINTF(("%.*sReturn from regex recursion %d: rc=%d\n", rlevel*2-2, SP,
1495 recursing + 1, rc));
1496
1497 /* Ran out of internal offsets */
1498
1499 if (rc == 0) return PCRE_ERROR_DFA_RECURSE;
1500
1501 /* For each successful matched substring, set up the next state with a
1502 count of characters to skip before trying it. Note that the count is in
1503 characters, not bytes. */
1504
1505 if (rc > 0)
1506 {
1507 for (rc = rc*2 - 2; rc >= 0; rc -= 2)
1508 {
1509 const uschar *p = start_subject + local_offsets[rc];
1510 const uschar *pp = start_subject + local_offsets[rc+1];
1511 int charcount = local_offsets[rc+1] - local_offsets[rc];
1512 while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
1513 if (charcount > 0)
1514 {
1515 ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));
1516 }
1517 else
1518 {
1519 ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0);
1520 }
1521 }
1522 }
1523 else if (rc != PCRE_ERROR_NOMATCH) return rc;
1524 }
1525 break;
1526
1527 /*-----------------------------------------------------------------*/
1528 case OP_ONCE:
1529 {
1530 int local_offsets[2];
1531 int local_workspace[1000];
1532
1533 int rc = internal_dfa_exec(
1534 md, /* fixed match data */
1535 code, /* this subexpression's code */
1536 ptr, /* where we currently are */
1537 ptr - start_subject, /* start offset */
1538 local_offsets, /* offset vector */
1539 sizeof(local_offsets)/sizeof(int), /* size of same */
1540 local_workspace, /* workspace vector */
1541 sizeof(local_workspace)/sizeof(int), /* size of same */
1542 ims, /* the current ims flags */
1543 rlevel, /* function recursion level */
1544 recursing); /* pass on regex recursion */
1545
1546 if (rc >= 0)
1547 {
1548 const uschar *end_subpattern = code;
1549 int charcount = local_offsets[1] - local_offsets[0];
1550 int next_state_offset, repeat_state_offset;
1551
1552 do { end_subpattern += GET(end_subpattern, 1); }
1553 while (*end_subpattern == OP_ALT);
1554 next_state_offset = end_subpattern - start_code + LINK_SIZE + 1;
1555
1556 /* If the end of this subpattern is KETRMAX or KETRMIN, we must
1557 arrange for the repeat state also to be added to the relevant list.
1558 Calculate the offset, or set -1 for no repeat. */
1559
1560 repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
1561 *end_subpattern == OP_KETRMIN)?
1562 end_subpattern - start_code - GET(end_subpattern, 1) : -1;
1563
1564 /* If we have matched an empty string, add the next state at the
1565 current character pointer. This is important so that the duplicate
1566 checking kicks in, which is what breaks infinite loops that match an
1567 empty string. */
1568
1569 if (charcount == 0)
1570 {
1571 ADD_ACTIVE(next_state_offset, 0);
1572 }
1573
1574 /* Optimization: if there are no more active states, and there
1575 are no new states yet set up, then skip over the subject string
1576 right here, to save looping. Otherwise, set up the new state to swing
1577 into action when the end of the substring is reached. */
1578
1579 else if (i + 1 >= active_count && new_count == 0)
1580 {
1581 ptr += charcount;
1582 clen = 0;
1583 ADD_NEW(next_state_offset, 0);
1584
1585 /* If we are adding a repeat state at the new character position,
1586 we must fudge things so that it is the only current state.
1587 Otherwise, it might be a duplicate of one we processed before, and
1588 that would cause it to be skipped. */
1589
1590 if (repeat_state_offset >= 0)
1591 {
1592 next_active_state = active_states;
1593 active_count = 0;
1594 i = -1;
1595 ADD_ACTIVE(repeat_state_offset, 0);
1596 }
1597 }
1598 else
1599 {
1600 const uschar *p = start_subject + local_offsets[0];
1601 const uschar *pp = start_subject + local_offsets[1];
1602 while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
1603 ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
1604 if (repeat_state_offset >= 0)
1605 { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }
1606 }
1607
1608 }
1609 else if (rc != PCRE_ERROR_NOMATCH) return rc;
1610 }
1611 break;
1612
1613
1614 /* ========================================================================== */
1615 /* Handle callouts */
1616
1617 case OP_CALLOUT:
1618 if (pcre_callout != NULL)
1619 {
1620 int rrc;
1621 pcre_callout_block cb;
1622 cb.version = 1; /* Version 1 of the callout block */
1623 cb.callout_number = code[1];
1624 cb.offset_vector = offsets;
1625 cb.subject = (PCRE_SPTR)start_subject;
1626 cb.subject_length = end_subject - start_subject;
1627 cb.start_match = current_subject - start_subject;
1628 cb.current_position = ptr - start_subject;
1629 cb.pattern_position = GET(code, 2);
1630 cb.next_item_length = GET(code, 2 + LINK_SIZE);
1631 cb.capture_top = 1;
1632 cb.capture_last = -1;
1633 cb.callout_data = md->callout_data;
1634 if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc; /* Abandon */
1635 if (rrc == 0) { ADD_ACTIVE(state_offset + 2 + 2*LINK_SIZE, 0); }
1636 }
1637 break;
1638
1639
1640 /* ========================================================================== */
1641 default: /* Unsupported opcode */
1642 return PCRE_ERROR_DFA_UITEM;
1643 }
1644
1645 NEXT_ACTIVE_STATE: continue;
1646
1647 } /* End of loop scanning active states */
1648
1649 /* We have finished the processing at the current subject character. If no
1650 new states have been set for the next character, we have found all the
1651 matches that we are going to find. If we are at the top level and partial
1652 matching has been requested, check for appropriate conditions. */
1653
1654 if (new_count <= 0)
1655 {
1656 if (match_count < 0 && /* No matches found */
1657 rlevel == 1 && /* Top level match function */
1658 (md->moptions & PCRE_PARTIAL) != 0 && /* Want partial matching */
1659 ptr >= end_subject && /* Reached end of subject */
1660 ptr > current_subject) /* Matched non-empty string */
1661 {
1662 if (offsetcount >= 2)
1663 {
1664 offsets[0] = current_subject - start_subject;
1665 offsets[1] = end_subject - start_subject;
1666 }
1667 match_count = PCRE_ERROR_PARTIAL;
1668 }
1669
1670 DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
1671 "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count,
1672 rlevel*2-2, SP));
1673 return match_count;
1674 }
1675
1676 /* One or more states are active for the next character. */
1677
1678 ptr += clen; /* Advance to next subject character */
1679 } /* Loop to move along the subject string */
1680
1681 /* Control never gets here, but we must keep the compiler happy. */
1682
1683 DPRINTF(("%.*s+++ Unexpected end of internal_dfa_exec %d +++\n"
1684 "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, rlevel*2-2, SP));
1685 return PCRE_ERROR_NOMATCH;
1686 }
1687
1688
1689
1690
1691 /*************************************************
1692 * Execute a Regular Expression - DFA engine *
1693 *************************************************/
1694
1695 /* This external function applies a compiled re to a subject string using a DFA
1696 engine. This function calls the internal function multiple times if the pattern
1697 is not anchored.
1698
1699 Arguments:
1700 argument_re points to the compiled expression
1701 extra_data points to extra data or is NULL (not currently used)
1702 subject points to the subject string
1703 length length of subject string (may contain binary zeros)
1704 start_offset where to start in the subject string
1705 options option bits
1706 offsets vector of match offsets
1707 offsetcount size of same
1708 workspace workspace vector
1709 wscount size of same
1710
1711 Returns: > 0 => number of match offset pairs placed in offsets
1712 = 0 => offsets overflowed; longest matches are present
1713 -1 => failed to match
1714 < -1 => some kind of unexpected problem
1715 */
1716
1717 PCRE_DATA_SCOPE int
1718 pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
1719 const char *subject, int length, int start_offset, int options, int *offsets,
1720 int offsetcount, int *workspace, int wscount)
1721 {
1722 real_pcre *re = (real_pcre *)argument_re;
1723 dfa_match_data match_block;
1724 BOOL utf8, anchored, startline, firstline;
1725 const uschar *current_subject, *end_subject, *lcc;
1726
1727 pcre_study_data internal_study;
1728 const pcre_study_data *study = NULL;
1729 real_pcre internal_re;
1730
1731 const uschar *req_byte_ptr;
1732 const uschar *start_bits = NULL;
1733 BOOL first_byte_caseless = FALSE;
1734 BOOL req_byte_caseless = FALSE;
1735 int first_byte = -1;
1736 int req_byte = -1;
1737 int req_byte2 = -1;
1738
1739 /* Plausibility checks */
1740
1741 if ((options & ~PUBLIC_DFA_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
1742 if (re == NULL || subject == NULL || workspace == NULL ||
1743 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
1744 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
1745 if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;
1746
1747 /* We need to find the pointer to any study data before we test for byte
1748 flipping, so we scan the extra_data block first. This may set two fields in the
1749 match block, so we must initialize them beforehand. However, the other fields
1750 in the match block must not be set until after the byte flipping. */
1751
1752 match_block.tables = re->tables;
1753 match_block.callout_data = NULL;
1754
1755 if (extra_data != NULL)
1756 {
1757 unsigned int flags = extra_data->flags;
1758 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
1759 study = (const pcre_study_data *)extra_data->study_data;
1760 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) return PCRE_ERROR_DFA_UMLIMIT;
1761 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
1762 return PCRE_ERROR_DFA_UMLIMIT;
1763 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
1764 match_block.callout_data = extra_data->callout_data;
1765 if ((flags & PCRE_EXTRA_TABLES) != 0)
1766 match_block.tables = extra_data->tables;
1767 }
1768
1769 /* Check that the first field in the block is the magic number. If it is not,
1770 test for a regex that was compiled on a host of opposite endianness. If this is
1771 the case, flipped values are put in internal_re and internal_study if there was
1772 study data too. */
1773
1774 if (re->magic_number != MAGIC_NUMBER)
1775 {
1776 re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
1777 if (re == NULL) return PCRE_ERROR_BADMAGIC;
1778 if (study != NULL) study = &internal_study;
1779 }
1780
1781 /* Set some local values */
1782
1783 current_subject = (const unsigned char *)subject + start_offset;
1784 end_subject = (const unsigned char *)subject + length;
1785 req_byte_ptr = current_subject - 1;
1786
1787 utf8 = (re->options & PCRE_UTF8) != 0;
1788
1789 anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||
1790 (re->options & PCRE_ANCHORED) != 0;
1791
1792 /* The remaining fixed data for passing around. */
1793
1794 match_block.start_code = (const uschar *)argument_re +
1795 re->name_table_offset + re->name_count * re->name_entry_size;
1796 match_block.start_subject = (const unsigned char *)subject;
1797 match_block.end_subject = end_subject;
1798 match_block.moptions = options;
1799 match_block.poptions = re->options;
1800
1801 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
1802 back the character offset. */
1803
1804 #ifdef SUPPORT_UTF8
1805 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
1806 {
1807 if (_pcre_valid_utf8((uschar *)subject, length) >= 0)
1808 return PCRE_ERROR_BADUTF8;
1809 if (start_offset > 0 && start_offset < length)
1810 {
1811 int tb = ((uschar *)subject)[start_offset];
1812 if (tb > 127)
1813 {
1814 tb &= 0xc0;
1815 if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
1816 }
1817 }
1818 }
1819 #endif
1820
1821 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
1822 is a feature that makes it possible to save compiled regex and re-use them
1823 in other programs later. */
1824
1825 if (match_block.tables == NULL) match_block.tables = _pcre_default_tables;
1826
1827 /* The lower casing table and the "must be at the start of a line" flag are
1828 used in a loop when finding where to start. */
1829
1830 lcc = match_block.tables + lcc_offset;
1831 startline = (re->options & PCRE_STARTLINE) != 0;
1832 firstline = (re->options & PCRE_FIRSTLINE) != 0;
1833
1834 /* Set up the first character to match, if available. The first_byte value is
1835 never set for an anchored regular expression, but the anchoring may be forced
1836 at run time, so we have to test for anchoring. The first char may be unset for
1837 an unanchored pattern, of course. If there's no first char and the pattern was
1838 studied, there may be a bitmap of possible first characters. */
1839
1840 if (!anchored)
1841 {
1842 if ((re->options & PCRE_FIRSTSET) != 0)
1843 {
1844 first_byte = re->first_byte & 255;
1845 if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
1846 first_byte = lcc[first_byte];
1847 }
1848 else
1849 {
1850 if (startline && study != NULL &&
1851 (study->options & PCRE_STUDY_MAPPED) != 0)
1852 start_bits = study->start_bits;
1853 }
1854 }
1855
1856 /* For anchored or unanchored matches, there may be a "last known required
1857 character" set. */
1858
1859 if ((re->options & PCRE_REQCHSET) != 0)
1860 {
1861 req_byte = re->req_byte & 255;
1862 req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
1863 req_byte2 = (match_block.tables + fcc_offset)[req_byte]; /* case flipped */
1864 }
1865
1866 /* Call the main matching function, looping for a non-anchored regex after a
1867 failed match. Unless restarting, optimize by moving to the first match
1868 character if possible, when not anchored. Then unless wanting a partial match,
1869 check for a required later character. */
1870
1871 for (;;)
1872 {
1873 int rc;
1874
1875 if ((options & PCRE_DFA_RESTART) == 0)
1876 {
1877 const uschar *save_end_subject = end_subject;
1878
1879 /* Advance to a unique first char if possible. If firstline is TRUE, the
1880 start of the match is constrained to the first line of a multiline string.
1881 Implement this by temporarily adjusting end_subject so that we stop
1882 scanning at a newline. If the match fails at the newline, later code breaks
1883 this loop. */
1884
1885 if (firstline)
1886 {
1887 const uschar *t = current_subject;
1888 while (t < save_end_subject && *t != '\n') t++;
1889 end_subject = t;
1890 }
1891
1892 if (first_byte >= 0)
1893 {
1894 if (first_byte_caseless)
1895 while (current_subject < end_subject &&
1896 lcc[*current_subject] != first_byte)
1897 current_subject++;
1898 else
1899 while (current_subject < end_subject && *current_subject != first_byte)
1900 current_subject++;
1901 }
1902
1903 /* Or to just after \n for a multiline match if possible */
1904
1905 else if (startline)
1906 {
1907 if (current_subject > match_block.start_subject + start_offset)
1908 {
1909 while (current_subject < end_subject && current_subject[-1] != NEWLINE)
1910 current_subject++;
1911 }
1912 }
1913
1914 /* Or to a non-unique first char after study */
1915
1916 else if (start_bits != NULL)
1917 {
1918 while (current_subject < end_subject)
1919 {
1920 register unsigned int c = *current_subject;
1921 if ((start_bits[c/8] & (1 << (c&7))) == 0) current_subject++;
1922 else break;
1923 }
1924 }
1925
1926 /* Restore fudged end_subject */
1927
1928 end_subject = save_end_subject;
1929 }
1930
1931 /* If req_byte is set, we know that that character must appear in the subject
1932 for the match to succeed. If the first character is set, req_byte must be
1933 later in the subject; otherwise the test starts at the match point. This
1934 optimization can save a huge amount of work in patterns with nested unlimited
1935 repeats that aren't going to match. Writing separate code for cased/caseless
1936 versions makes it go faster, as does using an autoincrement and backing off
1937 on a match.
1938
1939 HOWEVER: when the subject string is very, very long, searching to its end can
1940 take a long time, and give bad performance on quite ordinary patterns. This
1941 showed up when somebody was matching /^C/ on a 32-megabyte string... so we
1942 don't do this when the string is sufficiently long.
1943
1944 ALSO: this processing is disabled when partial matching is requested.
1945 */
1946
1947 if (req_byte >= 0 &&
1948 end_subject - current_subject < REQ_BYTE_MAX &&
1949 (options & PCRE_PARTIAL) == 0)
1950 {
1951 register const uschar *p = current_subject + ((first_byte >= 0)? 1 : 0);
1952
1953 /* We don't need to repeat the search if we haven't yet reached the
1954 place we found it at last time. */
1955
1956 if (p > req_byte_ptr)
1957 {
1958 if (req_byte_caseless)
1959 {
1960 while (p < end_subject)
1961 {
1962 register int pp = *p++;
1963 if (pp == req_byte || pp == req_byte2) { p--; break; }
1964 }
1965 }
1966 else
1967 {
1968 while (p < end_subject)
1969 {
1970 if (*p++ == req_byte) { p--; break; }
1971 }
1972 }
1973
1974 /* If we can't find the required character, break the matching loop,
1975 which will cause a return or PCRE_ERROR_NOMATCH. */
1976
1977 if (p >= end_subject) break;
1978
1979 /* If we have found the required character, save the point where we
1980 found it, so that we don't search again next time round the loop if
1981 the start hasn't passed this character yet. */
1982
1983 req_byte_ptr = p;
1984 }
1985 }
1986
1987 /* OK, now we can do the business */
1988
1989 rc = internal_dfa_exec(
1990 &match_block, /* fixed match data */
1991 match_block.start_code, /* this subexpression's code */
1992 current_subject, /* where we currently are */
1993 start_offset, /* start offset in subject */
1994 offsets, /* offset vector */
1995 offsetcount, /* size of same */
1996 workspace, /* workspace vector */
1997 wscount, /* size of same */
1998 re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL), /* ims flags */
1999 0, /* function recurse level */
2000 0); /* regex recurse level */
2001
2002 /* Anything other than "no match" means we are done, always; otherwise, carry
2003 on only if not anchored. */
2004
2005 if (rc != PCRE_ERROR_NOMATCH || anchored) return rc;
2006
2007 /* Advance to the next subject character unless we are at the end of a line
2008 and firstline is set. */
2009
2010 if (firstline && *current_subject == NEWLINE) break;
2011 current_subject++;
2012
2013 #ifdef SUPPORT_UTF8
2014 if (utf8)
2015 {
2016 while (current_subject < end_subject && (*current_subject & 0xc0) == 0x80)
2017 current_subject++;
2018 }
2019 #endif
2020
2021 if (current_subject > end_subject) break;
2022 }
2023
2024 return PCRE_ERROR_NOMATCH;
2025 }
2026
2027 /* End of pcre_dfa_exec.c */

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12