/[pcre]/code/tags/pcre-6.0/pcre_dfa_exec.c
ViewVC logotype

Contents of /code/tags/pcre-6.0/pcre_dfa_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 78 - (show annotations) (download)
Sat Feb 24 21:40:47 2007 UTC (7 years, 6 months ago) by nigel
File MIME type: text/plain
File size: 66541 byte(s)
Tag code/trunk as code/tags/pcre-6.0.

1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2005 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains the external function pcre_dfa_exec(), which is an
42 alternative matching function that uses a DFA algorithm. This is NOT Perl-
43 compatible, but it has advantages in certain applications. */
44
45
46 #include "pcre_internal.h"
47
48
49 /* For use to indent debugging output */
50
51 #define SP " "
52
53
54
55 /*************************************************
56 * Code parameters and static tables *
57 *************************************************/
58
59 /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
60 into others, under special conditions. A gap of 10 between the blocks should be
61 enough. */
62
63 #define OP_PROP_EXTRA (EXTRACT_BASIC_MAX+1)
64 #define OP_EXTUNI_EXTRA (EXTRACT_BASIC_MAX+11)
65
66
67 /* This table identifies those opcodes that are followed immediately by a
68 character that is to be tested in some way. This makes is possible to
69 centralize the loading of these characters. In the case of Type * etc, the
70 "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
71 small value. */
72
73 static uschar coptable[] = {
74 0, /* End */
75 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* \A, \G, \B, \b, \D, \d, \S, \s, \W, \w */
76 0, 0, /* Any, Anybyte */
77 0, 0, 0, /* NOTPROP, PROP, EXTUNI */
78 0, 0, 0, 0, 0, /* \Z, \z, Opt, ^, $ */
79 1, /* Char */
80 1, /* Charnc */
81 1, /* not */
82 /* Positive single-char repeats */
83 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
84 3, 3, 3, /* upto, minupto, exact */
85 /* Negative single-char repeats - only for chars < 256 */
86 1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
87 3, 3, 3, /* NOT upto, minupto, exact */
88 /* Positive type repeats */
89 1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
90 3, 3, 3, /* Type upto, minupto, exact */
91 /* Character class & ref repeats */
92 0, 0, 0, 0, 0, 0, /* *, *?, +, +?, ?, ?? */
93 0, 0, /* CRRANGE, CRMINRANGE */
94 0, /* CLASS */
95 0, /* NCLASS */
96 0, /* XCLASS - variable length */
97 0, /* REF */
98 0, /* RECURSE */
99 0, /* CALLOUT */
100 0, /* Alt */
101 0, /* Ket */
102 0, /* KetRmax */
103 0, /* KetRmin */
104 0, /* Assert */
105 0, /* Assert not */
106 0, /* Assert behind */
107 0, /* Assert behind not */
108 0, /* Reverse */
109 0, /* Once */
110 0, /* COND */
111 0, /* CREF */
112 0, 0, /* BRAZERO, BRAMINZERO */
113 0, /* BRANUMBER */
114 0 /* BRA */
115 };
116
117 /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
118 and \w */
119
120 static uschar toptable1[] = {
121 0, 0, 0, 0, 0,
122 ctype_digit, ctype_digit,
123 ctype_space, ctype_space,
124 ctype_word, ctype_word,
125 0 /* OP_ANY */
126 };
127
128 static uschar toptable2[] = {
129 0, 0, 0, 0, 0,
130 ctype_digit, 0,
131 ctype_space, 0,
132 ctype_word, 0,
133 1 /* OP_ANY */
134 };
135
136
137 /* Structure for holding data about a particular state, which is in effect the
138 current data for an active path through the match tree. It must consist
139 entirely of ints because the working vector we are passed, and which we put
140 these structures in, is a vector of ints. */
141
142 typedef struct stateblock {
143 int offset; /* Offset to opcode */
144 int count; /* Count for repeats */
145 int ims; /* ims flag bits */
146 int data; /* Some use extra data */
147 } stateblock;
148
149 #define INTS_PER_STATEBLOCK (sizeof(stateblock)/sizeof(int))
150
151
152 #ifdef DEBUG
153 /*************************************************
154 * Print character string *
155 *************************************************/
156
157 /* Character string printing function for debugging.
158
159 Arguments:
160 p points to string
161 length number of bytes
162 f where to print
163
164 Returns: nothing
165 */
166
167 static void
168 pchars(unsigned char *p, int length, FILE *f)
169 {
170 int c;
171 while (length-- > 0)
172 {
173 if (isprint(c = *(p++)))
174 fprintf(f, "%c", c);
175 else
176 fprintf(f, "\\x%02x", c);
177 }
178 }
179 #endif
180
181
182
183 /*************************************************
184 * Execute a Regular Expression - DFA engine *
185 *************************************************/
186
187 /* This internal function applies a compiled pattern to a subject string,
188 starting at a given point, using a DFA engine. This function is called from the
189 external one, possibly multiple times if the pattern is not anchored. The
190 function calls itself recursively for some kinds of subpattern.
191
192 Arguments:
193 md the match_data block with fixed information
194 this_start_code the opening bracket of this subexpression's code
195 current_subject where we currently are in the subject string
196 start_offset start offset in the subject string
197 offsets vector to contain the matching string offsets
198 offsetcount size of same
199 workspace vector of workspace
200 wscount size of same
201 ims the current ims flags
202 rlevel function call recursion level
203 recursing regex recursive call level
204
205 Returns: > 0 =>
206 = 0 =>
207 -1 => failed to match
208 < -1 => some kind of unexpected problem
209
210 The following macros are used for adding states to the two state vectors (one
211 for the current character, one for the following character). */
212
213 #define ADD_ACTIVE(x,y) \
214 if (active_count++ < wscount) \
215 { \
216 next_active_state->offset = (x); \
217 next_active_state->count = (y); \
218 next_active_state->ims = ims; \
219 next_active_state++; \
220 DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
221 } \
222 else return PCRE_ERROR_DFA_WSSIZE
223
224 #define ADD_ACTIVE_DATA(x,y,z) \
225 if (active_count++ < wscount) \
226 { \
227 next_active_state->offset = (x); \
228 next_active_state->count = (y); \
229 next_active_state->ims = ims; \
230 next_active_state->data = (z); \
231 next_active_state++; \
232 DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
233 } \
234 else return PCRE_ERROR_DFA_WSSIZE
235
236 #define ADD_NEW(x,y) \
237 if (new_count++ < wscount) \
238 { \
239 next_new_state->offset = (x); \
240 next_new_state->count = (y); \
241 next_new_state->ims = ims; \
242 next_new_state++; \
243 DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
244 } \
245 else return PCRE_ERROR_DFA_WSSIZE
246
247 #define ADD_NEW_DATA(x,y,z) \
248 if (new_count++ < wscount) \
249 { \
250 next_new_state->offset = (x); \
251 next_new_state->count = (y); \
252 next_new_state->ims = ims; \
253 next_new_state->data = (z); \
254 next_new_state++; \
255 DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
256 } \
257 else return PCRE_ERROR_DFA_WSSIZE
258
259 /* And now, here is the code */
260
261 static int
262 internal_dfa_exec(
263 dfa_match_data *md,
264 const uschar *this_start_code,
265 const uschar *current_subject,
266 int start_offset,
267 int *offsets,
268 int offsetcount,
269 int *workspace,
270 int wscount,
271 int ims,
272 int rlevel,
273 int recursing)
274 {
275 stateblock *active_states, *new_states, *temp_states;
276 stateblock *next_active_state, *next_new_state;
277
278 const uschar *ctypes, *lcc, *fcc;
279 const uschar *ptr;
280 const uschar *end_code;
281
282 int active_count, new_count, match_count;
283
284 /* Some fields in the md block are frequently referenced, so we load them into
285 independent variables in the hope that this will perform better. */
286
287 const uschar *start_subject = md->start_subject;
288 const uschar *end_subject = md->end_subject;
289 const uschar *start_code = md->start_code;
290
291 BOOL utf8 = (md->poptions & PCRE_UTF8) != 0;
292
293 rlevel++;
294 offsetcount &= (-2);
295
296 wscount -= 2;
297 wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
298 (2 * INTS_PER_STATEBLOCK);
299
300 DPRINTF(("\n%.*s---------------------\n"
301 "%.*sCall to internal_dfa_exec f=%d r=%d\n",
302 rlevel*2-2, SP, rlevel*2-2, SP, rlevel, recursing));
303
304 ctypes = md->tables + ctypes_offset;
305 lcc = md->tables + lcc_offset;
306 fcc = md->tables + fcc_offset;
307
308 match_count = PCRE_ERROR_NOMATCH; /* A negative number */
309
310 active_states = (stateblock *)(workspace + 2);
311 next_new_state = new_states = active_states + wscount;
312 new_count = 0;
313
314 /* The first thing in any (sub) pattern is a bracket of some sort. Push all
315 the alternative states onto the list, and find out where the end is. This
316 makes is possible to use this function recursively, when we want to stop at a
317 matching internal ket rather than at the end.
318
319 If the first opcode in the first alternative is OP_REVERSE, we are dealing with
320 a backward assertion. In that case, we have to find out the maximum amount to
321 move back, and set up each alternative appropriately. */
322
323 if (this_start_code[1+LINK_SIZE] == OP_REVERSE)
324 {
325 int max_back = 0;
326 int gone_back;
327
328 end_code = this_start_code;
329 do
330 {
331 int back = GET(end_code, 2+LINK_SIZE);
332 if (back > max_back) max_back = back;
333 end_code += GET(end_code, 1);
334 }
335 while (*end_code == OP_ALT);
336
337 /* If we can't go back the amount required for the longest lookbehind
338 pattern, go back as far as we can; some alternatives may still be viable. */
339
340 #ifdef SUPPORT_UTF8
341 /* In character mode we have to step back character by character */
342
343 if (utf8)
344 {
345 for (gone_back = 0; gone_back < max_back; gone_back++)
346 {
347 if (current_subject <= start_subject) break;
348 current_subject--;
349 while (current_subject > start_subject &&
350 (*current_subject & 0xc0) == 0x80)
351 current_subject--;
352 }
353 }
354 else
355 #endif
356
357 /* In byte-mode we can do this quickly. */
358
359 {
360 gone_back = (current_subject - max_back < start_subject)?
361 current_subject - start_subject : max_back;
362 current_subject -= gone_back;
363 }
364
365 /* Now we can process the individual branches. */
366
367 end_code = this_start_code;
368 do
369 {
370 int back = GET(end_code, 2+LINK_SIZE);
371 if (back <= gone_back)
372 {
373 int bstate = end_code - start_code + 2 + 2*LINK_SIZE;
374 ADD_NEW_DATA(-bstate, 0, gone_back - back);
375 }
376 end_code += GET(end_code, 1);
377 }
378 while (*end_code == OP_ALT);
379 }
380
381 /* This is the code for a "normal" subpattern (not a backward assertion). The
382 start of a whole pattern is always one of these. If we are at the top level,
383 we may be asked to restart matching from the same point that we reached for a
384 previous partial match. We still have to scan through the top-level branches to
385 find the end state. */
386
387 else
388 {
389 end_code = this_start_code;
390
391 /* Restarting */
392
393 if (rlevel == 1 && (md->moptions & PCRE_DFA_RESTART) != 0)
394 {
395 do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);
396 new_count = workspace[1];
397 if (!workspace[0])
398 memcpy(new_states, active_states, new_count * sizeof(stateblock));
399 }
400
401 /* Not restarting */
402
403 else
404 {
405 do
406 {
407 ADD_NEW(end_code - start_code + 1 + LINK_SIZE, 0);
408 end_code += GET(end_code, 1);
409 }
410 while (*end_code == OP_ALT);
411 }
412 }
413
414 workspace[0] = 0; /* Bit indicating which vector is current */
415
416 DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, end_code - start_code));
417
418 /* Loop for scanning the subject */
419
420 ptr = current_subject;
421 for (;;)
422 {
423 int i, j;
424 int c, d, clen, dlen;
425
426 /* Make the new state list into the active state list and empty the
427 new state list. */
428
429 temp_states = active_states;
430 active_states = new_states;
431 new_states = temp_states;
432 active_count = new_count;
433 new_count = 0;
434
435 workspace[0] ^= 1; /* Remember for the restarting feature */
436 workspace[1] = active_count;
437
438 #ifdef DEBUG
439 printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);
440 pchars((uschar *)ptr, strlen((char *)ptr), stdout);
441 printf("\"\n");
442
443 printf("%.*sActive states: ", rlevel*2-2, SP);
444 for (i = 0; i < active_count; i++)
445 printf("%d/%d ", active_states[i].offset, active_states[i].count);
446 printf("\n");
447 #endif
448
449 /* Set the pointers for adding new states */
450
451 next_active_state = active_states + active_count;
452 next_new_state = new_states;
453
454 /* Load the current character from the subject outside the loop, as many
455 different states may want to look at it, and we assume that at least one
456 will. */
457
458 if (ptr < end_subject)
459 {
460 clen = 1;
461 #ifdef SUPPORT_UTF8
462 if (utf8) { GETCHARLEN(c, ptr, clen); } else
463 #endif /* SUPPORT_UTF8 */
464 c = *ptr;
465 }
466 else
467 {
468 clen = 0; /* At end subject */
469 c = -1;
470 }
471
472 /* Scan up the active states and act on each one. The result of an action
473 may be to add more states to the currently active list (e.g. on hitting a
474 parenthesis) or it may be to put states on the new list, for considering
475 when we move the character pointer on. */
476
477 for (i = 0; i < active_count; i++)
478 {
479 stateblock *current_state = active_states + i;
480 const uschar *code;
481 int state_offset = current_state->offset;
482 int count, codevalue;
483 int chartype, othercase;
484
485 #ifdef DEBUG
486 printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
487 if (c < 0) printf("-1\n");
488 else if (c > 32 && c < 127) printf("'%c'\n", c);
489 else printf("0x%02x\n", c);
490 #endif
491
492 /* This variable is referred to implicity in the ADD_xxx macros. */
493
494 ims = current_state->ims;
495
496 /* A negative offset is a special case meaning "hold off going to this
497 (negated) state until the number of characters in the data field have
498 been skipped". */
499
500 if (state_offset < 0)
501 {
502 if (current_state->data > 0)
503 {
504 DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP));
505 ADD_NEW_DATA(state_offset, current_state->count,
506 current_state->data - 1);
507 continue;
508 }
509 else
510 {
511 current_state->offset = state_offset = -state_offset;
512 }
513 }
514
515 /* Check for a duplicate state with the same count, and skip if found. */
516
517 for (j = 0; j < i; j++)
518 {
519 if (active_states[j].offset == state_offset &&
520 active_states[j].count == current_state->count)
521 {
522 DPRINTF(("%.*sDuplicate state: skipped\n", rlevel*2-2, SP));
523 goto NEXT_ACTIVE_STATE;
524 }
525 }
526
527 /* The state offset is the offset to the opcode */
528
529 code = start_code + state_offset;
530 codevalue = *code;
531 if (codevalue >= OP_BRA) codevalue = OP_BRA; /* All brackets are equal */
532
533 /* If this opcode is followed by an inline character, load it. It is
534 tempting to test for the presence of a subject character here, but that
535 is wrong, because sometimes zero repetitions of the subject are
536 permitted.
537
538 We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
539 argument that is not a data character - but is always one byte long.
540 Unfortunately, we have to take special action to deal with \P, \p, and
541 \X in this case. To keep the other cases fast, convert these ones to new
542 opcodes. */
543
544 if (coptable[codevalue] > 0)
545 {
546 dlen = 1;
547 #ifdef SUPPORT_UTF8
548 if (utf8) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
549 #endif /* SUPPORT_UTF8 */
550 d = code[coptable[codevalue]];
551 if (codevalue >= OP_TYPESTAR)
552 {
553 if (d == OP_ANYBYTE) return PCRE_ERROR_DFA_UITEM;
554 if (d >= OP_NOTPROP)
555 codevalue += (d == OP_EXTUNI)? OP_EXTUNI_EXTRA : OP_PROP_EXTRA;
556 }
557 }
558 else
559 {
560 dlen = 0; /* Not strictly necessary, but compilers moan */
561 d = -1; /* if these variables are not set. */
562 }
563
564
565 /* Now process the individual opcodes */
566
567 switch (codevalue)
568 {
569
570 /* ========================================================================== */
571 /* Reached a closing bracket. If not at the end of the pattern, carry
572 on with the next opcode. Otherwise, unless we have an empty string and
573 PCRE_NOTEMPTY is set, save the match data, shifting up all previous
574 matches so we always have the longest first. */
575
576 case OP_KET:
577 case OP_KETRMIN:
578 case OP_KETRMAX:
579 if (code != end_code)
580 {
581 ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
582 if (codevalue != OP_KET)
583 {
584 ADD_ACTIVE(state_offset - GET(code, 1), 0);
585 }
586 }
587 else if (ptr > current_subject || (md->moptions & PCRE_NOTEMPTY) == 0)
588 {
589 if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
590 else if (match_count > 0 && ++match_count * 2 >= offsetcount)
591 match_count = 0;
592 count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
593 if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
594 if (offsetcount >= 2)
595 {
596 offsets[0] = current_subject - start_subject;
597 offsets[1] = ptr - start_subject;
598 DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
599 offsets[1] - offsets[0], current_subject));
600 }
601 if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
602 {
603 DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
604 "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,
605 match_count, rlevel*2-2, SP));
606 return match_count;
607 }
608 }
609 break;
610
611 /* ========================================================================== */
612 /* These opcodes add to the current list of states without looking
613 at the current character. */
614
615 /*-----------------------------------------------------------------*/
616 case OP_ALT:
617 do { code += GET(code, 1); } while (*code == OP_ALT);
618 ADD_ACTIVE(code - start_code, 0);
619 break;
620
621 /*-----------------------------------------------------------------*/
622 case OP_BRA:
623 do
624 {
625 ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
626 code += GET(code, 1);
627 }
628 while (*code == OP_ALT);
629 break;
630
631 /*-----------------------------------------------------------------*/
632 case OP_BRAZERO:
633 case OP_BRAMINZERO:
634 ADD_ACTIVE(state_offset + 1, 0);
635 code += 1 + GET(code, 2);
636 while (*code == OP_ALT) code += GET(code, 1);
637 ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
638 break;
639
640 /*-----------------------------------------------------------------*/
641 case OP_BRANUMBER:
642 ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
643 break;
644
645 /*-----------------------------------------------------------------*/
646 case OP_CIRC:
647 if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
648 ((ims & PCRE_MULTILINE) != 0 && ptr[-1] == NEWLINE))
649 { ADD_ACTIVE(state_offset + 1, 0); }
650 break;
651
652 /*-----------------------------------------------------------------*/
653 case OP_EOD:
654 if (ptr >= end_subject) { ADD_ACTIVE(state_offset + 1, 0); }
655 break;
656
657 /*-----------------------------------------------------------------*/
658 case OP_OPT:
659 ims = code[1];
660 ADD_ACTIVE(state_offset + 2, 0);
661 break;
662
663 /*-----------------------------------------------------------------*/
664 case OP_SOD:
665 if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
666 break;
667
668 /*-----------------------------------------------------------------*/
669 case OP_SOM:
670 if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); }
671 break;
672
673
674 /* ========================================================================== */
675 /* These opcodes inspect the next subject character, and sometimes
676 the previous one as well, but do not have an argument. The variable
677 clen contains the length of the current character and is zero if we are
678 at the end of the subject. */
679
680 /*-----------------------------------------------------------------*/
681 case OP_ANY:
682 if (clen > 0 && (c != NEWLINE || (ims & PCRE_DOTALL) != 0))
683 { ADD_NEW(state_offset + 1, 0); }
684 break;
685
686 /*-----------------------------------------------------------------*/
687 case OP_EODN:
688 if (clen == 0 || (c == NEWLINE && ptr + 1 == end_subject))
689 { ADD_ACTIVE(state_offset + 1, 0); }
690 break;
691
692 /*-----------------------------------------------------------------*/
693 case OP_DOLL:
694 if ((md->moptions & PCRE_NOTEOL) == 0)
695 {
696 if (clen == 0 || (c == NEWLINE && (ptr + 1 == end_subject ||
697 (ims & PCRE_MULTILINE) != 0)))
698 { ADD_ACTIVE(state_offset + 1, 0); }
699 }
700 else if (c == NEWLINE && (ims & PCRE_MULTILINE) != 0)
701 { ADD_ACTIVE(state_offset + 1, 0); }
702 break;
703
704 /*-----------------------------------------------------------------*/
705
706 case OP_DIGIT:
707 case OP_WHITESPACE:
708 case OP_WORDCHAR:
709 if (clen > 0 && c < 256 &&
710 ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)
711 { ADD_NEW(state_offset + 1, 0); }
712 break;
713
714 /*-----------------------------------------------------------------*/
715 case OP_NOT_DIGIT:
716 case OP_NOT_WHITESPACE:
717 case OP_NOT_WORDCHAR:
718 if (clen > 0 && (c >= 256 ||
719 ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0))
720 { ADD_NEW(state_offset + 1, 0); }
721 break;
722
723 /*-----------------------------------------------------------------*/
724 case OP_WORD_BOUNDARY:
725 case OP_NOT_WORD_BOUNDARY:
726 {
727 int left_word, right_word;
728
729 if (ptr > start_subject)
730 {
731 const uschar *temp = ptr - 1;
732 #ifdef SUPPORT_UTF8
733 if (utf8) BACKCHAR(temp);
734 #endif
735 GETCHARTEST(d, temp);
736 left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
737 }
738 else left_word = 0;
739
740 if (clen > 0) right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
741 else right_word = 0;
742
743 if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
744 { ADD_ACTIVE(state_offset + 1, 0); }
745 }
746 break;
747
748
749 #ifdef SUPPORT_UCP
750
751 /*-----------------------------------------------------------------*/
752 /* Check the next character by Unicode property. We will get here only
753 if the support is in the binary; otherwise a compile-time error occurs.
754 */
755
756 case OP_PROP:
757 case OP_NOTPROP:
758 if (clen > 0)
759 {
760 int rqdtype, category;
761 category = ucp_findchar(c, &chartype, &othercase);
762 rqdtype = code[1];
763 if (rqdtype >= 128)
764 {
765 if ((rqdtype - 128 == category) == (codevalue == OP_PROP))
766 { ADD_NEW(state_offset + 2, 0); }
767 }
768 else
769 {
770 if ((rqdtype == chartype) == (codevalue == OP_PROP))
771 { ADD_NEW(state_offset + 2, 0); }
772 }
773 }
774 break;
775 #endif
776
777
778
779 /* ========================================================================== */
780 /* These opcodes likewise inspect the subject character, but have an
781 argument that is not a data character. It is one of these opcodes:
782 OP_ANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE, OP_WORDCHAR,
783 OP_NOT_WORDCHAR. The value is loaded into d. */
784
785 case OP_TYPEPLUS:
786 case OP_TYPEMINPLUS:
787 count = current_state->count; /* Already matched */
788 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
789 if (clen > 0)
790 {
791 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
792 (c < 256 &&
793 (d != OP_ANY || c != '\n' || (ims & PCRE_DOTALL) != 0) &&
794 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
795 {
796 count++;
797 ADD_NEW(state_offset, count);
798 }
799 }
800 break;
801
802 /*-----------------------------------------------------------------*/
803 case OP_TYPEQUERY:
804 case OP_TYPEMINQUERY:
805 ADD_ACTIVE(state_offset + 2, 0);
806 if (clen > 0)
807 {
808 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
809 (c < 256 &&
810 (d != OP_ANY || c != '\n' || (ims & PCRE_DOTALL) != 0) &&
811 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
812 {
813 ADD_NEW(state_offset + 2, 0);
814 }
815 }
816 break;
817
818 /*-----------------------------------------------------------------*/
819 case OP_TYPESTAR:
820 case OP_TYPEMINSTAR:
821 ADD_ACTIVE(state_offset + 2, 0);
822 if (clen > 0)
823 {
824 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
825 (c < 256 &&
826 (d != OP_ANY || c != '\n' || (ims & PCRE_DOTALL) != 0) &&
827 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
828 {
829 ADD_NEW(state_offset, 0);
830 }
831 }
832 break;
833
834 /*-----------------------------------------------------------------*/
835 case OP_TYPEEXACT:
836 case OP_TYPEUPTO:
837 case OP_TYPEMINUPTO:
838 if (codevalue != OP_TYPEEXACT)
839 { ADD_ACTIVE(state_offset + 4, 0); }
840 count = current_state->count; /* Number already matched */
841 if (clen > 0)
842 {
843 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
844 (c < 256 &&
845 (d != OP_ANY || c != '\n' || (ims & PCRE_DOTALL) != 0) &&
846 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
847 {
848 if (++count >= GET2(code, 1))
849 { ADD_NEW(state_offset + 4, 0); }
850 else
851 { ADD_NEW(state_offset, count); }
852 }
853 }
854 break;
855
856 /* ========================================================================== */
857 /* These are virtual opcodes that are used when something like
858 OP_TYPEPLUS has OP_PROP, OP_NOTPROP, or OP_EXTUNI as its argument. It
859 keeps the code above fast for the other cases. The argument is in the
860 d variable. */
861
862 case OP_PROP_EXTRA + OP_TYPEPLUS:
863 case OP_PROP_EXTRA + OP_TYPEMINPLUS:
864 count = current_state->count; /* Already matched */
865 if (count > 0) { ADD_ACTIVE(state_offset + 3, 0); }
866 if (clen > 0)
867 {
868 int category = ucp_findchar(c, &chartype, &othercase);
869 int rqdtype = code[2];
870 if ((d == OP_PROP) ==
871 (rqdtype == ((rqdtype >= 128)? (category + 128) : chartype)))
872 { count++; ADD_NEW(state_offset, count); }
873 }
874 break;
875
876 /*-----------------------------------------------------------------*/
877 case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
878 case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
879 count = current_state->count; /* Already matched */
880 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
881 if (clen > 0 && ucp_findchar(c, &chartype, &othercase) != ucp_M)
882 {
883 const uschar *nptr = ptr + clen;
884 int ncount = 0;
885 while (nptr < end_subject)
886 {
887 int nd;
888 int ndlen = 1;
889 GETCHARLEN(nd, nptr, ndlen);
890 if (ucp_findchar(nd, &chartype, &othercase) != ucp_M) break;
891 ncount++;
892 nptr += ndlen;
893 }
894 count++;
895 ADD_NEW_DATA(-state_offset, count, ncount);
896 }
897 break;
898
899 /*-----------------------------------------------------------------*/
900 case OP_PROP_EXTRA + OP_TYPEQUERY:
901 case OP_PROP_EXTRA + OP_TYPEMINQUERY:
902 count = 3;
903 goto QS1;
904
905 case OP_PROP_EXTRA + OP_TYPESTAR:
906 case OP_PROP_EXTRA + OP_TYPEMINSTAR:
907 count = 0;
908
909 QS1:
910
911 ADD_ACTIVE(state_offset + 3, 0);
912 if (clen > 0)
913 {
914 int category = ucp_findchar(c, &chartype, &othercase);
915 int rqdtype = code[2];
916 if ((d == OP_PROP) ==
917 (rqdtype == ((rqdtype >= 128)? (category + 128) : chartype)))
918 { ADD_NEW(state_offset + count, 0); }
919 }
920 break;
921
922 /*-----------------------------------------------------------------*/
923 case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
924 case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
925 count = 2;
926 goto QS2;
927
928 case OP_EXTUNI_EXTRA + OP_TYPESTAR:
929 case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
930 count = 0;
931
932 QS2:
933
934 ADD_ACTIVE(state_offset + 2, 0);
935 if (clen > 0 && ucp_findchar(c, &chartype, &othercase) != ucp_M)
936 {
937 const uschar *nptr = ptr + clen;
938 int ncount = 0;
939 while (nptr < end_subject)
940 {
941 int nd;
942 int ndlen = 1;
943 GETCHARLEN(nd, nptr, ndlen);
944 if (ucp_findchar(nd, &chartype, &othercase) != ucp_M) break;
945 ncount++;
946 nptr += ndlen;
947 }
948 ADD_NEW_DATA(-(state_offset + count), 0, ncount);
949 }
950 break;
951
952 /*-----------------------------------------------------------------*/
953 case OP_PROP_EXTRA + OP_TYPEEXACT:
954 case OP_PROP_EXTRA + OP_TYPEUPTO:
955 case OP_PROP_EXTRA + OP_TYPEMINUPTO:
956 if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
957 { ADD_ACTIVE(state_offset + 5, 0); }
958 count = current_state->count; /* Number already matched */
959 if (clen > 0)
960 {
961 int category = ucp_findchar(c, &chartype, &othercase);
962 int rqdtype = code[4];
963 if ((d == OP_PROP) ==
964 (rqdtype == ((rqdtype >= 128)? (category + 128) : chartype)))
965 {
966 if (++count >= GET2(code, 1))
967 { ADD_NEW(state_offset + 5, 0); }
968 else
969 { ADD_NEW(state_offset, count); }
970 }
971 }
972 break;
973
974 /*-----------------------------------------------------------------*/
975 case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
976 case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
977 case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
978 if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
979 { ADD_ACTIVE(state_offset + 4, 0); }
980 count = current_state->count; /* Number already matched */
981 if (clen > 0 && ucp_findchar(c, &chartype, &othercase) != ucp_M)
982 {
983 const uschar *nptr = ptr + clen;
984 int ncount = 0;
985 while (nptr < end_subject)
986 {
987 int nd;
988 int ndlen = 1;
989 GETCHARLEN(nd, nptr, ndlen);
990 if (ucp_findchar(nd, &chartype, &othercase) != ucp_M) break;
991 ncount++;
992 nptr += ndlen;
993 }
994 if (++count >= GET2(code, 1))
995 { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
996 else
997 { ADD_NEW_DATA(-state_offset, count, ncount); }
998 }
999 break;
1000
1001 /* ========================================================================== */
1002 /* These opcodes are followed by a character that is usually compared
1003 to the current subject character; it is loaded into d. We still get
1004 here even if there is no subject character, because in some cases zero
1005 repetitions are permitted. */
1006
1007 /*-----------------------------------------------------------------*/
1008 case OP_CHAR:
1009 if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); }
1010 break;
1011
1012 /*-----------------------------------------------------------------*/
1013 case OP_CHARNC:
1014 if (clen == 0) break;
1015
1016 #ifdef SUPPORT_UTF8
1017 if (utf8)
1018 {
1019 if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
1020 {
1021 if (c < 128) othercase = fcc[c]; else
1022
1023 /* If we have Unicode property support, we can use it to test the
1024 other case of the character, if there is one. The result of
1025 ucp_findchar() is < 0 if the char isn't found, and othercase is
1026 returned as zero if there isn't another case. */
1027
1028 #ifdef SUPPORT_UCP
1029 if (ucp_findchar(c, &chartype, &othercase) < 0)
1030 #endif
1031 othercase = -1;
1032
1033 if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
1034 }
1035 }
1036 else
1037 #endif /* SUPPORT_UTF8 */
1038
1039 /* Non-UTF-8 mode */
1040 {
1041 if (lcc[c] == lcc[d]) { ADD_NEW(state_offset + 2, 0); }
1042 }
1043 break;
1044
1045
1046 #ifdef SUPPORT_UCP
1047 /*-----------------------------------------------------------------*/
1048 /* This is a tricky one because it can match more than one character.
1049 Find out how many characters to skip, and then set up a negative state
1050 to wait for them to pass before continuing. */
1051
1052 case OP_EXTUNI:
1053 if (clen > 0 && ucp_findchar(c, &chartype, &othercase) != ucp_M)
1054 {
1055 const uschar *nptr = ptr + clen;
1056 int ncount = 0;
1057 while (nptr < end_subject)
1058 {
1059 int nclen = 1;
1060 GETCHARLEN(c, nptr, nclen);
1061 if (ucp_findchar(c, &chartype, &othercase) != ucp_M) break;
1062 ncount++;
1063 nptr += nclen;
1064 }
1065 ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
1066 }
1067 break;
1068 #endif
1069
1070 /*-----------------------------------------------------------------*/
1071 /* Match a negated single character. This is only used for one-byte
1072 characters, that is, we know that d < 256. The character we are
1073 checking (c) can be multibyte. */
1074
1075 case OP_NOT:
1076 if (clen > 0)
1077 {
1078 int otherd = ((ims & PCRE_CASELESS) != 0)? fcc[d] : d;
1079 if (c != d && c != otherd) { ADD_NEW(state_offset + dlen + 1, 0); }
1080 }
1081 break;
1082
1083 /*-----------------------------------------------------------------*/
1084 case OP_PLUS:
1085 case OP_MINPLUS:
1086 case OP_NOTPLUS:
1087 case OP_NOTMINPLUS:
1088 count = current_state->count; /* Already matched */
1089 if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
1090 if (clen > 0)
1091 {
1092 int otherd = -1;
1093 if ((ims & PCRE_CASELESS) != 0)
1094 {
1095 #ifdef SUPPORT_UTF8
1096 if (utf8 && c >= 128)
1097 {
1098 #ifdef SUPPORT_UCP
1099 if (ucp_findchar(d, &chartype, &otherd) < 0) otherd = -1;
1100 #endif /* SUPPORT_UCP */
1101 }
1102 else
1103 #endif /* SUPPORT_UTF8 */
1104 otherd = fcc[d];
1105 }
1106 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1107 { count++; ADD_NEW(state_offset, count); }
1108 }
1109 break;
1110
1111 /*-----------------------------------------------------------------*/
1112 case OP_QUERY:
1113 case OP_MINQUERY:
1114 case OP_NOTQUERY:
1115 case OP_NOTMINQUERY:
1116 ADD_ACTIVE(state_offset + dlen + 1, 0);
1117 if (clen > 0)
1118 {
1119 int otherd = -1;
1120 if ((ims && PCRE_CASELESS) != 0)
1121 {
1122 #ifdef SUPPORT_UTF8
1123 if (utf8 && c >= 128)
1124 {
1125 #ifdef SUPPORT_UCP
1126 if (ucp_findchar(c, &chartype, &otherd) < 0) otherd = -1;
1127 #endif /* SUPPORT_UCP */
1128 }
1129 else
1130 #endif /* SUPPORT_UTF8 */
1131 otherd = fcc[d];
1132 }
1133 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1134 { ADD_NEW(state_offset + dlen + 1, 0); }
1135 }
1136 break;
1137
1138 /*-----------------------------------------------------------------*/
1139 case OP_STAR:
1140 case OP_MINSTAR:
1141 case OP_NOTSTAR:
1142 case OP_NOTMINSTAR:
1143 ADD_ACTIVE(state_offset + dlen + 1, 0);
1144 if (clen > 0)
1145 {
1146 int otherd = -1;
1147 if ((ims && PCRE_CASELESS) != 0)
1148 {
1149 #ifdef SUPPORT_UTF8
1150 if (utf8 && c >= 128)
1151 {
1152 #ifdef SUPPORT_UCP
1153 if (ucp_findchar(c, &chartype, &otherd) < 0) otherd = -1;
1154 #endif /* SUPPORT_UCP */
1155 }
1156 else
1157 #endif /* SUPPORT_UTF8 */
1158 otherd = fcc[d];
1159 }
1160 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1161 { ADD_NEW(state_offset, 0); }
1162 }
1163 break;
1164
1165 /*-----------------------------------------------------------------*/
1166 case OP_EXACT:
1167 case OP_UPTO:
1168 case OP_MINUPTO:
1169 case OP_NOTEXACT:
1170 case OP_NOTUPTO:
1171 case OP_NOTMINUPTO:
1172 if (codevalue != OP_EXACT && codevalue != OP_NOTEXACT)
1173 { ADD_ACTIVE(state_offset + dlen + 3, 0); }
1174 count = current_state->count; /* Number already matched */
1175 if (clen > 0)
1176 {
1177 int otherd = -1;
1178 if ((ims & PCRE_CASELESS) != 0)
1179 {
1180 #ifdef SUPPORT_UTF8
1181 if (utf8 && c >= 128)
1182 {
1183 #ifdef SUPPORT_UCP
1184 if (ucp_findchar(d, &chartype, &otherd) < 0) otherd = -1;
1185 #endif /* SUPPORT_UCP */
1186 }
1187 else
1188 #endif /* SUPPORT_UTF8 */
1189 otherd = fcc[d];
1190 }
1191 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1192 {
1193 if (++count >= GET2(code, 1))
1194 { ADD_NEW(state_offset + dlen + 3, 0); }
1195 else
1196 { ADD_NEW(state_offset, count); }
1197 }
1198 }
1199 break;
1200
1201
1202 /* ========================================================================== */
1203 /* These are the class-handling opcodes */
1204
1205 case OP_CLASS:
1206 case OP_NCLASS:
1207 case OP_XCLASS:
1208 {
1209 BOOL isinclass = FALSE;
1210 int next_state_offset;
1211 const uschar *ecode;
1212
1213 /* For a simple class, there is always just a 32-byte table, and we
1214 can set isinclass from it. */
1215
1216 if (codevalue != OP_XCLASS)
1217 {
1218 ecode = code + 33;
1219 if (clen > 0)
1220 {
1221 isinclass = (c > 255)? (codevalue == OP_NCLASS) :
1222 ((code[1 + c/8] & (1 << (c&7))) != 0);
1223 }
1224 }
1225
1226 /* An extended class may have a table or a list of single characters,
1227 ranges, or both, and it may be positive or negative. There's a
1228 function that sorts all this out. */
1229
1230 else
1231 {
1232 ecode = code + GET(code, 1);
1233 if (clen > 0) isinclass = _pcre_xclass(c, code + 1 + LINK_SIZE);
1234 }
1235
1236 /* At this point, isinclass is set for all kinds of class, and ecode
1237 points to the byte after the end of the class. If there is a
1238 quantifier, this is where it will be. */
1239
1240 next_state_offset = ecode - start_code;
1241
1242 switch (*ecode)
1243 {
1244 case OP_CRSTAR:
1245 case OP_CRMINSTAR:
1246 ADD_ACTIVE(next_state_offset + 1, 0);
1247 if (isinclass) { ADD_NEW(state_offset, 0); }
1248 break;
1249
1250 case OP_CRPLUS:
1251 case OP_CRMINPLUS:
1252 count = current_state->count; /* Already matched */
1253 if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
1254 if (isinclass) { count++; ADD_NEW(state_offset, count); }
1255 break;
1256
1257 case OP_CRQUERY:
1258 case OP_CRMINQUERY:
1259 ADD_ACTIVE(next_state_offset + 1, 0);
1260 if (isinclass) { ADD_NEW(next_state_offset + 1, 0); }
1261 break;
1262
1263 case OP_CRRANGE:
1264 case OP_CRMINRANGE:
1265 count = current_state->count; /* Already matched */
1266 if (count >= GET2(ecode, 1))
1267 { ADD_ACTIVE(next_state_offset + 5, 0); }
1268 if (isinclass)
1269 {
1270 if (++count >= GET2(ecode, 3))
1271 { ADD_NEW(next_state_offset + 5, 0); }
1272 else
1273 { ADD_NEW(state_offset, count); }
1274 }
1275 break;
1276
1277 default:
1278 if (isinclass) { ADD_NEW(next_state_offset, 0); }
1279 break;
1280 }
1281 }
1282 break;
1283
1284 /* ========================================================================== */
1285 /* These are the opcodes for fancy brackets of various kinds. We have
1286 to use recursion in order to handle them. */
1287
1288 case OP_ASSERT:
1289 case OP_ASSERT_NOT:
1290 case OP_ASSERTBACK:
1291 case OP_ASSERTBACK_NOT:
1292 {
1293 int rc;
1294 int local_offsets[2];
1295 int local_workspace[1000];
1296 const uschar *endasscode = code + GET(code, 1);
1297
1298 while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
1299
1300 rc = internal_dfa_exec(
1301 md, /* static match data */
1302 code, /* this subexpression's code */
1303 ptr, /* where we currently are */
1304 ptr - start_subject, /* start offset */
1305 local_offsets, /* offset vector */
1306 sizeof(local_offsets)/sizeof(int), /* size of same */
1307 local_workspace, /* workspace vector */
1308 sizeof(local_workspace)/sizeof(int), /* size of same */
1309 ims, /* the current ims flags */
1310 rlevel, /* function recursion level */
1311 recursing); /* pass on regex recursion */
1312
1313 if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
1314 { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
1315 }
1316 break;
1317
1318 /*-----------------------------------------------------------------*/
1319 case OP_COND:
1320 {
1321 int local_offsets[1000];
1322 int local_workspace[1000];
1323 int condcode = code[LINK_SIZE+1];
1324
1325 /* The only supported version of OP_CREF is for the value 0xffff, which
1326 means "test if in a recursion". */
1327
1328 if (condcode == OP_CREF)
1329 {
1330 int value = GET2(code, LINK_SIZE+2);
1331 if (value != 0xffff) return PCRE_ERROR_DFA_UCOND;
1332 if (recursing > 0) { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); }
1333 else { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }
1334 }
1335
1336 /* Otherwise, the condition is an assertion */
1337
1338 else
1339 {
1340 int rc;
1341 const uschar *asscode = code + LINK_SIZE + 1;
1342 const uschar *endasscode = asscode + GET(asscode, 1);
1343
1344 while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
1345
1346 rc = internal_dfa_exec(
1347 md, /* fixed match data */
1348 asscode, /* this subexpression's code */
1349 ptr, /* where we currently are */
1350 ptr - start_subject, /* start offset */
1351 local_offsets, /* offset vector */
1352 sizeof(local_offsets)/sizeof(int), /* size of same */
1353 local_workspace, /* workspace vector */
1354 sizeof(local_workspace)/sizeof(int), /* size of same */
1355 ims, /* the current ims flags */
1356 rlevel, /* function recursion level */
1357 recursing); /* pass on regex recursion */
1358
1359 if ((rc >= 0) ==
1360 (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
1361 { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
1362 else
1363 { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }
1364 }
1365 }
1366 break;
1367
1368 /*-----------------------------------------------------------------*/
1369 case OP_RECURSE:
1370 {
1371 int local_offsets[1000];
1372 int local_workspace[1000];
1373 int rc;
1374
1375 DPRINTF(("%.*sStarting regex recursion %d\n", rlevel*2-2, SP,
1376 recursing + 1));
1377
1378 rc = internal_dfa_exec(
1379 md, /* fixed match data */
1380 start_code + GET(code, 1), /* this subexpression's code */
1381 ptr, /* where we currently are */
1382 ptr - start_subject, /* start offset */
1383 local_offsets, /* offset vector */
1384 sizeof(local_offsets)/sizeof(int), /* size of same */
1385 local_workspace, /* workspace vector */
1386 sizeof(local_workspace)/sizeof(int), /* size of same */
1387 ims, /* the current ims flags */
1388 rlevel, /* function recursion level */
1389 recursing + 1); /* regex recurse level */
1390
1391 DPRINTF(("%.*sReturn from regex recursion %d: rc=%d\n", rlevel*2-2, SP,
1392 recursing + 1, rc));
1393
1394 /* Ran out of internal offsets */
1395
1396 if (rc == 0) return PCRE_ERROR_DFA_RECURSE;
1397
1398 /* For each successful matched substring, set up the next state with a
1399 count of characters to skip before trying it. Note that the count is in
1400 characters, not bytes. */
1401
1402 if (rc > 0)
1403 {
1404 for (rc = rc*2 - 2; rc >= 0; rc -= 2)
1405 {
1406 const uschar *p = start_subject + local_offsets[rc];
1407 const uschar *pp = start_subject + local_offsets[rc+1];
1408 int charcount = local_offsets[rc+1] - local_offsets[rc];
1409 while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
1410 if (charcount > 0)
1411 {
1412 ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));
1413 }
1414 else
1415 {
1416 ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0);
1417 }
1418 }
1419 }
1420 else if (rc != PCRE_ERROR_NOMATCH) return rc;
1421 }
1422 break;
1423
1424 /*-----------------------------------------------------------------*/
1425 case OP_ONCE:
1426 {
1427 const uschar *endcode;
1428 int local_offsets[2];
1429 int local_workspace[1000];
1430
1431 int rc = internal_dfa_exec(
1432 md, /* fixed match data */
1433 code, /* this subexpression's code */
1434 ptr, /* where we currently are */
1435 ptr - start_subject, /* start offset */
1436 local_offsets, /* offset vector */
1437 sizeof(local_offsets)/sizeof(int), /* size of same */
1438 local_workspace, /* workspace vector */
1439 sizeof(local_workspace)/sizeof(int), /* size of same */
1440 ims, /* the current ims flags */
1441 rlevel, /* function recursion level */
1442 recursing); /* pass on regex recursion */
1443
1444 if (rc >= 0)
1445 {
1446 const uschar *end_subpattern = code;
1447 int charcount = local_offsets[1] - local_offsets[0];
1448 int next_state_offset, repeat_state_offset;
1449 BOOL is_repeated;
1450
1451 do { end_subpattern += GET(end_subpattern, 1); }
1452 while (*end_subpattern == OP_ALT);
1453 next_state_offset = end_subpattern - start_code + LINK_SIZE + 1;
1454
1455 /* If the end of this subpattern is KETRMAX or KETRMIN, we must
1456 arrange for the repeat state also to be added to the relevant list.
1457 Calculate the offset, or set -1 for no repeat. */
1458
1459 repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
1460 *end_subpattern == OP_KETRMIN)?
1461 end_subpattern - start_code - GET(end_subpattern, 1) : -1;
1462
1463 /* If we have matched an empty string, add the next state at the
1464 current character pointer. This is important so that the duplicate
1465 checking kicks in, which is what breaks infinite loops that match an
1466 empty string. */
1467
1468 if (charcount == 0)
1469 {
1470 ADD_ACTIVE(next_state_offset, 0);
1471 }
1472
1473 /* Optimization: if there are no more active states, and there
1474 are no new states yet set up, then skip over the subject string
1475 right here, to save looping. Otherwise, set up the new state to swing
1476 into action when the end of the substring is reached. */
1477
1478 else if (i + 1 >= active_count && new_count == 0)
1479 {
1480 ptr += charcount;
1481 clen = 0;
1482 ADD_NEW(next_state_offset, 0);
1483
1484 /* If we are adding a repeat state at the new character position,
1485 we must fudge things so that it is the only current state.
1486 Otherwise, it might be a duplicate of one we processed before, and
1487 that would cause it to be skipped. */
1488
1489 if (repeat_state_offset >= 0)
1490 {
1491 next_active_state = active_states;
1492 active_count = 0;
1493 i = -1;
1494 ADD_ACTIVE(repeat_state_offset, 0);
1495 }
1496 }
1497 else
1498 {
1499 const uschar *p = start_subject + local_offsets[0];
1500 const uschar *pp = start_subject + local_offsets[1];
1501 while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
1502 ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
1503 if (repeat_state_offset >= 0)
1504 { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }
1505 }
1506
1507 }
1508 else if (rc != PCRE_ERROR_NOMATCH) return rc;
1509 }
1510 break;
1511
1512
1513 /* ========================================================================== */
1514 /* Handle callouts */
1515
1516 case OP_CALLOUT:
1517 if (pcre_callout != NULL)
1518 {
1519 int rrc;
1520 pcre_callout_block cb;
1521 cb.version = 1; /* Version 1 of the callout block */
1522 cb.callout_number = code[1];
1523 cb.offset_vector = offsets;
1524 cb.subject = (char *)start_subject;
1525 cb.subject_length = end_subject - start_subject;
1526 cb.start_match = current_subject - start_subject;
1527 cb.current_position = ptr - start_subject;
1528 cb.pattern_position = GET(code, 2);
1529 cb.next_item_length = GET(code, 2 + LINK_SIZE);
1530 cb.capture_top = 1;
1531 cb.capture_last = -1;
1532 cb.callout_data = md->callout_data;
1533 if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc; /* Abandon */
1534 if (rrc == 0) { ADD_ACTIVE(state_offset + 2 + 2*LINK_SIZE, 0); }
1535 }
1536 break;
1537
1538
1539 /* ========================================================================== */
1540 default: /* Unsupported opcode */
1541 return PCRE_ERROR_DFA_UITEM;
1542 }
1543
1544 NEXT_ACTIVE_STATE: continue;
1545
1546 } /* End of loop scanning active states */
1547
1548 /* We have finished the processing at the current subject character. If no
1549 new states have been set for the next character, we have found all the
1550 matches that we are going to find. If we are at the top level and partial
1551 matching has been requested, check for appropriate conditions. */
1552
1553 if (new_count <= 0)
1554 {
1555 if (match_count < 0 && /* No matches found */
1556 rlevel == 1 && /* Top level match function */
1557 (md->moptions & PCRE_PARTIAL) != 0 && /* Want partial matching */
1558 ptr >= end_subject && /* Reached end of subject */
1559 ptr > current_subject) /* Matched non-empty string */
1560 {
1561 if (offsetcount >= 2)
1562 {
1563 offsets[0] = current_subject - start_subject;
1564 offsets[1] = end_subject - start_subject;
1565 }
1566 match_count = PCRE_ERROR_PARTIAL;
1567 }
1568
1569 DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
1570 "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count,
1571 rlevel*2-2, SP));
1572 return match_count;
1573 }
1574
1575 /* One or more states are active for the next character. */
1576
1577 ptr += clen; /* Advance to next subject character */
1578 } /* Loop to move along the subject string */
1579
1580 /* Control never gets here, but we must keep the compiler happy. */
1581
1582 DPRINTF(("%.*s+++ Unexpected end of internal_dfa_exec %d +++\n"
1583 "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, rlevel*2-2, SP));
1584 return PCRE_ERROR_NOMATCH;
1585 }
1586
1587
1588
1589
1590 /*************************************************
1591 * Execute a Regular Expression - DFA engine *
1592 *************************************************/
1593
1594 /* This external function applies a compiled re to a subject string using a DFA
1595 engine. This function calls the internal function multiple times if the pattern
1596 is not anchored.
1597
1598 Arguments:
1599 argument_re points to the compiled expression
1600 extra_data points to extra data or is NULL (not currently used)
1601 subject points to the subject string
1602 length length of subject string (may contain binary zeros)
1603 start_offset where to start in the subject string
1604 options option bits
1605 offsets vector of match offsets
1606 offsetcount size of same
1607 workspace workspace vector
1608 wscount size of same
1609
1610 Returns: > 0 => number of match offset pairs placed in offsets
1611 = 0 => offsets overflowed; longest matches are present
1612 -1 => failed to match
1613 < -1 => some kind of unexpected problem
1614 */
1615
1616 EXPORT int
1617 pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
1618 const char *subject, int length, int start_offset, int options, int *offsets,
1619 int offsetcount, int *workspace, int wscount)
1620 {
1621 real_pcre *re = (real_pcre *)argument_re;
1622 dfa_match_data match_block;
1623 BOOL utf8, anchored, startline, firstline;
1624 const uschar *current_subject, *end_subject, *lcc;
1625
1626 pcre_study_data internal_study;
1627 const pcre_study_data *study = NULL;
1628 real_pcre internal_re;
1629
1630 const uschar *req_byte_ptr;
1631 const uschar *start_bits = NULL;
1632 BOOL first_byte_caseless = FALSE;
1633 BOOL req_byte_caseless = FALSE;
1634 int first_byte = -1;
1635 int req_byte = -1;
1636 int req_byte2 = -1;
1637
1638 /* Plausibility checks */
1639
1640 if ((options & ~PUBLIC_DFA_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
1641 if (re == NULL || subject == NULL || workspace == NULL ||
1642 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
1643 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
1644 if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;
1645
1646 /* We need to find the pointer to any study data before we test for byte
1647 flipping, so we scan the extra_data block first. This may set two fields in the
1648 match block, so we must initialize them beforehand. However, the other fields
1649 in the match block must not be set until after the byte flipping. */
1650
1651 match_block.tables = re->tables;
1652 match_block.callout_data = NULL;
1653
1654 if (extra_data != NULL)
1655 {
1656 unsigned int flags = extra_data->flags;
1657 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
1658 study = (const pcre_study_data *)extra_data->study_data;
1659 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) return PCRE_ERROR_DFA_UMLIMIT;
1660 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
1661 match_block.callout_data = extra_data->callout_data;
1662 if ((flags & PCRE_EXTRA_TABLES) != 0)
1663 match_block.tables = extra_data->tables;
1664 }
1665
1666 /* Check that the first field in the block is the magic number. If it is not,
1667 test for a regex that was compiled on a host of opposite endianness. If this is
1668 the case, flipped values are put in internal_re and internal_study if there was
1669 study data too. */
1670
1671 if (re->magic_number != MAGIC_NUMBER)
1672 {
1673 re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
1674 if (re == NULL) return PCRE_ERROR_BADMAGIC;
1675 if (study != NULL) study = &internal_study;
1676 }
1677
1678 /* Set some local values */
1679
1680 current_subject = (const unsigned char *)subject + start_offset;
1681 end_subject = (const unsigned char *)subject + length;
1682 req_byte_ptr = current_subject - 1;
1683
1684 utf8 = (re->options & PCRE_UTF8) != 0;
1685 anchored = (options & PCRE_ANCHORED) != 0 || (re->options & PCRE_ANCHORED) != 0;
1686
1687 /* The remaining fixed data for passing around. */
1688
1689 match_block.start_code = (const uschar *)argument_re +
1690 re->name_table_offset + re->name_count * re->name_entry_size;
1691 match_block.start_subject = (const unsigned char *)subject;
1692 match_block.end_subject = end_subject;
1693 match_block.moptions = options;
1694 match_block.poptions = re->options;
1695
1696 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
1697 back the character offset. */
1698
1699 #ifdef SUPPORT_UTF8
1700 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
1701 {
1702 if (_pcre_valid_utf8((uschar *)subject, length) >= 0)
1703 return PCRE_ERROR_BADUTF8;
1704 if (start_offset > 0 && start_offset < length)
1705 {
1706 int tb = ((uschar *)subject)[start_offset];
1707 if (tb > 127)
1708 {
1709 tb &= 0xc0;
1710 if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
1711 }
1712 }
1713 }
1714 #endif
1715
1716 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
1717 is a feature that makes it possible to save compiled regex and re-use them
1718 in other programs later. */
1719
1720 if (match_block.tables == NULL) match_block.tables = _pcre_default_tables;
1721
1722 /* The lower casing table and the "must be at the start of a line" flag are
1723 used in a loop when finding where to start. */
1724
1725 lcc = match_block.tables + lcc_offset;
1726 startline = (re->options & PCRE_STARTLINE) != 0;
1727 firstline = (re->options & PCRE_FIRSTLINE) != 0;
1728
1729 /* Set up the first character to match, if available. The first_byte value is
1730 never set for an anchored regular expression, but the anchoring may be forced
1731 at run time, so we have to test for anchoring. The first char may be unset for
1732 an unanchored pattern, of course. If there's no first char and the pattern was
1733 studied, there may be a bitmap of possible first characters. */
1734
1735 if (!anchored)
1736 {
1737 if ((re->options & PCRE_FIRSTSET) != 0)
1738 {
1739 first_byte = re->first_byte & 255;
1740 if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
1741 first_byte = lcc[first_byte];
1742 }
1743 else
1744 {
1745 if (startline && study != NULL &&
1746 (study->options & PCRE_STUDY_MAPPED) != 0)
1747 start_bits = study->start_bits;
1748 }
1749 }
1750
1751 /* For anchored or unanchored matches, there may be a "last known required
1752 character" set. */
1753
1754 if ((re->options & PCRE_REQCHSET) != 0)
1755 {
1756 req_byte = re->req_byte & 255;
1757 req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
1758 req_byte2 = (match_block.tables + fcc_offset)[req_byte]; /* case flipped */
1759 }
1760
1761 /* Call the main matching function, looping for a non-anchored regex after a
1762 failed match. Unless restarting, optimize by moving to the first match
1763 character if possible, when not anchored. Then unless wanting a partial match,
1764 check for a required later character. */
1765
1766 for (;;)
1767 {
1768 int rc;
1769
1770 if ((options & PCRE_DFA_RESTART) == 0)
1771 {
1772 const uschar *save_end_subject = end_subject;
1773
1774 /* Advance to a unique first char if possible. If firstline is TRUE, the
1775 start of the match is constrained to the first line of a multiline string.
1776 Implement this by temporarily adjusting end_subject so that we stop scanning
1777 at a newline. If the match fails at the newline, later code breaks this loop.
1778 */
1779
1780 if (firstline)
1781 {
1782 const uschar *t = current_subject;
1783 while (t < save_end_subject && *t != '\n') t++;
1784 end_subject = t;
1785 }
1786
1787 if (first_byte >= 0)
1788 {
1789 if (first_byte_caseless)
1790 while (current_subject < end_subject &&
1791 lcc[*current_subject] != first_byte)
1792 current_subject++;
1793 else
1794 while (current_subject < end_subject && *current_subject != first_byte)
1795 current_subject++;
1796 }
1797
1798 /* Or to just after \n for a multiline match if possible */
1799
1800 else if (startline)
1801 {
1802 if (current_subject > match_block.start_subject + start_offset)
1803 {
1804 while (current_subject < end_subject && current_subject[-1] != NEWLINE)
1805 current_subject++;
1806 }
1807 }
1808
1809 /* Or to a non-unique first char after study */
1810
1811 else if (start_bits != NULL)
1812 {
1813 while (current_subject < end_subject)
1814 {
1815 register unsigned int c = *current_subject;
1816 if ((start_bits[c/8] & (1 << (c&7))) == 0) current_subject++;
1817 else break;
1818 }
1819 }
1820
1821 /* Restore fudged end_subject */
1822
1823 end_subject = save_end_subject;
1824 }
1825
1826 /* If req_byte is set, we know that that character must appear in the subject
1827 for the match to succeed. If the first character is set, req_byte must be
1828 later in the subject; otherwise the test starts at the match point. This
1829 optimization can save a huge amount of work in patterns with nested unlimited
1830 repeats that aren't going to match. Writing separate code for cased/caseless
1831 versions makes it go faster, as does using an autoincrement and backing off
1832 on a match.
1833
1834 HOWEVER: when the subject string is very, very long, searching to its end can
1835 take a long time, and give bad performance on quite ordinary patterns. This
1836 showed up when somebody was matching /^C/ on a 32-megabyte string... so we
1837 don't do this when the string is sufficiently long.
1838
1839 ALSO: this processing is disabled when partial matching is requested.
1840 */
1841
1842 if (req_byte >= 0 &&
1843 end_subject - current_subject < REQ_BYTE_MAX &&
1844 (options & PCRE_PARTIAL) == 0)
1845 {
1846 register const uschar *p = current_subject + ((first_byte >= 0)? 1 : 0);
1847
1848 /* We don't need to repeat the search if we haven't yet reached the
1849 place we found it at last time. */
1850
1851 if (p > req_byte_ptr)
1852 {
1853 if (req_byte_caseless)
1854 {
1855 while (p < end_subject)
1856 {
1857 register int pp = *p++;
1858 if (pp == req_byte || pp == req_byte2) { p--; break; }
1859 }
1860 }
1861 else
1862 {
1863 while (p < end_subject)
1864 {
1865 if (*p++ == req_byte) { p--; break; }
1866 }
1867 }
1868
1869 /* If we can't find the required character, break the matching loop,
1870 which will cause a return or PCRE_ERROR_NOMATCH. */
1871
1872 if (p >= end_subject) break;
1873
1874 /* If we have found the required character, save the point where we
1875 found it, so that we don't search again next time round the loop if
1876 the start hasn't passed this character yet. */
1877
1878 req_byte_ptr = p;
1879 }
1880 }
1881
1882 /* OK, now we can do the business */
1883
1884 rc = internal_dfa_exec(
1885 &match_block, /* fixed match data */
1886 match_block.start_code, /* this subexpression's code */
1887 current_subject, /* where we currently are */
1888 start_offset, /* start offset in subject */
1889 offsets, /* offset vector */
1890 offsetcount, /* size of same */
1891 workspace, /* workspace vector */
1892 wscount, /* size of same */
1893 re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL), /* ims flags */
1894 0, /* function recurse level */
1895 0); /* regex recurse level */
1896
1897 /* Anything other than "no match" means we are done, always; otherwise, carry
1898 on only if not anchored. */
1899
1900 if (rc != PCRE_ERROR_NOMATCH || anchored) return rc;
1901
1902 /* Advance to the next subject character unless we are at the end of a line
1903 and firstline is set. */
1904
1905 if (firstline && *current_subject == NEWLINE) break;
1906 current_subject++;
1907
1908 #ifdef SUPPORT_UTF8
1909 if (utf8)
1910 {
1911 while (current_subject < end_subject && (*current_subject & 0xc0) == 0x80)
1912 current_subject++;
1913 }
1914 #endif
1915
1916 if (current_subject > end_subject) break;
1917 }
1918
1919 return PCRE_ERROR_NOMATCH;
1920 }
1921
1922 /* End of pcre_dfa_exec.c */

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12