/[pcre]/code/trunk/pcre_dfa_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_dfa_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 85 - (show annotations) (download)
Sat Feb 24 21:41:13 2007 UTC (7 years, 7 months ago) by nigel
File MIME type: text/plain
File size: 66595 byte(s)
Load pcre-6.4 into code/trunk.

1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2005 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains the external function pcre_dfa_exec(), which is an
42 alternative matching function that uses a DFA algorithm. This is NOT Perl-
43 compatible, but it has advantages in certain applications. */
44
45
46 #include "pcre_internal.h"
47
48
49 /* For use to indent debugging output */
50
51 #define SP " "
52
53
54
55 /*************************************************
56 * Code parameters and static tables *
57 *************************************************/
58
59 /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
60 into others, under special conditions. A gap of 10 between the blocks should be
61 enough. */
62
63 #define OP_PROP_EXTRA (EXTRACT_BASIC_MAX+1)
64 #define OP_EXTUNI_EXTRA (EXTRACT_BASIC_MAX+11)
65
66
67 /* This table identifies those opcodes that are followed immediately by a
68 character that is to be tested in some way. This makes is possible to
69 centralize the loading of these characters. In the case of Type * etc, the
70 "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
71 small value. */
72
73 static uschar coptable[] = {
74 0, /* End */
75 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* \A, \G, \B, \b, \D, \d, \S, \s, \W, \w */
76 0, 0, /* Any, Anybyte */
77 0, 0, 0, /* NOTPROP, PROP, EXTUNI */
78 0, 0, 0, 0, 0, /* \Z, \z, Opt, ^, $ */
79 1, /* Char */
80 1, /* Charnc */
81 1, /* not */
82 /* Positive single-char repeats */
83 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
84 3, 3, 3, /* upto, minupto, exact */
85 /* Negative single-char repeats - only for chars < 256 */
86 1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
87 3, 3, 3, /* NOT upto, minupto, exact */
88 /* Positive type repeats */
89 1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
90 3, 3, 3, /* Type upto, minupto, exact */
91 /* Character class & ref repeats */
92 0, 0, 0, 0, 0, 0, /* *, *?, +, +?, ?, ?? */
93 0, 0, /* CRRANGE, CRMINRANGE */
94 0, /* CLASS */
95 0, /* NCLASS */
96 0, /* XCLASS - variable length */
97 0, /* REF */
98 0, /* RECURSE */
99 0, /* CALLOUT */
100 0, /* Alt */
101 0, /* Ket */
102 0, /* KetRmax */
103 0, /* KetRmin */
104 0, /* Assert */
105 0, /* Assert not */
106 0, /* Assert behind */
107 0, /* Assert behind not */
108 0, /* Reverse */
109 0, /* Once */
110 0, /* COND */
111 0, /* CREF */
112 0, 0, /* BRAZERO, BRAMINZERO */
113 0, /* BRANUMBER */
114 0 /* BRA */
115 };
116
117 /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
118 and \w */
119
120 static uschar toptable1[] = {
121 0, 0, 0, 0, 0,
122 ctype_digit, ctype_digit,
123 ctype_space, ctype_space,
124 ctype_word, ctype_word,
125 0 /* OP_ANY */
126 };
127
128 static uschar toptable2[] = {
129 0, 0, 0, 0, 0,
130 ctype_digit, 0,
131 ctype_space, 0,
132 ctype_word, 0,
133 1 /* OP_ANY */
134 };
135
136
137 /* Structure for holding data about a particular state, which is in effect the
138 current data for an active path through the match tree. It must consist
139 entirely of ints because the working vector we are passed, and which we put
140 these structures in, is a vector of ints. */
141
142 typedef struct stateblock {
143 int offset; /* Offset to opcode */
144 int count; /* Count for repeats */
145 int ims; /* ims flag bits */
146 int data; /* Some use extra data */
147 } stateblock;
148
149 #define INTS_PER_STATEBLOCK (sizeof(stateblock)/sizeof(int))
150
151
152 #ifdef DEBUG
153 /*************************************************
154 * Print character string *
155 *************************************************/
156
157 /* Character string printing function for debugging.
158
159 Arguments:
160 p points to string
161 length number of bytes
162 f where to print
163
164 Returns: nothing
165 */
166
167 static void
168 pchars(unsigned char *p, int length, FILE *f)
169 {
170 int c;
171 while (length-- > 0)
172 {
173 if (isprint(c = *(p++)))
174 fprintf(f, "%c", c);
175 else
176 fprintf(f, "\\x%02x", c);
177 }
178 }
179 #endif
180
181
182
183 /*************************************************
184 * Execute a Regular Expression - DFA engine *
185 *************************************************/
186
187 /* This internal function applies a compiled pattern to a subject string,
188 starting at a given point, using a DFA engine. This function is called from the
189 external one, possibly multiple times if the pattern is not anchored. The
190 function calls itself recursively for some kinds of subpattern.
191
192 Arguments:
193 md the match_data block with fixed information
194 this_start_code the opening bracket of this subexpression's code
195 current_subject where we currently are in the subject string
196 start_offset start offset in the subject string
197 offsets vector to contain the matching string offsets
198 offsetcount size of same
199 workspace vector of workspace
200 wscount size of same
201 ims the current ims flags
202 rlevel function call recursion level
203 recursing regex recursive call level
204
205 Returns: > 0 =>
206 = 0 =>
207 -1 => failed to match
208 < -1 => some kind of unexpected problem
209
210 The following macros are used for adding states to the two state vectors (one
211 for the current character, one for the following character). */
212
213 #define ADD_ACTIVE(x,y) \
214 if (active_count++ < wscount) \
215 { \
216 next_active_state->offset = (x); \
217 next_active_state->count = (y); \
218 next_active_state->ims = ims; \
219 next_active_state++; \
220 DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
221 } \
222 else return PCRE_ERROR_DFA_WSSIZE
223
224 #define ADD_ACTIVE_DATA(x,y,z) \
225 if (active_count++ < wscount) \
226 { \
227 next_active_state->offset = (x); \
228 next_active_state->count = (y); \
229 next_active_state->ims = ims; \
230 next_active_state->data = (z); \
231 next_active_state++; \
232 DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
233 } \
234 else return PCRE_ERROR_DFA_WSSIZE
235
236 #define ADD_NEW(x,y) \
237 if (new_count++ < wscount) \
238 { \
239 next_new_state->offset = (x); \
240 next_new_state->count = (y); \
241 next_new_state->ims = ims; \
242 next_new_state++; \
243 DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
244 } \
245 else return PCRE_ERROR_DFA_WSSIZE
246
247 #define ADD_NEW_DATA(x,y,z) \
248 if (new_count++ < wscount) \
249 { \
250 next_new_state->offset = (x); \
251 next_new_state->count = (y); \
252 next_new_state->ims = ims; \
253 next_new_state->data = (z); \
254 next_new_state++; \
255 DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
256 } \
257 else return PCRE_ERROR_DFA_WSSIZE
258
259 /* And now, here is the code */
260
261 static int
262 internal_dfa_exec(
263 dfa_match_data *md,
264 const uschar *this_start_code,
265 const uschar *current_subject,
266 int start_offset,
267 int *offsets,
268 int offsetcount,
269 int *workspace,
270 int wscount,
271 int ims,
272 int rlevel,
273 int recursing)
274 {
275 stateblock *active_states, *new_states, *temp_states;
276 stateblock *next_active_state, *next_new_state;
277
278 const uschar *ctypes, *lcc, *fcc;
279 const uschar *ptr;
280 const uschar *end_code;
281
282 int active_count, new_count, match_count;
283
284 /* Some fields in the md block are frequently referenced, so we load them into
285 independent variables in the hope that this will perform better. */
286
287 const uschar *start_subject = md->start_subject;
288 const uschar *end_subject = md->end_subject;
289 const uschar *start_code = md->start_code;
290
291 BOOL utf8 = (md->poptions & PCRE_UTF8) != 0;
292
293 rlevel++;
294 offsetcount &= (-2);
295
296 wscount -= 2;
297 wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
298 (2 * INTS_PER_STATEBLOCK);
299
300 DPRINTF(("\n%.*s---------------------\n"
301 "%.*sCall to internal_dfa_exec f=%d r=%d\n",
302 rlevel*2-2, SP, rlevel*2-2, SP, rlevel, recursing));
303
304 ctypes = md->tables + ctypes_offset;
305 lcc = md->tables + lcc_offset;
306 fcc = md->tables + fcc_offset;
307
308 match_count = PCRE_ERROR_NOMATCH; /* A negative number */
309
310 active_states = (stateblock *)(workspace + 2);
311 next_new_state = new_states = active_states + wscount;
312 new_count = 0;
313
314 /* The first thing in any (sub) pattern is a bracket of some sort. Push all
315 the alternative states onto the list, and find out where the end is. This
316 makes is possible to use this function recursively, when we want to stop at a
317 matching internal ket rather than at the end.
318
319 If the first opcode in the first alternative is OP_REVERSE, we are dealing with
320 a backward assertion. In that case, we have to find out the maximum amount to
321 move back, and set up each alternative appropriately. */
322
323 if (this_start_code[1+LINK_SIZE] == OP_REVERSE)
324 {
325 int max_back = 0;
326 int gone_back;
327
328 end_code = this_start_code;
329 do
330 {
331 int back = GET(end_code, 2+LINK_SIZE);
332 if (back > max_back) max_back = back;
333 end_code += GET(end_code, 1);
334 }
335 while (*end_code == OP_ALT);
336
337 /* If we can't go back the amount required for the longest lookbehind
338 pattern, go back as far as we can; some alternatives may still be viable. */
339
340 #ifdef SUPPORT_UTF8
341 /* In character mode we have to step back character by character */
342
343 if (utf8)
344 {
345 for (gone_back = 0; gone_back < max_back; gone_back++)
346 {
347 if (current_subject <= start_subject) break;
348 current_subject--;
349 while (current_subject > start_subject &&
350 (*current_subject & 0xc0) == 0x80)
351 current_subject--;
352 }
353 }
354 else
355 #endif
356
357 /* In byte-mode we can do this quickly. */
358
359 {
360 gone_back = (current_subject - max_back < start_subject)?
361 current_subject - start_subject : max_back;
362 current_subject -= gone_back;
363 }
364
365 /* Now we can process the individual branches. */
366
367 end_code = this_start_code;
368 do
369 {
370 int back = GET(end_code, 2+LINK_SIZE);
371 if (back <= gone_back)
372 {
373 int bstate = end_code - start_code + 2 + 2*LINK_SIZE;
374 ADD_NEW_DATA(-bstate, 0, gone_back - back);
375 }
376 end_code += GET(end_code, 1);
377 }
378 while (*end_code == OP_ALT);
379 }
380
381 /* This is the code for a "normal" subpattern (not a backward assertion). The
382 start of a whole pattern is always one of these. If we are at the top level,
383 we may be asked to restart matching from the same point that we reached for a
384 previous partial match. We still have to scan through the top-level branches to
385 find the end state. */
386
387 else
388 {
389 end_code = this_start_code;
390
391 /* Restarting */
392
393 if (rlevel == 1 && (md->moptions & PCRE_DFA_RESTART) != 0)
394 {
395 do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);
396 new_count = workspace[1];
397 if (!workspace[0])
398 memcpy(new_states, active_states, new_count * sizeof(stateblock));
399 }
400
401 /* Not restarting */
402
403 else
404 {
405 do
406 {
407 ADD_NEW(end_code - start_code + 1 + LINK_SIZE, 0);
408 end_code += GET(end_code, 1);
409 }
410 while (*end_code == OP_ALT);
411 }
412 }
413
414 workspace[0] = 0; /* Bit indicating which vector is current */
415
416 DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, end_code - start_code));
417
418 /* Loop for scanning the subject */
419
420 ptr = current_subject;
421 for (;;)
422 {
423 int i, j;
424 int c, d, clen, dlen;
425
426 /* Make the new state list into the active state list and empty the
427 new state list. */
428
429 temp_states = active_states;
430 active_states = new_states;
431 new_states = temp_states;
432 active_count = new_count;
433 new_count = 0;
434
435 workspace[0] ^= 1; /* Remember for the restarting feature */
436 workspace[1] = active_count;
437
438 #ifdef DEBUG
439 printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);
440 pchars((uschar *)ptr, strlen((char *)ptr), stdout);
441 printf("\"\n");
442
443 printf("%.*sActive states: ", rlevel*2-2, SP);
444 for (i = 0; i < active_count; i++)
445 printf("%d/%d ", active_states[i].offset, active_states[i].count);
446 printf("\n");
447 #endif
448
449 /* Set the pointers for adding new states */
450
451 next_active_state = active_states + active_count;
452 next_new_state = new_states;
453
454 /* Load the current character from the subject outside the loop, as many
455 different states may want to look at it, and we assume that at least one
456 will. */
457
458 if (ptr < end_subject)
459 {
460 clen = 1;
461 #ifdef SUPPORT_UTF8
462 if (utf8) { GETCHARLEN(c, ptr, clen); } else
463 #endif /* SUPPORT_UTF8 */
464 c = *ptr;
465 }
466 else
467 {
468 clen = 0; /* At end subject */
469 c = -1;
470 }
471
472 /* Scan up the active states and act on each one. The result of an action
473 may be to add more states to the currently active list (e.g. on hitting a
474 parenthesis) or it may be to put states on the new list, for considering
475 when we move the character pointer on. */
476
477 for (i = 0; i < active_count; i++)
478 {
479 stateblock *current_state = active_states + i;
480 const uschar *code;
481 int state_offset = current_state->offset;
482 int count, codevalue;
483 int chartype, othercase;
484
485 #ifdef DEBUG
486 printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
487 if (c < 0) printf("-1\n");
488 else if (c > 32 && c < 127) printf("'%c'\n", c);
489 else printf("0x%02x\n", c);
490 #endif
491
492 /* This variable is referred to implicity in the ADD_xxx macros. */
493
494 ims = current_state->ims;
495
496 /* A negative offset is a special case meaning "hold off going to this
497 (negated) state until the number of characters in the data field have
498 been skipped". */
499
500 if (state_offset < 0)
501 {
502 if (current_state->data > 0)
503 {
504 DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP));
505 ADD_NEW_DATA(state_offset, current_state->count,
506 current_state->data - 1);
507 continue;
508 }
509 else
510 {
511 current_state->offset = state_offset = -state_offset;
512 }
513 }
514
515 /* Check for a duplicate state with the same count, and skip if found. */
516
517 for (j = 0; j < i; j++)
518 {
519 if (active_states[j].offset == state_offset &&
520 active_states[j].count == current_state->count)
521 {
522 DPRINTF(("%.*sDuplicate state: skipped\n", rlevel*2-2, SP));
523 goto NEXT_ACTIVE_STATE;
524 }
525 }
526
527 /* The state offset is the offset to the opcode */
528
529 code = start_code + state_offset;
530 codevalue = *code;
531 if (codevalue >= OP_BRA) codevalue = OP_BRA; /* All brackets are equal */
532
533 /* If this opcode is followed by an inline character, load it. It is
534 tempting to test for the presence of a subject character here, but that
535 is wrong, because sometimes zero repetitions of the subject are
536 permitted.
537
538 We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
539 argument that is not a data character - but is always one byte long.
540 Unfortunately, we have to take special action to deal with \P, \p, and
541 \X in this case. To keep the other cases fast, convert these ones to new
542 opcodes. */
543
544 if (coptable[codevalue] > 0)
545 {
546 dlen = 1;
547 #ifdef SUPPORT_UTF8
548 if (utf8) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
549 #endif /* SUPPORT_UTF8 */
550 d = code[coptable[codevalue]];
551 if (codevalue >= OP_TYPESTAR)
552 {
553 if (d == OP_ANYBYTE) return PCRE_ERROR_DFA_UITEM;
554 if (d >= OP_NOTPROP)
555 codevalue += (d == OP_EXTUNI)? OP_EXTUNI_EXTRA : OP_PROP_EXTRA;
556 }
557 }
558 else
559 {
560 dlen = 0; /* Not strictly necessary, but compilers moan */
561 d = -1; /* if these variables are not set. */
562 }
563
564
565 /* Now process the individual opcodes */
566
567 switch (codevalue)
568 {
569
570 /* ========================================================================== */
571 /* Reached a closing bracket. If not at the end of the pattern, carry
572 on with the next opcode. Otherwise, unless we have an empty string and
573 PCRE_NOTEMPTY is set, save the match data, shifting up all previous
574 matches so we always have the longest first. */
575
576 case OP_KET:
577 case OP_KETRMIN:
578 case OP_KETRMAX:
579 if (code != end_code)
580 {
581 ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
582 if (codevalue != OP_KET)
583 {
584 ADD_ACTIVE(state_offset - GET(code, 1), 0);
585 }
586 }
587 else if (ptr > current_subject || (md->moptions & PCRE_NOTEMPTY) == 0)
588 {
589 if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
590 else if (match_count > 0 && ++match_count * 2 >= offsetcount)
591 match_count = 0;
592 count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
593 if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
594 if (offsetcount >= 2)
595 {
596 offsets[0] = current_subject - start_subject;
597 offsets[1] = ptr - start_subject;
598 DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
599 offsets[1] - offsets[0], current_subject));
600 }
601 if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
602 {
603 DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
604 "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,
605 match_count, rlevel*2-2, SP));
606 return match_count;
607 }
608 }
609 break;
610
611 /* ========================================================================== */
612 /* These opcodes add to the current list of states without looking
613 at the current character. */
614
615 /*-----------------------------------------------------------------*/
616 case OP_ALT:
617 do { code += GET(code, 1); } while (*code == OP_ALT);
618 ADD_ACTIVE(code - start_code, 0);
619 break;
620
621 /*-----------------------------------------------------------------*/
622 case OP_BRA:
623 do
624 {
625 ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
626 code += GET(code, 1);
627 }
628 while (*code == OP_ALT);
629 break;
630
631 /*-----------------------------------------------------------------*/
632 case OP_BRAZERO:
633 case OP_BRAMINZERO:
634 ADD_ACTIVE(state_offset + 1, 0);
635 code += 1 + GET(code, 2);
636 while (*code == OP_ALT) code += GET(code, 1);
637 ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
638 break;
639
640 /*-----------------------------------------------------------------*/
641 case OP_BRANUMBER:
642 ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
643 break;
644
645 /*-----------------------------------------------------------------*/
646 case OP_CIRC:
647 if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
648 ((ims & PCRE_MULTILINE) != 0 && ptr[-1] == NEWLINE))
649 { ADD_ACTIVE(state_offset + 1, 0); }
650 break;
651
652 /*-----------------------------------------------------------------*/
653 case OP_EOD:
654 if (ptr >= end_subject) { ADD_ACTIVE(state_offset + 1, 0); }
655 break;
656
657 /*-----------------------------------------------------------------*/
658 case OP_OPT:
659 ims = code[1];
660 ADD_ACTIVE(state_offset + 2, 0);
661 break;
662
663 /*-----------------------------------------------------------------*/
664 case OP_SOD:
665 if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
666 break;
667
668 /*-----------------------------------------------------------------*/
669 case OP_SOM:
670 if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); }
671 break;
672
673
674 /* ========================================================================== */
675 /* These opcodes inspect the next subject character, and sometimes
676 the previous one as well, but do not have an argument. The variable
677 clen contains the length of the current character and is zero if we are
678 at the end of the subject. */
679
680 /*-----------------------------------------------------------------*/
681 case OP_ANY:
682 if (clen > 0 && (c != NEWLINE || (ims & PCRE_DOTALL) != 0))
683 { ADD_NEW(state_offset + 1, 0); }
684 break;
685
686 /*-----------------------------------------------------------------*/
687 case OP_EODN:
688 if (clen == 0 || (c == NEWLINE && ptr + 1 == end_subject))
689 { ADD_ACTIVE(state_offset + 1, 0); }
690 break;
691
692 /*-----------------------------------------------------------------*/
693 case OP_DOLL:
694 if ((md->moptions & PCRE_NOTEOL) == 0)
695 {
696 if (clen == 0 || (c == NEWLINE && (ptr + 1 == end_subject ||
697 (ims & PCRE_MULTILINE) != 0)))
698 { ADD_ACTIVE(state_offset + 1, 0); }
699 }
700 else if (c == NEWLINE && (ims & PCRE_MULTILINE) != 0)
701 { ADD_ACTIVE(state_offset + 1, 0); }
702 break;
703
704 /*-----------------------------------------------------------------*/
705
706 case OP_DIGIT:
707 case OP_WHITESPACE:
708 case OP_WORDCHAR:
709 if (clen > 0 && c < 256 &&
710 ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)
711 { ADD_NEW(state_offset + 1, 0); }
712 break;
713
714 /*-----------------------------------------------------------------*/
715 case OP_NOT_DIGIT:
716 case OP_NOT_WHITESPACE:
717 case OP_NOT_WORDCHAR:
718 if (clen > 0 && (c >= 256 ||
719 ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0))
720 { ADD_NEW(state_offset + 1, 0); }
721 break;
722
723 /*-----------------------------------------------------------------*/
724 case OP_WORD_BOUNDARY:
725 case OP_NOT_WORD_BOUNDARY:
726 {
727 int left_word, right_word;
728
729 if (ptr > start_subject)
730 {
731 const uschar *temp = ptr - 1;
732 #ifdef SUPPORT_UTF8
733 if (utf8) BACKCHAR(temp);
734 #endif
735 GETCHARTEST(d, temp);
736 left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
737 }
738 else left_word = 0;
739
740 if (clen > 0) right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
741 else right_word = 0;
742
743 if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
744 { ADD_ACTIVE(state_offset + 1, 0); }
745 }
746 break;
747
748
749 #ifdef SUPPORT_UCP
750
751 /*-----------------------------------------------------------------*/
752 /* Check the next character by Unicode property. We will get here only
753 if the support is in the binary; otherwise a compile-time error occurs.
754 */
755
756 case OP_PROP:
757 case OP_NOTPROP:
758 if (clen > 0)
759 {
760 int rqdtype, category;
761 category = _pcre_ucp_findchar(c, &chartype, &othercase);
762 rqdtype = code[1];
763 if (rqdtype >= 128)
764 {
765 if ((rqdtype - 128 == category) == (codevalue == OP_PROP))
766 { ADD_NEW(state_offset + 2, 0); }
767 }
768 else
769 {
770 if ((rqdtype == chartype) == (codevalue == OP_PROP))
771 { ADD_NEW(state_offset + 2, 0); }
772 }
773 }
774 break;
775 #endif
776
777
778
779 /* ========================================================================== */
780 /* These opcodes likewise inspect the subject character, but have an
781 argument that is not a data character. It is one of these opcodes:
782 OP_ANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE, OP_WORDCHAR,
783 OP_NOT_WORDCHAR. The value is loaded into d. */
784
785 case OP_TYPEPLUS:
786 case OP_TYPEMINPLUS:
787 count = current_state->count; /* Already matched */
788 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
789 if (clen > 0)
790 {
791 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
792 (c < 256 &&
793 (d != OP_ANY || c != '\n' || (ims & PCRE_DOTALL) != 0) &&
794 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
795 {
796 count++;
797 ADD_NEW(state_offset, count);
798 }
799 }
800 break;
801
802 /*-----------------------------------------------------------------*/
803 case OP_TYPEQUERY:
804 case OP_TYPEMINQUERY:
805 ADD_ACTIVE(state_offset + 2, 0);
806 if (clen > 0)
807 {
808 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
809 (c < 256 &&
810 (d != OP_ANY || c != '\n' || (ims & PCRE_DOTALL) != 0) &&
811 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
812 {
813 ADD_NEW(state_offset + 2, 0);
814 }
815 }
816 break;
817
818 /*-----------------------------------------------------------------*/
819 case OP_TYPESTAR:
820 case OP_TYPEMINSTAR:
821 ADD_ACTIVE(state_offset + 2, 0);
822 if (clen > 0)
823 {
824 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
825 (c < 256 &&
826 (d != OP_ANY || c != '\n' || (ims & PCRE_DOTALL) != 0) &&
827 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
828 {
829 ADD_NEW(state_offset, 0);
830 }
831 }
832 break;
833
834 /*-----------------------------------------------------------------*/
835 case OP_TYPEEXACT:
836 case OP_TYPEUPTO:
837 case OP_TYPEMINUPTO:
838 if (codevalue != OP_TYPEEXACT)
839 { ADD_ACTIVE(state_offset + 4, 0); }
840 count = current_state->count; /* Number already matched */
841 if (clen > 0)
842 {
843 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
844 (c < 256 &&
845 (d != OP_ANY || c != '\n' || (ims & PCRE_DOTALL) != 0) &&
846 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
847 {
848 if (++count >= GET2(code, 1))
849 { ADD_NEW(state_offset + 4, 0); }
850 else
851 { ADD_NEW(state_offset, count); }
852 }
853 }
854 break;
855
856 /* ========================================================================== */
857 /* These are virtual opcodes that are used when something like
858 OP_TYPEPLUS has OP_PROP, OP_NOTPROP, or OP_EXTUNI as its argument. It
859 keeps the code above fast for the other cases. The argument is in the
860 d variable. */
861
862 case OP_PROP_EXTRA + OP_TYPEPLUS:
863 case OP_PROP_EXTRA + OP_TYPEMINPLUS:
864 count = current_state->count; /* Already matched */
865 if (count > 0) { ADD_ACTIVE(state_offset + 3, 0); }
866 if (clen > 0)
867 {
868 int category = _pcre_ucp_findchar(c, &chartype, &othercase);
869 int rqdtype = code[2];
870 if ((d == OP_PROP) ==
871 (rqdtype == ((rqdtype >= 128)? (category + 128) : chartype)))
872 { count++; ADD_NEW(state_offset, count); }
873 }
874 break;
875
876 /*-----------------------------------------------------------------*/
877 case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
878 case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
879 count = current_state->count; /* Already matched */
880 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
881 if (clen > 0 && _pcre_ucp_findchar(c, &chartype, &othercase) != ucp_M)
882 {
883 const uschar *nptr = ptr + clen;
884 int ncount = 0;
885 while (nptr < end_subject)
886 {
887 int nd;
888 int ndlen = 1;
889 GETCHARLEN(nd, nptr, ndlen);
890 if (_pcre_ucp_findchar(nd, &chartype, &othercase) != ucp_M) break;
891 ncount++;
892 nptr += ndlen;
893 }
894 count++;
895 ADD_NEW_DATA(-state_offset, count, ncount);
896 }
897 break;
898
899 /*-----------------------------------------------------------------*/
900 case OP_PROP_EXTRA + OP_TYPEQUERY:
901 case OP_PROP_EXTRA + OP_TYPEMINQUERY:
902 count = 3;
903 goto QS1;
904
905 case OP_PROP_EXTRA + OP_TYPESTAR:
906 case OP_PROP_EXTRA + OP_TYPEMINSTAR:
907 count = 0;
908
909 QS1:
910
911 ADD_ACTIVE(state_offset + 3, 0);
912 if (clen > 0)
913 {
914 int category = _pcre_ucp_findchar(c, &chartype, &othercase);
915 int rqdtype = code[2];
916 if ((d == OP_PROP) ==
917 (rqdtype == ((rqdtype >= 128)? (category + 128) : chartype)))
918 { ADD_NEW(state_offset + count, 0); }
919 }
920 break;
921
922 /*-----------------------------------------------------------------*/
923 case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
924 case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
925 count = 2;
926 goto QS2;
927
928 case OP_EXTUNI_EXTRA + OP_TYPESTAR:
929 case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
930 count = 0;
931
932 QS2:
933
934 ADD_ACTIVE(state_offset + 2, 0);
935 if (clen > 0 && _pcre_ucp_findchar(c, &chartype, &othercase) != ucp_M)
936 {
937 const uschar *nptr = ptr + clen;
938 int ncount = 0;
939 while (nptr < end_subject)
940 {
941 int nd;
942 int ndlen = 1;
943 GETCHARLEN(nd, nptr, ndlen);
944 if (_pcre_ucp_findchar(nd, &chartype, &othercase) != ucp_M) break;
945 ncount++;
946 nptr += ndlen;
947 }
948 ADD_NEW_DATA(-(state_offset + count), 0, ncount);
949 }
950 break;
951
952 /*-----------------------------------------------------------------*/
953 case OP_PROP_EXTRA + OP_TYPEEXACT:
954 case OP_PROP_EXTRA + OP_TYPEUPTO:
955 case OP_PROP_EXTRA + OP_TYPEMINUPTO:
956 if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
957 { ADD_ACTIVE(state_offset + 5, 0); }
958 count = current_state->count; /* Number already matched */
959 if (clen > 0)
960 {
961 int category = _pcre_ucp_findchar(c, &chartype, &othercase);
962 int rqdtype = code[4];
963 if ((d == OP_PROP) ==
964 (rqdtype == ((rqdtype >= 128)? (category + 128) : chartype)))
965 {
966 if (++count >= GET2(code, 1))
967 { ADD_NEW(state_offset + 5, 0); }
968 else
969 { ADD_NEW(state_offset, count); }
970 }
971 }
972 break;
973
974 /*-----------------------------------------------------------------*/
975 case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
976 case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
977 case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
978 if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
979 { ADD_ACTIVE(state_offset + 4, 0); }
980 count = current_state->count; /* Number already matched */
981 if (clen > 0 && _pcre_ucp_findchar(c, &chartype, &othercase) != ucp_M)
982 {
983 const uschar *nptr = ptr + clen;
984 int ncount = 0;
985 while (nptr < end_subject)
986 {
987 int nd;
988 int ndlen = 1;
989 GETCHARLEN(nd, nptr, ndlen);
990 if (_pcre_ucp_findchar(nd, &chartype, &othercase) != ucp_M) break;
991 ncount++;
992 nptr += ndlen;
993 }
994 if (++count >= GET2(code, 1))
995 { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
996 else
997 { ADD_NEW_DATA(-state_offset, count, ncount); }
998 }
999 break;
1000
1001 /* ========================================================================== */
1002 /* These opcodes are followed by a character that is usually compared
1003 to the current subject character; it is loaded into d. We still get
1004 here even if there is no subject character, because in some cases zero
1005 repetitions are permitted. */
1006
1007 /*-----------------------------------------------------------------*/
1008 case OP_CHAR:
1009 if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); }
1010 break;
1011
1012 /*-----------------------------------------------------------------*/
1013 case OP_CHARNC:
1014 if (clen == 0) break;
1015
1016 #ifdef SUPPORT_UTF8
1017 if (utf8)
1018 {
1019 if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
1020 {
1021 if (c < 128) othercase = fcc[c]; else
1022
1023 /* If we have Unicode property support, we can use it to test the
1024 other case of the character, if there is one. The result of
1025 _pcre_ucp_findchar() is < 0 if the char isn't found, and othercase is
1026 returned as zero if there isn't another case. */
1027
1028 #ifdef SUPPORT_UCP
1029 if (_pcre_ucp_findchar(c, &chartype, &othercase) < 0)
1030 #endif
1031 othercase = -1;
1032
1033 if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
1034 }
1035 }
1036 else
1037 #endif /* SUPPORT_UTF8 */
1038
1039 /* Non-UTF-8 mode */
1040 {
1041 if (lcc[c] == lcc[d]) { ADD_NEW(state_offset + 2, 0); }
1042 }
1043 break;
1044
1045
1046 #ifdef SUPPORT_UCP
1047 /*-----------------------------------------------------------------*/
1048 /* This is a tricky one because it can match more than one character.
1049 Find out how many characters to skip, and then set up a negative state
1050 to wait for them to pass before continuing. */
1051
1052 case OP_EXTUNI:
1053 if (clen > 0 && _pcre_ucp_findchar(c, &chartype, &othercase) != ucp_M)
1054 {
1055 const uschar *nptr = ptr + clen;
1056 int ncount = 0;
1057 while (nptr < end_subject)
1058 {
1059 int nclen = 1;
1060 GETCHARLEN(c, nptr, nclen);
1061 if (_pcre_ucp_findchar(c, &chartype, &othercase) != ucp_M) break;
1062 ncount++;
1063 nptr += nclen;
1064 }
1065 ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
1066 }
1067 break;
1068 #endif
1069
1070 /*-----------------------------------------------------------------*/
1071 /* Match a negated single character. This is only used for one-byte
1072 characters, that is, we know that d < 256. The character we are
1073 checking (c) can be multibyte. */
1074
1075 case OP_NOT:
1076 if (clen > 0)
1077 {
1078 int otherd = ((ims & PCRE_CASELESS) != 0)? fcc[d] : d;
1079 if (c != d && c != otherd) { ADD_NEW(state_offset + dlen + 1, 0); }
1080 }
1081 break;
1082
1083 /*-----------------------------------------------------------------*/
1084 case OP_PLUS:
1085 case OP_MINPLUS:
1086 case OP_NOTPLUS:
1087 case OP_NOTMINPLUS:
1088 count = current_state->count; /* Already matched */
1089 if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
1090 if (clen > 0)
1091 {
1092 int otherd = -1;
1093 if ((ims & PCRE_CASELESS) != 0)
1094 {
1095 #ifdef SUPPORT_UTF8
1096 if (utf8 && c >= 128)
1097 {
1098 #ifdef SUPPORT_UCP
1099 if (_pcre_ucp_findchar(d, &chartype, &otherd) < 0) otherd = -1;
1100 #endif /* SUPPORT_UCP */
1101 }
1102 else
1103 #endif /* SUPPORT_UTF8 */
1104 otherd = fcc[d];
1105 }
1106 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1107 { count++; ADD_NEW(state_offset, count); }
1108 }
1109 break;
1110
1111 /*-----------------------------------------------------------------*/
1112 case OP_QUERY:
1113 case OP_MINQUERY:
1114 case OP_NOTQUERY:
1115 case OP_NOTMINQUERY:
1116 ADD_ACTIVE(state_offset + dlen + 1, 0);
1117 if (clen > 0)
1118 {
1119 int otherd = -1;
1120 if ((ims && PCRE_CASELESS) != 0)
1121 {
1122 #ifdef SUPPORT_UTF8
1123 if (utf8 && c >= 128)
1124 {
1125 #ifdef SUPPORT_UCP
1126 if (_pcre_ucp_findchar(c, &chartype, &otherd) < 0) otherd = -1;
1127 #endif /* SUPPORT_UCP */
1128 }
1129 else
1130 #endif /* SUPPORT_UTF8 */
1131 otherd = fcc[d];
1132 }
1133 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1134 { ADD_NEW(state_offset + dlen + 1, 0); }
1135 }
1136 break;
1137
1138 /*-----------------------------------------------------------------*/
1139 case OP_STAR:
1140 case OP_MINSTAR:
1141 case OP_NOTSTAR:
1142 case OP_NOTMINSTAR:
1143 ADD_ACTIVE(state_offset + dlen + 1, 0);
1144 if (clen > 0)
1145 {
1146 int otherd = -1;
1147 if ((ims && PCRE_CASELESS) != 0)
1148 {
1149 #ifdef SUPPORT_UTF8
1150 if (utf8 && c >= 128)
1151 {
1152 #ifdef SUPPORT_UCP
1153 if (_pcre_ucp_findchar(c, &chartype, &otherd) < 0) otherd = -1;
1154 #endif /* SUPPORT_UCP */
1155 }
1156 else
1157 #endif /* SUPPORT_UTF8 */
1158 otherd = fcc[d];
1159 }
1160 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1161 { ADD_NEW(state_offset, 0); }
1162 }
1163 break;
1164
1165 /*-----------------------------------------------------------------*/
1166 case OP_EXACT:
1167 case OP_UPTO:
1168 case OP_MINUPTO:
1169 case OP_NOTEXACT:
1170 case OP_NOTUPTO:
1171 case OP_NOTMINUPTO:
1172 if (codevalue != OP_EXACT && codevalue != OP_NOTEXACT)
1173 { ADD_ACTIVE(state_offset + dlen + 3, 0); }
1174 count = current_state->count; /* Number already matched */
1175 if (clen > 0)
1176 {
1177 int otherd = -1;
1178 if ((ims & PCRE_CASELESS) != 0)
1179 {
1180 #ifdef SUPPORT_UTF8
1181 if (utf8 && c >= 128)
1182 {
1183 #ifdef SUPPORT_UCP
1184 if (_pcre_ucp_findchar(d, &chartype, &otherd) < 0) otherd = -1;
1185 #endif /* SUPPORT_UCP */
1186 }
1187 else
1188 #endif /* SUPPORT_UTF8 */
1189 otherd = fcc[d];
1190 }
1191 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1192 {
1193 if (++count >= GET2(code, 1))
1194 { ADD_NEW(state_offset + dlen + 3, 0); }
1195 else
1196 { ADD_NEW(state_offset, count); }
1197 }
1198 }
1199 break;
1200
1201
1202 /* ========================================================================== */
1203 /* These are the class-handling opcodes */
1204
1205 case OP_CLASS:
1206 case OP_NCLASS:
1207 case OP_XCLASS:
1208 {
1209 BOOL isinclass = FALSE;
1210 int next_state_offset;
1211 const uschar *ecode;
1212
1213 /* For a simple class, there is always just a 32-byte table, and we
1214 can set isinclass from it. */
1215
1216 if (codevalue != OP_XCLASS)
1217 {
1218 ecode = code + 33;
1219 if (clen > 0)
1220 {
1221 isinclass = (c > 255)? (codevalue == OP_NCLASS) :
1222 ((code[1 + c/8] & (1 << (c&7))) != 0);
1223 }
1224 }
1225
1226 /* An extended class may have a table or a list of single characters,
1227 ranges, or both, and it may be positive or negative. There's a
1228 function that sorts all this out. */
1229
1230 else
1231 {
1232 ecode = code + GET(code, 1);
1233 if (clen > 0) isinclass = _pcre_xclass(c, code + 1 + LINK_SIZE);
1234 }
1235
1236 /* At this point, isinclass is set for all kinds of class, and ecode
1237 points to the byte after the end of the class. If there is a
1238 quantifier, this is where it will be. */
1239
1240 next_state_offset = ecode - start_code;
1241
1242 switch (*ecode)
1243 {
1244 case OP_CRSTAR:
1245 case OP_CRMINSTAR:
1246 ADD_ACTIVE(next_state_offset + 1, 0);
1247 if (isinclass) { ADD_NEW(state_offset, 0); }
1248 break;
1249
1250 case OP_CRPLUS:
1251 case OP_CRMINPLUS:
1252 count = current_state->count; /* Already matched */
1253 if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
1254 if (isinclass) { count++; ADD_NEW(state_offset, count); }
1255 break;
1256
1257 case OP_CRQUERY:
1258 case OP_CRMINQUERY:
1259 ADD_ACTIVE(next_state_offset + 1, 0);
1260 if (isinclass) { ADD_NEW(next_state_offset + 1, 0); }
1261 break;
1262
1263 case OP_CRRANGE:
1264 case OP_CRMINRANGE:
1265 count = current_state->count; /* Already matched */
1266 if (count >= GET2(ecode, 1))
1267 { ADD_ACTIVE(next_state_offset + 5, 0); }
1268 if (isinclass)
1269 {
1270 if (++count >= GET2(ecode, 3))
1271 { ADD_NEW(next_state_offset + 5, 0); }
1272 else
1273 { ADD_NEW(state_offset, count); }
1274 }
1275 break;
1276
1277 default:
1278 if (isinclass) { ADD_NEW(next_state_offset, 0); }
1279 break;
1280 }
1281 }
1282 break;
1283
1284 /* ========================================================================== */
1285 /* These are the opcodes for fancy brackets of various kinds. We have
1286 to use recursion in order to handle them. */
1287
1288 case OP_ASSERT:
1289 case OP_ASSERT_NOT:
1290 case OP_ASSERTBACK:
1291 case OP_ASSERTBACK_NOT:
1292 {
1293 int rc;
1294 int local_offsets[2];
1295 int local_workspace[1000];
1296 const uschar *endasscode = code + GET(code, 1);
1297
1298 while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
1299
1300 rc = internal_dfa_exec(
1301 md, /* static match data */
1302 code, /* this subexpression's code */
1303 ptr, /* where we currently are */
1304 ptr - start_subject, /* start offset */
1305 local_offsets, /* offset vector */
1306 sizeof(local_offsets)/sizeof(int), /* size of same */
1307 local_workspace, /* workspace vector */
1308 sizeof(local_workspace)/sizeof(int), /* size of same */
1309 ims, /* the current ims flags */
1310 rlevel, /* function recursion level */
1311 recursing); /* pass on regex recursion */
1312
1313 if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
1314 { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
1315 }
1316 break;
1317
1318 /*-----------------------------------------------------------------*/
1319 case OP_COND:
1320 {
1321 int local_offsets[1000];
1322 int local_workspace[1000];
1323 int condcode = code[LINK_SIZE+1];
1324
1325 /* The only supported version of OP_CREF is for the value 0xffff, which
1326 means "test if in a recursion". */
1327
1328 if (condcode == OP_CREF)
1329 {
1330 int value = GET2(code, LINK_SIZE+2);
1331 if (value != 0xffff) return PCRE_ERROR_DFA_UCOND;
1332 if (recursing > 0) { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); }
1333 else { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }
1334 }
1335
1336 /* Otherwise, the condition is an assertion */
1337
1338 else
1339 {
1340 int rc;
1341 const uschar *asscode = code + LINK_SIZE + 1;
1342 const uschar *endasscode = asscode + GET(asscode, 1);
1343
1344 while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
1345
1346 rc = internal_dfa_exec(
1347 md, /* fixed match data */
1348 asscode, /* this subexpression's code */
1349 ptr, /* where we currently are */
1350 ptr - start_subject, /* start offset */
1351 local_offsets, /* offset vector */
1352 sizeof(local_offsets)/sizeof(int), /* size of same */
1353 local_workspace, /* workspace vector */
1354 sizeof(local_workspace)/sizeof(int), /* size of same */
1355 ims, /* the current ims flags */
1356 rlevel, /* function recursion level */
1357 recursing); /* pass on regex recursion */
1358
1359 if ((rc >= 0) ==
1360 (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
1361 { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
1362 else
1363 { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }
1364 }
1365 }
1366 break;
1367
1368 /*-----------------------------------------------------------------*/
1369 case OP_RECURSE:
1370 {
1371 int local_offsets[1000];
1372 int local_workspace[1000];
1373 int rc;
1374
1375 DPRINTF(("%.*sStarting regex recursion %d\n", rlevel*2-2, SP,
1376 recursing + 1));
1377
1378 rc = internal_dfa_exec(
1379 md, /* fixed match data */
1380 start_code + GET(code, 1), /* this subexpression's code */
1381 ptr, /* where we currently are */
1382 ptr - start_subject, /* start offset */
1383 local_offsets, /* offset vector */
1384 sizeof(local_offsets)/sizeof(int), /* size of same */
1385 local_workspace, /* workspace vector */
1386 sizeof(local_workspace)/sizeof(int), /* size of same */
1387 ims, /* the current ims flags */
1388 rlevel, /* function recursion level */
1389 recursing + 1); /* regex recurse level */
1390
1391 DPRINTF(("%.*sReturn from regex recursion %d: rc=%d\n", rlevel*2-2, SP,
1392 recursing + 1, rc));
1393
1394 /* Ran out of internal offsets */
1395
1396 if (rc == 0) return PCRE_ERROR_DFA_RECURSE;
1397
1398 /* For each successful matched substring, set up the next state with a
1399 count of characters to skip before trying it. Note that the count is in
1400 characters, not bytes. */
1401
1402 if (rc > 0)
1403 {
1404 for (rc = rc*2 - 2; rc >= 0; rc -= 2)
1405 {
1406 const uschar *p = start_subject + local_offsets[rc];
1407 const uschar *pp = start_subject + local_offsets[rc+1];
1408 int charcount = local_offsets[rc+1] - local_offsets[rc];
1409 while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
1410 if (charcount > 0)
1411 {
1412 ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));
1413 }
1414 else
1415 {
1416 ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0);
1417 }
1418 }
1419 }
1420 else if (rc != PCRE_ERROR_NOMATCH) return rc;
1421 }
1422 break;
1423
1424 /*-----------------------------------------------------------------*/
1425 case OP_ONCE:
1426 {
1427 int local_offsets[2];
1428 int local_workspace[1000];
1429
1430 int rc = internal_dfa_exec(
1431 md, /* fixed match data */
1432 code, /* this subexpression's code */
1433 ptr, /* where we currently are */
1434 ptr - start_subject, /* start offset */
1435 local_offsets, /* offset vector */
1436 sizeof(local_offsets)/sizeof(int), /* size of same */
1437 local_workspace, /* workspace vector */
1438 sizeof(local_workspace)/sizeof(int), /* size of same */
1439 ims, /* the current ims flags */
1440 rlevel, /* function recursion level */
1441 recursing); /* pass on regex recursion */
1442
1443 if (rc >= 0)
1444 {
1445 const uschar *end_subpattern = code;
1446 int charcount = local_offsets[1] - local_offsets[0];
1447 int next_state_offset, repeat_state_offset;
1448
1449 do { end_subpattern += GET(end_subpattern, 1); }
1450 while (*end_subpattern == OP_ALT);
1451 next_state_offset = end_subpattern - start_code + LINK_SIZE + 1;
1452
1453 /* If the end of this subpattern is KETRMAX or KETRMIN, we must
1454 arrange for the repeat state also to be added to the relevant list.
1455 Calculate the offset, or set -1 for no repeat. */
1456
1457 repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
1458 *end_subpattern == OP_KETRMIN)?
1459 end_subpattern - start_code - GET(end_subpattern, 1) : -1;
1460
1461 /* If we have matched an empty string, add the next state at the
1462 current character pointer. This is important so that the duplicate
1463 checking kicks in, which is what breaks infinite loops that match an
1464 empty string. */
1465
1466 if (charcount == 0)
1467 {
1468 ADD_ACTIVE(next_state_offset, 0);
1469 }
1470
1471 /* Optimization: if there are no more active states, and there
1472 are no new states yet set up, then skip over the subject string
1473 right here, to save looping. Otherwise, set up the new state to swing
1474 into action when the end of the substring is reached. */
1475
1476 else if (i + 1 >= active_count && new_count == 0)
1477 {
1478 ptr += charcount;
1479 clen = 0;
1480 ADD_NEW(next_state_offset, 0);
1481
1482 /* If we are adding a repeat state at the new character position,
1483 we must fudge things so that it is the only current state.
1484 Otherwise, it might be a duplicate of one we processed before, and
1485 that would cause it to be skipped. */
1486
1487 if (repeat_state_offset >= 0)
1488 {
1489 next_active_state = active_states;
1490 active_count = 0;
1491 i = -1;
1492 ADD_ACTIVE(repeat_state_offset, 0);
1493 }
1494 }
1495 else
1496 {
1497 const uschar *p = start_subject + local_offsets[0];
1498 const uschar *pp = start_subject + local_offsets[1];
1499 while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
1500 ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
1501 if (repeat_state_offset >= 0)
1502 { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }
1503 }
1504
1505 }
1506 else if (rc != PCRE_ERROR_NOMATCH) return rc;
1507 }
1508 break;
1509
1510
1511 /* ========================================================================== */
1512 /* Handle callouts */
1513
1514 case OP_CALLOUT:
1515 if (pcre_callout != NULL)
1516 {
1517 int rrc;
1518 pcre_callout_block cb;
1519 cb.version = 1; /* Version 1 of the callout block */
1520 cb.callout_number = code[1];
1521 cb.offset_vector = offsets;
1522 cb.subject = (char *)start_subject;
1523 cb.subject_length = end_subject - start_subject;
1524 cb.start_match = current_subject - start_subject;
1525 cb.current_position = ptr - start_subject;
1526 cb.pattern_position = GET(code, 2);
1527 cb.next_item_length = GET(code, 2 + LINK_SIZE);
1528 cb.capture_top = 1;
1529 cb.capture_last = -1;
1530 cb.callout_data = md->callout_data;
1531 if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc; /* Abandon */
1532 if (rrc == 0) { ADD_ACTIVE(state_offset + 2 + 2*LINK_SIZE, 0); }
1533 }
1534 break;
1535
1536
1537 /* ========================================================================== */
1538 default: /* Unsupported opcode */
1539 return PCRE_ERROR_DFA_UITEM;
1540 }
1541
1542 NEXT_ACTIVE_STATE: continue;
1543
1544 } /* End of loop scanning active states */
1545
1546 /* We have finished the processing at the current subject character. If no
1547 new states have been set for the next character, we have found all the
1548 matches that we are going to find. If we are at the top level and partial
1549 matching has been requested, check for appropriate conditions. */
1550
1551 if (new_count <= 0)
1552 {
1553 if (match_count < 0 && /* No matches found */
1554 rlevel == 1 && /* Top level match function */
1555 (md->moptions & PCRE_PARTIAL) != 0 && /* Want partial matching */
1556 ptr >= end_subject && /* Reached end of subject */
1557 ptr > current_subject) /* Matched non-empty string */
1558 {
1559 if (offsetcount >= 2)
1560 {
1561 offsets[0] = current_subject - start_subject;
1562 offsets[1] = end_subject - start_subject;
1563 }
1564 match_count = PCRE_ERROR_PARTIAL;
1565 }
1566
1567 DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
1568 "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count,
1569 rlevel*2-2, SP));
1570 return match_count;
1571 }
1572
1573 /* One or more states are active for the next character. */
1574
1575 ptr += clen; /* Advance to next subject character */
1576 } /* Loop to move along the subject string */
1577
1578 /* Control never gets here, but we must keep the compiler happy. */
1579
1580 DPRINTF(("%.*s+++ Unexpected end of internal_dfa_exec %d +++\n"
1581 "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, rlevel*2-2, SP));
1582 return PCRE_ERROR_NOMATCH;
1583 }
1584
1585
1586
1587
1588 /*************************************************
1589 * Execute a Regular Expression - DFA engine *
1590 *************************************************/
1591
1592 /* This external function applies a compiled re to a subject string using a DFA
1593 engine. This function calls the internal function multiple times if the pattern
1594 is not anchored.
1595
1596 Arguments:
1597 argument_re points to the compiled expression
1598 extra_data points to extra data or is NULL (not currently used)
1599 subject points to the subject string
1600 length length of subject string (may contain binary zeros)
1601 start_offset where to start in the subject string
1602 options option bits
1603 offsets vector of match offsets
1604 offsetcount size of same
1605 workspace workspace vector
1606 wscount size of same
1607
1608 Returns: > 0 => number of match offset pairs placed in offsets
1609 = 0 => offsets overflowed; longest matches are present
1610 -1 => failed to match
1611 < -1 => some kind of unexpected problem
1612 */
1613
1614 PCRE_EXPORT int
1615 pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
1616 const char *subject, int length, int start_offset, int options, int *offsets,
1617 int offsetcount, int *workspace, int wscount)
1618 {
1619 real_pcre *re = (real_pcre *)argument_re;
1620 dfa_match_data match_block;
1621 BOOL utf8, anchored, startline, firstline;
1622 const uschar *current_subject, *end_subject, *lcc;
1623
1624 pcre_study_data internal_study;
1625 const pcre_study_data *study = NULL;
1626 real_pcre internal_re;
1627
1628 const uschar *req_byte_ptr;
1629 const uschar *start_bits = NULL;
1630 BOOL first_byte_caseless = FALSE;
1631 BOOL req_byte_caseless = FALSE;
1632 int first_byte = -1;
1633 int req_byte = -1;
1634 int req_byte2 = -1;
1635
1636 /* Plausibility checks */
1637
1638 if ((options & ~PUBLIC_DFA_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
1639 if (re == NULL || subject == NULL || workspace == NULL ||
1640 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
1641 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
1642 if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;
1643
1644 /* We need to find the pointer to any study data before we test for byte
1645 flipping, so we scan the extra_data block first. This may set two fields in the
1646 match block, so we must initialize them beforehand. However, the other fields
1647 in the match block must not be set until after the byte flipping. */
1648
1649 match_block.tables = re->tables;
1650 match_block.callout_data = NULL;
1651
1652 if (extra_data != NULL)
1653 {
1654 unsigned int flags = extra_data->flags;
1655 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
1656 study = (const pcre_study_data *)extra_data->study_data;
1657 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) return PCRE_ERROR_DFA_UMLIMIT;
1658 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
1659 match_block.callout_data = extra_data->callout_data;
1660 if ((flags & PCRE_EXTRA_TABLES) != 0)
1661 match_block.tables = extra_data->tables;
1662 }
1663
1664 /* Check that the first field in the block is the magic number. If it is not,
1665 test for a regex that was compiled on a host of opposite endianness. If this is
1666 the case, flipped values are put in internal_re and internal_study if there was
1667 study data too. */
1668
1669 if (re->magic_number != MAGIC_NUMBER)
1670 {
1671 re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
1672 if (re == NULL) return PCRE_ERROR_BADMAGIC;
1673 if (study != NULL) study = &internal_study;
1674 }
1675
1676 /* Set some local values */
1677
1678 current_subject = (const unsigned char *)subject + start_offset;
1679 end_subject = (const unsigned char *)subject + length;
1680 req_byte_ptr = current_subject - 1;
1681
1682 utf8 = (re->options & PCRE_UTF8) != 0;
1683 anchored = (options & PCRE_ANCHORED) != 0 || (re->options & PCRE_ANCHORED) != 0;
1684
1685 /* The remaining fixed data for passing around. */
1686
1687 match_block.start_code = (const uschar *)argument_re +
1688 re->name_table_offset + re->name_count * re->name_entry_size;
1689 match_block.start_subject = (const unsigned char *)subject;
1690 match_block.end_subject = end_subject;
1691 match_block.moptions = options;
1692 match_block.poptions = re->options;
1693
1694 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
1695 back the character offset. */
1696
1697 #ifdef SUPPORT_UTF8
1698 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
1699 {
1700 if (_pcre_valid_utf8((uschar *)subject, length) >= 0)
1701 return PCRE_ERROR_BADUTF8;
1702 if (start_offset > 0 && start_offset < length)
1703 {
1704 int tb = ((uschar *)subject)[start_offset];
1705 if (tb > 127)
1706 {
1707 tb &= 0xc0;
1708 if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
1709 }
1710 }
1711 }
1712 #endif
1713
1714 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
1715 is a feature that makes it possible to save compiled regex and re-use them
1716 in other programs later. */
1717
1718 if (match_block.tables == NULL) match_block.tables = _pcre_default_tables;
1719
1720 /* The lower casing table and the "must be at the start of a line" flag are
1721 used in a loop when finding where to start. */
1722
1723 lcc = match_block.tables + lcc_offset;
1724 startline = (re->options & PCRE_STARTLINE) != 0;
1725 firstline = (re->options & PCRE_FIRSTLINE) != 0;
1726
1727 /* Set up the first character to match, if available. The first_byte value is
1728 never set for an anchored regular expression, but the anchoring may be forced
1729 at run time, so we have to test for anchoring. The first char may be unset for
1730 an unanchored pattern, of course. If there's no first char and the pattern was
1731 studied, there may be a bitmap of possible first characters. */
1732
1733 if (!anchored)
1734 {
1735 if ((re->options & PCRE_FIRSTSET) != 0)
1736 {
1737 first_byte = re->first_byte & 255;
1738 if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
1739 first_byte = lcc[first_byte];
1740 }
1741 else
1742 {
1743 if (startline && study != NULL &&
1744 (study->options & PCRE_STUDY_MAPPED) != 0)
1745 start_bits = study->start_bits;
1746 }
1747 }
1748
1749 /* For anchored or unanchored matches, there may be a "last known required
1750 character" set. */
1751
1752 if ((re->options & PCRE_REQCHSET) != 0)
1753 {
1754 req_byte = re->req_byte & 255;
1755 req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
1756 req_byte2 = (match_block.tables + fcc_offset)[req_byte]; /* case flipped */
1757 }
1758
1759 /* Call the main matching function, looping for a non-anchored regex after a
1760 failed match. Unless restarting, optimize by moving to the first match
1761 character if possible, when not anchored. Then unless wanting a partial match,
1762 check for a required later character. */
1763
1764 for (;;)
1765 {
1766 int rc;
1767
1768 if ((options & PCRE_DFA_RESTART) == 0)
1769 {
1770 const uschar *save_end_subject = end_subject;
1771
1772 /* Advance to a unique first char if possible. If firstline is TRUE, the
1773 start of the match is constrained to the first line of a multiline string.
1774 Implement this by temporarily adjusting end_subject so that we stop scanning
1775 at a newline. If the match fails at the newline, later code breaks this loop.
1776 */
1777
1778 if (firstline)
1779 {
1780 const uschar *t = current_subject;
1781 while (t < save_end_subject && *t != '\n') t++;
1782 end_subject = t;
1783 }
1784
1785 if (first_byte >= 0)
1786 {
1787 if (first_byte_caseless)
1788 while (current_subject < end_subject &&
1789 lcc[*current_subject] != first_byte)
1790 current_subject++;
1791 else
1792 while (current_subject < end_subject && *current_subject != first_byte)
1793 current_subject++;
1794 }
1795
1796 /* Or to just after \n for a multiline match if possible */
1797
1798 else if (startline)
1799 {
1800 if (current_subject > match_block.start_subject + start_offset)
1801 {
1802 while (current_subject < end_subject && current_subject[-1] != NEWLINE)
1803 current_subject++;
1804 }
1805 }
1806
1807 /* Or to a non-unique first char after study */
1808
1809 else if (start_bits != NULL)
1810 {
1811 while (current_subject < end_subject)
1812 {
1813 register unsigned int c = *current_subject;
1814 if ((start_bits[c/8] & (1 << (c&7))) == 0) current_subject++;
1815 else break;
1816 }
1817 }
1818
1819 /* Restore fudged end_subject */
1820
1821 end_subject = save_end_subject;
1822 }
1823
1824 /* If req_byte is set, we know that that character must appear in the subject
1825 for the match to succeed. If the first character is set, req_byte must be
1826 later in the subject; otherwise the test starts at the match point. This
1827 optimization can save a huge amount of work in patterns with nested unlimited
1828 repeats that aren't going to match. Writing separate code for cased/caseless
1829 versions makes it go faster, as does using an autoincrement and backing off
1830 on a match.
1831
1832 HOWEVER: when the subject string is very, very long, searching to its end can
1833 take a long time, and give bad performance on quite ordinary patterns. This
1834 showed up when somebody was matching /^C/ on a 32-megabyte string... so we
1835 don't do this when the string is sufficiently long.
1836
1837 ALSO: this processing is disabled when partial matching is requested.
1838 */
1839
1840 if (req_byte >= 0 &&
1841 end_subject - current_subject < REQ_BYTE_MAX &&
1842 (options & PCRE_PARTIAL) == 0)
1843 {
1844 register const uschar *p = current_subject + ((first_byte >= 0)? 1 : 0);
1845
1846 /* We don't need to repeat the search if we haven't yet reached the
1847 place we found it at last time. */
1848
1849 if (p > req_byte_ptr)
1850 {
1851 if (req_byte_caseless)
1852 {
1853 while (p < end_subject)
1854 {
1855 register int pp = *p++;
1856 if (pp == req_byte || pp == req_byte2) { p--; break; }
1857 }
1858 }
1859 else
1860 {
1861 while (p < end_subject)
1862 {
1863 if (*p++ == req_byte) { p--; break; }
1864 }
1865 }
1866
1867 /* If we can't find the required character, break the matching loop,
1868 which will cause a return or PCRE_ERROR_NOMATCH. */
1869
1870 if (p >= end_subject) break;
1871
1872 /* If we have found the required character, save the point where we
1873 found it, so that we don't search again next time round the loop if
1874 the start hasn't passed this character yet. */
1875
1876 req_byte_ptr = p;
1877 }
1878 }
1879
1880 /* OK, now we can do the business */
1881
1882 rc = internal_dfa_exec(
1883 &match_block, /* fixed match data */
1884 match_block.start_code, /* this subexpression's code */
1885 current_subject, /* where we currently are */
1886 start_offset, /* start offset in subject */
1887 offsets, /* offset vector */
1888 offsetcount, /* size of same */
1889 workspace, /* workspace vector */
1890 wscount, /* size of same */
1891 re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL), /* ims flags */
1892 0, /* function recurse level */
1893 0); /* regex recurse level */
1894
1895 /* Anything other than "no match" means we are done, always; otherwise, carry
1896 on only if not anchored. */
1897
1898 if (rc != PCRE_ERROR_NOMATCH || anchored) return rc;
1899
1900 /* Advance to the next subject character unless we are at the end of a line
1901 and firstline is set. */
1902
1903 if (firstline && *current_subject == NEWLINE) break;
1904 current_subject++;
1905
1906 #ifdef SUPPORT_UTF8
1907 if (utf8)
1908 {
1909 while (current_subject < end_subject && (*current_subject & 0xc0) == 0x80)
1910 current_subject++;
1911 }
1912 #endif
1913
1914 if (current_subject > end_subject) break;
1915 }
1916
1917 return PCRE_ERROR_NOMATCH;
1918 }
1919
1920 /* End of pcre_dfa_exec.c */

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12