/[pcre]/code/trunk/pcre_dfa_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_dfa_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 151 - (show annotations) (download)
Tue Apr 17 15:07:29 2007 UTC (7 years, 3 months ago) by ph10
File MIME type: text/plain
File size: 81541 byte(s)
Tidies: added some casts and some missing #ifdefs.

1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2007 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains the external function pcre_dfa_exec(), which is an
42 alternative matching function that uses a sort of DFA algorithm (not a true
43 FSM). This is NOT Perl- compatible, but it has advantages in certain
44 applications. */
45
46
47 #define NLBLOCK md /* Block containing newline information */
48 #define PSSTART start_subject /* Field containing processed string start */
49 #define PSEND end_subject /* Field containing processed string end */
50
51 #include "pcre_internal.h"
52
53
54 /* For use to indent debugging output */
55
56 #define SP " "
57
58
59
60 /*************************************************
61 * Code parameters and static tables *
62 *************************************************/
63
64 /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
65 into others, under special conditions. A gap of 20 between the blocks should be
66 enough. */
67
68 #define OP_PROP_EXTRA 100
69 #define OP_EXTUNI_EXTRA 120
70 #define OP_ANYNL_EXTRA 140
71
72
73 /* This table identifies those opcodes that are followed immediately by a
74 character that is to be tested in some way. This makes is possible to
75 centralize the loading of these characters. In the case of Type * etc, the
76 "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
77 small value. */
78
79 static uschar coptable[] = {
80 0, /* End */
81 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* \A, \G, \B, \b, \D, \d, \S, \s, \W, \w */
82 0, 0, /* Any, Anybyte */
83 0, 0, 0, 0, /* NOTPROP, PROP, EXTUNI, ANYNL */
84 0, 0, 0, 0, 0, /* \Z, \z, Opt, ^, $ */
85 1, /* Char */
86 1, /* Charnc */
87 1, /* not */
88 /* Positive single-char repeats */
89 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
90 3, 3, 3, /* upto, minupto, exact */
91 1, 1, 1, 3, /* *+, ++, ?+, upto+ */
92 /* Negative single-char repeats - only for chars < 256 */
93 1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
94 3, 3, 3, /* NOT upto, minupto, exact */
95 1, 1, 1, 3, /* NOT *+, ++, ?+, updo+ */
96 /* Positive type repeats */
97 1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
98 3, 3, 3, /* Type upto, minupto, exact */
99 1, 1, 1, 3, /* Type *+, ++, ?+, upto+ */
100 /* Character class & ref repeats */
101 0, 0, 0, 0, 0, 0, /* *, *?, +, +?, ?, ?? */
102 0, 0, /* CRRANGE, CRMINRANGE */
103 0, /* CLASS */
104 0, /* NCLASS */
105 0, /* XCLASS - variable length */
106 0, /* REF */
107 0, /* RECURSE */
108 0, /* CALLOUT */
109 0, /* Alt */
110 0, /* Ket */
111 0, /* KetRmax */
112 0, /* KetRmin */
113 0, /* Assert */
114 0, /* Assert not */
115 0, /* Assert behind */
116 0, /* Assert behind not */
117 0, /* Reverse */
118 0, 0, 0, 0, /* ONCE, BRA, CBRA, COND */
119 0, 0, 0, /* SBRA, SCBRA, SCOND */
120 0, /* CREF */
121 0, /* RREF */
122 0, /* DEF */
123 0, 0 /* BRAZERO, BRAMINZERO */
124 };
125
126 /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
127 and \w */
128
129 static uschar toptable1[] = {
130 0, 0, 0, 0, 0,
131 ctype_digit, ctype_digit,
132 ctype_space, ctype_space,
133 ctype_word, ctype_word,
134 0 /* OP_ANY */
135 };
136
137 static uschar toptable2[] = {
138 0, 0, 0, 0, 0,
139 ctype_digit, 0,
140 ctype_space, 0,
141 ctype_word, 0,
142 1 /* OP_ANY */
143 };
144
145
146 /* Structure for holding data about a particular state, which is in effect the
147 current data for an active path through the match tree. It must consist
148 entirely of ints because the working vector we are passed, and which we put
149 these structures in, is a vector of ints. */
150
151 typedef struct stateblock {
152 int offset; /* Offset to opcode */
153 int count; /* Count for repeats */
154 int ims; /* ims flag bits */
155 int data; /* Some use extra data */
156 } stateblock;
157
158 #define INTS_PER_STATEBLOCK (sizeof(stateblock)/sizeof(int))
159
160
161 #ifdef DEBUG
162 /*************************************************
163 * Print character string *
164 *************************************************/
165
166 /* Character string printing function for debugging.
167
168 Arguments:
169 p points to string
170 length number of bytes
171 f where to print
172
173 Returns: nothing
174 */
175
176 static void
177 pchars(unsigned char *p, int length, FILE *f)
178 {
179 int c;
180 while (length-- > 0)
181 {
182 if (isprint(c = *(p++)))
183 fprintf(f, "%c", c);
184 else
185 fprintf(f, "\\x%02x", c);
186 }
187 }
188 #endif
189
190
191
192 /*************************************************
193 * Execute a Regular Expression - DFA engine *
194 *************************************************/
195
196 /* This internal function applies a compiled pattern to a subject string,
197 starting at a given point, using a DFA engine. This function is called from the
198 external one, possibly multiple times if the pattern is not anchored. The
199 function calls itself recursively for some kinds of subpattern.
200
201 Arguments:
202 md the match_data block with fixed information
203 this_start_code the opening bracket of this subexpression's code
204 current_subject where we currently are in the subject string
205 start_offset start offset in the subject string
206 offsets vector to contain the matching string offsets
207 offsetcount size of same
208 workspace vector of workspace
209 wscount size of same
210 ims the current ims flags
211 rlevel function call recursion level
212 recursing regex recursive call level
213
214 Returns: > 0 =>
215 = 0 =>
216 -1 => failed to match
217 < -1 => some kind of unexpected problem
218
219 The following macros are used for adding states to the two state vectors (one
220 for the current character, one for the following character). */
221
222 #define ADD_ACTIVE(x,y) \
223 if (active_count++ < wscount) \
224 { \
225 next_active_state->offset = (x); \
226 next_active_state->count = (y); \
227 next_active_state->ims = ims; \
228 next_active_state++; \
229 DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
230 } \
231 else return PCRE_ERROR_DFA_WSSIZE
232
233 #define ADD_ACTIVE_DATA(x,y,z) \
234 if (active_count++ < wscount) \
235 { \
236 next_active_state->offset = (x); \
237 next_active_state->count = (y); \
238 next_active_state->ims = ims; \
239 next_active_state->data = (z); \
240 next_active_state++; \
241 DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
242 } \
243 else return PCRE_ERROR_DFA_WSSIZE
244
245 #define ADD_NEW(x,y) \
246 if (new_count++ < wscount) \
247 { \
248 next_new_state->offset = (x); \
249 next_new_state->count = (y); \
250 next_new_state->ims = ims; \
251 next_new_state++; \
252 DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
253 } \
254 else return PCRE_ERROR_DFA_WSSIZE
255
256 #define ADD_NEW_DATA(x,y,z) \
257 if (new_count++ < wscount) \
258 { \
259 next_new_state->offset = (x); \
260 next_new_state->count = (y); \
261 next_new_state->ims = ims; \
262 next_new_state->data = (z); \
263 next_new_state++; \
264 DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
265 } \
266 else return PCRE_ERROR_DFA_WSSIZE
267
268 /* And now, here is the code */
269
270 static int
271 internal_dfa_exec(
272 dfa_match_data *md,
273 const uschar *this_start_code,
274 const uschar *current_subject,
275 int start_offset,
276 int *offsets,
277 int offsetcount,
278 int *workspace,
279 int wscount,
280 int ims,
281 int rlevel,
282 int recursing)
283 {
284 stateblock *active_states, *new_states, *temp_states;
285 stateblock *next_active_state, *next_new_state;
286
287 const uschar *ctypes, *lcc, *fcc;
288 const uschar *ptr;
289 const uschar *end_code, *first_op;
290
291 int active_count, new_count, match_count;
292
293 /* Some fields in the md block are frequently referenced, so we load them into
294 independent variables in the hope that this will perform better. */
295
296 const uschar *start_subject = md->start_subject;
297 const uschar *end_subject = md->end_subject;
298 const uschar *start_code = md->start_code;
299
300 #ifdef SUPPORT_UTF8
301 BOOL utf8 = (md->poptions & PCRE_UTF8) != 0;
302 #else
303 BOOL utf8 = FALSE;
304 #endif
305
306 rlevel++;
307 offsetcount &= (-2);
308
309 wscount -= 2;
310 wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
311 (2 * INTS_PER_STATEBLOCK);
312
313 DPRINTF(("\n%.*s---------------------\n"
314 "%.*sCall to internal_dfa_exec f=%d r=%d\n",
315 rlevel*2-2, SP, rlevel*2-2, SP, rlevel, recursing));
316
317 ctypes = md->tables + ctypes_offset;
318 lcc = md->tables + lcc_offset;
319 fcc = md->tables + fcc_offset;
320
321 match_count = PCRE_ERROR_NOMATCH; /* A negative number */
322
323 active_states = (stateblock *)(workspace + 2);
324 next_new_state = new_states = active_states + wscount;
325 new_count = 0;
326
327 first_op = this_start_code + 1 + LINK_SIZE +
328 ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
329
330 /* The first thing in any (sub) pattern is a bracket of some sort. Push all
331 the alternative states onto the list, and find out where the end is. This
332 makes is possible to use this function recursively, when we want to stop at a
333 matching internal ket rather than at the end.
334
335 If the first opcode in the first alternative is OP_REVERSE, we are dealing with
336 a backward assertion. In that case, we have to find out the maximum amount to
337 move back, and set up each alternative appropriately. */
338
339 if (*first_op == OP_REVERSE)
340 {
341 int max_back = 0;
342 int gone_back;
343
344 end_code = this_start_code;
345 do
346 {
347 int back = GET(end_code, 2+LINK_SIZE);
348 if (back > max_back) max_back = back;
349 end_code += GET(end_code, 1);
350 }
351 while (*end_code == OP_ALT);
352
353 /* If we can't go back the amount required for the longest lookbehind
354 pattern, go back as far as we can; some alternatives may still be viable. */
355
356 #ifdef SUPPORT_UTF8
357 /* In character mode we have to step back character by character */
358
359 if (utf8)
360 {
361 for (gone_back = 0; gone_back < max_back; gone_back++)
362 {
363 if (current_subject <= start_subject) break;
364 current_subject--;
365 while (current_subject > start_subject &&
366 (*current_subject & 0xc0) == 0x80)
367 current_subject--;
368 }
369 }
370 else
371 #endif
372
373 /* In byte-mode we can do this quickly. */
374
375 {
376 gone_back = (current_subject - max_back < start_subject)?
377 current_subject - start_subject : max_back;
378 current_subject -= gone_back;
379 }
380
381 /* Now we can process the individual branches. */
382
383 end_code = this_start_code;
384 do
385 {
386 int back = GET(end_code, 2+LINK_SIZE);
387 if (back <= gone_back)
388 {
389 int bstate = end_code - start_code + 2 + 2*LINK_SIZE;
390 ADD_NEW_DATA(-bstate, 0, gone_back - back);
391 }
392 end_code += GET(end_code, 1);
393 }
394 while (*end_code == OP_ALT);
395 }
396
397 /* This is the code for a "normal" subpattern (not a backward assertion). The
398 start of a whole pattern is always one of these. If we are at the top level,
399 we may be asked to restart matching from the same point that we reached for a
400 previous partial match. We still have to scan through the top-level branches to
401 find the end state. */
402
403 else
404 {
405 end_code = this_start_code;
406
407 /* Restarting */
408
409 if (rlevel == 1 && (md->moptions & PCRE_DFA_RESTART) != 0)
410 {
411 do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);
412 new_count = workspace[1];
413 if (!workspace[0])
414 memcpy(new_states, active_states, new_count * sizeof(stateblock));
415 }
416
417 /* Not restarting */
418
419 else
420 {
421 int length = 1 + LINK_SIZE +
422 ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
423 do
424 {
425 ADD_NEW(end_code - start_code + length, 0);
426 end_code += GET(end_code, 1);
427 length = 1 + LINK_SIZE;
428 }
429 while (*end_code == OP_ALT);
430 }
431 }
432
433 workspace[0] = 0; /* Bit indicating which vector is current */
434
435 DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, end_code - start_code));
436
437 /* Loop for scanning the subject */
438
439 ptr = current_subject;
440 for (;;)
441 {
442 int i, j;
443 int clen, dlen;
444 unsigned int c, d;
445
446 /* Make the new state list into the active state list and empty the
447 new state list. */
448
449 temp_states = active_states;
450 active_states = new_states;
451 new_states = temp_states;
452 active_count = new_count;
453 new_count = 0;
454
455 workspace[0] ^= 1; /* Remember for the restarting feature */
456 workspace[1] = active_count;
457
458 #ifdef DEBUG
459 printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);
460 pchars((uschar *)ptr, strlen((char *)ptr), stdout);
461 printf("\"\n");
462
463 printf("%.*sActive states: ", rlevel*2-2, SP);
464 for (i = 0; i < active_count; i++)
465 printf("%d/%d ", active_states[i].offset, active_states[i].count);
466 printf("\n");
467 #endif
468
469 /* Set the pointers for adding new states */
470
471 next_active_state = active_states + active_count;
472 next_new_state = new_states;
473
474 /* Load the current character from the subject outside the loop, as many
475 different states may want to look at it, and we assume that at least one
476 will. */
477
478 if (ptr < end_subject)
479 {
480 clen = 1; /* Number of bytes in the character */
481 #ifdef SUPPORT_UTF8
482 if (utf8) { GETCHARLEN(c, ptr, clen); } else
483 #endif /* SUPPORT_UTF8 */
484 c = *ptr;
485 }
486 else
487 {
488 clen = 0; /* This indicates the end of the subject */
489 c = NOTACHAR; /* This value should never actually be used */
490 }
491
492 /* Scan up the active states and act on each one. The result of an action
493 may be to add more states to the currently active list (e.g. on hitting a
494 parenthesis) or it may be to put states on the new list, for considering
495 when we move the character pointer on. */
496
497 for (i = 0; i < active_count; i++)
498 {
499 stateblock *current_state = active_states + i;
500 const uschar *code;
501 int state_offset = current_state->offset;
502 int count, codevalue;
503 #ifdef SUPPORT_UCP
504 int chartype, script;
505 #endif
506
507 #ifdef DEBUG
508 printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
509 if (clen == 0) printf("EOL\n");
510 else if (c > 32 && c < 127) printf("'%c'\n", c);
511 else printf("0x%02x\n", c);
512 #endif
513
514 /* This variable is referred to implicity in the ADD_xxx macros. */
515
516 ims = current_state->ims;
517
518 /* A negative offset is a special case meaning "hold off going to this
519 (negated) state until the number of characters in the data field have
520 been skipped". */
521
522 if (state_offset < 0)
523 {
524 if (current_state->data > 0)
525 {
526 DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP));
527 ADD_NEW_DATA(state_offset, current_state->count,
528 current_state->data - 1);
529 continue;
530 }
531 else
532 {
533 current_state->offset = state_offset = -state_offset;
534 }
535 }
536
537 /* Check for a duplicate state with the same count, and skip if found. */
538
539 for (j = 0; j < i; j++)
540 {
541 if (active_states[j].offset == state_offset &&
542 active_states[j].count == current_state->count)
543 {
544 DPRINTF(("%.*sDuplicate state: skipped\n", rlevel*2-2, SP));
545 goto NEXT_ACTIVE_STATE;
546 }
547 }
548
549 /* The state offset is the offset to the opcode */
550
551 code = start_code + state_offset;
552 codevalue = *code;
553
554 /* If this opcode is followed by an inline character, load it. It is
555 tempting to test for the presence of a subject character here, but that
556 is wrong, because sometimes zero repetitions of the subject are
557 permitted.
558
559 We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
560 argument that is not a data character - but is always one byte long.
561 Unfortunately, we have to take special action to deal with \P, \p, and
562 \X in this case. To keep the other cases fast, convert these ones to new
563 opcodes. */
564
565 if (coptable[codevalue] > 0)
566 {
567 dlen = 1;
568 #ifdef SUPPORT_UTF8
569 if (utf8) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
570 #endif /* SUPPORT_UTF8 */
571 d = code[coptable[codevalue]];
572 if (codevalue >= OP_TYPESTAR)
573 {
574 switch(d)
575 {
576 case OP_ANYBYTE: return PCRE_ERROR_DFA_UITEM;
577 case OP_NOTPROP:
578 case OP_PROP: codevalue += OP_PROP_EXTRA; break;
579 case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
580 case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
581 default: break;
582 }
583 }
584 }
585 else
586 {
587 dlen = 0; /* Not strictly necessary, but compilers moan */
588 d = NOTACHAR; /* if these variables are not set. */
589 }
590
591
592 /* Now process the individual opcodes */
593
594 switch (codevalue)
595 {
596
597 /* ========================================================================== */
598 /* Reached a closing bracket. If not at the end of the pattern, carry
599 on with the next opcode. Otherwise, unless we have an empty string and
600 PCRE_NOTEMPTY is set, save the match data, shifting up all previous
601 matches so we always have the longest first. */
602
603 case OP_KET:
604 case OP_KETRMIN:
605 case OP_KETRMAX:
606 if (code != end_code)
607 {
608 ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
609 if (codevalue != OP_KET)
610 {
611 ADD_ACTIVE(state_offset - GET(code, 1), 0);
612 }
613 }
614 else if (ptr > current_subject || (md->moptions & PCRE_NOTEMPTY) == 0)
615 {
616 if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
617 else if (match_count > 0 && ++match_count * 2 >= offsetcount)
618 match_count = 0;
619 count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
620 if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
621 if (offsetcount >= 2)
622 {
623 offsets[0] = current_subject - start_subject;
624 offsets[1] = ptr - start_subject;
625 DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
626 offsets[1] - offsets[0], current_subject));
627 }
628 if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
629 {
630 DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
631 "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,
632 match_count, rlevel*2-2, SP));
633 return match_count;
634 }
635 }
636 break;
637
638 /* ========================================================================== */
639 /* These opcodes add to the current list of states without looking
640 at the current character. */
641
642 /*-----------------------------------------------------------------*/
643 case OP_ALT:
644 do { code += GET(code, 1); } while (*code == OP_ALT);
645 ADD_ACTIVE(code - start_code, 0);
646 break;
647
648 /*-----------------------------------------------------------------*/
649 case OP_BRA:
650 case OP_SBRA:
651 do
652 {
653 ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
654 code += GET(code, 1);
655 }
656 while (*code == OP_ALT);
657 break;
658
659 /*-----------------------------------------------------------------*/
660 case OP_CBRA:
661 case OP_SCBRA:
662 ADD_ACTIVE(code - start_code + 3 + LINK_SIZE, 0);
663 code += GET(code, 1);
664 while (*code == OP_ALT)
665 {
666 ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
667 code += GET(code, 1);
668 }
669 break;
670
671 /*-----------------------------------------------------------------*/
672 case OP_BRAZERO:
673 case OP_BRAMINZERO:
674 ADD_ACTIVE(state_offset + 1, 0);
675 code += 1 + GET(code, 2);
676 while (*code == OP_ALT) code += GET(code, 1);
677 ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
678 break;
679
680 /*-----------------------------------------------------------------*/
681 case OP_CIRC:
682 if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
683 ((ims & PCRE_MULTILINE) != 0 &&
684 ptr != end_subject &&
685 WAS_NEWLINE(ptr)))
686 { ADD_ACTIVE(state_offset + 1, 0); }
687 break;
688
689 /*-----------------------------------------------------------------*/
690 case OP_EOD:
691 if (ptr >= end_subject) { ADD_ACTIVE(state_offset + 1, 0); }
692 break;
693
694 /*-----------------------------------------------------------------*/
695 case OP_OPT:
696 ims = code[1];
697 ADD_ACTIVE(state_offset + 2, 0);
698 break;
699
700 /*-----------------------------------------------------------------*/
701 case OP_SOD:
702 if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
703 break;
704
705 /*-----------------------------------------------------------------*/
706 case OP_SOM:
707 if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); }
708 break;
709
710
711 /* ========================================================================== */
712 /* These opcodes inspect the next subject character, and sometimes
713 the previous one as well, but do not have an argument. The variable
714 clen contains the length of the current character and is zero if we are
715 at the end of the subject. */
716
717 /*-----------------------------------------------------------------*/
718 case OP_ANY:
719 if (clen > 0 && ((ims & PCRE_DOTALL) != 0 || !IS_NEWLINE(ptr)))
720 { ADD_NEW(state_offset + 1, 0); }
721 break;
722
723 /*-----------------------------------------------------------------*/
724 case OP_EODN:
725 if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
726 { ADD_ACTIVE(state_offset + 1, 0); }
727 break;
728
729 /*-----------------------------------------------------------------*/
730 case OP_DOLL:
731 if ((md->moptions & PCRE_NOTEOL) == 0)
732 {
733 if (clen == 0 ||
734 (IS_NEWLINE(ptr) &&
735 ((ims & PCRE_MULTILINE) != 0 || ptr == end_subject - md->nllen)
736 ))
737 { ADD_ACTIVE(state_offset + 1, 0); }
738 }
739 else if ((ims & PCRE_MULTILINE) != 0 && IS_NEWLINE(ptr))
740 { ADD_ACTIVE(state_offset + 1, 0); }
741 break;
742
743 /*-----------------------------------------------------------------*/
744
745 case OP_DIGIT:
746 case OP_WHITESPACE:
747 case OP_WORDCHAR:
748 if (clen > 0 && c < 256 &&
749 ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)
750 { ADD_NEW(state_offset + 1, 0); }
751 break;
752
753 /*-----------------------------------------------------------------*/
754 case OP_NOT_DIGIT:
755 case OP_NOT_WHITESPACE:
756 case OP_NOT_WORDCHAR:
757 if (clen > 0 && (c >= 256 ||
758 ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0))
759 { ADD_NEW(state_offset + 1, 0); }
760 break;
761
762 /*-----------------------------------------------------------------*/
763 case OP_WORD_BOUNDARY:
764 case OP_NOT_WORD_BOUNDARY:
765 {
766 int left_word, right_word;
767
768 if (ptr > start_subject)
769 {
770 const uschar *temp = ptr - 1;
771 #ifdef SUPPORT_UTF8
772 if (utf8) BACKCHAR(temp);
773 #endif
774 GETCHARTEST(d, temp);
775 left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
776 }
777 else left_word = 0;
778
779 if (clen > 0) right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
780 else right_word = 0;
781
782 if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
783 { ADD_ACTIVE(state_offset + 1, 0); }
784 }
785 break;
786
787
788 /*-----------------------------------------------------------------*/
789 /* Check the next character by Unicode property. We will get here only
790 if the support is in the binary; otherwise a compile-time error occurs.
791 */
792
793 #ifdef SUPPORT_UCP
794 case OP_PROP:
795 case OP_NOTPROP:
796 if (clen > 0)
797 {
798 BOOL OK;
799 int category = _pcre_ucp_findprop(c, &chartype, &script);
800 switch(code[1])
801 {
802 case PT_ANY:
803 OK = TRUE;
804 break;
805
806 case PT_LAMP:
807 OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
808 break;
809
810 case PT_GC:
811 OK = category == code[2];
812 break;
813
814 case PT_PC:
815 OK = chartype == code[2];
816 break;
817
818 case PT_SC:
819 OK = script == code[2];
820 break;
821
822 /* Should never occur, but keep compilers from grumbling. */
823
824 default:
825 OK = codevalue != OP_PROP;
826 break;
827 }
828
829 if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); }
830 }
831 break;
832 #endif
833
834
835
836 /* ========================================================================== */
837 /* These opcodes likewise inspect the subject character, but have an
838 argument that is not a data character. It is one of these opcodes:
839 OP_ANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE, OP_WORDCHAR,
840 OP_NOT_WORDCHAR. The value is loaded into d. */
841
842 case OP_TYPEPLUS:
843 case OP_TYPEMINPLUS:
844 case OP_TYPEPOSPLUS:
845 count = current_state->count; /* Already matched */
846 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
847 if (clen > 0)
848 {
849 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
850 (c < 256 &&
851 (d != OP_ANY ||
852 (ims & PCRE_DOTALL) != 0 ||
853 !IS_NEWLINE(ptr)
854 ) &&
855 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
856 {
857 if (count > 0 && codevalue == OP_TYPEPOSPLUS)
858 {
859 active_count--; /* Remove non-match possibility */
860 next_active_state--;
861 }
862 count++;
863 ADD_NEW(state_offset, count);
864 }
865 }
866 break;
867
868 /*-----------------------------------------------------------------*/
869 case OP_TYPEQUERY:
870 case OP_TYPEMINQUERY:
871 case OP_TYPEPOSQUERY:
872 ADD_ACTIVE(state_offset + 2, 0);
873 if (clen > 0)
874 {
875 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
876 (c < 256 &&
877 (d != OP_ANY ||
878 (ims & PCRE_DOTALL) != 0 ||
879 !IS_NEWLINE(ptr)
880 ) &&
881 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
882 {
883 if (codevalue == OP_TYPEPOSQUERY)
884 {
885 active_count--; /* Remove non-match possibility */
886 next_active_state--;
887 }
888 ADD_NEW(state_offset + 2, 0);
889 }
890 }
891 break;
892
893 /*-----------------------------------------------------------------*/
894 case OP_TYPESTAR:
895 case OP_TYPEMINSTAR:
896 case OP_TYPEPOSSTAR:
897 ADD_ACTIVE(state_offset + 2, 0);
898 if (clen > 0)
899 {
900 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
901 (c < 256 &&
902 (d != OP_ANY ||
903 (ims & PCRE_DOTALL) != 0 ||
904 !IS_NEWLINE(ptr)
905 ) &&
906 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
907 {
908 if (codevalue == OP_TYPEPOSSTAR)
909 {
910 active_count--; /* Remove non-match possibility */
911 next_active_state--;
912 }
913 ADD_NEW(state_offset, 0);
914 }
915 }
916 break;
917
918 /*-----------------------------------------------------------------*/
919 case OP_TYPEEXACT:
920 count = current_state->count; /* Number already matched */
921 if (clen > 0)
922 {
923 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
924 (c < 256 &&
925 (d != OP_ANY ||
926 (ims & PCRE_DOTALL) != 0 ||
927 !IS_NEWLINE(ptr)
928 ) &&
929 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
930 {
931 if (++count >= GET2(code, 1))
932 { ADD_NEW(state_offset + 4, 0); }
933 else
934 { ADD_NEW(state_offset, count); }
935 }
936 }
937 break;
938
939 /*-----------------------------------------------------------------*/
940 case OP_TYPEUPTO:
941 case OP_TYPEMINUPTO:
942 case OP_TYPEPOSUPTO:
943 ADD_ACTIVE(state_offset + 4, 0);
944 count = current_state->count; /* Number already matched */
945 if (clen > 0)
946 {
947 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
948 (c < 256 &&
949 (d != OP_ANY ||
950 (ims & PCRE_DOTALL) != 0 ||
951 !IS_NEWLINE(ptr)
952 ) &&
953 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
954 {
955 if (codevalue == OP_TYPEPOSUPTO)
956 {
957 active_count--; /* Remove non-match possibility */
958 next_active_state--;
959 }
960 if (++count >= GET2(code, 1))
961 { ADD_NEW(state_offset + 4, 0); }
962 else
963 { ADD_NEW(state_offset, count); }
964 }
965 }
966 break;
967
968 /* ========================================================================== */
969 /* These are virtual opcodes that are used when something like
970 OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
971 argument. It keeps the code above fast for the other cases. The argument
972 is in the d variable. */
973
974 #ifdef SUPPORT_UCP
975 case OP_PROP_EXTRA + OP_TYPEPLUS:
976 case OP_PROP_EXTRA + OP_TYPEMINPLUS:
977 case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
978 count = current_state->count; /* Already matched */
979 if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
980 if (clen > 0)
981 {
982 BOOL OK;
983 int category = _pcre_ucp_findprop(c, &chartype, &script);
984 switch(code[2])
985 {
986 case PT_ANY:
987 OK = TRUE;
988 break;
989
990 case PT_LAMP:
991 OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
992 break;
993
994 case PT_GC:
995 OK = category == code[3];
996 break;
997
998 case PT_PC:
999 OK = chartype == code[3];
1000 break;
1001
1002 case PT_SC:
1003 OK = script == code[3];
1004 break;
1005
1006 /* Should never occur, but keep compilers from grumbling. */
1007
1008 default:
1009 OK = codevalue != OP_PROP;
1010 break;
1011 }
1012
1013 if (OK == (d == OP_PROP))
1014 {
1015 if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)
1016 {
1017 active_count--; /* Remove non-match possibility */
1018 next_active_state--;
1019 }
1020 count++;
1021 ADD_NEW(state_offset, count);
1022 }
1023 }
1024 break;
1025
1026 /*-----------------------------------------------------------------*/
1027 case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
1028 case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
1029 case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
1030 count = current_state->count; /* Already matched */
1031 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1032 if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)
1033 {
1034 const uschar *nptr = ptr + clen;
1035 int ncount = 0;
1036 if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
1037 {
1038 active_count--; /* Remove non-match possibility */
1039 next_active_state--;
1040 }
1041 while (nptr < end_subject)
1042 {
1043 int nd;
1044 int ndlen = 1;
1045 GETCHARLEN(nd, nptr, ndlen);
1046 if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;
1047 ncount++;
1048 nptr += ndlen;
1049 }
1050 count++;
1051 ADD_NEW_DATA(-state_offset, count, ncount);
1052 }
1053 break;
1054 #endif
1055
1056 /*-----------------------------------------------------------------*/
1057 case OP_ANYNL_EXTRA + OP_TYPEPLUS:
1058 case OP_ANYNL_EXTRA + OP_TYPEMINPLUS:
1059 case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS:
1060 count = current_state->count; /* Already matched */
1061 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1062 if (clen > 0)
1063 {
1064 int ncount = 0;
1065 switch (c)
1066 {
1067 case 0x000d:
1068 if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1069 /* Fall through */
1070 case 0x000a:
1071 case 0x000b:
1072 case 0x000c:
1073 case 0x0085:
1074 case 0x2028:
1075 case 0x2029:
1076 if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
1077 {
1078 active_count--; /* Remove non-match possibility */
1079 next_active_state--;
1080 }
1081 count++;
1082 ADD_NEW_DATA(-state_offset, count, ncount);
1083 break;
1084 default:
1085 break;
1086 }
1087 }
1088 break;
1089
1090 /*-----------------------------------------------------------------*/
1091 #ifdef SUPPORT_UCP
1092 case OP_PROP_EXTRA + OP_TYPEQUERY:
1093 case OP_PROP_EXTRA + OP_TYPEMINQUERY:
1094 case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
1095 count = 4;
1096 goto QS1;
1097
1098 case OP_PROP_EXTRA + OP_TYPESTAR:
1099 case OP_PROP_EXTRA + OP_TYPEMINSTAR:
1100 case OP_PROP_EXTRA + OP_TYPEPOSSTAR:
1101 count = 0;
1102
1103 QS1:
1104
1105 ADD_ACTIVE(state_offset + 4, 0);
1106 if (clen > 0)
1107 {
1108 BOOL OK;
1109 int category = _pcre_ucp_findprop(c, &chartype, &script);
1110 switch(code[2])
1111 {
1112 case PT_ANY:
1113 OK = TRUE;
1114 break;
1115
1116 case PT_LAMP:
1117 OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
1118 break;
1119
1120 case PT_GC:
1121 OK = category == code[3];
1122 break;
1123
1124 case PT_PC:
1125 OK = chartype == code[3];
1126 break;
1127
1128 case PT_SC:
1129 OK = script == code[3];
1130 break;
1131
1132 /* Should never occur, but keep compilers from grumbling. */
1133
1134 default:
1135 OK = codevalue != OP_PROP;
1136 break;
1137 }
1138
1139 if (OK == (d == OP_PROP))
1140 {
1141 if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR ||
1142 codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)
1143 {
1144 active_count--; /* Remove non-match possibility */
1145 next_active_state--;
1146 }
1147 ADD_NEW(state_offset + count, 0);
1148 }
1149 }
1150 break;
1151
1152 /*-----------------------------------------------------------------*/
1153 case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
1154 case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
1155 case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY:
1156 count = 2;
1157 goto QS2;
1158
1159 case OP_EXTUNI_EXTRA + OP_TYPESTAR:
1160 case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
1161 case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR:
1162 count = 0;
1163
1164 QS2:
1165
1166 ADD_ACTIVE(state_offset + 2, 0);
1167 if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)
1168 {
1169 const uschar *nptr = ptr + clen;
1170 int ncount = 0;
1171 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
1172 codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
1173 {
1174 active_count--; /* Remove non-match possibility */
1175 next_active_state--;
1176 }
1177 while (nptr < end_subject)
1178 {
1179 int nd;
1180 int ndlen = 1;
1181 GETCHARLEN(nd, nptr, ndlen);
1182 if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;
1183 ncount++;
1184 nptr += ndlen;
1185 }
1186 ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1187 }
1188 break;
1189 #endif
1190
1191 /*-----------------------------------------------------------------*/
1192 case OP_ANYNL_EXTRA + OP_TYPEQUERY:
1193 case OP_ANYNL_EXTRA + OP_TYPEMINQUERY:
1194 case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY:
1195 count = 2;
1196 goto QS3;
1197
1198 case OP_ANYNL_EXTRA + OP_TYPESTAR:
1199 case OP_ANYNL_EXTRA + OP_TYPEMINSTAR:
1200 case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR:
1201 count = 0;
1202
1203 QS3:
1204 ADD_ACTIVE(state_offset + 2, 0);
1205 if (clen > 0)
1206 {
1207 int ncount = 0;
1208 switch (c)
1209 {
1210 case 0x000d:
1211 if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1212 /* Fall through */
1213 case 0x000a:
1214 case 0x000b:
1215 case 0x000c:
1216 case 0x0085:
1217 case 0x2028:
1218 case 0x2029:
1219 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
1220 codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
1221 {
1222 active_count--; /* Remove non-match possibility */
1223 next_active_state--;
1224 }
1225 ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1226 break;
1227 default:
1228 break;
1229 }
1230 }
1231 break;
1232
1233 /*-----------------------------------------------------------------*/
1234 #ifdef SUPPORT_UCP
1235 case OP_PROP_EXTRA + OP_TYPEEXACT:
1236 case OP_PROP_EXTRA + OP_TYPEUPTO:
1237 case OP_PROP_EXTRA + OP_TYPEMINUPTO:
1238 case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
1239 if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
1240 { ADD_ACTIVE(state_offset + 6, 0); }
1241 count = current_state->count; /* Number already matched */
1242 if (clen > 0)
1243 {
1244 BOOL OK;
1245 int category = _pcre_ucp_findprop(c, &chartype, &script);
1246 switch(code[4])
1247 {
1248 case PT_ANY:
1249 OK = TRUE;
1250 break;
1251
1252 case PT_LAMP:
1253 OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
1254 break;
1255
1256 case PT_GC:
1257 OK = category == code[5];
1258 break;
1259
1260 case PT_PC:
1261 OK = chartype == code[5];
1262 break;
1263
1264 case PT_SC:
1265 OK = script == code[5];
1266 break;
1267
1268 /* Should never occur, but keep compilers from grumbling. */
1269
1270 default:
1271 OK = codevalue != OP_PROP;
1272 break;
1273 }
1274
1275 if (OK == (d == OP_PROP))
1276 {
1277 if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
1278 {
1279 active_count--; /* Remove non-match possibility */
1280 next_active_state--;
1281 }
1282 if (++count >= GET2(code, 1))
1283 { ADD_NEW(state_offset + 6, 0); }
1284 else
1285 { ADD_NEW(state_offset, count); }
1286 }
1287 }
1288 break;
1289
1290 /*-----------------------------------------------------------------*/
1291 case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
1292 case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
1293 case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
1294 case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
1295 if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
1296 { ADD_ACTIVE(state_offset + 4, 0); }
1297 count = current_state->count; /* Number already matched */
1298 if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)
1299 {
1300 const uschar *nptr = ptr + clen;
1301 int ncount = 0;
1302 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
1303 {
1304 active_count--; /* Remove non-match possibility */
1305 next_active_state--;
1306 }
1307 while (nptr < end_subject)
1308 {
1309 int nd;
1310 int ndlen = 1;
1311 GETCHARLEN(nd, nptr, ndlen);
1312 if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;
1313 ncount++;
1314 nptr += ndlen;
1315 }
1316 if (++count >= GET2(code, 1))
1317 { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
1318 else
1319 { ADD_NEW_DATA(-state_offset, count, ncount); }
1320 }
1321 break;
1322 #endif
1323
1324 /*-----------------------------------------------------------------*/
1325 case OP_ANYNL_EXTRA + OP_TYPEEXACT:
1326 case OP_ANYNL_EXTRA + OP_TYPEUPTO:
1327 case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
1328 case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
1329 if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
1330 { ADD_ACTIVE(state_offset + 4, 0); }
1331 count = current_state->count; /* Number already matched */
1332 if (clen > 0)
1333 {
1334 int ncount = 0;
1335 switch (c)
1336 {
1337 case 0x000d:
1338 if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1339 /* Fall through */
1340 case 0x000a:
1341 case 0x000b:
1342 case 0x000c:
1343 case 0x0085:
1344 case 0x2028:
1345 case 0x2029:
1346 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
1347 {
1348 active_count--; /* Remove non-match possibility */
1349 next_active_state--;
1350 }
1351 if (++count >= GET2(code, 1))
1352 { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
1353 else
1354 { ADD_NEW_DATA(-state_offset, count, ncount); }
1355 break;
1356 default:
1357 break;
1358 }
1359 }
1360 break;
1361
1362 /* ========================================================================== */
1363 /* These opcodes are followed by a character that is usually compared
1364 to the current subject character; it is loaded into d. We still get
1365 here even if there is no subject character, because in some cases zero
1366 repetitions are permitted. */
1367
1368 /*-----------------------------------------------------------------*/
1369 case OP_CHAR:
1370 if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); }
1371 break;
1372
1373 /*-----------------------------------------------------------------*/
1374 case OP_CHARNC:
1375 if (clen == 0) break;
1376
1377 #ifdef SUPPORT_UTF8
1378 if (utf8)
1379 {
1380 if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
1381 {
1382 unsigned int othercase;
1383 if (c < 128) othercase = fcc[c]; else
1384
1385 /* If we have Unicode property support, we can use it to test the
1386 other case of the character. */
1387
1388 #ifdef SUPPORT_UCP
1389 othercase = _pcre_ucp_othercase(c);
1390 #else
1391 othercase = NOTACHAR;
1392 #endif
1393
1394 if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
1395 }
1396 }
1397 else
1398 #endif /* SUPPORT_UTF8 */
1399
1400 /* Non-UTF-8 mode */
1401 {
1402 if (lcc[c] == lcc[d]) { ADD_NEW(state_offset + 2, 0); }
1403 }
1404 break;
1405
1406
1407 #ifdef SUPPORT_UCP
1408 /*-----------------------------------------------------------------*/
1409 /* This is a tricky one because it can match more than one character.
1410 Find out how many characters to skip, and then set up a negative state
1411 to wait for them to pass before continuing. */
1412
1413 case OP_EXTUNI:
1414 if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)
1415 {
1416 const uschar *nptr = ptr + clen;
1417 int ncount = 0;
1418 while (nptr < end_subject)
1419 {
1420 int nclen = 1;
1421 GETCHARLEN(c, nptr, nclen);
1422 if (_pcre_ucp_findprop(c, &chartype, &script) != ucp_M) break;
1423 ncount++;
1424 nptr += nclen;
1425 }
1426 ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
1427 }
1428 break;
1429 #endif
1430
1431 /*-----------------------------------------------------------------*/
1432 /* This is a tricky like EXTUNI because it too can match more than one
1433 character (when CR is followed by LF). In this case, set up a negative
1434 state to wait for one character to pass before continuing. */
1435
1436 case OP_ANYNL:
1437 if (clen > 0) switch(c)
1438 {
1439 case 0x000a:
1440 case 0x000b:
1441 case 0x000c:
1442 case 0x0085:
1443 case 0x2028:
1444 case 0x2029:
1445 ADD_NEW(state_offset + 1, 0);
1446 break;
1447 case 0x000d:
1448 if (ptr + 1 < end_subject && ptr[1] == 0x0a)
1449 {
1450 ADD_NEW_DATA(-(state_offset + 1), 0, 1);
1451 }
1452 else
1453 {
1454 ADD_NEW(state_offset + 1, 0);
1455 }
1456 break;
1457 }
1458 break;
1459
1460 /*-----------------------------------------------------------------*/
1461 /* Match a negated single character. This is only used for one-byte
1462 characters, that is, we know that d < 256. The character we are
1463 checking (c) can be multibyte. */
1464
1465 case OP_NOT:
1466 if (clen > 0)
1467 {
1468 unsigned int otherd = ((ims & PCRE_CASELESS) != 0)? fcc[d] : d;
1469 if (c != d && c != otherd) { ADD_NEW(state_offset + dlen + 1, 0); }
1470 }
1471 break;
1472
1473 /*-----------------------------------------------------------------*/
1474 case OP_PLUS:
1475 case OP_MINPLUS:
1476 case OP_POSPLUS:
1477 case OP_NOTPLUS:
1478 case OP_NOTMINPLUS:
1479 case OP_NOTPOSPLUS:
1480 count = current_state->count; /* Already matched */
1481 if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
1482 if (clen > 0)
1483 {
1484 unsigned int otherd = NOTACHAR;
1485 if ((ims & PCRE_CASELESS) != 0)
1486 {
1487 #ifdef SUPPORT_UTF8
1488 if (utf8 && d >= 128)
1489 {
1490 #ifdef SUPPORT_UCP
1491 otherd = _pcre_ucp_othercase(d);
1492 #endif /* SUPPORT_UCP */
1493 }
1494 else
1495 #endif /* SUPPORT_UTF8 */
1496 otherd = fcc[d];
1497 }
1498 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1499 {
1500 if (count > 0 &&
1501 (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS))
1502 {
1503 active_count--; /* Remove non-match possibility */
1504 next_active_state--;
1505 }
1506 count++;
1507 ADD_NEW(state_offset, count);
1508 }
1509 }
1510 break;
1511
1512 /*-----------------------------------------------------------------*/
1513 case OP_QUERY:
1514 case OP_MINQUERY:
1515 case OP_POSQUERY:
1516 case OP_NOTQUERY:
1517 case OP_NOTMINQUERY:
1518 case OP_NOTPOSQUERY:
1519 ADD_ACTIVE(state_offset + dlen + 1, 0);
1520 if (clen > 0)
1521 {
1522 unsigned int otherd = NOTACHAR;
1523 if ((ims & PCRE_CASELESS) != 0)
1524 {
1525 #ifdef SUPPORT_UTF8
1526 if (utf8 && d >= 128)
1527 {
1528 #ifdef SUPPORT_UCP
1529 otherd = _pcre_ucp_othercase(d);
1530 #endif /* SUPPORT_UCP */
1531 }
1532 else
1533 #endif /* SUPPORT_UTF8 */
1534 otherd = fcc[d];
1535 }
1536 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1537 {
1538 if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY)
1539 {
1540 active_count--; /* Remove non-match possibility */
1541 next_active_state--;
1542 }
1543 ADD_NEW(state_offset + dlen + 1, 0);
1544 }
1545 }
1546 break;
1547
1548 /*-----------------------------------------------------------------*/
1549 case OP_STAR:
1550 case OP_MINSTAR:
1551 case OP_POSSTAR:
1552 case OP_NOTSTAR:
1553 case OP_NOTMINSTAR:
1554 case OP_NOTPOSSTAR:
1555 ADD_ACTIVE(state_offset + dlen + 1, 0);
1556 if (clen > 0)
1557 {
1558 unsigned int otherd = NOTACHAR;
1559 if ((ims & PCRE_CASELESS) != 0)
1560 {
1561 #ifdef SUPPORT_UTF8
1562 if (utf8 && d >= 128)
1563 {
1564 #ifdef SUPPORT_UCP
1565 otherd = _pcre_ucp_othercase(d);
1566 #endif /* SUPPORT_UCP */
1567 }
1568 else
1569 #endif /* SUPPORT_UTF8 */
1570 otherd = fcc[d];
1571 }
1572 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1573 {
1574 if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR)
1575 {
1576 active_count--; /* Remove non-match possibility */
1577 next_active_state--;
1578 }
1579 ADD_NEW(state_offset, 0);
1580 }
1581 }
1582 break;
1583
1584 /*-----------------------------------------------------------------*/
1585 case OP_EXACT:
1586 case OP_NOTEXACT:
1587 count = current_state->count; /* Number already matched */
1588 if (clen > 0)
1589 {
1590 unsigned int otherd = NOTACHAR;
1591 if ((ims & PCRE_CASELESS) != 0)
1592 {
1593 #ifdef SUPPORT_UTF8
1594 if (utf8 && d >= 128)
1595 {
1596 #ifdef SUPPORT_UCP
1597 otherd = _pcre_ucp_othercase(d);
1598 #endif /* SUPPORT_UCP */
1599 }
1600 else
1601 #endif /* SUPPORT_UTF8 */
1602 otherd = fcc[d];
1603 }
1604 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1605 {
1606 if (++count >= GET2(code, 1))
1607 { ADD_NEW(state_offset + dlen + 3, 0); }
1608 else
1609 { ADD_NEW(state_offset, count); }
1610 }
1611 }
1612 break;
1613
1614 /*-----------------------------------------------------------------*/
1615 case OP_UPTO:
1616 case OP_MINUPTO:
1617 case OP_POSUPTO:
1618 case OP_NOTUPTO:
1619 case OP_NOTMINUPTO:
1620 case OP_NOTPOSUPTO:
1621 ADD_ACTIVE(state_offset + dlen + 3, 0);
1622 count = current_state->count; /* Number already matched */
1623 if (clen > 0)
1624 {
1625 unsigned int otherd = NOTACHAR;
1626 if ((ims & PCRE_CASELESS) != 0)
1627 {
1628 #ifdef SUPPORT_UTF8
1629 if (utf8 && d >= 128)
1630 {
1631 #ifdef SUPPORT_UCP
1632 otherd = _pcre_ucp_othercase(d);
1633 #endif /* SUPPORT_UCP */
1634 }
1635 else
1636 #endif /* SUPPORT_UTF8 */
1637 otherd = fcc[d];
1638 }
1639 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1640 {
1641 if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO)
1642 {
1643 active_count--; /* Remove non-match possibility */
1644 next_active_state--;
1645 }
1646 if (++count >= GET2(code, 1))
1647 { ADD_NEW(state_offset + dlen + 3, 0); }
1648 else
1649 { ADD_NEW(state_offset, count); }
1650 }
1651 }
1652 break;
1653
1654
1655 /* ========================================================================== */
1656 /* These are the class-handling opcodes */
1657
1658 case OP_CLASS:
1659 case OP_NCLASS:
1660 case OP_XCLASS:
1661 {
1662 BOOL isinclass = FALSE;
1663 int next_state_offset;
1664 const uschar *ecode;
1665
1666 /* For a simple class, there is always just a 32-byte table, and we
1667 can set isinclass from it. */
1668
1669 if (codevalue != OP_XCLASS)
1670 {
1671 ecode = code + 33;
1672 if (clen > 0)
1673 {
1674 isinclass = (c > 255)? (codevalue == OP_NCLASS) :
1675 ((code[1 + c/8] & (1 << (c&7))) != 0);
1676 }
1677 }
1678
1679 /* An extended class may have a table or a list of single characters,
1680 ranges, or both, and it may be positive or negative. There's a
1681 function that sorts all this out. */
1682
1683 else
1684 {
1685 ecode = code + GET(code, 1);
1686 if (clen > 0) isinclass = _pcre_xclass(c, code + 1 + LINK_SIZE);
1687 }
1688
1689 /* At this point, isinclass is set for all kinds of class, and ecode
1690 points to the byte after the end of the class. If there is a
1691 quantifier, this is where it will be. */
1692
1693 next_state_offset = ecode - start_code;
1694
1695 switch (*ecode)
1696 {
1697 case OP_CRSTAR:
1698 case OP_CRMINSTAR:
1699 ADD_ACTIVE(next_state_offset + 1, 0);
1700 if (isinclass) { ADD_NEW(state_offset, 0); }
1701 break;
1702
1703 case OP_CRPLUS:
1704 case OP_CRMINPLUS:
1705 count = current_state->count; /* Already matched */
1706 if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
1707 if (isinclass) { count++; ADD_NEW(state_offset, count); }
1708 break;
1709
1710 case OP_CRQUERY:
1711 case OP_CRMINQUERY:
1712 ADD_ACTIVE(next_state_offset + 1, 0);
1713 if (isinclass) { ADD_NEW(next_state_offset + 1, 0); }
1714 break;
1715
1716 case OP_CRRANGE:
1717 case OP_CRMINRANGE:
1718 count = current_state->count; /* Already matched */
1719 if (count >= GET2(ecode, 1))
1720 { ADD_ACTIVE(next_state_offset + 5, 0); }
1721 if (isinclass)
1722 {
1723 int max = GET2(ecode, 3);
1724 if (++count >= max && max != 0) /* Max 0 => no limit */
1725 { ADD_NEW(next_state_offset + 5, 0); }
1726 else
1727 { ADD_NEW(state_offset, count); }
1728 }
1729 break;
1730
1731 default:
1732 if (isinclass) { ADD_NEW(next_state_offset, 0); }
1733 break;
1734 }
1735 }
1736 break;
1737
1738 /* ========================================================================== */
1739 /* These are the opcodes for fancy brackets of various kinds. We have
1740 to use recursion in order to handle them. */
1741
1742 case OP_ASSERT:
1743 case OP_ASSERT_NOT:
1744 case OP_ASSERTBACK:
1745 case OP_ASSERTBACK_NOT:
1746 {
1747 int rc;
1748 int local_offsets[2];
1749 int local_workspace[1000];
1750 const uschar *endasscode = code + GET(code, 1);
1751
1752 while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
1753
1754 rc = internal_dfa_exec(
1755 md, /* static match data */
1756 code, /* this subexpression's code */
1757 ptr, /* where we currently are */
1758 ptr - start_subject, /* start offset */
1759 local_offsets, /* offset vector */
1760 sizeof(local_offsets)/sizeof(int), /* size of same */
1761 local_workspace, /* workspace vector */
1762 sizeof(local_workspace)/sizeof(int), /* size of same */
1763 ims, /* the current ims flags */
1764 rlevel, /* function recursion level */
1765 recursing); /* pass on regex recursion */
1766
1767 if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
1768 { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
1769 }
1770 break;
1771
1772 /*-----------------------------------------------------------------*/
1773 case OP_COND:
1774 case OP_SCOND:
1775 {
1776 int local_offsets[1000];
1777 int local_workspace[1000];
1778 int condcode = code[LINK_SIZE+1];
1779
1780 /* Back reference conditions are not supported */
1781
1782 if (condcode == OP_CREF) return PCRE_ERROR_DFA_UCOND;
1783
1784 /* The DEFINE condition is always false */
1785
1786 if (condcode == OP_DEF)
1787 {
1788 ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0);
1789 }
1790
1791 /* The only supported version of OP_RREF is for the value RREF_ANY,
1792 which means "test if in any recursion". We can't test for specifically
1793 recursed groups. */
1794
1795 else if (condcode == OP_RREF)
1796 {
1797 int value = GET2(code, LINK_SIZE+2);
1798 if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
1799 if (recursing > 0) { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); }
1800 else { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }
1801 }
1802
1803 /* Otherwise, the condition is an assertion */
1804
1805 else
1806 {
1807 int rc;
1808 const uschar *asscode = code + LINK_SIZE + 1;
1809 const uschar *endasscode = asscode + GET(asscode, 1);
1810
1811 while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
1812
1813 rc = internal_dfa_exec(
1814 md, /* fixed match data */
1815 asscode, /* this subexpression's code */
1816 ptr, /* where we currently are */
1817 ptr - start_subject, /* start offset */
1818 local_offsets, /* offset vector */
1819 sizeof(local_offsets)/sizeof(int), /* size of same */
1820 local_workspace, /* workspace vector */
1821 sizeof(local_workspace)/sizeof(int), /* size of same */
1822 ims, /* the current ims flags */
1823 rlevel, /* function recursion level */
1824 recursing); /* pass on regex recursion */
1825
1826 if ((rc >= 0) ==
1827 (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
1828 { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
1829 else
1830 { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }
1831 }
1832 }
1833 break;
1834
1835 /*-----------------------------------------------------------------*/
1836 case OP_RECURSE:
1837 {
1838 int local_offsets[1000];
1839 int local_workspace[1000];
1840 int rc;
1841
1842 DPRINTF(("%.*sStarting regex recursion %d\n", rlevel*2-2, SP,
1843 recursing + 1));
1844
1845 rc = internal_dfa_exec(
1846 md, /* fixed match data */
1847 start_code + GET(code, 1), /* this subexpression's code */
1848 ptr, /* where we currently are */
1849 ptr - start_subject, /* start offset */
1850 local_offsets, /* offset vector */
1851 sizeof(local_offsets)/sizeof(int), /* size of same */
1852 local_workspace, /* workspace vector */
1853 sizeof(local_workspace)/sizeof(int), /* size of same */
1854 ims, /* the current ims flags */
1855 rlevel, /* function recursion level */
1856 recursing + 1); /* regex recurse level */
1857
1858 DPRINTF(("%.*sReturn from regex recursion %d: rc=%d\n", rlevel*2-2, SP,
1859 recursing + 1, rc));
1860
1861 /* Ran out of internal offsets */
1862
1863 if (rc == 0) return PCRE_ERROR_DFA_RECURSE;
1864
1865 /* For each successful matched substring, set up the next state with a
1866 count of characters to skip before trying it. Note that the count is in
1867 characters, not bytes. */
1868
1869 if (rc > 0)
1870 {
1871 for (rc = rc*2 - 2; rc >= 0; rc -= 2)
1872 {
1873 const uschar *p = start_subject + local_offsets[rc];
1874 const uschar *pp = start_subject + local_offsets[rc+1];
1875 int charcount = local_offsets[rc+1] - local_offsets[rc];
1876 while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
1877 if (charcount > 0)
1878 {
1879 ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));
1880 }
1881 else
1882 {
1883 ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0);
1884 }
1885 }
1886 }
1887 else if (rc != PCRE_ERROR_NOMATCH) return rc;
1888 }
1889 break;
1890
1891 /*-----------------------------------------------------------------*/
1892 case OP_ONCE:
1893 {
1894 int local_offsets[2];
1895 int local_workspace[1000];
1896
1897 int rc = internal_dfa_exec(
1898 md, /* fixed match data */
1899 code, /* this subexpression's code */
1900 ptr, /* where we currently are */
1901 ptr - start_subject, /* start offset */
1902 local_offsets, /* offset vector */
1903 sizeof(local_offsets)/sizeof(int), /* size of same */
1904 local_workspace, /* workspace vector */
1905 sizeof(local_workspace)/sizeof(int), /* size of same */
1906 ims, /* the current ims flags */
1907 rlevel, /* function recursion level */
1908 recursing); /* pass on regex recursion */
1909
1910 if (rc >= 0)
1911 {
1912 const uschar *end_subpattern = code;
1913 int charcount = local_offsets[1] - local_offsets[0];
1914 int next_state_offset, repeat_state_offset;
1915
1916 do { end_subpattern += GET(end_subpattern, 1); }
1917 while (*end_subpattern == OP_ALT);
1918 next_state_offset = end_subpattern - start_code + LINK_SIZE + 1;
1919
1920 /* If the end of this subpattern is KETRMAX or KETRMIN, we must
1921 arrange for the repeat state also to be added to the relevant list.
1922 Calculate the offset, or set -1 for no repeat. */
1923
1924 repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
1925 *end_subpattern == OP_KETRMIN)?
1926 end_subpattern - start_code - GET(end_subpattern, 1) : -1;
1927
1928 /* If we have matched an empty string, add the next state at the
1929 current character pointer. This is important so that the duplicate
1930 checking kicks in, which is what breaks infinite loops that match an
1931 empty string. */
1932
1933 if (charcount == 0)
1934 {
1935 ADD_ACTIVE(next_state_offset, 0);
1936 }
1937
1938 /* Optimization: if there are no more active states, and there
1939 are no new states yet set up, then skip over the subject string
1940 right here, to save looping. Otherwise, set up the new state to swing
1941 into action when the end of the substring is reached. */
1942
1943 else if (i + 1 >= active_count && new_count == 0)
1944 {
1945 ptr += charcount;
1946 clen = 0;
1947 ADD_NEW(next_state_offset, 0);
1948
1949 /* If we are adding a repeat state at the new character position,
1950 we must fudge things so that it is the only current state.
1951 Otherwise, it might be a duplicate of one we processed before, and
1952 that would cause it to be skipped. */
1953
1954 if (repeat_state_offset >= 0)
1955 {
1956 next_active_state = active_states;
1957 active_count = 0;
1958 i = -1;
1959 ADD_ACTIVE(repeat_state_offset, 0);
1960 }
1961 }
1962 else
1963 {
1964 const uschar *p = start_subject + local_offsets[0];
1965 const uschar *pp = start_subject + local_offsets[1];
1966 while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
1967 ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
1968 if (repeat_state_offset >= 0)
1969 { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }
1970 }
1971
1972 }
1973 else if (rc != PCRE_ERROR_NOMATCH) return rc;
1974 }
1975 break;
1976
1977
1978 /* ========================================================================== */
1979 /* Handle callouts */
1980
1981 case OP_CALLOUT:
1982 if (pcre_callout != NULL)
1983 {
1984 int rrc;
1985 pcre_callout_block cb;
1986 cb.version = 1; /* Version 1 of the callout block */
1987 cb.callout_number = code[1];
1988 cb.offset_vector = offsets;
1989 cb.subject = (PCRE_SPTR)start_subject;
1990 cb.subject_length = end_subject - start_subject;
1991 cb.start_match = current_subject - start_subject;
1992 cb.current_position = ptr - start_subject;
1993 cb.pattern_position = GET(code, 2);
1994 cb.next_item_length = GET(code, 2 + LINK_SIZE);
1995 cb.capture_top = 1;
1996 cb.capture_last = -1;
1997 cb.callout_data = md->callout_data;
1998 if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc; /* Abandon */
1999 if (rrc == 0) { ADD_ACTIVE(state_offset + 2 + 2*LINK_SIZE, 0); }
2000 }
2001 break;
2002
2003
2004 /* ========================================================================== */
2005 default: /* Unsupported opcode */
2006 return PCRE_ERROR_DFA_UITEM;
2007 }
2008
2009 NEXT_ACTIVE_STATE: continue;
2010
2011 } /* End of loop scanning active states */
2012
2013 /* We have finished the processing at the current subject character. If no
2014 new states have been set for the next character, we have found all the
2015 matches that we are going to find. If we are at the top level and partial
2016 matching has been requested, check for appropriate conditions. */
2017
2018 if (new_count <= 0)
2019 {
2020 if (match_count < 0 && /* No matches found */
2021 rlevel == 1 && /* Top level match function */
2022 (md->moptions & PCRE_PARTIAL) != 0 && /* Want partial matching */
2023 ptr >= end_subject && /* Reached end of subject */
2024 ptr > current_subject) /* Matched non-empty string */
2025 {
2026 if (offsetcount >= 2)
2027 {
2028 offsets[0] = current_subject - start_subject;
2029 offsets[1] = end_subject - start_subject;
2030 }
2031 match_count = PCRE_ERROR_PARTIAL;
2032 }
2033
2034 DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
2035 "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count,
2036 rlevel*2-2, SP));
2037 break; /* In effect, "return", but see the comment below */
2038 }
2039
2040 /* One or more states are active for the next character. */
2041
2042 ptr += clen; /* Advance to next subject character */
2043 } /* Loop to move along the subject string */
2044
2045 /* Control gets here from "break" a few lines above. We do it this way because
2046 if we use "return" above, we have compiler trouble. Some compilers warn if
2047 there's nothing here because they think the function doesn't return a value. On
2048 the other hand, if we put a dummy statement here, some more clever compilers
2049 complain that it can't be reached. Sigh. */
2050
2051 return match_count;
2052 }
2053
2054
2055
2056
2057 /*************************************************
2058 * Execute a Regular Expression - DFA engine *
2059 *************************************************/
2060
2061 /* This external function applies a compiled re to a subject string using a DFA
2062 engine. This function calls the internal function multiple times if the pattern
2063 is not anchored.
2064
2065 Arguments:
2066 argument_re points to the compiled expression
2067 extra_data points to extra data or is NULL
2068 subject points to the subject string
2069 length length of subject string (may contain binary zeros)
2070 start_offset where to start in the subject string
2071 options option bits
2072 offsets vector of match offsets
2073 offsetcount size of same
2074 workspace workspace vector
2075 wscount size of same
2076
2077 Returns: > 0 => number of match offset pairs placed in offsets
2078 = 0 => offsets overflowed; longest matches are present
2079 -1 => failed to match
2080 < -1 => some kind of unexpected problem
2081 */
2082
2083 PCRE_EXP_DEFN int
2084 pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
2085 const char *subject, int length, int start_offset, int options, int *offsets,
2086 int offsetcount, int *workspace, int wscount)
2087 {
2088 real_pcre *re = (real_pcre *)argument_re;
2089 dfa_match_data match_block;
2090 dfa_match_data *md = &match_block;
2091 BOOL utf8, anchored, startline, firstline;
2092 const uschar *current_subject, *end_subject, *lcc;
2093
2094 pcre_study_data internal_study;
2095 const pcre_study_data *study = NULL;
2096 real_pcre internal_re;
2097
2098 const uschar *req_byte_ptr;
2099 const uschar *start_bits = NULL;
2100 BOOL first_byte_caseless = FALSE;
2101 BOOL req_byte_caseless = FALSE;
2102 int first_byte = -1;
2103 int req_byte = -1;
2104 int req_byte2 = -1;
2105 int newline;
2106
2107 /* Plausibility checks */
2108
2109 if ((options & ~PUBLIC_DFA_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
2110 if (re == NULL || subject == NULL || workspace == NULL ||
2111 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
2112 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
2113 if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;
2114
2115 /* We need to find the pointer to any study data before we test for byte
2116 flipping, so we scan the extra_data block first. This may set two fields in the
2117 match block, so we must initialize them beforehand. However, the other fields
2118 in the match block must not be set until after the byte flipping. */
2119
2120 md->tables = re->tables;
2121 md->callout_data = NULL;
2122
2123 if (extra_data != NULL)
2124 {
2125 unsigned int flags = extra_data->flags;
2126 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
2127 study = (const pcre_study_data *)extra_data->study_data;
2128 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) return PCRE_ERROR_DFA_UMLIMIT;
2129 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
2130 return PCRE_ERROR_DFA_UMLIMIT;
2131 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
2132 md->callout_data = extra_data->callout_data;
2133 if ((flags & PCRE_EXTRA_TABLES) != 0)
2134 md->tables = extra_data->tables;
2135 }
2136
2137 /* Check that the first field in the block is the magic number. If it is not,
2138 test for a regex that was compiled on a host of opposite endianness. If this is
2139 the case, flipped values are put in internal_re and internal_study if there was
2140 study data too. */
2141
2142 if (re->magic_number != MAGIC_NUMBER)
2143 {
2144 re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
2145 if (re == NULL) return PCRE_ERROR_BADMAGIC;
2146 if (study != NULL) study = &internal_study;
2147 }
2148
2149 /* Set some local values */
2150
2151 current_subject = (const unsigned char *)subject + start_offset;
2152 end_subject = (const unsigned char *)subject + length;
2153 req_byte_ptr = current_subject - 1;
2154
2155 #ifdef SUPPORT_UTF8
2156 utf8 = (re->options & PCRE_UTF8) != 0;
2157 #else
2158 utf8 = FALSE;
2159 #endif
2160
2161 anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||
2162 (re->options & PCRE_ANCHORED) != 0;
2163
2164 /* The remaining fixed data for passing around. */
2165
2166 md->start_code = (const uschar *)argument_re +
2167 re->name_table_offset + re->name_count * re->name_entry_size;
2168 md->start_subject = (const unsigned char *)subject;
2169 md->end_subject = end_subject;
2170 md->moptions = options;
2171 md->poptions = re->options;
2172
2173 /* Handle different types of newline. The three bits give eight cases. If
2174 nothing is set at run time, whatever was used at compile time applies. */
2175
2176 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)options) &
2177 PCRE_NEWLINE_BITS)
2178 {
2179 case 0: newline = NEWLINE; break; /* Compile-time default */
2180 case PCRE_NEWLINE_CR: newline = '\r'; break;
2181 case PCRE_NEWLINE_LF: newline = '\n'; break;
2182 case PCRE_NEWLINE_CR+
2183 PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
2184 case PCRE_NEWLINE_ANY: newline = -1; break;
2185 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
2186 default: return PCRE_ERROR_BADNEWLINE;
2187 }
2188
2189 if (newline == -2)
2190 {
2191 md->nltype = NLTYPE_ANYCRLF;
2192 }
2193 else if (newline < 0)
2194 {
2195 md->nltype = NLTYPE_ANY;
2196 }
2197 else
2198 {
2199 md->nltype = NLTYPE_FIXED;
2200 if (newline > 255)
2201 {
2202 md->nllen = 2;
2203 md->nl[0] = (newline >> 8) & 255;
2204 md->nl[1] = newline & 255;
2205 }
2206 else
2207 {
2208 md->nllen = 1;
2209 md->nl[0] = newline;
2210 }
2211 }
2212
2213 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
2214 back the character offset. */
2215
2216 #ifdef SUPPORT_UTF8
2217 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
2218 {
2219 if (_pcre_valid_utf8((uschar *)subject, length) >= 0)
2220 return PCRE_ERROR_BADUTF8;
2221 if (start_offset > 0 && start_offset < length)
2222 {
2223 int tb = ((uschar *)subject)[start_offset];
2224 if (tb > 127)
2225 {
2226 tb &= 0xc0;
2227 if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
2228 }
2229 }
2230 }
2231 #endif
2232
2233 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
2234 is a feature that makes it possible to save compiled regex and re-use them
2235 in other programs later. */
2236
2237 if (md->tables == NULL) md->tables = _pcre_default_tables;
2238
2239 /* The lower casing table and the "must be at the start of a line" flag are
2240 used in a loop when finding where to start. */
2241
2242 lcc = md->tables + lcc_offset;
2243 startline = (re->options & PCRE_STARTLINE) != 0;
2244 firstline = (re->options & PCRE_FIRSTLINE) != 0;
2245
2246 /* Set up the first character to match, if available. The first_byte value is
2247 never set for an anchored regular expression, but the anchoring may be forced
2248 at run time, so we have to test for anchoring. The first char may be unset for
2249 an unanchored pattern, of course. If there's no first char and the pattern was
2250 studied, there may be a bitmap of possible first characters. */
2251
2252 if (!anchored)
2253 {
2254 if ((re->options & PCRE_FIRSTSET) != 0)
2255 {
2256 first_byte = re->first_byte & 255;
2257 if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
2258 first_byte = lcc[first_byte];
2259 }
2260 else
2261 {
2262 if (startline && study != NULL &&
2263 (study->options & PCRE_STUDY_MAPPED) != 0)
2264 start_bits = study->start_bits;
2265 }
2266 }
2267
2268 /* For anchored or unanchored matches, there may be a "last known required
2269 character" set. */
2270
2271 if ((re->options & PCRE_REQCHSET) != 0)
2272 {
2273 req_byte = re->req_byte & 255;
2274 req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
2275 req_byte2 = (md->tables + fcc_offset)[req_byte]; /* case flipped */
2276 }
2277
2278 /* Call the main matching function, looping for a non-anchored regex after a
2279 failed match. Unless restarting, optimize by moving to the first match
2280 character if possible, when not anchored. Then unless wanting a partial match,
2281 check for a required later character. */
2282
2283 for (;;)
2284 {
2285 int rc;
2286
2287 if ((options & PCRE_DFA_RESTART) == 0)
2288 {
2289 const uschar *save_end_subject = end_subject;
2290
2291 /* Advance to a unique first char if possible. If firstline is TRUE, the
2292 start of the match is constrained to the first line of a multiline string.
2293 Implement this by temporarily adjusting end_subject so that we stop
2294 scanning at a newline. If the match fails at the newline, later code breaks
2295 this loop. */
2296
2297 if (firstline)
2298 {
2299 const uschar *t = current_subject;
2300 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
2301 end_subject = t;
2302 }
2303
2304 if (first_byte >= 0)
2305 {
2306 if (first_byte_caseless)
2307 while (current_subject < end_subject &&
2308 lcc[*current_subject] != first_byte)
2309 current_subject++;
2310 else
2311 while (current_subject < end_subject && *current_subject != first_byte)
2312 current_subject++;
2313 }
2314
2315 /* Or to just after a linebreak for a multiline match if possible */
2316
2317 else if (startline)
2318 {
2319 if (current_subject > md->start_subject + start_offset)
2320 {
2321 while (current_subject <= end_subject && !WAS_NEWLINE(current_subject))
2322 current_subject++;
2323
2324 /* If we have just passed a CR and the newline option is ANY or
2325 ANYCRLF, and we are now at a LF, advance the match position by one more
2326 character. */
2327
2328 if (current_subject[-1] == '\r' &&
2329 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
2330 current_subject < end_subject &&
2331 *current_subject == '\n')
2332 current_subject++;
2333 }
2334 }
2335
2336 /* Or to a non-unique first char after study */
2337
2338 else if (start_bits != NULL)
2339 {
2340 while (current_subject < end_subject)
2341 {
2342 register unsigned int c = *current_subject;
2343 if ((start_bits[c/8] & (1 << (c&7))) == 0) current_subject++;
2344 else break;
2345 }
2346 }
2347
2348 /* Restore fudged end_subject */
2349
2350 end_subject = save_end_subject;
2351 }
2352
2353 /* If req_byte is set, we know that that character must appear in the subject
2354 for the match to succeed. If the first character is set, req_byte must be
2355 later in the subject; otherwise the test starts at the match point. This
2356 optimization can save a huge amount of work in patterns with nested unlimited
2357 repeats that aren't going to match. Writing separate code for cased/caseless
2358 versions makes it go faster, as does using an autoincrement and backing off
2359 on a match.
2360
2361 HOWEVER: when the subject string is very, very long, searching to its end can
2362 take a long time, and give bad performance on quite ordinary patterns. This
2363 showed up when somebody was matching /^C/ on a 32-megabyte string... so we
2364 don't do this when the string is sufficiently long.
2365
2366 ALSO: this processing is disabled when partial matching is requested.
2367 */
2368
2369 if (req_byte >= 0 &&
2370 end_subject - current_subject < REQ_BYTE_MAX &&
2371 (options & PCRE_PARTIAL) == 0)
2372 {
2373 register const uschar *p = current_subject + ((first_byte >= 0)? 1 : 0);
2374
2375 /* We don't need to repeat the search if we haven't yet reached the
2376 place we found it at last time. */
2377
2378 if (p > req_byte_ptr)
2379 {
2380 if (req_byte_caseless)
2381 {
2382 while (p < end_subject)
2383 {
2384 register int pp = *p++;
2385 if (pp == req_byte || pp == req_byte2) { p--; break; }
2386 }
2387 }
2388 else
2389 {
2390 while (p < end_subject)
2391 {
2392 if (*p++ == req_byte) { p--; break; }
2393 }
2394 }
2395
2396 /* If we can't find the required character, break the matching loop,
2397 which will cause a return or PCRE_ERROR_NOMATCH. */
2398
2399 if (p >= end_subject) break;
2400
2401 /* If we have found the required character, save the point where we
2402 found it, so that we don't search again next time round the loop if
2403 the start hasn't passed this character yet. */
2404
2405 req_byte_ptr = p;
2406 }
2407 }
2408
2409 /* OK, now we can do the business */
2410
2411 rc = internal_dfa_exec(
2412 md, /* fixed match data */
2413 md->start_code, /* this subexpression's code */
2414 current_subject, /* where we currently are */
2415 start_offset, /* start offset in subject */
2416 offsets, /* offset vector */
2417 offsetcount, /* size of same */
2418 workspace, /* workspace vector */
2419 wscount, /* size of same */
2420 re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL), /* ims flags */
2421 0, /* function recurse level */
2422 0); /* regex recurse level */
2423
2424 /* Anything other than "no match" means we are done, always; otherwise, carry
2425 on only if not anchored. */
2426
2427 if (rc != PCRE_ERROR_NOMATCH || anchored) return rc;
2428
2429 /* Advance to the next subject character unless we are at the end of a line
2430 and firstline is set. */
2431
2432 if (firstline && IS_NEWLINE(current_subject)) break;
2433 current_subject++;
2434 if (utf8)
2435 {
2436 while (current_subject < end_subject && (*current_subject & 0xc0) == 0x80)
2437 current_subject++;
2438 }
2439 if (current_subject > end_subject) break;
2440
2441 /* If we have just passed a CR and the newline option is CRLF or ANY or
2442 ANYCRLF, and we are now at a LF, advance the match position by one more
2443 character. */
2444
2445 if (current_subject[-1] == '\r' &&
2446 (md->nltype == NLTYPE_ANY ||
2447 md->nltype == NLTYPE_ANYCRLF ||
2448 md->nllen == 2) &&
2449 current_subject < end_subject &&
2450 *current_subject == '\n')
2451 current_subject++;
2452
2453 } /* "Bumpalong" loop */
2454
2455 return PCRE_ERROR_NOMATCH;
2456 }
2457
2458 /* End of pcre_dfa_exec.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12