/[pcre]/code/trunk/pcre_dfa_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_dfa_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 145 - (show annotations) (download)
Wed Apr 4 14:06:52 2007 UTC (7 years, 6 months ago) by ph10
File MIME type: text/plain
File size: 81201 byte(s)
Reworked all the WIN32 __declspec stuff in the hope of getting it right.

1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2007 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains the external function pcre_dfa_exec(), which is an
42 alternative matching function that uses a sort of DFA algorithm (not a true
43 FSM). This is NOT Perl- compatible, but it has advantages in certain
44 applications. */
45
46
47 #define NLBLOCK md /* Block containing newline information */
48 #define PSSTART start_subject /* Field containing processed string start */
49 #define PSEND end_subject /* Field containing processed string end */
50
51 #include "pcre_internal.h"
52
53
54 /* For use to indent debugging output */
55
56 #define SP " "
57
58
59
60 /*************************************************
61 * Code parameters and static tables *
62 *************************************************/
63
64 /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
65 into others, under special conditions. A gap of 20 between the blocks should be
66 enough. */
67
68 #define OP_PROP_EXTRA 100
69 #define OP_EXTUNI_EXTRA 120
70 #define OP_ANYNL_EXTRA 140
71
72
73 /* This table identifies those opcodes that are followed immediately by a
74 character that is to be tested in some way. This makes is possible to
75 centralize the loading of these characters. In the case of Type * etc, the
76 "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
77 small value. */
78
79 static uschar coptable[] = {
80 0, /* End */
81 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* \A, \G, \B, \b, \D, \d, \S, \s, \W, \w */
82 0, 0, /* Any, Anybyte */
83 0, 0, 0, 0, /* NOTPROP, PROP, EXTUNI, ANYNL */
84 0, 0, 0, 0, 0, /* \Z, \z, Opt, ^, $ */
85 1, /* Char */
86 1, /* Charnc */
87 1, /* not */
88 /* Positive single-char repeats */
89 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
90 3, 3, 3, /* upto, minupto, exact */
91 1, 1, 1, 3, /* *+, ++, ?+, upto+ */
92 /* Negative single-char repeats - only for chars < 256 */
93 1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
94 3, 3, 3, /* NOT upto, minupto, exact */
95 1, 1, 1, 3, /* NOT *+, ++, ?+, updo+ */
96 /* Positive type repeats */
97 1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
98 3, 3, 3, /* Type upto, minupto, exact */
99 1, 1, 1, 3, /* Type *+, ++, ?+, upto+ */
100 /* Character class & ref repeats */
101 0, 0, 0, 0, 0, 0, /* *, *?, +, +?, ?, ?? */
102 0, 0, /* CRRANGE, CRMINRANGE */
103 0, /* CLASS */
104 0, /* NCLASS */
105 0, /* XCLASS - variable length */
106 0, /* REF */
107 0, /* RECURSE */
108 0, /* CALLOUT */
109 0, /* Alt */
110 0, /* Ket */
111 0, /* KetRmax */
112 0, /* KetRmin */
113 0, /* Assert */
114 0, /* Assert not */
115 0, /* Assert behind */
116 0, /* Assert behind not */
117 0, /* Reverse */
118 0, 0, 0, 0, /* ONCE, BRA, CBRA, COND */
119 0, 0, 0, /* SBRA, SCBRA, SCOND */
120 0, /* CREF */
121 0, /* RREF */
122 0, /* DEF */
123 0, 0 /* BRAZERO, BRAMINZERO */
124 };
125
126 /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
127 and \w */
128
129 static uschar toptable1[] = {
130 0, 0, 0, 0, 0,
131 ctype_digit, ctype_digit,
132 ctype_space, ctype_space,
133 ctype_word, ctype_word,
134 0 /* OP_ANY */
135 };
136
137 static uschar toptable2[] = {
138 0, 0, 0, 0, 0,
139 ctype_digit, 0,
140 ctype_space, 0,
141 ctype_word, 0,
142 1 /* OP_ANY */
143 };
144
145
146 /* Structure for holding data about a particular state, which is in effect the
147 current data for an active path through the match tree. It must consist
148 entirely of ints because the working vector we are passed, and which we put
149 these structures in, is a vector of ints. */
150
151 typedef struct stateblock {
152 int offset; /* Offset to opcode */
153 int count; /* Count for repeats */
154 int ims; /* ims flag bits */
155 int data; /* Some use extra data */
156 } stateblock;
157
158 #define INTS_PER_STATEBLOCK (sizeof(stateblock)/sizeof(int))
159
160
161 #ifdef DEBUG
162 /*************************************************
163 * Print character string *
164 *************************************************/
165
166 /* Character string printing function for debugging.
167
168 Arguments:
169 p points to string
170 length number of bytes
171 f where to print
172
173 Returns: nothing
174 */
175
176 static void
177 pchars(unsigned char *p, int length, FILE *f)
178 {
179 int c;
180 while (length-- > 0)
181 {
182 if (isprint(c = *(p++)))
183 fprintf(f, "%c", c);
184 else
185 fprintf(f, "\\x%02x", c);
186 }
187 }
188 #endif
189
190
191
192 /*************************************************
193 * Execute a Regular Expression - DFA engine *
194 *************************************************/
195
196 /* This internal function applies a compiled pattern to a subject string,
197 starting at a given point, using a DFA engine. This function is called from the
198 external one, possibly multiple times if the pattern is not anchored. The
199 function calls itself recursively for some kinds of subpattern.
200
201 Arguments:
202 md the match_data block with fixed information
203 this_start_code the opening bracket of this subexpression's code
204 current_subject where we currently are in the subject string
205 start_offset start offset in the subject string
206 offsets vector to contain the matching string offsets
207 offsetcount size of same
208 workspace vector of workspace
209 wscount size of same
210 ims the current ims flags
211 rlevel function call recursion level
212 recursing regex recursive call level
213
214 Returns: > 0 =>
215 = 0 =>
216 -1 => failed to match
217 < -1 => some kind of unexpected problem
218
219 The following macros are used for adding states to the two state vectors (one
220 for the current character, one for the following character). */
221
222 #define ADD_ACTIVE(x,y) \
223 if (active_count++ < wscount) \
224 { \
225 next_active_state->offset = (x); \
226 next_active_state->count = (y); \
227 next_active_state->ims = ims; \
228 next_active_state++; \
229 DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
230 } \
231 else return PCRE_ERROR_DFA_WSSIZE
232
233 #define ADD_ACTIVE_DATA(x,y,z) \
234 if (active_count++ < wscount) \
235 { \
236 next_active_state->offset = (x); \
237 next_active_state->count = (y); \
238 next_active_state->ims = ims; \
239 next_active_state->data = (z); \
240 next_active_state++; \
241 DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
242 } \
243 else return PCRE_ERROR_DFA_WSSIZE
244
245 #define ADD_NEW(x,y) \
246 if (new_count++ < wscount) \
247 { \
248 next_new_state->offset = (x); \
249 next_new_state->count = (y); \
250 next_new_state->ims = ims; \
251 next_new_state++; \
252 DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
253 } \
254 else return PCRE_ERROR_DFA_WSSIZE
255
256 #define ADD_NEW_DATA(x,y,z) \
257 if (new_count++ < wscount) \
258 { \
259 next_new_state->offset = (x); \
260 next_new_state->count = (y); \
261 next_new_state->ims = ims; \
262 next_new_state->data = (z); \
263 next_new_state++; \
264 DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
265 } \
266 else return PCRE_ERROR_DFA_WSSIZE
267
268 /* And now, here is the code */
269
270 static int
271 internal_dfa_exec(
272 dfa_match_data *md,
273 const uschar *this_start_code,
274 const uschar *current_subject,
275 int start_offset,
276 int *offsets,
277 int offsetcount,
278 int *workspace,
279 int wscount,
280 int ims,
281 int rlevel,
282 int recursing)
283 {
284 stateblock *active_states, *new_states, *temp_states;
285 stateblock *next_active_state, *next_new_state;
286
287 const uschar *ctypes, *lcc, *fcc;
288 const uschar *ptr;
289 const uschar *end_code, *first_op;
290
291 int active_count, new_count, match_count;
292
293 /* Some fields in the md block are frequently referenced, so we load them into
294 independent variables in the hope that this will perform better. */
295
296 const uschar *start_subject = md->start_subject;
297 const uschar *end_subject = md->end_subject;
298 const uschar *start_code = md->start_code;
299
300 #ifdef SUPPORT_UTF8
301 BOOL utf8 = (md->poptions & PCRE_UTF8) != 0;
302 #else
303 BOOL utf8 = FALSE;
304 #endif
305
306 rlevel++;
307 offsetcount &= (-2);
308
309 wscount -= 2;
310 wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
311 (2 * INTS_PER_STATEBLOCK);
312
313 DPRINTF(("\n%.*s---------------------\n"
314 "%.*sCall to internal_dfa_exec f=%d r=%d\n",
315 rlevel*2-2, SP, rlevel*2-2, SP, rlevel, recursing));
316
317 ctypes = md->tables + ctypes_offset;
318 lcc = md->tables + lcc_offset;
319 fcc = md->tables + fcc_offset;
320
321 match_count = PCRE_ERROR_NOMATCH; /* A negative number */
322
323 active_states = (stateblock *)(workspace + 2);
324 next_new_state = new_states = active_states + wscount;
325 new_count = 0;
326
327 first_op = this_start_code + 1 + LINK_SIZE +
328 ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
329
330 /* The first thing in any (sub) pattern is a bracket of some sort. Push all
331 the alternative states onto the list, and find out where the end is. This
332 makes is possible to use this function recursively, when we want to stop at a
333 matching internal ket rather than at the end.
334
335 If the first opcode in the first alternative is OP_REVERSE, we are dealing with
336 a backward assertion. In that case, we have to find out the maximum amount to
337 move back, and set up each alternative appropriately. */
338
339 if (*first_op == OP_REVERSE)
340 {
341 int max_back = 0;
342 int gone_back;
343
344 end_code = this_start_code;
345 do
346 {
347 int back = GET(end_code, 2+LINK_SIZE);
348 if (back > max_back) max_back = back;
349 end_code += GET(end_code, 1);
350 }
351 while (*end_code == OP_ALT);
352
353 /* If we can't go back the amount required for the longest lookbehind
354 pattern, go back as far as we can; some alternatives may still be viable. */
355
356 #ifdef SUPPORT_UTF8
357 /* In character mode we have to step back character by character */
358
359 if (utf8)
360 {
361 for (gone_back = 0; gone_back < max_back; gone_back++)
362 {
363 if (current_subject <= start_subject) break;
364 current_subject--;
365 while (current_subject > start_subject &&
366 (*current_subject & 0xc0) == 0x80)
367 current_subject--;
368 }
369 }
370 else
371 #endif
372
373 /* In byte-mode we can do this quickly. */
374
375 {
376 gone_back = (current_subject - max_back < start_subject)?
377 current_subject - start_subject : max_back;
378 current_subject -= gone_back;
379 }
380
381 /* Now we can process the individual branches. */
382
383 end_code = this_start_code;
384 do
385 {
386 int back = GET(end_code, 2+LINK_SIZE);
387 if (back <= gone_back)
388 {
389 int bstate = end_code - start_code + 2 + 2*LINK_SIZE;
390 ADD_NEW_DATA(-bstate, 0, gone_back - back);
391 }
392 end_code += GET(end_code, 1);
393 }
394 while (*end_code == OP_ALT);
395 }
396
397 /* This is the code for a "normal" subpattern (not a backward assertion). The
398 start of a whole pattern is always one of these. If we are at the top level,
399 we may be asked to restart matching from the same point that we reached for a
400 previous partial match. We still have to scan through the top-level branches to
401 find the end state. */
402
403 else
404 {
405 end_code = this_start_code;
406
407 /* Restarting */
408
409 if (rlevel == 1 && (md->moptions & PCRE_DFA_RESTART) != 0)
410 {
411 do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);
412 new_count = workspace[1];
413 if (!workspace[0])
414 memcpy(new_states, active_states, new_count * sizeof(stateblock));
415 }
416
417 /* Not restarting */
418
419 else
420 {
421 int length = 1 + LINK_SIZE +
422 ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
423 do
424 {
425 ADD_NEW(end_code - start_code + length, 0);
426 end_code += GET(end_code, 1);
427 length = 1 + LINK_SIZE;
428 }
429 while (*end_code == OP_ALT);
430 }
431 }
432
433 workspace[0] = 0; /* Bit indicating which vector is current */
434
435 DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, end_code - start_code));
436
437 /* Loop for scanning the subject */
438
439 ptr = current_subject;
440 for (;;)
441 {
442 int i, j;
443 int clen, dlen;
444 unsigned int c, d;
445
446 /* Make the new state list into the active state list and empty the
447 new state list. */
448
449 temp_states = active_states;
450 active_states = new_states;
451 new_states = temp_states;
452 active_count = new_count;
453 new_count = 0;
454
455 workspace[0] ^= 1; /* Remember for the restarting feature */
456 workspace[1] = active_count;
457
458 #ifdef DEBUG
459 printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);
460 pchars((uschar *)ptr, strlen((char *)ptr), stdout);
461 printf("\"\n");
462
463 printf("%.*sActive states: ", rlevel*2-2, SP);
464 for (i = 0; i < active_count; i++)
465 printf("%d/%d ", active_states[i].offset, active_states[i].count);
466 printf("\n");
467 #endif
468
469 /* Set the pointers for adding new states */
470
471 next_active_state = active_states + active_count;
472 next_new_state = new_states;
473
474 /* Load the current character from the subject outside the loop, as many
475 different states may want to look at it, and we assume that at least one
476 will. */
477
478 if (ptr < end_subject)
479 {
480 clen = 1; /* Number of bytes in the character */
481 #ifdef SUPPORT_UTF8
482 if (utf8) { GETCHARLEN(c, ptr, clen); } else
483 #endif /* SUPPORT_UTF8 */
484 c = *ptr;
485 }
486 else
487 {
488 clen = 0; /* This indicates the end of the subject */
489 c = NOTACHAR; /* This value should never actually be used */
490 }
491
492 /* Scan up the active states and act on each one. The result of an action
493 may be to add more states to the currently active list (e.g. on hitting a
494 parenthesis) or it may be to put states on the new list, for considering
495 when we move the character pointer on. */
496
497 for (i = 0; i < active_count; i++)
498 {
499 stateblock *current_state = active_states + i;
500 const uschar *code;
501 int state_offset = current_state->offset;
502 int count, codevalue;
503 int chartype, script;
504
505 #ifdef DEBUG
506 printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
507 if (clen == 0) printf("EOL\n");
508 else if (c > 32 && c < 127) printf("'%c'\n", c);
509 else printf("0x%02x\n", c);
510 #endif
511
512 /* This variable is referred to implicity in the ADD_xxx macros. */
513
514 ims = current_state->ims;
515
516 /* A negative offset is a special case meaning "hold off going to this
517 (negated) state until the number of characters in the data field have
518 been skipped". */
519
520 if (state_offset < 0)
521 {
522 if (current_state->data > 0)
523 {
524 DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP));
525 ADD_NEW_DATA(state_offset, current_state->count,
526 current_state->data - 1);
527 continue;
528 }
529 else
530 {
531 current_state->offset = state_offset = -state_offset;
532 }
533 }
534
535 /* Check for a duplicate state with the same count, and skip if found. */
536
537 for (j = 0; j < i; j++)
538 {
539 if (active_states[j].offset == state_offset &&
540 active_states[j].count == current_state->count)
541 {
542 DPRINTF(("%.*sDuplicate state: skipped\n", rlevel*2-2, SP));
543 goto NEXT_ACTIVE_STATE;
544 }
545 }
546
547 /* The state offset is the offset to the opcode */
548
549 code = start_code + state_offset;
550 codevalue = *code;
551
552 /* If this opcode is followed by an inline character, load it. It is
553 tempting to test for the presence of a subject character here, but that
554 is wrong, because sometimes zero repetitions of the subject are
555 permitted.
556
557 We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
558 argument that is not a data character - but is always one byte long.
559 Unfortunately, we have to take special action to deal with \P, \p, and
560 \X in this case. To keep the other cases fast, convert these ones to new
561 opcodes. */
562
563 if (coptable[codevalue] > 0)
564 {
565 dlen = 1;
566 #ifdef SUPPORT_UTF8
567 if (utf8) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
568 #endif /* SUPPORT_UTF8 */
569 d = code[coptable[codevalue]];
570 if (codevalue >= OP_TYPESTAR)
571 {
572 switch(d)
573 {
574 case OP_ANYBYTE: return PCRE_ERROR_DFA_UITEM;
575 case OP_NOTPROP:
576 case OP_PROP: codevalue += OP_PROP_EXTRA; break;
577 case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
578 case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
579 default: break;
580 }
581 }
582 }
583 else
584 {
585 dlen = 0; /* Not strictly necessary, but compilers moan */
586 d = NOTACHAR; /* if these variables are not set. */
587 }
588
589
590 /* Now process the individual opcodes */
591
592 switch (codevalue)
593 {
594
595 /* ========================================================================== */
596 /* Reached a closing bracket. If not at the end of the pattern, carry
597 on with the next opcode. Otherwise, unless we have an empty string and
598 PCRE_NOTEMPTY is set, save the match data, shifting up all previous
599 matches so we always have the longest first. */
600
601 case OP_KET:
602 case OP_KETRMIN:
603 case OP_KETRMAX:
604 if (code != end_code)
605 {
606 ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
607 if (codevalue != OP_KET)
608 {
609 ADD_ACTIVE(state_offset - GET(code, 1), 0);
610 }
611 }
612 else if (ptr > current_subject || (md->moptions & PCRE_NOTEMPTY) == 0)
613 {
614 if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
615 else if (match_count > 0 && ++match_count * 2 >= offsetcount)
616 match_count = 0;
617 count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
618 if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
619 if (offsetcount >= 2)
620 {
621 offsets[0] = current_subject - start_subject;
622 offsets[1] = ptr - start_subject;
623 DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
624 offsets[1] - offsets[0], current_subject));
625 }
626 if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
627 {
628 DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
629 "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,
630 match_count, rlevel*2-2, SP));
631 return match_count;
632 }
633 }
634 break;
635
636 /* ========================================================================== */
637 /* These opcodes add to the current list of states without looking
638 at the current character. */
639
640 /*-----------------------------------------------------------------*/
641 case OP_ALT:
642 do { code += GET(code, 1); } while (*code == OP_ALT);
643 ADD_ACTIVE(code - start_code, 0);
644 break;
645
646 /*-----------------------------------------------------------------*/
647 case OP_BRA:
648 case OP_SBRA:
649 do
650 {
651 ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
652 code += GET(code, 1);
653 }
654 while (*code == OP_ALT);
655 break;
656
657 /*-----------------------------------------------------------------*/
658 case OP_CBRA:
659 case OP_SCBRA:
660 ADD_ACTIVE(code - start_code + 3 + LINK_SIZE, 0);
661 code += GET(code, 1);
662 while (*code == OP_ALT)
663 {
664 ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
665 code += GET(code, 1);
666 }
667 break;
668
669 /*-----------------------------------------------------------------*/
670 case OP_BRAZERO:
671 case OP_BRAMINZERO:
672 ADD_ACTIVE(state_offset + 1, 0);
673 code += 1 + GET(code, 2);
674 while (*code == OP_ALT) code += GET(code, 1);
675 ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
676 break;
677
678 /*-----------------------------------------------------------------*/
679 case OP_CIRC:
680 if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
681 ((ims & PCRE_MULTILINE) != 0 &&
682 ptr != end_subject &&
683 WAS_NEWLINE(ptr)))
684 { ADD_ACTIVE(state_offset + 1, 0); }
685 break;
686
687 /*-----------------------------------------------------------------*/
688 case OP_EOD:
689 if (ptr >= end_subject) { ADD_ACTIVE(state_offset + 1, 0); }
690 break;
691
692 /*-----------------------------------------------------------------*/
693 case OP_OPT:
694 ims = code[1];
695 ADD_ACTIVE(state_offset + 2, 0);
696 break;
697
698 /*-----------------------------------------------------------------*/
699 case OP_SOD:
700 if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
701 break;
702
703 /*-----------------------------------------------------------------*/
704 case OP_SOM:
705 if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); }
706 break;
707
708
709 /* ========================================================================== */
710 /* These opcodes inspect the next subject character, and sometimes
711 the previous one as well, but do not have an argument. The variable
712 clen contains the length of the current character and is zero if we are
713 at the end of the subject. */
714
715 /*-----------------------------------------------------------------*/
716 case OP_ANY:
717 if (clen > 0 && ((ims & PCRE_DOTALL) != 0 || !IS_NEWLINE(ptr)))
718 { ADD_NEW(state_offset + 1, 0); }
719 break;
720
721 /*-----------------------------------------------------------------*/
722 case OP_EODN:
723 if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
724 { ADD_ACTIVE(state_offset + 1, 0); }
725 break;
726
727 /*-----------------------------------------------------------------*/
728 case OP_DOLL:
729 if ((md->moptions & PCRE_NOTEOL) == 0)
730 {
731 if (clen == 0 ||
732 (IS_NEWLINE(ptr) &&
733 ((ims & PCRE_MULTILINE) != 0 || ptr == end_subject - md->nllen)
734 ))
735 { ADD_ACTIVE(state_offset + 1, 0); }
736 }
737 else if ((ims & PCRE_MULTILINE) != 0 && IS_NEWLINE(ptr))
738 { ADD_ACTIVE(state_offset + 1, 0); }
739 break;
740
741 /*-----------------------------------------------------------------*/
742
743 case OP_DIGIT:
744 case OP_WHITESPACE:
745 case OP_WORDCHAR:
746 if (clen > 0 && c < 256 &&
747 ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)
748 { ADD_NEW(state_offset + 1, 0); }
749 break;
750
751 /*-----------------------------------------------------------------*/
752 case OP_NOT_DIGIT:
753 case OP_NOT_WHITESPACE:
754 case OP_NOT_WORDCHAR:
755 if (clen > 0 && (c >= 256 ||
756 ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0))
757 { ADD_NEW(state_offset + 1, 0); }
758 break;
759
760 /*-----------------------------------------------------------------*/
761 case OP_WORD_BOUNDARY:
762 case OP_NOT_WORD_BOUNDARY:
763 {
764 int left_word, right_word;
765
766 if (ptr > start_subject)
767 {
768 const uschar *temp = ptr - 1;
769 #ifdef SUPPORT_UTF8
770 if (utf8) BACKCHAR(temp);
771 #endif
772 GETCHARTEST(d, temp);
773 left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
774 }
775 else left_word = 0;
776
777 if (clen > 0) right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
778 else right_word = 0;
779
780 if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
781 { ADD_ACTIVE(state_offset + 1, 0); }
782 }
783 break;
784
785
786 #ifdef SUPPORT_UCP
787
788 /*-----------------------------------------------------------------*/
789 /* Check the next character by Unicode property. We will get here only
790 if the support is in the binary; otherwise a compile-time error occurs.
791 */
792
793 case OP_PROP:
794 case OP_NOTPROP:
795 if (clen > 0)
796 {
797 BOOL OK;
798 int category = _pcre_ucp_findprop(c, &chartype, &script);
799 switch(code[1])
800 {
801 case PT_ANY:
802 OK = TRUE;
803 break;
804
805 case PT_LAMP:
806 OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
807 break;
808
809 case PT_GC:
810 OK = category == code[2];
811 break;
812
813 case PT_PC:
814 OK = chartype == code[2];
815 break;
816
817 case PT_SC:
818 OK = script == code[2];
819 break;
820
821 /* Should never occur, but keep compilers from grumbling. */
822
823 default:
824 OK = codevalue != OP_PROP;
825 break;
826 }
827
828 if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); }
829 }
830 break;
831 #endif
832
833
834
835 /* ========================================================================== */
836 /* These opcodes likewise inspect the subject character, but have an
837 argument that is not a data character. It is one of these opcodes:
838 OP_ANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE, OP_WORDCHAR,
839 OP_NOT_WORDCHAR. The value is loaded into d. */
840
841 case OP_TYPEPLUS:
842 case OP_TYPEMINPLUS:
843 case OP_TYPEPOSPLUS:
844 count = current_state->count; /* Already matched */
845 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
846 if (clen > 0)
847 {
848 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
849 (c < 256 &&
850 (d != OP_ANY ||
851 (ims & PCRE_DOTALL) != 0 ||
852 !IS_NEWLINE(ptr)
853 ) &&
854 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
855 {
856 if (count > 0 && codevalue == OP_TYPEPOSPLUS)
857 {
858 active_count--; /* Remove non-match possibility */
859 next_active_state--;
860 }
861 count++;
862 ADD_NEW(state_offset, count);
863 }
864 }
865 break;
866
867 /*-----------------------------------------------------------------*/
868 case OP_TYPEQUERY:
869 case OP_TYPEMINQUERY:
870 case OP_TYPEPOSQUERY:
871 ADD_ACTIVE(state_offset + 2, 0);
872 if (clen > 0)
873 {
874 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
875 (c < 256 &&
876 (d != OP_ANY ||
877 (ims & PCRE_DOTALL) != 0 ||
878 !IS_NEWLINE(ptr)
879 ) &&
880 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
881 {
882 if (codevalue == OP_TYPEPOSQUERY)
883 {
884 active_count--; /* Remove non-match possibility */
885 next_active_state--;
886 }
887 ADD_NEW(state_offset + 2, 0);
888 }
889 }
890 break;
891
892 /*-----------------------------------------------------------------*/
893 case OP_TYPESTAR:
894 case OP_TYPEMINSTAR:
895 case OP_TYPEPOSSTAR:
896 ADD_ACTIVE(state_offset + 2, 0);
897 if (clen > 0)
898 {
899 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
900 (c < 256 &&
901 (d != OP_ANY ||
902 (ims & PCRE_DOTALL) != 0 ||
903 !IS_NEWLINE(ptr)
904 ) &&
905 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
906 {
907 if (codevalue == OP_TYPEPOSSTAR)
908 {
909 active_count--; /* Remove non-match possibility */
910 next_active_state--;
911 }
912 ADD_NEW(state_offset, 0);
913 }
914 }
915 break;
916
917 /*-----------------------------------------------------------------*/
918 case OP_TYPEEXACT:
919 count = current_state->count; /* Number already matched */
920 if (clen > 0)
921 {
922 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
923 (c < 256 &&
924 (d != OP_ANY ||
925 (ims & PCRE_DOTALL) != 0 ||
926 !IS_NEWLINE(ptr)
927 ) &&
928 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
929 {
930 if (++count >= GET2(code, 1))
931 { ADD_NEW(state_offset + 4, 0); }
932 else
933 { ADD_NEW(state_offset, count); }
934 }
935 }
936 break;
937
938 /*-----------------------------------------------------------------*/
939 case OP_TYPEUPTO:
940 case OP_TYPEMINUPTO:
941 case OP_TYPEPOSUPTO:
942 ADD_ACTIVE(state_offset + 4, 0);
943 count = current_state->count; /* Number already matched */
944 if (clen > 0)
945 {
946 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
947 (c < 256 &&
948 (d != OP_ANY ||
949 (ims & PCRE_DOTALL) != 0 ||
950 !IS_NEWLINE(ptr)
951 ) &&
952 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
953 {
954 if (codevalue == OP_TYPEPOSUPTO)
955 {
956 active_count--; /* Remove non-match possibility */
957 next_active_state--;
958 }
959 if (++count >= GET2(code, 1))
960 { ADD_NEW(state_offset + 4, 0); }
961 else
962 { ADD_NEW(state_offset, count); }
963 }
964 }
965 break;
966
967 /* ========================================================================== */
968 /* These are virtual opcodes that are used when something like
969 OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
970 argument. It keeps the code above fast for the other cases. The argument
971 is in the d variable. */
972
973 case OP_PROP_EXTRA + OP_TYPEPLUS:
974 case OP_PROP_EXTRA + OP_TYPEMINPLUS:
975 case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
976 count = current_state->count; /* Already matched */
977 if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
978 if (clen > 0)
979 {
980 BOOL OK;
981 int category = _pcre_ucp_findprop(c, &chartype, &script);
982 switch(code[2])
983 {
984 case PT_ANY:
985 OK = TRUE;
986 break;
987
988 case PT_LAMP:
989 OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
990 break;
991
992 case PT_GC:
993 OK = category == code[3];
994 break;
995
996 case PT_PC:
997 OK = chartype == code[3];
998 break;
999
1000 case PT_SC:
1001 OK = script == code[3];
1002 break;
1003
1004 /* Should never occur, but keep compilers from grumbling. */
1005
1006 default:
1007 OK = codevalue != OP_PROP;
1008 break;
1009 }
1010
1011 if (OK == (d == OP_PROP))
1012 {
1013 if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)
1014 {
1015 active_count--; /* Remove non-match possibility */
1016 next_active_state--;
1017 }
1018 count++;
1019 ADD_NEW(state_offset, count);
1020 }
1021 }
1022 break;
1023
1024 /*-----------------------------------------------------------------*/
1025 case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
1026 case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
1027 case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
1028 count = current_state->count; /* Already matched */
1029 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1030 if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)
1031 {
1032 const uschar *nptr = ptr + clen;
1033 int ncount = 0;
1034 if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
1035 {
1036 active_count--; /* Remove non-match possibility */
1037 next_active_state--;
1038 }
1039 while (nptr < end_subject)
1040 {
1041 int nd;
1042 int ndlen = 1;
1043 GETCHARLEN(nd, nptr, ndlen);
1044 if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;
1045 ncount++;
1046 nptr += ndlen;
1047 }
1048 count++;
1049 ADD_NEW_DATA(-state_offset, count, ncount);
1050 }
1051 break;
1052
1053 /*-----------------------------------------------------------------*/
1054 case OP_ANYNL_EXTRA + OP_TYPEPLUS:
1055 case OP_ANYNL_EXTRA + OP_TYPEMINPLUS:
1056 case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS:
1057 count = current_state->count; /* Already matched */
1058 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1059 if (clen > 0)
1060 {
1061 int ncount = 0;
1062 switch (c)
1063 {
1064 case 0x000d:
1065 if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1066 /* Fall through */
1067 case 0x000a:
1068 case 0x000b:
1069 case 0x000c:
1070 case 0x0085:
1071 case 0x2028:
1072 case 0x2029:
1073 if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
1074 {
1075 active_count--; /* Remove non-match possibility */
1076 next_active_state--;
1077 }
1078 count++;
1079 ADD_NEW_DATA(-state_offset, count, ncount);
1080 break;
1081 default:
1082 break;
1083 }
1084 }
1085 break;
1086
1087 /*-----------------------------------------------------------------*/
1088 case OP_PROP_EXTRA + OP_TYPEQUERY:
1089 case OP_PROP_EXTRA + OP_TYPEMINQUERY:
1090 case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
1091 count = 4;
1092 goto QS1;
1093
1094 case OP_PROP_EXTRA + OP_TYPESTAR:
1095 case OP_PROP_EXTRA + OP_TYPEMINSTAR:
1096 case OP_PROP_EXTRA + OP_TYPEPOSSTAR:
1097 count = 0;
1098
1099 QS1:
1100
1101 ADD_ACTIVE(state_offset + 4, 0);
1102 if (clen > 0)
1103 {
1104 BOOL OK;
1105 int category = _pcre_ucp_findprop(c, &chartype, &script);
1106 switch(code[2])
1107 {
1108 case PT_ANY:
1109 OK = TRUE;
1110 break;
1111
1112 case PT_LAMP:
1113 OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
1114 break;
1115
1116 case PT_GC:
1117 OK = category == code[3];
1118 break;
1119
1120 case PT_PC:
1121 OK = chartype == code[3];
1122 break;
1123
1124 case PT_SC:
1125 OK = script == code[3];
1126 break;
1127
1128 /* Should never occur, but keep compilers from grumbling. */
1129
1130 default:
1131 OK = codevalue != OP_PROP;
1132 break;
1133 }
1134
1135 if (OK == (d == OP_PROP))
1136 {
1137 if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR ||
1138 codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)
1139 {
1140 active_count--; /* Remove non-match possibility */
1141 next_active_state--;
1142 }
1143 ADD_NEW(state_offset + count, 0);
1144 }
1145 }
1146 break;
1147
1148 /*-----------------------------------------------------------------*/
1149 case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
1150 case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
1151 case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY:
1152 count = 2;
1153 goto QS2;
1154
1155 case OP_EXTUNI_EXTRA + OP_TYPESTAR:
1156 case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
1157 case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR:
1158 count = 0;
1159
1160 QS2:
1161
1162 ADD_ACTIVE(state_offset + 2, 0);
1163 if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)
1164 {
1165 const uschar *nptr = ptr + clen;
1166 int ncount = 0;
1167 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
1168 codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
1169 {
1170 active_count--; /* Remove non-match possibility */
1171 next_active_state--;
1172 }
1173 while (nptr < end_subject)
1174 {
1175 int nd;
1176 int ndlen = 1;
1177 GETCHARLEN(nd, nptr, ndlen);
1178 if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;
1179 ncount++;
1180 nptr += ndlen;
1181 }
1182 ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1183 }
1184 break;
1185
1186 /*-----------------------------------------------------------------*/
1187 case OP_ANYNL_EXTRA + OP_TYPEQUERY:
1188 case OP_ANYNL_EXTRA + OP_TYPEMINQUERY:
1189 case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY:
1190 count = 2;
1191 goto QS3;
1192
1193 case OP_ANYNL_EXTRA + OP_TYPESTAR:
1194 case OP_ANYNL_EXTRA + OP_TYPEMINSTAR:
1195 case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR:
1196 count = 0;
1197
1198 QS3:
1199 ADD_ACTIVE(state_offset + 2, 0);
1200 if (clen > 0)
1201 {
1202 int ncount = 0;
1203 switch (c)
1204 {
1205 case 0x000d:
1206 if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1207 /* Fall through */
1208 case 0x000a:
1209 case 0x000b:
1210 case 0x000c:
1211 case 0x0085:
1212 case 0x2028:
1213 case 0x2029:
1214 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
1215 codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
1216 {
1217 active_count--; /* Remove non-match possibility */
1218 next_active_state--;
1219 }
1220 ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1221 break;
1222 default:
1223 break;
1224 }
1225 }
1226 break;
1227
1228 /*-----------------------------------------------------------------*/
1229 case OP_PROP_EXTRA + OP_TYPEEXACT:
1230 case OP_PROP_EXTRA + OP_TYPEUPTO:
1231 case OP_PROP_EXTRA + OP_TYPEMINUPTO:
1232 case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
1233 if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
1234 { ADD_ACTIVE(state_offset + 6, 0); }
1235 count = current_state->count; /* Number already matched */
1236 if (clen > 0)
1237 {
1238 BOOL OK;
1239 int category = _pcre_ucp_findprop(c, &chartype, &script);
1240 switch(code[4])
1241 {
1242 case PT_ANY:
1243 OK = TRUE;
1244 break;
1245
1246 case PT_LAMP:
1247 OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
1248 break;
1249
1250 case PT_GC:
1251 OK = category == code[5];
1252 break;
1253
1254 case PT_PC:
1255 OK = chartype == code[5];
1256 break;
1257
1258 case PT_SC:
1259 OK = script == code[5];
1260 break;
1261
1262 /* Should never occur, but keep compilers from grumbling. */
1263
1264 default:
1265 OK = codevalue != OP_PROP;
1266 break;
1267 }
1268
1269 if (OK == (d == OP_PROP))
1270 {
1271 if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
1272 {
1273 active_count--; /* Remove non-match possibility */
1274 next_active_state--;
1275 }
1276 if (++count >= GET2(code, 1))
1277 { ADD_NEW(state_offset + 6, 0); }
1278 else
1279 { ADD_NEW(state_offset, count); }
1280 }
1281 }
1282 break;
1283
1284 /*-----------------------------------------------------------------*/
1285 case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
1286 case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
1287 case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
1288 case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
1289 if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
1290 { ADD_ACTIVE(state_offset + 4, 0); }
1291 count = current_state->count; /* Number already matched */
1292 if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)
1293 {
1294 const uschar *nptr = ptr + clen;
1295 int ncount = 0;
1296 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
1297 {
1298 active_count--; /* Remove non-match possibility */
1299 next_active_state--;
1300 }
1301 while (nptr < end_subject)
1302 {
1303 int nd;
1304 int ndlen = 1;
1305 GETCHARLEN(nd, nptr, ndlen);
1306 if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;
1307 ncount++;
1308 nptr += ndlen;
1309 }
1310 if (++count >= GET2(code, 1))
1311 { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
1312 else
1313 { ADD_NEW_DATA(-state_offset, count, ncount); }
1314 }
1315 break;
1316
1317 /*-----------------------------------------------------------------*/
1318 case OP_ANYNL_EXTRA + OP_TYPEEXACT:
1319 case OP_ANYNL_EXTRA + OP_TYPEUPTO:
1320 case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
1321 case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
1322 if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
1323 { ADD_ACTIVE(state_offset + 4, 0); }
1324 count = current_state->count; /* Number already matched */
1325 if (clen > 0)
1326 {
1327 int ncount = 0;
1328 switch (c)
1329 {
1330 case 0x000d:
1331 if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1332 /* Fall through */
1333 case 0x000a:
1334 case 0x000b:
1335 case 0x000c:
1336 case 0x0085:
1337 case 0x2028:
1338 case 0x2029:
1339 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
1340 {
1341 active_count--; /* Remove non-match possibility */
1342 next_active_state--;
1343 }
1344 if (++count >= GET2(code, 1))
1345 { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
1346 else
1347 { ADD_NEW_DATA(-state_offset, count, ncount); }
1348 break;
1349 default:
1350 break;
1351 }
1352 }
1353 break;
1354
1355 /* ========================================================================== */
1356 /* These opcodes are followed by a character that is usually compared
1357 to the current subject character; it is loaded into d. We still get
1358 here even if there is no subject character, because in some cases zero
1359 repetitions are permitted. */
1360
1361 /*-----------------------------------------------------------------*/
1362 case OP_CHAR:
1363 if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); }
1364 break;
1365
1366 /*-----------------------------------------------------------------*/
1367 case OP_CHARNC:
1368 if (clen == 0) break;
1369
1370 #ifdef SUPPORT_UTF8
1371 if (utf8)
1372 {
1373 if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
1374 {
1375 unsigned int othercase;
1376 if (c < 128) othercase = fcc[c]; else
1377
1378 /* If we have Unicode property support, we can use it to test the
1379 other case of the character. */
1380
1381 #ifdef SUPPORT_UCP
1382 othercase = _pcre_ucp_othercase(c);
1383 #else
1384 othercase = NOTACHAR;
1385 #endif
1386
1387 if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
1388 }
1389 }
1390 else
1391 #endif /* SUPPORT_UTF8 */
1392
1393 /* Non-UTF-8 mode */
1394 {
1395 if (lcc[c] == lcc[d]) { ADD_NEW(state_offset + 2, 0); }
1396 }
1397 break;
1398
1399
1400 #ifdef SUPPORT_UCP
1401 /*-----------------------------------------------------------------*/
1402 /* This is a tricky one because it can match more than one character.
1403 Find out how many characters to skip, and then set up a negative state
1404 to wait for them to pass before continuing. */
1405
1406 case OP_EXTUNI:
1407 if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)
1408 {
1409 const uschar *nptr = ptr + clen;
1410 int ncount = 0;
1411 while (nptr < end_subject)
1412 {
1413 int nclen = 1;
1414 GETCHARLEN(c, nptr, nclen);
1415 if (_pcre_ucp_findprop(c, &chartype, &script) != ucp_M) break;
1416 ncount++;
1417 nptr += nclen;
1418 }
1419 ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
1420 }
1421 break;
1422 #endif
1423
1424 /*-----------------------------------------------------------------*/
1425 /* This is a tricky like EXTUNI because it too can match more than one
1426 character (when CR is followed by LF). In this case, set up a negative
1427 state to wait for one character to pass before continuing. */
1428
1429 case OP_ANYNL:
1430 if (clen > 0) switch(c)
1431 {
1432 case 0x000a:
1433 case 0x000b:
1434 case 0x000c:
1435 case 0x0085:
1436 case 0x2028:
1437 case 0x2029:
1438 ADD_NEW(state_offset + 1, 0);
1439 break;
1440 case 0x000d:
1441 if (ptr + 1 < end_subject && ptr[1] == 0x0a)
1442 {
1443 ADD_NEW_DATA(-(state_offset + 1), 0, 1);
1444 }
1445 else
1446 {
1447 ADD_NEW(state_offset + 1, 0);
1448 }
1449 break;
1450 }
1451 break;
1452
1453 /*-----------------------------------------------------------------*/
1454 /* Match a negated single character. This is only used for one-byte
1455 characters, that is, we know that d < 256. The character we are
1456 checking (c) can be multibyte. */
1457
1458 case OP_NOT:
1459 if (clen > 0)
1460 {
1461 unsigned int otherd = ((ims & PCRE_CASELESS) != 0)? fcc[d] : d;
1462 if (c != d && c != otherd) { ADD_NEW(state_offset + dlen + 1, 0); }
1463 }
1464 break;
1465
1466 /*-----------------------------------------------------------------*/
1467 case OP_PLUS:
1468 case OP_MINPLUS:
1469 case OP_POSPLUS:
1470 case OP_NOTPLUS:
1471 case OP_NOTMINPLUS:
1472 case OP_NOTPOSPLUS:
1473 count = current_state->count; /* Already matched */
1474 if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
1475 if (clen > 0)
1476 {
1477 unsigned int otherd = NOTACHAR;
1478 if ((ims & PCRE_CASELESS) != 0)
1479 {
1480 #ifdef SUPPORT_UTF8
1481 if (utf8 && d >= 128)
1482 {
1483 #ifdef SUPPORT_UCP
1484 otherd = _pcre_ucp_othercase(d);
1485 #endif /* SUPPORT_UCP */
1486 }
1487 else
1488 #endif /* SUPPORT_UTF8 */
1489 otherd = fcc[d];
1490 }
1491 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1492 {
1493 if (count > 0 &&
1494 (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS))
1495 {
1496 active_count--; /* Remove non-match possibility */
1497 next_active_state--;
1498 }
1499 count++;
1500 ADD_NEW(state_offset, count);
1501 }
1502 }
1503 break;
1504
1505 /*-----------------------------------------------------------------*/
1506 case OP_QUERY:
1507 case OP_MINQUERY:
1508 case OP_POSQUERY:
1509 case OP_NOTQUERY:
1510 case OP_NOTMINQUERY:
1511 case OP_NOTPOSQUERY:
1512 ADD_ACTIVE(state_offset + dlen + 1, 0);
1513 if (clen > 0)
1514 {
1515 unsigned int otherd = NOTACHAR;
1516 if ((ims & PCRE_CASELESS) != 0)
1517 {
1518 #ifdef SUPPORT_UTF8
1519 if (utf8 && d >= 128)
1520 {
1521 #ifdef SUPPORT_UCP
1522 otherd = _pcre_ucp_othercase(d);
1523 #endif /* SUPPORT_UCP */
1524 }
1525 else
1526 #endif /* SUPPORT_UTF8 */
1527 otherd = fcc[d];
1528 }
1529 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1530 {
1531 if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY)
1532 {
1533 active_count--; /* Remove non-match possibility */
1534 next_active_state--;
1535 }
1536 ADD_NEW(state_offset + dlen + 1, 0);
1537 }
1538 }
1539 break;
1540
1541 /*-----------------------------------------------------------------*/
1542 case OP_STAR:
1543 case OP_MINSTAR:
1544 case OP_POSSTAR:
1545 case OP_NOTSTAR:
1546 case OP_NOTMINSTAR:
1547 case OP_NOTPOSSTAR:
1548 ADD_ACTIVE(state_offset + dlen + 1, 0);
1549 if (clen > 0)
1550 {
1551 unsigned int otherd = NOTACHAR;
1552 if ((ims & PCRE_CASELESS) != 0)
1553 {
1554 #ifdef SUPPORT_UTF8
1555 if (utf8 && d >= 128)
1556 {
1557 #ifdef SUPPORT_UCP
1558 otherd = _pcre_ucp_othercase(d);
1559 #endif /* SUPPORT_UCP */
1560 }
1561 else
1562 #endif /* SUPPORT_UTF8 */
1563 otherd = fcc[d];
1564 }
1565 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1566 {
1567 if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR)
1568 {
1569 active_count--; /* Remove non-match possibility */
1570 next_active_state--;
1571 }
1572 ADD_NEW(state_offset, 0);
1573 }
1574 }
1575 break;
1576
1577 /*-----------------------------------------------------------------*/
1578 case OP_EXACT:
1579 case OP_NOTEXACT:
1580 count = current_state->count; /* Number already matched */
1581 if (clen > 0)
1582 {
1583 unsigned int otherd = NOTACHAR;
1584 if ((ims & PCRE_CASELESS) != 0)
1585 {
1586 #ifdef SUPPORT_UTF8
1587 if (utf8 && d >= 128)
1588 {
1589 #ifdef SUPPORT_UCP
1590 otherd = _pcre_ucp_othercase(d);
1591 #endif /* SUPPORT_UCP */
1592 }
1593 else
1594 #endif /* SUPPORT_UTF8 */
1595 otherd = fcc[d];
1596 }
1597 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1598 {
1599 if (++count >= GET2(code, 1))
1600 { ADD_NEW(state_offset + dlen + 3, 0); }
1601 else
1602 { ADD_NEW(state_offset, count); }
1603 }
1604 }
1605 break;
1606
1607 /*-----------------------------------------------------------------*/
1608 case OP_UPTO:
1609 case OP_MINUPTO:
1610 case OP_POSUPTO:
1611 case OP_NOTUPTO:
1612 case OP_NOTMINUPTO:
1613 case OP_NOTPOSUPTO:
1614 ADD_ACTIVE(state_offset + dlen + 3, 0);
1615 count = current_state->count; /* Number already matched */
1616 if (clen > 0)
1617 {
1618 unsigned int otherd = NOTACHAR;
1619 if ((ims & PCRE_CASELESS) != 0)
1620 {
1621 #ifdef SUPPORT_UTF8
1622 if (utf8 && d >= 128)
1623 {
1624 #ifdef SUPPORT_UCP
1625 otherd = _pcre_ucp_othercase(d);
1626 #endif /* SUPPORT_UCP */
1627 }
1628 else
1629 #endif /* SUPPORT_UTF8 */
1630 otherd = fcc[d];
1631 }
1632 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1633 {
1634 if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO)
1635 {
1636 active_count--; /* Remove non-match possibility */
1637 next_active_state--;
1638 }
1639 if (++count >= GET2(code, 1))
1640 { ADD_NEW(state_offset + dlen + 3, 0); }
1641 else
1642 { ADD_NEW(state_offset, count); }
1643 }
1644 }
1645 break;
1646
1647
1648 /* ========================================================================== */
1649 /* These are the class-handling opcodes */
1650
1651 case OP_CLASS:
1652 case OP_NCLASS:
1653 case OP_XCLASS:
1654 {
1655 BOOL isinclass = FALSE;
1656 int next_state_offset;
1657 const uschar *ecode;
1658
1659 /* For a simple class, there is always just a 32-byte table, and we
1660 can set isinclass from it. */
1661
1662 if (codevalue != OP_XCLASS)
1663 {
1664 ecode = code + 33;
1665 if (clen > 0)
1666 {
1667 isinclass = (c > 255)? (codevalue == OP_NCLASS) :
1668 ((code[1 + c/8] & (1 << (c&7))) != 0);
1669 }
1670 }
1671
1672 /* An extended class may have a table or a list of single characters,
1673 ranges, or both, and it may be positive or negative. There's a
1674 function that sorts all this out. */
1675
1676 else
1677 {
1678 ecode = code + GET(code, 1);
1679 if (clen > 0) isinclass = _pcre_xclass(c, code + 1 + LINK_SIZE);
1680 }
1681
1682 /* At this point, isinclass is set for all kinds of class, and ecode
1683 points to the byte after the end of the class. If there is a
1684 quantifier, this is where it will be. */
1685
1686 next_state_offset = ecode - start_code;
1687
1688 switch (*ecode)
1689 {
1690 case OP_CRSTAR:
1691 case OP_CRMINSTAR:
1692 ADD_ACTIVE(next_state_offset + 1, 0);
1693 if (isinclass) { ADD_NEW(state_offset, 0); }
1694 break;
1695
1696 case OP_CRPLUS:
1697 case OP_CRMINPLUS:
1698 count = current_state->count; /* Already matched */
1699 if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
1700 if (isinclass) { count++; ADD_NEW(state_offset, count); }
1701 break;
1702
1703 case OP_CRQUERY:
1704 case OP_CRMINQUERY:
1705 ADD_ACTIVE(next_state_offset + 1, 0);
1706 if (isinclass) { ADD_NEW(next_state_offset + 1, 0); }
1707 break;
1708
1709 case OP_CRRANGE:
1710 case OP_CRMINRANGE:
1711 count = current_state->count; /* Already matched */
1712 if (count >= GET2(ecode, 1))
1713 { ADD_ACTIVE(next_state_offset + 5, 0); }
1714 if (isinclass)
1715 {
1716 int max = GET2(ecode, 3);
1717 if (++count >= max && max != 0) /* Max 0 => no limit */
1718 { ADD_NEW(next_state_offset + 5, 0); }
1719 else
1720 { ADD_NEW(state_offset, count); }
1721 }
1722 break;
1723
1724 default:
1725 if (isinclass) { ADD_NEW(next_state_offset, 0); }
1726 break;
1727 }
1728 }
1729 break;
1730
1731 /* ========================================================================== */
1732 /* These are the opcodes for fancy brackets of various kinds. We have
1733 to use recursion in order to handle them. */
1734
1735 case OP_ASSERT:
1736 case OP_ASSERT_NOT:
1737 case OP_ASSERTBACK:
1738 case OP_ASSERTBACK_NOT:
1739 {
1740 int rc;
1741 int local_offsets[2];
1742 int local_workspace[1000];
1743 const uschar *endasscode = code + GET(code, 1);
1744
1745 while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
1746
1747 rc = internal_dfa_exec(
1748 md, /* static match data */
1749 code, /* this subexpression's code */
1750 ptr, /* where we currently are */
1751 ptr - start_subject, /* start offset */
1752 local_offsets, /* offset vector */
1753 sizeof(local_offsets)/sizeof(int), /* size of same */
1754 local_workspace, /* workspace vector */
1755 sizeof(local_workspace)/sizeof(int), /* size of same */
1756 ims, /* the current ims flags */
1757 rlevel, /* function recursion level */
1758 recursing); /* pass on regex recursion */
1759
1760 if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
1761 { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
1762 }
1763 break;
1764
1765 /*-----------------------------------------------------------------*/
1766 case OP_COND:
1767 case OP_SCOND:
1768 {
1769 int local_offsets[1000];
1770 int local_workspace[1000];
1771 int condcode = code[LINK_SIZE+1];
1772
1773 /* Back reference conditions are not supported */
1774
1775 if (condcode == OP_CREF) return PCRE_ERROR_DFA_UCOND;
1776
1777 /* The DEFINE condition is always false */
1778
1779 if (condcode == OP_DEF)
1780 {
1781 ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0);
1782 }
1783
1784 /* The only supported version of OP_RREF is for the value RREF_ANY,
1785 which means "test if in any recursion". We can't test for specifically
1786 recursed groups. */
1787
1788 else if (condcode == OP_RREF)
1789 {
1790 int value = GET2(code, LINK_SIZE+2);
1791 if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
1792 if (recursing > 0) { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); }
1793 else { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }
1794 }
1795
1796 /* Otherwise, the condition is an assertion */
1797
1798 else
1799 {
1800 int rc;
1801 const uschar *asscode = code + LINK_SIZE + 1;
1802 const uschar *endasscode = asscode + GET(asscode, 1);
1803
1804 while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
1805
1806 rc = internal_dfa_exec(
1807 md, /* fixed match data */
1808 asscode, /* this subexpression's code */
1809 ptr, /* where we currently are */
1810 ptr - start_subject, /* start offset */
1811 local_offsets, /* offset vector */
1812 sizeof(local_offsets)/sizeof(int), /* size of same */
1813 local_workspace, /* workspace vector */
1814 sizeof(local_workspace)/sizeof(int), /* size of same */
1815 ims, /* the current ims flags */
1816 rlevel, /* function recursion level */
1817 recursing); /* pass on regex recursion */
1818
1819 if ((rc >= 0) ==
1820 (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
1821 { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
1822 else
1823 { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }
1824 }
1825 }
1826 break;
1827
1828 /*-----------------------------------------------------------------*/
1829 case OP_RECURSE:
1830 {
1831 int local_offsets[1000];
1832 int local_workspace[1000];
1833 int rc;
1834
1835 DPRINTF(("%.*sStarting regex recursion %d\n", rlevel*2-2, SP,
1836 recursing + 1));
1837
1838 rc = internal_dfa_exec(
1839 md, /* fixed match data */
1840 start_code + GET(code, 1), /* this subexpression's code */
1841 ptr, /* where we currently are */
1842 ptr - start_subject, /* start offset */
1843 local_offsets, /* offset vector */
1844 sizeof(local_offsets)/sizeof(int), /* size of same */
1845 local_workspace, /* workspace vector */
1846 sizeof(local_workspace)/sizeof(int), /* size of same */
1847 ims, /* the current ims flags */
1848 rlevel, /* function recursion level */
1849 recursing + 1); /* regex recurse level */
1850
1851 DPRINTF(("%.*sReturn from regex recursion %d: rc=%d\n", rlevel*2-2, SP,
1852 recursing + 1, rc));
1853
1854 /* Ran out of internal offsets */
1855
1856 if (rc == 0) return PCRE_ERROR_DFA_RECURSE;
1857
1858 /* For each successful matched substring, set up the next state with a
1859 count of characters to skip before trying it. Note that the count is in
1860 characters, not bytes. */
1861
1862 if (rc > 0)
1863 {
1864 for (rc = rc*2 - 2; rc >= 0; rc -= 2)
1865 {
1866 const uschar *p = start_subject + local_offsets[rc];
1867 const uschar *pp = start_subject + local_offsets[rc+1];
1868 int charcount = local_offsets[rc+1] - local_offsets[rc];
1869 while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
1870 if (charcount > 0)
1871 {
1872 ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));
1873 }
1874 else
1875 {
1876 ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0);
1877 }
1878 }
1879 }
1880 else if (rc != PCRE_ERROR_NOMATCH) return rc;
1881 }
1882 break;
1883
1884 /*-----------------------------------------------------------------*/
1885 case OP_ONCE:
1886 {
1887 int local_offsets[2];
1888 int local_workspace[1000];
1889
1890 int rc = internal_dfa_exec(
1891 md, /* fixed match data */
1892 code, /* this subexpression's code */
1893 ptr, /* where we currently are */
1894 ptr - start_subject, /* start offset */
1895 local_offsets, /* offset vector */
1896 sizeof(local_offsets)/sizeof(int), /* size of same */
1897 local_workspace, /* workspace vector */
1898 sizeof(local_workspace)/sizeof(int), /* size of same */
1899 ims, /* the current ims flags */
1900 rlevel, /* function recursion level */
1901 recursing); /* pass on regex recursion */
1902
1903 if (rc >= 0)
1904 {
1905 const uschar *end_subpattern = code;
1906 int charcount = local_offsets[1] - local_offsets[0];
1907 int next_state_offset, repeat_state_offset;
1908
1909 do { end_subpattern += GET(end_subpattern, 1); }
1910 while (*end_subpattern == OP_ALT);
1911 next_state_offset = end_subpattern - start_code + LINK_SIZE + 1;
1912
1913 /* If the end of this subpattern is KETRMAX or KETRMIN, we must
1914 arrange for the repeat state also to be added to the relevant list.
1915 Calculate the offset, or set -1 for no repeat. */
1916
1917 repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
1918 *end_subpattern == OP_KETRMIN)?
1919 end_subpattern - start_code - GET(end_subpattern, 1) : -1;
1920
1921 /* If we have matched an empty string, add the next state at the
1922 current character pointer. This is important so that the duplicate
1923 checking kicks in, which is what breaks infinite loops that match an
1924 empty string. */
1925
1926 if (charcount == 0)
1927 {
1928 ADD_ACTIVE(next_state_offset, 0);
1929 }
1930
1931 /* Optimization: if there are no more active states, and there
1932 are no new states yet set up, then skip over the subject string
1933 right here, to save looping. Otherwise, set up the new state to swing
1934 into action when the end of the substring is reached. */
1935
1936 else if (i + 1 >= active_count && new_count == 0)
1937 {
1938 ptr += charcount;
1939 clen = 0;
1940 ADD_NEW(next_state_offset, 0);
1941
1942 /* If we are adding a repeat state at the new character position,
1943 we must fudge things so that it is the only current state.
1944 Otherwise, it might be a duplicate of one we processed before, and
1945 that would cause it to be skipped. */
1946
1947 if (repeat_state_offset >= 0)
1948 {
1949 next_active_state = active_states;
1950 active_count = 0;
1951 i = -1;
1952 ADD_ACTIVE(repeat_state_offset, 0);
1953 }
1954 }
1955 else
1956 {
1957 const uschar *p = start_subject + local_offsets[0];
1958 const uschar *pp = start_subject + local_offsets[1];
1959 while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
1960 ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
1961 if (repeat_state_offset >= 0)
1962 { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }
1963 }
1964
1965 }
1966 else if (rc != PCRE_ERROR_NOMATCH) return rc;
1967 }
1968 break;
1969
1970
1971 /* ========================================================================== */
1972 /* Handle callouts */
1973
1974 case OP_CALLOUT:
1975 if (pcre_callout != NULL)
1976 {
1977 int rrc;
1978 pcre_callout_block cb;
1979 cb.version = 1; /* Version 1 of the callout block */
1980 cb.callout_number = code[1];
1981 cb.offset_vector = offsets;
1982 cb.subject = (PCRE_SPTR)start_subject;
1983 cb.subject_length = end_subject - start_subject;
1984 cb.start_match = current_subject - start_subject;
1985 cb.current_position = ptr - start_subject;
1986 cb.pattern_position = GET(code, 2);
1987 cb.next_item_length = GET(code, 2 + LINK_SIZE);
1988 cb.capture_top = 1;
1989 cb.capture_last = -1;
1990 cb.callout_data = md->callout_data;
1991 if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc; /* Abandon */
1992 if (rrc == 0) { ADD_ACTIVE(state_offset + 2 + 2*LINK_SIZE, 0); }
1993 }
1994 break;
1995
1996
1997 /* ========================================================================== */
1998 default: /* Unsupported opcode */
1999 return PCRE_ERROR_DFA_UITEM;
2000 }
2001
2002 NEXT_ACTIVE_STATE: continue;
2003
2004 } /* End of loop scanning active states */
2005
2006 /* We have finished the processing at the current subject character. If no
2007 new states have been set for the next character, we have found all the
2008 matches that we are going to find. If we are at the top level and partial
2009 matching has been requested, check for appropriate conditions. */
2010
2011 if (new_count <= 0)
2012 {
2013 if (match_count < 0 && /* No matches found */
2014 rlevel == 1 && /* Top level match function */
2015 (md->moptions & PCRE_PARTIAL) != 0 && /* Want partial matching */
2016 ptr >= end_subject && /* Reached end of subject */
2017 ptr > current_subject) /* Matched non-empty string */
2018 {
2019 if (offsetcount >= 2)
2020 {
2021 offsets[0] = current_subject - start_subject;
2022 offsets[1] = end_subject - start_subject;
2023 }
2024 match_count = PCRE_ERROR_PARTIAL;
2025 }
2026
2027 DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
2028 "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count,
2029 rlevel*2-2, SP));
2030 break; /* In effect, "return", but see the comment below */
2031 }
2032
2033 /* One or more states are active for the next character. */
2034
2035 ptr += clen; /* Advance to next subject character */
2036 } /* Loop to move along the subject string */
2037
2038 /* Control gets here from "break" a few lines above. We do it this way because
2039 if we use "return" above, we have compiler trouble. Some compilers warn if
2040 there's nothing here because they think the function doesn't return a value. On
2041 the other hand, if we put a dummy statement here, some more clever compilers
2042 complain that it can't be reached. Sigh. */
2043
2044 return match_count;
2045 }
2046
2047
2048
2049
2050 /*************************************************
2051 * Execute a Regular Expression - DFA engine *
2052 *************************************************/
2053
2054 /* This external function applies a compiled re to a subject string using a DFA
2055 engine. This function calls the internal function multiple times if the pattern
2056 is not anchored.
2057
2058 Arguments:
2059 argument_re points to the compiled expression
2060 extra_data points to extra data or is NULL
2061 subject points to the subject string
2062 length length of subject string (may contain binary zeros)
2063 start_offset where to start in the subject string
2064 options option bits
2065 offsets vector of match offsets
2066 offsetcount size of same
2067 workspace workspace vector
2068 wscount size of same
2069
2070 Returns: > 0 => number of match offset pairs placed in offsets
2071 = 0 => offsets overflowed; longest matches are present
2072 -1 => failed to match
2073 < -1 => some kind of unexpected problem
2074 */
2075
2076 PCRE_EXP_DEFN int
2077 pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
2078 const char *subject, int length, int start_offset, int options, int *offsets,
2079 int offsetcount, int *workspace, int wscount)
2080 {
2081 real_pcre *re = (real_pcre *)argument_re;
2082 dfa_match_data match_block;
2083 dfa_match_data *md = &match_block;
2084 BOOL utf8, anchored, startline, firstline;
2085 const uschar *current_subject, *end_subject, *lcc;
2086
2087 pcre_study_data internal_study;
2088 const pcre_study_data *study = NULL;
2089 real_pcre internal_re;
2090
2091 const uschar *req_byte_ptr;
2092 const uschar *start_bits = NULL;
2093 BOOL first_byte_caseless = FALSE;
2094 BOOL req_byte_caseless = FALSE;
2095 int first_byte = -1;
2096 int req_byte = -1;
2097 int req_byte2 = -1;
2098 int newline;
2099
2100 /* Plausibility checks */
2101
2102 if ((options & ~PUBLIC_DFA_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
2103 if (re == NULL || subject == NULL || workspace == NULL ||
2104 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
2105 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
2106 if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;
2107
2108 /* We need to find the pointer to any study data before we test for byte
2109 flipping, so we scan the extra_data block first. This may set two fields in the
2110 match block, so we must initialize them beforehand. However, the other fields
2111 in the match block must not be set until after the byte flipping. */
2112
2113 md->tables = re->tables;
2114 md->callout_data = NULL;
2115
2116 if (extra_data != NULL)
2117 {
2118 unsigned int flags = extra_data->flags;
2119 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
2120 study = (const pcre_study_data *)extra_data->study_data;
2121 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) return PCRE_ERROR_DFA_UMLIMIT;
2122 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
2123 return PCRE_ERROR_DFA_UMLIMIT;
2124 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
2125 md->callout_data = extra_data->callout_data;
2126 if ((flags & PCRE_EXTRA_TABLES) != 0)
2127 md->tables = extra_data->tables;
2128 }
2129
2130 /* Check that the first field in the block is the magic number. If it is not,
2131 test for a regex that was compiled on a host of opposite endianness. If this is
2132 the case, flipped values are put in internal_re and internal_study if there was
2133 study data too. */
2134
2135 if (re->magic_number != MAGIC_NUMBER)
2136 {
2137 re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
2138 if (re == NULL) return PCRE_ERROR_BADMAGIC;
2139 if (study != NULL) study = &internal_study;
2140 }
2141
2142 /* Set some local values */
2143
2144 current_subject = (const unsigned char *)subject + start_offset;
2145 end_subject = (const unsigned char *)subject + length;
2146 req_byte_ptr = current_subject - 1;
2147
2148 #ifdef SUPPORT_UTF8
2149 utf8 = (re->options & PCRE_UTF8) != 0;
2150 #else
2151 utf8 = FALSE;
2152 #endif
2153
2154 anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||
2155 (re->options & PCRE_ANCHORED) != 0;
2156
2157 /* The remaining fixed data for passing around. */
2158
2159 md->start_code = (const uschar *)argument_re +
2160 re->name_table_offset + re->name_count * re->name_entry_size;
2161 md->start_subject = (const unsigned char *)subject;
2162 md->end_subject = end_subject;
2163 md->moptions = options;
2164 md->poptions = re->options;
2165
2166 /* Handle different types of newline. The three bits give eight cases. If
2167 nothing is set at run time, whatever was used at compile time applies. */
2168
2169 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)options) &
2170 PCRE_NEWLINE_BITS)
2171 {
2172 case 0: newline = NEWLINE; break; /* Compile-time default */
2173 case PCRE_NEWLINE_CR: newline = '\r'; break;
2174 case PCRE_NEWLINE_LF: newline = '\n'; break;
2175 case PCRE_NEWLINE_CR+
2176 PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
2177 case PCRE_NEWLINE_ANY: newline = -1; break;
2178 default: return PCRE_ERROR_BADNEWLINE;
2179 }
2180
2181 if (newline < 0)
2182 {
2183 md->nltype = NLTYPE_ANY;
2184 }
2185 else
2186 {
2187 md->nltype = NLTYPE_FIXED;
2188 if (newline > 255)
2189 {
2190 md->nllen = 2;
2191 md->nl[0] = (newline >> 8) & 255;
2192 md->nl[1] = newline & 255;
2193 }
2194 else
2195 {
2196 md->nllen = 1;
2197 md->nl[0] = newline;
2198 }
2199 }
2200
2201 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
2202 back the character offset. */
2203
2204 #ifdef SUPPORT_UTF8
2205 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
2206 {
2207 if (_pcre_valid_utf8((uschar *)subject, length) >= 0)
2208 return PCRE_ERROR_BADUTF8;
2209 if (start_offset > 0 && start_offset < length)
2210 {
2211 int tb = ((uschar *)subject)[start_offset];
2212 if (tb > 127)
2213 {
2214 tb &= 0xc0;
2215 if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
2216 }
2217 }
2218 }
2219 #endif
2220
2221 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
2222 is a feature that makes it possible to save compiled regex and re-use them
2223 in other programs later. */
2224
2225 if (md->tables == NULL) md->tables = _pcre_default_tables;
2226
2227 /* The lower casing table and the "must be at the start of a line" flag are
2228 used in a loop when finding where to start. */
2229
2230 lcc = md->tables + lcc_offset;
2231 startline = (re->options & PCRE_STARTLINE) != 0;
2232 firstline = (re->options & PCRE_FIRSTLINE) != 0;
2233
2234 /* Set up the first character to match, if available. The first_byte value is
2235 never set for an anchored regular expression, but the anchoring may be forced
2236 at run time, so we have to test for anchoring. The first char may be unset for
2237 an unanchored pattern, of course. If there's no first char and the pattern was
2238 studied, there may be a bitmap of possible first characters. */
2239
2240 if (!anchored)
2241 {
2242 if ((re->options & PCRE_FIRSTSET) != 0)
2243 {
2244 first_byte = re->first_byte & 255;
2245 if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
2246 first_byte = lcc[first_byte];
2247 }
2248 else
2249 {
2250 if (startline && study != NULL &&
2251 (study->options & PCRE_STUDY_MAPPED) != 0)
2252 start_bits = study->start_bits;
2253 }
2254 }
2255
2256 /* For anchored or unanchored matches, there may be a "last known required
2257 character" set. */
2258
2259 if ((re->options & PCRE_REQCHSET) != 0)
2260 {
2261 req_byte = re->req_byte & 255;
2262 req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
2263 req_byte2 = (md->tables + fcc_offset)[req_byte]; /* case flipped */
2264 }
2265
2266 /* Call the main matching function, looping for a non-anchored regex after a
2267 failed match. Unless restarting, optimize by moving to the first match
2268 character if possible, when not anchored. Then unless wanting a partial match,
2269 check for a required later character. */
2270
2271 for (;;)
2272 {
2273 int rc;
2274
2275 if ((options & PCRE_DFA_RESTART) == 0)
2276 {
2277 const uschar *save_end_subject = end_subject;
2278
2279 /* Advance to a unique first char if possible. If firstline is TRUE, the
2280 start of the match is constrained to the first line of a multiline string.
2281 Implement this by temporarily adjusting end_subject so that we stop
2282 scanning at a newline. If the match fails at the newline, later code breaks
2283 this loop. */
2284
2285 if (firstline)
2286 {
2287 const uschar *t = current_subject;
2288 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
2289 end_subject = t;
2290 }
2291
2292 if (first_byte >= 0)
2293 {
2294 if (first_byte_caseless)
2295 while (current_subject < end_subject &&
2296 lcc[*current_subject] != first_byte)
2297 current_subject++;
2298 else
2299 while (current_subject < end_subject && *current_subject != first_byte)
2300 current_subject++;
2301 }
2302
2303 /* Or to just after a linebreak for a multiline match if possible */
2304
2305 else if (startline)
2306 {
2307 if (current_subject > md->start_subject + start_offset)
2308 {
2309 while (current_subject <= end_subject && !WAS_NEWLINE(current_subject))
2310 current_subject++;
2311
2312 /* If we have just passed a CR and the newline option is ANY, and we
2313 are now at a LF, advance the match position by one more character. */
2314
2315 if (current_subject[-1] == '\r' &&
2316 md->nltype == NLTYPE_ANY &&
2317 current_subject < end_subject &&
2318 *current_subject == '\n')
2319 current_subject++;
2320 }
2321 }
2322
2323 /* Or to a non-unique first char after study */
2324
2325 else if (start_bits != NULL)
2326 {
2327 while (current_subject < end_subject)
2328 {
2329 register unsigned int c = *current_subject;
2330 if ((start_bits[c/8] & (1 << (c&7))) == 0) current_subject++;
2331 else break;
2332 }
2333 }
2334
2335 /* Restore fudged end_subject */
2336
2337 end_subject = save_end_subject;
2338 }
2339
2340 /* If req_byte is set, we know that that character must appear in the subject
2341 for the match to succeed. If the first character is set, req_byte must be
2342 later in the subject; otherwise the test starts at the match point. This
2343 optimization can save a huge amount of work in patterns with nested unlimited
2344 repeats that aren't going to match. Writing separate code for cased/caseless
2345 versions makes it go faster, as does using an autoincrement and backing off
2346 on a match.
2347
2348 HOWEVER: when the subject string is very, very long, searching to its end can
2349 take a long time, and give bad performance on quite ordinary patterns. This
2350 showed up when somebody was matching /^C/ on a 32-megabyte string... so we
2351 don't do this when the string is sufficiently long.
2352
2353 ALSO: this processing is disabled when partial matching is requested.
2354 */
2355
2356 if (req_byte >= 0 &&
2357 end_subject - current_subject < REQ_BYTE_MAX &&
2358 (options & PCRE_PARTIAL) == 0)
2359 {
2360 register const uschar *p = current_subject + ((first_byte >= 0)? 1 : 0);
2361
2362 /* We don't need to repeat the search if we haven't yet reached the
2363 place we found it at last time. */
2364
2365 if (p > req_byte_ptr)
2366 {
2367 if (req_byte_caseless)
2368 {
2369 while (p < end_subject)
2370 {
2371 register int pp = *p++;
2372 if (pp == req_byte || pp == req_byte2) { p--; break; }
2373 }
2374 }
2375 else
2376 {
2377 while (p < end_subject)
2378 {
2379 if (*p++ == req_byte) { p--; break; }
2380 }
2381 }
2382
2383 /* If we can't find the required character, break the matching loop,
2384 which will cause a return or PCRE_ERROR_NOMATCH. */
2385
2386 if (p >= end_subject) break;
2387
2388 /* If we have found the required character, save the point where we
2389 found it, so that we don't search again next time round the loop if
2390 the start hasn't passed this character yet. */
2391
2392 req_byte_ptr = p;
2393 }
2394 }
2395
2396 /* OK, now we can do the business */
2397
2398 rc = internal_dfa_exec(
2399 md, /* fixed match data */
2400 md->start_code, /* this subexpression's code */
2401 current_subject, /* where we currently are */
2402 start_offset, /* start offset in subject */
2403 offsets, /* offset vector */
2404 offsetcount, /* size of same */
2405 workspace, /* workspace vector */
2406 wscount, /* size of same */
2407 re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL), /* ims flags */
2408 0, /* function recurse level */
2409 0); /* regex recurse level */
2410
2411 /* Anything other than "no match" means we are done, always; otherwise, carry
2412 on only if not anchored. */
2413
2414 if (rc != PCRE_ERROR_NOMATCH || anchored) return rc;
2415
2416 /* Advance to the next subject character unless we are at the end of a line
2417 and firstline is set. */
2418
2419 if (firstline && IS_NEWLINE(current_subject)) break;
2420 current_subject++;
2421 if (utf8)
2422 {
2423 while (current_subject < end_subject && (*current_subject & 0xc0) == 0x80)
2424 current_subject++;
2425 }
2426 if (current_subject > end_subject) break;
2427
2428 /* If we have just passed a CR and the newline option is CRLF or ANY, and we
2429 are now at a LF, advance the match position by one more character. */
2430
2431 if (current_subject[-1] == '\r' &&
2432 (md->nltype == NLTYPE_ANY || md->nllen == 2) &&
2433 current_subject < end_subject &&
2434 *current_subject == '\n')
2435 current_subject++;
2436
2437 } /* "Bumpalong" loop */
2438
2439 return PCRE_ERROR_NOMATCH;
2440 }
2441
2442 /* End of pcre_dfa_exec.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12