/[pcre]/code/trunk/pcre_dfa_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_dfa_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 172 - (show annotations) (download)
Tue Jun 5 10:40:13 2007 UTC (7 years, 4 months ago) by ph10
File MIME type: text/plain
File size: 81716 byte(s)
Drastically reduce workspace used for alternatives in groups; also some 
trailing space removals for a test release.

1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2007 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains the external function pcre_dfa_exec(), which is an
42 alternative matching function that uses a sort of DFA algorithm (not a true
43 FSM). This is NOT Perl- compatible, but it has advantages in certain
44 applications. */
45
46
47 #define NLBLOCK md /* Block containing newline information */
48 #define PSSTART start_subject /* Field containing processed string start */
49 #define PSEND end_subject /* Field containing processed string end */
50
51 #include "pcre_internal.h"
52
53
54 /* For use to indent debugging output */
55
56 #define SP " "
57
58
59
60 /*************************************************
61 * Code parameters and static tables *
62 *************************************************/
63
64 /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
65 into others, under special conditions. A gap of 20 between the blocks should be
66 enough. */
67
68 #define OP_PROP_EXTRA 100
69 #define OP_EXTUNI_EXTRA 120
70 #define OP_ANYNL_EXTRA 140
71
72
73 /* This table identifies those opcodes that are followed immediately by a
74 character that is to be tested in some way. This makes is possible to
75 centralize the loading of these characters. In the case of Type * etc, the
76 "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
77 small value. ***NOTE*** If the start of this table is modified, the two tables
78 that follow must also be modified. */
79
80 static uschar coptable[] = {
81 0, /* End */
82 0, 0, 0, 0, 0, /* \A, \G, \K, \B, \b */
83 0, 0, 0, 0, 0, 0, /* \D, \d, \S, \s, \W, \w */
84 0, 0, /* Any, Anybyte */
85 0, 0, 0, 0, /* NOTPROP, PROP, EXTUNI, ANYNL */
86 0, 0, 0, 0, 0, /* \Z, \z, Opt, ^, $ */
87 1, /* Char */
88 1, /* Charnc */
89 1, /* not */
90 /* Positive single-char repeats */
91 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
92 3, 3, 3, /* upto, minupto, exact */
93 1, 1, 1, 3, /* *+, ++, ?+, upto+ */
94 /* Negative single-char repeats - only for chars < 256 */
95 1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
96 3, 3, 3, /* NOT upto, minupto, exact */
97 1, 1, 1, 3, /* NOT *+, ++, ?+, updo+ */
98 /* Positive type repeats */
99 1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
100 3, 3, 3, /* Type upto, minupto, exact */
101 1, 1, 1, 3, /* Type *+, ++, ?+, upto+ */
102 /* Character class & ref repeats */
103 0, 0, 0, 0, 0, 0, /* *, *?, +, +?, ?, ?? */
104 0, 0, /* CRRANGE, CRMINRANGE */
105 0, /* CLASS */
106 0, /* NCLASS */
107 0, /* XCLASS - variable length */
108 0, /* REF */
109 0, /* RECURSE */
110 0, /* CALLOUT */
111 0, /* Alt */
112 0, /* Ket */
113 0, /* KetRmax */
114 0, /* KetRmin */
115 0, /* Assert */
116 0, /* Assert not */
117 0, /* Assert behind */
118 0, /* Assert behind not */
119 0, /* Reverse */
120 0, 0, 0, 0, /* ONCE, BRA, CBRA, COND */
121 0, 0, 0, /* SBRA, SCBRA, SCOND */
122 0, /* CREF */
123 0, /* RREF */
124 0, /* DEF */
125 0, 0 /* BRAZERO, BRAMINZERO */
126 };
127
128 /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
129 and \w */
130
131 static uschar toptable1[] = {
132 0, 0, 0, 0, 0, 0,
133 ctype_digit, ctype_digit,
134 ctype_space, ctype_space,
135 ctype_word, ctype_word,
136 0 /* OP_ANY */
137 };
138
139 static uschar toptable2[] = {
140 0, 0, 0, 0, 0, 0,
141 ctype_digit, 0,
142 ctype_space, 0,
143 ctype_word, 0,
144 1 /* OP_ANY */
145 };
146
147
148 /* Structure for holding data about a particular state, which is in effect the
149 current data for an active path through the match tree. It must consist
150 entirely of ints because the working vector we are passed, and which we put
151 these structures in, is a vector of ints. */
152
153 typedef struct stateblock {
154 int offset; /* Offset to opcode */
155 int count; /* Count for repeats */
156 int ims; /* ims flag bits */
157 int data; /* Some use extra data */
158 } stateblock;
159
160 #define INTS_PER_STATEBLOCK (sizeof(stateblock)/sizeof(int))
161
162
163 #ifdef DEBUG
164 /*************************************************
165 * Print character string *
166 *************************************************/
167
168 /* Character string printing function for debugging.
169
170 Arguments:
171 p points to string
172 length number of bytes
173 f where to print
174
175 Returns: nothing
176 */
177
178 static void
179 pchars(unsigned char *p, int length, FILE *f)
180 {
181 int c;
182 while (length-- > 0)
183 {
184 if (isprint(c = *(p++)))
185 fprintf(f, "%c", c);
186 else
187 fprintf(f, "\\x%02x", c);
188 }
189 }
190 #endif
191
192
193
194 /*************************************************
195 * Execute a Regular Expression - DFA engine *
196 *************************************************/
197
198 /* This internal function applies a compiled pattern to a subject string,
199 starting at a given point, using a DFA engine. This function is called from the
200 external one, possibly multiple times if the pattern is not anchored. The
201 function calls itself recursively for some kinds of subpattern.
202
203 Arguments:
204 md the match_data block with fixed information
205 this_start_code the opening bracket of this subexpression's code
206 current_subject where we currently are in the subject string
207 start_offset start offset in the subject string
208 offsets vector to contain the matching string offsets
209 offsetcount size of same
210 workspace vector of workspace
211 wscount size of same
212 ims the current ims flags
213 rlevel function call recursion level
214 recursing regex recursive call level
215
216 Returns: > 0 =>
217 = 0 =>
218 -1 => failed to match
219 < -1 => some kind of unexpected problem
220
221 The following macros are used for adding states to the two state vectors (one
222 for the current character, one for the following character). */
223
224 #define ADD_ACTIVE(x,y) \
225 if (active_count++ < wscount) \
226 { \
227 next_active_state->offset = (x); \
228 next_active_state->count = (y); \
229 next_active_state->ims = ims; \
230 next_active_state++; \
231 DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
232 } \
233 else return PCRE_ERROR_DFA_WSSIZE
234
235 #define ADD_ACTIVE_DATA(x,y,z) \
236 if (active_count++ < wscount) \
237 { \
238 next_active_state->offset = (x); \
239 next_active_state->count = (y); \
240 next_active_state->ims = ims; \
241 next_active_state->data = (z); \
242 next_active_state++; \
243 DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
244 } \
245 else return PCRE_ERROR_DFA_WSSIZE
246
247 #define ADD_NEW(x,y) \
248 if (new_count++ < wscount) \
249 { \
250 next_new_state->offset = (x); \
251 next_new_state->count = (y); \
252 next_new_state->ims = ims; \
253 next_new_state++; \
254 DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
255 } \
256 else return PCRE_ERROR_DFA_WSSIZE
257
258 #define ADD_NEW_DATA(x,y,z) \
259 if (new_count++ < wscount) \
260 { \
261 next_new_state->offset = (x); \
262 next_new_state->count = (y); \
263 next_new_state->ims = ims; \
264 next_new_state->data = (z); \
265 next_new_state++; \
266 DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
267 } \
268 else return PCRE_ERROR_DFA_WSSIZE
269
270 /* And now, here is the code */
271
272 static int
273 internal_dfa_exec(
274 dfa_match_data *md,
275 const uschar *this_start_code,
276 const uschar *current_subject,
277 int start_offset,
278 int *offsets,
279 int offsetcount,
280 int *workspace,
281 int wscount,
282 int ims,
283 int rlevel,
284 int recursing)
285 {
286 stateblock *active_states, *new_states, *temp_states;
287 stateblock *next_active_state, *next_new_state;
288
289 const uschar *ctypes, *lcc, *fcc;
290 const uschar *ptr;
291 const uschar *end_code, *first_op;
292
293 int active_count, new_count, match_count;
294
295 /* Some fields in the md block are frequently referenced, so we load them into
296 independent variables in the hope that this will perform better. */
297
298 const uschar *start_subject = md->start_subject;
299 const uschar *end_subject = md->end_subject;
300 const uschar *start_code = md->start_code;
301
302 #ifdef SUPPORT_UTF8
303 BOOL utf8 = (md->poptions & PCRE_UTF8) != 0;
304 #else
305 BOOL utf8 = FALSE;
306 #endif
307
308 rlevel++;
309 offsetcount &= (-2);
310
311 wscount -= 2;
312 wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
313 (2 * INTS_PER_STATEBLOCK);
314
315 DPRINTF(("\n%.*s---------------------\n"
316 "%.*sCall to internal_dfa_exec f=%d r=%d\n",
317 rlevel*2-2, SP, rlevel*2-2, SP, rlevel, recursing));
318
319 ctypes = md->tables + ctypes_offset;
320 lcc = md->tables + lcc_offset;
321 fcc = md->tables + fcc_offset;
322
323 match_count = PCRE_ERROR_NOMATCH; /* A negative number */
324
325 active_states = (stateblock *)(workspace + 2);
326 next_new_state = new_states = active_states + wscount;
327 new_count = 0;
328
329 first_op = this_start_code + 1 + LINK_SIZE +
330 ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
331
332 /* The first thing in any (sub) pattern is a bracket of some sort. Push all
333 the alternative states onto the list, and find out where the end is. This
334 makes is possible to use this function recursively, when we want to stop at a
335 matching internal ket rather than at the end.
336
337 If the first opcode in the first alternative is OP_REVERSE, we are dealing with
338 a backward assertion. In that case, we have to find out the maximum amount to
339 move back, and set up each alternative appropriately. */
340
341 if (*first_op == OP_REVERSE)
342 {
343 int max_back = 0;
344 int gone_back;
345
346 end_code = this_start_code;
347 do
348 {
349 int back = GET(end_code, 2+LINK_SIZE);
350 if (back > max_back) max_back = back;
351 end_code += GET(end_code, 1);
352 }
353 while (*end_code == OP_ALT);
354
355 /* If we can't go back the amount required for the longest lookbehind
356 pattern, go back as far as we can; some alternatives may still be viable. */
357
358 #ifdef SUPPORT_UTF8
359 /* In character mode we have to step back character by character */
360
361 if (utf8)
362 {
363 for (gone_back = 0; gone_back < max_back; gone_back++)
364 {
365 if (current_subject <= start_subject) break;
366 current_subject--;
367 while (current_subject > start_subject &&
368 (*current_subject & 0xc0) == 0x80)
369 current_subject--;
370 }
371 }
372 else
373 #endif
374
375 /* In byte-mode we can do this quickly. */
376
377 {
378 gone_back = (current_subject - max_back < start_subject)?
379 current_subject - start_subject : max_back;
380 current_subject -= gone_back;
381 }
382
383 /* Now we can process the individual branches. */
384
385 end_code = this_start_code;
386 do
387 {
388 int back = GET(end_code, 2+LINK_SIZE);
389 if (back <= gone_back)
390 {
391 int bstate = end_code - start_code + 2 + 2*LINK_SIZE;
392 ADD_NEW_DATA(-bstate, 0, gone_back - back);
393 }
394 end_code += GET(end_code, 1);
395 }
396 while (*end_code == OP_ALT);
397 }
398
399 /* This is the code for a "normal" subpattern (not a backward assertion). The
400 start of a whole pattern is always one of these. If we are at the top level,
401 we may be asked to restart matching from the same point that we reached for a
402 previous partial match. We still have to scan through the top-level branches to
403 find the end state. */
404
405 else
406 {
407 end_code = this_start_code;
408
409 /* Restarting */
410
411 if (rlevel == 1 && (md->moptions & PCRE_DFA_RESTART) != 0)
412 {
413 do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);
414 new_count = workspace[1];
415 if (!workspace[0])
416 memcpy(new_states, active_states, new_count * sizeof(stateblock));
417 }
418
419 /* Not restarting */
420
421 else
422 {
423 int length = 1 + LINK_SIZE +
424 ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
425 do
426 {
427 ADD_NEW(end_code - start_code + length, 0);
428 end_code += GET(end_code, 1);
429 length = 1 + LINK_SIZE;
430 }
431 while (*end_code == OP_ALT);
432 }
433 }
434
435 workspace[0] = 0; /* Bit indicating which vector is current */
436
437 DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, end_code - start_code));
438
439 /* Loop for scanning the subject */
440
441 ptr = current_subject;
442 for (;;)
443 {
444 int i, j;
445 int clen, dlen;
446 unsigned int c, d;
447
448 /* Make the new state list into the active state list and empty the
449 new state list. */
450
451 temp_states = active_states;
452 active_states = new_states;
453 new_states = temp_states;
454 active_count = new_count;
455 new_count = 0;
456
457 workspace[0] ^= 1; /* Remember for the restarting feature */
458 workspace[1] = active_count;
459
460 #ifdef DEBUG
461 printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);
462 pchars((uschar *)ptr, strlen((char *)ptr), stdout);
463 printf("\"\n");
464
465 printf("%.*sActive states: ", rlevel*2-2, SP);
466 for (i = 0; i < active_count; i++)
467 printf("%d/%d ", active_states[i].offset, active_states[i].count);
468 printf("\n");
469 #endif
470
471 /* Set the pointers for adding new states */
472
473 next_active_state = active_states + active_count;
474 next_new_state = new_states;
475
476 /* Load the current character from the subject outside the loop, as many
477 different states may want to look at it, and we assume that at least one
478 will. */
479
480 if (ptr < end_subject)
481 {
482 clen = 1; /* Number of bytes in the character */
483 #ifdef SUPPORT_UTF8
484 if (utf8) { GETCHARLEN(c, ptr, clen); } else
485 #endif /* SUPPORT_UTF8 */
486 c = *ptr;
487 }
488 else
489 {
490 clen = 0; /* This indicates the end of the subject */
491 c = NOTACHAR; /* This value should never actually be used */
492 }
493
494 /* Scan up the active states and act on each one. The result of an action
495 may be to add more states to the currently active list (e.g. on hitting a
496 parenthesis) or it may be to put states on the new list, for considering
497 when we move the character pointer on. */
498
499 for (i = 0; i < active_count; i++)
500 {
501 stateblock *current_state = active_states + i;
502 const uschar *code;
503 int state_offset = current_state->offset;
504 int count, codevalue;
505 #ifdef SUPPORT_UCP
506 int chartype, script;
507 #endif
508
509 #ifdef DEBUG
510 printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
511 if (clen == 0) printf("EOL\n");
512 else if (c > 32 && c < 127) printf("'%c'\n", c);
513 else printf("0x%02x\n", c);
514 #endif
515
516 /* This variable is referred to implicity in the ADD_xxx macros. */
517
518 ims = current_state->ims;
519
520 /* A negative offset is a special case meaning "hold off going to this
521 (negated) state until the number of characters in the data field have
522 been skipped". */
523
524 if (state_offset < 0)
525 {
526 if (current_state->data > 0)
527 {
528 DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP));
529 ADD_NEW_DATA(state_offset, current_state->count,
530 current_state->data - 1);
531 continue;
532 }
533 else
534 {
535 current_state->offset = state_offset = -state_offset;
536 }
537 }
538
539 /* Check for a duplicate state with the same count, and skip if found. */
540
541 for (j = 0; j < i; j++)
542 {
543 if (active_states[j].offset == state_offset &&
544 active_states[j].count == current_state->count)
545 {
546 DPRINTF(("%.*sDuplicate state: skipped\n", rlevel*2-2, SP));
547 goto NEXT_ACTIVE_STATE;
548 }
549 }
550
551 /* The state offset is the offset to the opcode */
552
553 code = start_code + state_offset;
554 codevalue = *code;
555
556 /* If this opcode is followed by an inline character, load it. It is
557 tempting to test for the presence of a subject character here, but that
558 is wrong, because sometimes zero repetitions of the subject are
559 permitted.
560
561 We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
562 argument that is not a data character - but is always one byte long.
563 Unfortunately, we have to take special action to deal with \P, \p, and
564 \X in this case. To keep the other cases fast, convert these ones to new
565 opcodes. */
566
567 if (coptable[codevalue] > 0)
568 {
569 dlen = 1;
570 #ifdef SUPPORT_UTF8
571 if (utf8) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
572 #endif /* SUPPORT_UTF8 */
573 d = code[coptable[codevalue]];
574 if (codevalue >= OP_TYPESTAR)
575 {
576 switch(d)
577 {
578 case OP_ANYBYTE: return PCRE_ERROR_DFA_UITEM;
579 case OP_NOTPROP:
580 case OP_PROP: codevalue += OP_PROP_EXTRA; break;
581 case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
582 case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
583 default: break;
584 }
585 }
586 }
587 else
588 {
589 dlen = 0; /* Not strictly necessary, but compilers moan */
590 d = NOTACHAR; /* if these variables are not set. */
591 }
592
593
594 /* Now process the individual opcodes */
595
596 switch (codevalue)
597 {
598
599 /* ========================================================================== */
600 /* Reached a closing bracket. If not at the end of the pattern, carry
601 on with the next opcode. Otherwise, unless we have an empty string and
602 PCRE_NOTEMPTY is set, save the match data, shifting up all previous
603 matches so we always have the longest first. */
604
605 case OP_KET:
606 case OP_KETRMIN:
607 case OP_KETRMAX:
608 if (code != end_code)
609 {
610 ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
611 if (codevalue != OP_KET)
612 {
613 ADD_ACTIVE(state_offset - GET(code, 1), 0);
614 }
615 }
616 else if (ptr > current_subject || (md->moptions & PCRE_NOTEMPTY) == 0)
617 {
618 if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
619 else if (match_count > 0 && ++match_count * 2 >= offsetcount)
620 match_count = 0;
621 count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
622 if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
623 if (offsetcount >= 2)
624 {
625 offsets[0] = current_subject - start_subject;
626 offsets[1] = ptr - start_subject;
627 DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
628 offsets[1] - offsets[0], current_subject));
629 }
630 if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
631 {
632 DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
633 "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,
634 match_count, rlevel*2-2, SP));
635 return match_count;
636 }
637 }
638 break;
639
640 /* ========================================================================== */
641 /* These opcodes add to the current list of states without looking
642 at the current character. */
643
644 /*-----------------------------------------------------------------*/
645 case OP_ALT:
646 do { code += GET(code, 1); } while (*code == OP_ALT);
647 ADD_ACTIVE(code - start_code, 0);
648 break;
649
650 /*-----------------------------------------------------------------*/
651 case OP_BRA:
652 case OP_SBRA:
653 do
654 {
655 ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
656 code += GET(code, 1);
657 }
658 while (*code == OP_ALT);
659 break;
660
661 /*-----------------------------------------------------------------*/
662 case OP_CBRA:
663 case OP_SCBRA:
664 ADD_ACTIVE(code - start_code + 3 + LINK_SIZE, 0);
665 code += GET(code, 1);
666 while (*code == OP_ALT)
667 {
668 ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
669 code += GET(code, 1);
670 }
671 break;
672
673 /*-----------------------------------------------------------------*/
674 case OP_BRAZERO:
675 case OP_BRAMINZERO:
676 ADD_ACTIVE(state_offset + 1, 0);
677 code += 1 + GET(code, 2);
678 while (*code == OP_ALT) code += GET(code, 1);
679 ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
680 break;
681
682 /*-----------------------------------------------------------------*/
683 case OP_CIRC:
684 if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
685 ((ims & PCRE_MULTILINE) != 0 &&
686 ptr != end_subject &&
687 WAS_NEWLINE(ptr)))
688 { ADD_ACTIVE(state_offset + 1, 0); }
689 break;
690
691 /*-----------------------------------------------------------------*/
692 case OP_EOD:
693 if (ptr >= end_subject) { ADD_ACTIVE(state_offset + 1, 0); }
694 break;
695
696 /*-----------------------------------------------------------------*/
697 case OP_OPT:
698 ims = code[1];
699 ADD_ACTIVE(state_offset + 2, 0);
700 break;
701
702 /*-----------------------------------------------------------------*/
703 case OP_SOD:
704 if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
705 break;
706
707 /*-----------------------------------------------------------------*/
708 case OP_SOM:
709 if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); }
710 break;
711
712
713 /* ========================================================================== */
714 /* These opcodes inspect the next subject character, and sometimes
715 the previous one as well, but do not have an argument. The variable
716 clen contains the length of the current character and is zero if we are
717 at the end of the subject. */
718
719 /*-----------------------------------------------------------------*/
720 case OP_ANY:
721 if (clen > 0 && ((ims & PCRE_DOTALL) != 0 || !IS_NEWLINE(ptr)))
722 { ADD_NEW(state_offset + 1, 0); }
723 break;
724
725 /*-----------------------------------------------------------------*/
726 case OP_EODN:
727 if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
728 { ADD_ACTIVE(state_offset + 1, 0); }
729 break;
730
731 /*-----------------------------------------------------------------*/
732 case OP_DOLL:
733 if ((md->moptions & PCRE_NOTEOL) == 0)
734 {
735 if (clen == 0 ||
736 (IS_NEWLINE(ptr) &&
737 ((ims & PCRE_MULTILINE) != 0 || ptr == end_subject - md->nllen)
738 ))
739 { ADD_ACTIVE(state_offset + 1, 0); }
740 }
741 else if ((ims & PCRE_MULTILINE) != 0 && IS_NEWLINE(ptr))
742 { ADD_ACTIVE(state_offset + 1, 0); }
743 break;
744
745 /*-----------------------------------------------------------------*/
746
747 case OP_DIGIT:
748 case OP_WHITESPACE:
749 case OP_WORDCHAR:
750 if (clen > 0 && c < 256 &&
751 ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)
752 { ADD_NEW(state_offset + 1, 0); }
753 break;
754
755 /*-----------------------------------------------------------------*/
756 case OP_NOT_DIGIT:
757 case OP_NOT_WHITESPACE:
758 case OP_NOT_WORDCHAR:
759 if (clen > 0 && (c >= 256 ||
760 ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0))
761 { ADD_NEW(state_offset + 1, 0); }
762 break;
763
764 /*-----------------------------------------------------------------*/
765 case OP_WORD_BOUNDARY:
766 case OP_NOT_WORD_BOUNDARY:
767 {
768 int left_word, right_word;
769
770 if (ptr > start_subject)
771 {
772 const uschar *temp = ptr - 1;
773 #ifdef SUPPORT_UTF8
774 if (utf8) BACKCHAR(temp);
775 #endif
776 GETCHARTEST(d, temp);
777 left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
778 }
779 else left_word = 0;
780
781 if (clen > 0) right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
782 else right_word = 0;
783
784 if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
785 { ADD_ACTIVE(state_offset + 1, 0); }
786 }
787 break;
788
789
790 /*-----------------------------------------------------------------*/
791 /* Check the next character by Unicode property. We will get here only
792 if the support is in the binary; otherwise a compile-time error occurs.
793 */
794
795 #ifdef SUPPORT_UCP
796 case OP_PROP:
797 case OP_NOTPROP:
798 if (clen > 0)
799 {
800 BOOL OK;
801 int category = _pcre_ucp_findprop(c, &chartype, &script);
802 switch(code[1])
803 {
804 case PT_ANY:
805 OK = TRUE;
806 break;
807
808 case PT_LAMP:
809 OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
810 break;
811
812 case PT_GC:
813 OK = category == code[2];
814 break;
815
816 case PT_PC:
817 OK = chartype == code[2];
818 break;
819
820 case PT_SC:
821 OK = script == code[2];
822 break;
823
824 /* Should never occur, but keep compilers from grumbling. */
825
826 default:
827 OK = codevalue != OP_PROP;
828 break;
829 }
830
831 if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); }
832 }
833 break;
834 #endif
835
836
837
838 /* ========================================================================== */
839 /* These opcodes likewise inspect the subject character, but have an
840 argument that is not a data character. It is one of these opcodes:
841 OP_ANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE, OP_WORDCHAR,
842 OP_NOT_WORDCHAR. The value is loaded into d. */
843
844 case OP_TYPEPLUS:
845 case OP_TYPEMINPLUS:
846 case OP_TYPEPOSPLUS:
847 count = current_state->count; /* Already matched */
848 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
849 if (clen > 0)
850 {
851 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
852 (c < 256 &&
853 (d != OP_ANY ||
854 (ims & PCRE_DOTALL) != 0 ||
855 !IS_NEWLINE(ptr)
856 ) &&
857 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
858 {
859 if (count > 0 && codevalue == OP_TYPEPOSPLUS)
860 {
861 active_count--; /* Remove non-match possibility */
862 next_active_state--;
863 }
864 count++;
865 ADD_NEW(state_offset, count);
866 }
867 }
868 break;
869
870 /*-----------------------------------------------------------------*/
871 case OP_TYPEQUERY:
872 case OP_TYPEMINQUERY:
873 case OP_TYPEPOSQUERY:
874 ADD_ACTIVE(state_offset + 2, 0);
875 if (clen > 0)
876 {
877 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
878 (c < 256 &&
879 (d != OP_ANY ||
880 (ims & PCRE_DOTALL) != 0 ||
881 !IS_NEWLINE(ptr)
882 ) &&
883 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
884 {
885 if (codevalue == OP_TYPEPOSQUERY)
886 {
887 active_count--; /* Remove non-match possibility */
888 next_active_state--;
889 }
890 ADD_NEW(state_offset + 2, 0);
891 }
892 }
893 break;
894
895 /*-----------------------------------------------------------------*/
896 case OP_TYPESTAR:
897 case OP_TYPEMINSTAR:
898 case OP_TYPEPOSSTAR:
899 ADD_ACTIVE(state_offset + 2, 0);
900 if (clen > 0)
901 {
902 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
903 (c < 256 &&
904 (d != OP_ANY ||
905 (ims & PCRE_DOTALL) != 0 ||
906 !IS_NEWLINE(ptr)
907 ) &&
908 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
909 {
910 if (codevalue == OP_TYPEPOSSTAR)
911 {
912 active_count--; /* Remove non-match possibility */
913 next_active_state--;
914 }
915 ADD_NEW(state_offset, 0);
916 }
917 }
918 break;
919
920 /*-----------------------------------------------------------------*/
921 case OP_TYPEEXACT:
922 count = current_state->count; /* Number already matched */
923 if (clen > 0)
924 {
925 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
926 (c < 256 &&
927 (d != OP_ANY ||
928 (ims & PCRE_DOTALL) != 0 ||
929 !IS_NEWLINE(ptr)
930 ) &&
931 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
932 {
933 if (++count >= GET2(code, 1))
934 { ADD_NEW(state_offset + 4, 0); }
935 else
936 { ADD_NEW(state_offset, count); }
937 }
938 }
939 break;
940
941 /*-----------------------------------------------------------------*/
942 case OP_TYPEUPTO:
943 case OP_TYPEMINUPTO:
944 case OP_TYPEPOSUPTO:
945 ADD_ACTIVE(state_offset + 4, 0);
946 count = current_state->count; /* Number already matched */
947 if (clen > 0)
948 {
949 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
950 (c < 256 &&
951 (d != OP_ANY ||
952 (ims & PCRE_DOTALL) != 0 ||
953 !IS_NEWLINE(ptr)
954 ) &&
955 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
956 {
957 if (codevalue == OP_TYPEPOSUPTO)
958 {
959 active_count--; /* Remove non-match possibility */
960 next_active_state--;
961 }
962 if (++count >= GET2(code, 1))
963 { ADD_NEW(state_offset + 4, 0); }
964 else
965 { ADD_NEW(state_offset, count); }
966 }
967 }
968 break;
969
970 /* ========================================================================== */
971 /* These are virtual opcodes that are used when something like
972 OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
973 argument. It keeps the code above fast for the other cases. The argument
974 is in the d variable. */
975
976 #ifdef SUPPORT_UCP
977 case OP_PROP_EXTRA + OP_TYPEPLUS:
978 case OP_PROP_EXTRA + OP_TYPEMINPLUS:
979 case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
980 count = current_state->count; /* Already matched */
981 if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
982 if (clen > 0)
983 {
984 BOOL OK;
985 int category = _pcre_ucp_findprop(c, &chartype, &script);
986 switch(code[2])
987 {
988 case PT_ANY:
989 OK = TRUE;
990 break;
991
992 case PT_LAMP:
993 OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
994 break;
995
996 case PT_GC:
997 OK = category == code[3];
998 break;
999
1000 case PT_PC:
1001 OK = chartype == code[3];
1002 break;
1003
1004 case PT_SC:
1005 OK = script == code[3];
1006 break;
1007
1008 /* Should never occur, but keep compilers from grumbling. */
1009
1010 default:
1011 OK = codevalue != OP_PROP;
1012 break;
1013 }
1014
1015 if (OK == (d == OP_PROP))
1016 {
1017 if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)
1018 {
1019 active_count--; /* Remove non-match possibility */
1020 next_active_state--;
1021 }
1022 count++;
1023 ADD_NEW(state_offset, count);
1024 }
1025 }
1026 break;
1027
1028 /*-----------------------------------------------------------------*/
1029 case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
1030 case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
1031 case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
1032 count = current_state->count; /* Already matched */
1033 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1034 if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)
1035 {
1036 const uschar *nptr = ptr + clen;
1037 int ncount = 0;
1038 if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
1039 {
1040 active_count--; /* Remove non-match possibility */
1041 next_active_state--;
1042 }
1043 while (nptr < end_subject)
1044 {
1045 int nd;
1046 int ndlen = 1;
1047 GETCHARLEN(nd, nptr, ndlen);
1048 if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;
1049 ncount++;
1050 nptr += ndlen;
1051 }
1052 count++;
1053 ADD_NEW_DATA(-state_offset, count, ncount);
1054 }
1055 break;
1056 #endif
1057
1058 /*-----------------------------------------------------------------*/
1059 case OP_ANYNL_EXTRA + OP_TYPEPLUS:
1060 case OP_ANYNL_EXTRA + OP_TYPEMINPLUS:
1061 case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS:
1062 count = current_state->count; /* Already matched */
1063 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1064 if (clen > 0)
1065 {
1066 int ncount = 0;
1067 switch (c)
1068 {
1069 case 0x000d:
1070 if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1071 /* Fall through */
1072 case 0x000a:
1073 case 0x000b:
1074 case 0x000c:
1075 case 0x0085:
1076 case 0x2028:
1077 case 0x2029:
1078 if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
1079 {
1080 active_count--; /* Remove non-match possibility */
1081 next_active_state--;
1082 }
1083 count++;
1084 ADD_NEW_DATA(-state_offset, count, ncount);
1085 break;
1086 default:
1087 break;
1088 }
1089 }
1090 break;
1091
1092 /*-----------------------------------------------------------------*/
1093 #ifdef SUPPORT_UCP
1094 case OP_PROP_EXTRA + OP_TYPEQUERY:
1095 case OP_PROP_EXTRA + OP_TYPEMINQUERY:
1096 case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
1097 count = 4;
1098 goto QS1;
1099
1100 case OP_PROP_EXTRA + OP_TYPESTAR:
1101 case OP_PROP_EXTRA + OP_TYPEMINSTAR:
1102 case OP_PROP_EXTRA + OP_TYPEPOSSTAR:
1103 count = 0;
1104
1105 QS1:
1106
1107 ADD_ACTIVE(state_offset + 4, 0);
1108 if (clen > 0)
1109 {
1110 BOOL OK;
1111 int category = _pcre_ucp_findprop(c, &chartype, &script);
1112 switch(code[2])
1113 {
1114 case PT_ANY:
1115 OK = TRUE;
1116 break;
1117
1118 case PT_LAMP:
1119 OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
1120 break;
1121
1122 case PT_GC:
1123 OK = category == code[3];
1124 break;
1125
1126 case PT_PC:
1127 OK = chartype == code[3];
1128 break;
1129
1130 case PT_SC:
1131 OK = script == code[3];
1132 break;
1133
1134 /* Should never occur, but keep compilers from grumbling. */
1135
1136 default:
1137 OK = codevalue != OP_PROP;
1138 break;
1139 }
1140
1141 if (OK == (d == OP_PROP))
1142 {
1143 if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR ||
1144 codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)
1145 {
1146 active_count--; /* Remove non-match possibility */
1147 next_active_state--;
1148 }
1149 ADD_NEW(state_offset + count, 0);
1150 }
1151 }
1152 break;
1153
1154 /*-----------------------------------------------------------------*/
1155 case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
1156 case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
1157 case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY:
1158 count = 2;
1159 goto QS2;
1160
1161 case OP_EXTUNI_EXTRA + OP_TYPESTAR:
1162 case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
1163 case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR:
1164 count = 0;
1165
1166 QS2:
1167
1168 ADD_ACTIVE(state_offset + 2, 0);
1169 if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)
1170 {
1171 const uschar *nptr = ptr + clen;
1172 int ncount = 0;
1173 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
1174 codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
1175 {
1176 active_count--; /* Remove non-match possibility */
1177 next_active_state--;
1178 }
1179 while (nptr < end_subject)
1180 {
1181 int nd;
1182 int ndlen = 1;
1183 GETCHARLEN(nd, nptr, ndlen);
1184 if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;
1185 ncount++;
1186 nptr += ndlen;
1187 }
1188 ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1189 }
1190 break;
1191 #endif
1192
1193 /*-----------------------------------------------------------------*/
1194 case OP_ANYNL_EXTRA + OP_TYPEQUERY:
1195 case OP_ANYNL_EXTRA + OP_TYPEMINQUERY:
1196 case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY:
1197 count = 2;
1198 goto QS3;
1199
1200 case OP_ANYNL_EXTRA + OP_TYPESTAR:
1201 case OP_ANYNL_EXTRA + OP_TYPEMINSTAR:
1202 case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR:
1203 count = 0;
1204
1205 QS3:
1206 ADD_ACTIVE(state_offset + 2, 0);
1207 if (clen > 0)
1208 {
1209 int ncount = 0;
1210 switch (c)
1211 {
1212 case 0x000d:
1213 if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1214 /* Fall through */
1215 case 0x000a:
1216 case 0x000b:
1217 case 0x000c:
1218 case 0x0085:
1219 case 0x2028:
1220 case 0x2029:
1221 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
1222 codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
1223 {
1224 active_count--; /* Remove non-match possibility */
1225 next_active_state--;
1226 }
1227 ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1228 break;
1229 default:
1230 break;
1231 }
1232 }
1233 break;
1234
1235 /*-----------------------------------------------------------------*/
1236 #ifdef SUPPORT_UCP
1237 case OP_PROP_EXTRA + OP_TYPEEXACT:
1238 case OP_PROP_EXTRA + OP_TYPEUPTO:
1239 case OP_PROP_EXTRA + OP_TYPEMINUPTO:
1240 case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
1241 if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
1242 { ADD_ACTIVE(state_offset + 6, 0); }
1243 count = current_state->count; /* Number already matched */
1244 if (clen > 0)
1245 {
1246 BOOL OK;
1247 int category = _pcre_ucp_findprop(c, &chartype, &script);
1248 switch(code[4])
1249 {
1250 case PT_ANY:
1251 OK = TRUE;
1252 break;
1253
1254 case PT_LAMP:
1255 OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
1256 break;
1257
1258 case PT_GC:
1259 OK = category == code[5];
1260 break;
1261
1262 case PT_PC:
1263 OK = chartype == code[5];
1264 break;
1265
1266 case PT_SC:
1267 OK = script == code[5];
1268 break;
1269
1270 /* Should never occur, but keep compilers from grumbling. */
1271
1272 default:
1273 OK = codevalue != OP_PROP;
1274 break;
1275 }
1276
1277 if (OK == (d == OP_PROP))
1278 {
1279 if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
1280 {
1281 active_count--; /* Remove non-match possibility */
1282 next_active_state--;
1283 }
1284 if (++count >= GET2(code, 1))
1285 { ADD_NEW(state_offset + 6, 0); }
1286 else
1287 { ADD_NEW(state_offset, count); }
1288 }
1289 }
1290 break;
1291
1292 /*-----------------------------------------------------------------*/
1293 case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
1294 case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
1295 case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
1296 case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
1297 if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
1298 { ADD_ACTIVE(state_offset + 4, 0); }
1299 count = current_state->count; /* Number already matched */
1300 if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)
1301 {
1302 const uschar *nptr = ptr + clen;
1303 int ncount = 0;
1304 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
1305 {
1306 active_count--; /* Remove non-match possibility */
1307 next_active_state--;
1308 }
1309 while (nptr < end_subject)
1310 {
1311 int nd;
1312 int ndlen = 1;
1313 GETCHARLEN(nd, nptr, ndlen);
1314 if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;
1315 ncount++;
1316 nptr += ndlen;
1317 }
1318 if (++count >= GET2(code, 1))
1319 { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
1320 else
1321 { ADD_NEW_DATA(-state_offset, count, ncount); }
1322 }
1323 break;
1324 #endif
1325
1326 /*-----------------------------------------------------------------*/
1327 case OP_ANYNL_EXTRA + OP_TYPEEXACT:
1328 case OP_ANYNL_EXTRA + OP_TYPEUPTO:
1329 case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
1330 case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
1331 if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
1332 { ADD_ACTIVE(state_offset + 4, 0); }
1333 count = current_state->count; /* Number already matched */
1334 if (clen > 0)
1335 {
1336 int ncount = 0;
1337 switch (c)
1338 {
1339 case 0x000d:
1340 if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1341 /* Fall through */
1342 case 0x000a:
1343 case 0x000b:
1344 case 0x000c:
1345 case 0x0085:
1346 case 0x2028:
1347 case 0x2029:
1348 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
1349 {
1350 active_count--; /* Remove non-match possibility */
1351 next_active_state--;
1352 }
1353 if (++count >= GET2(code, 1))
1354 { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
1355 else
1356 { ADD_NEW_DATA(-state_offset, count, ncount); }
1357 break;
1358 default:
1359 break;
1360 }
1361 }
1362 break;
1363
1364 /* ========================================================================== */
1365 /* These opcodes are followed by a character that is usually compared
1366 to the current subject character; it is loaded into d. We still get
1367 here even if there is no subject character, because in some cases zero
1368 repetitions are permitted. */
1369
1370 /*-----------------------------------------------------------------*/
1371 case OP_CHAR:
1372 if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); }
1373 break;
1374
1375 /*-----------------------------------------------------------------*/
1376 case OP_CHARNC:
1377 if (clen == 0) break;
1378
1379 #ifdef SUPPORT_UTF8
1380 if (utf8)
1381 {
1382 if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
1383 {
1384 unsigned int othercase;
1385 if (c < 128) othercase = fcc[c]; else
1386
1387 /* If we have Unicode property support, we can use it to test the
1388 other case of the character. */
1389
1390 #ifdef SUPPORT_UCP
1391 othercase = _pcre_ucp_othercase(c);
1392 #else
1393 othercase = NOTACHAR;
1394 #endif
1395
1396 if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
1397 }
1398 }
1399 else
1400 #endif /* SUPPORT_UTF8 */
1401
1402 /* Non-UTF-8 mode */
1403 {
1404 if (lcc[c] == lcc[d]) { ADD_NEW(state_offset + 2, 0); }
1405 }
1406 break;
1407
1408
1409 #ifdef SUPPORT_UCP
1410 /*-----------------------------------------------------------------*/
1411 /* This is a tricky one because it can match more than one character.
1412 Find out how many characters to skip, and then set up a negative state
1413 to wait for them to pass before continuing. */
1414
1415 case OP_EXTUNI:
1416 if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)
1417 {
1418 const uschar *nptr = ptr + clen;
1419 int ncount = 0;
1420 while (nptr < end_subject)
1421 {
1422 int nclen = 1;
1423 GETCHARLEN(c, nptr, nclen);
1424 if (_pcre_ucp_findprop(c, &chartype, &script) != ucp_M) break;
1425 ncount++;
1426 nptr += nclen;
1427 }
1428 ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
1429 }
1430 break;
1431 #endif
1432
1433 /*-----------------------------------------------------------------*/
1434 /* This is a tricky like EXTUNI because it too can match more than one
1435 character (when CR is followed by LF). In this case, set up a negative
1436 state to wait for one character to pass before continuing. */
1437
1438 case OP_ANYNL:
1439 if (clen > 0) switch(c)
1440 {
1441 case 0x000a:
1442 case 0x000b:
1443 case 0x000c:
1444 case 0x0085:
1445 case 0x2028:
1446 case 0x2029:
1447 ADD_NEW(state_offset + 1, 0);
1448 break;
1449 case 0x000d:
1450 if (ptr + 1 < end_subject && ptr[1] == 0x0a)
1451 {
1452 ADD_NEW_DATA(-(state_offset + 1), 0, 1);
1453 }
1454 else
1455 {
1456 ADD_NEW(state_offset + 1, 0);
1457 }
1458 break;
1459 }
1460 break;
1461
1462 /*-----------------------------------------------------------------*/
1463 /* Match a negated single character. This is only used for one-byte
1464 characters, that is, we know that d < 256. The character we are
1465 checking (c) can be multibyte. */
1466
1467 case OP_NOT:
1468 if (clen > 0)
1469 {
1470 unsigned int otherd = ((ims & PCRE_CASELESS) != 0)? fcc[d] : d;
1471 if (c != d && c != otherd) { ADD_NEW(state_offset + dlen + 1, 0); }
1472 }
1473 break;
1474
1475 /*-----------------------------------------------------------------*/
1476 case OP_PLUS:
1477 case OP_MINPLUS:
1478 case OP_POSPLUS:
1479 case OP_NOTPLUS:
1480 case OP_NOTMINPLUS:
1481 case OP_NOTPOSPLUS:
1482 count = current_state->count; /* Already matched */
1483 if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
1484 if (clen > 0)
1485 {
1486 unsigned int otherd = NOTACHAR;
1487 if ((ims & PCRE_CASELESS) != 0)
1488 {
1489 #ifdef SUPPORT_UTF8
1490 if (utf8 && d >= 128)
1491 {
1492 #ifdef SUPPORT_UCP
1493 otherd = _pcre_ucp_othercase(d);
1494 #endif /* SUPPORT_UCP */
1495 }
1496 else
1497 #endif /* SUPPORT_UTF8 */
1498 otherd = fcc[d];
1499 }
1500 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1501 {
1502 if (count > 0 &&
1503 (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS))
1504 {
1505 active_count--; /* Remove non-match possibility */
1506 next_active_state--;
1507 }
1508 count++;
1509 ADD_NEW(state_offset, count);
1510 }
1511 }
1512 break;
1513
1514 /*-----------------------------------------------------------------*/
1515 case OP_QUERY:
1516 case OP_MINQUERY:
1517 case OP_POSQUERY:
1518 case OP_NOTQUERY:
1519 case OP_NOTMINQUERY:
1520 case OP_NOTPOSQUERY:
1521 ADD_ACTIVE(state_offset + dlen + 1, 0);
1522 if (clen > 0)
1523 {
1524 unsigned int otherd = NOTACHAR;
1525 if ((ims & PCRE_CASELESS) != 0)
1526 {
1527 #ifdef SUPPORT_UTF8
1528 if (utf8 && d >= 128)
1529 {
1530 #ifdef SUPPORT_UCP
1531 otherd = _pcre_ucp_othercase(d);
1532 #endif /* SUPPORT_UCP */
1533 }
1534 else
1535 #endif /* SUPPORT_UTF8 */
1536 otherd = fcc[d];
1537 }
1538 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1539 {
1540 if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY)
1541 {
1542 active_count--; /* Remove non-match possibility */
1543 next_active_state--;
1544 }
1545 ADD_NEW(state_offset + dlen + 1, 0);
1546 }
1547 }
1548 break;
1549
1550 /*-----------------------------------------------------------------*/
1551 case OP_STAR:
1552 case OP_MINSTAR:
1553 case OP_POSSTAR:
1554 case OP_NOTSTAR:
1555 case OP_NOTMINSTAR:
1556 case OP_NOTPOSSTAR:
1557 ADD_ACTIVE(state_offset + dlen + 1, 0);
1558 if (clen > 0)
1559 {
1560 unsigned int otherd = NOTACHAR;
1561 if ((ims & PCRE_CASELESS) != 0)
1562 {
1563 #ifdef SUPPORT_UTF8
1564 if (utf8 && d >= 128)
1565 {
1566 #ifdef SUPPORT_UCP
1567 otherd = _pcre_ucp_othercase(d);
1568 #endif /* SUPPORT_UCP */
1569 }
1570 else
1571 #endif /* SUPPORT_UTF8 */
1572 otherd = fcc[d];
1573 }
1574 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1575 {
1576 if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR)
1577 {
1578 active_count--; /* Remove non-match possibility */
1579 next_active_state--;
1580 }
1581 ADD_NEW(state_offset, 0);
1582 }
1583 }
1584 break;
1585
1586 /*-----------------------------------------------------------------*/
1587 case OP_EXACT:
1588 case OP_NOTEXACT:
1589 count = current_state->count; /* Number already matched */
1590 if (clen > 0)
1591 {
1592 unsigned int otherd = NOTACHAR;
1593 if ((ims & PCRE_CASELESS) != 0)
1594 {
1595 #ifdef SUPPORT_UTF8
1596 if (utf8 && d >= 128)
1597 {
1598 #ifdef SUPPORT_UCP
1599 otherd = _pcre_ucp_othercase(d);
1600 #endif /* SUPPORT_UCP */
1601 }
1602 else
1603 #endif /* SUPPORT_UTF8 */
1604 otherd = fcc[d];
1605 }
1606 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1607 {
1608 if (++count >= GET2(code, 1))
1609 { ADD_NEW(state_offset + dlen + 3, 0); }
1610 else
1611 { ADD_NEW(state_offset, count); }
1612 }
1613 }
1614 break;
1615
1616 /*-----------------------------------------------------------------*/
1617 case OP_UPTO:
1618 case OP_MINUPTO:
1619 case OP_POSUPTO:
1620 case OP_NOTUPTO:
1621 case OP_NOTMINUPTO:
1622 case OP_NOTPOSUPTO:
1623 ADD_ACTIVE(state_offset + dlen + 3, 0);
1624 count = current_state->count; /* Number already matched */
1625 if (clen > 0)
1626 {
1627 unsigned int otherd = NOTACHAR;
1628 if ((ims & PCRE_CASELESS) != 0)
1629 {
1630 #ifdef SUPPORT_UTF8
1631 if (utf8 && d >= 128)
1632 {
1633 #ifdef SUPPORT_UCP
1634 otherd = _pcre_ucp_othercase(d);
1635 #endif /* SUPPORT_UCP */
1636 }
1637 else
1638 #endif /* SUPPORT_UTF8 */
1639 otherd = fcc[d];
1640 }
1641 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1642 {
1643 if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO)
1644 {
1645 active_count--; /* Remove non-match possibility */
1646 next_active_state--;
1647 }
1648 if (++count >= GET2(code, 1))
1649 { ADD_NEW(state_offset + dlen + 3, 0); }
1650 else
1651 { ADD_NEW(state_offset, count); }
1652 }
1653 }
1654 break;
1655
1656
1657 /* ========================================================================== */
1658 /* These are the class-handling opcodes */
1659
1660 case OP_CLASS:
1661 case OP_NCLASS:
1662 case OP_XCLASS:
1663 {
1664 BOOL isinclass = FALSE;
1665 int next_state_offset;
1666 const uschar *ecode;
1667
1668 /* For a simple class, there is always just a 32-byte table, and we
1669 can set isinclass from it. */
1670
1671 if (codevalue != OP_XCLASS)
1672 {
1673 ecode = code + 33;
1674 if (clen > 0)
1675 {
1676 isinclass = (c > 255)? (codevalue == OP_NCLASS) :
1677 ((code[1 + c/8] & (1 << (c&7))) != 0);
1678 }
1679 }
1680
1681 /* An extended class may have a table or a list of single characters,
1682 ranges, or both, and it may be positive or negative. There's a
1683 function that sorts all this out. */
1684
1685 else
1686 {
1687 ecode = code + GET(code, 1);
1688 if (clen > 0) isinclass = _pcre_xclass(c, code + 1 + LINK_SIZE);
1689 }
1690
1691 /* At this point, isinclass is set for all kinds of class, and ecode
1692 points to the byte after the end of the class. If there is a
1693 quantifier, this is where it will be. */
1694
1695 next_state_offset = ecode - start_code;
1696
1697 switch (*ecode)
1698 {
1699 case OP_CRSTAR:
1700 case OP_CRMINSTAR:
1701 ADD_ACTIVE(next_state_offset + 1, 0);
1702 if (isinclass) { ADD_NEW(state_offset, 0); }
1703 break;
1704
1705 case OP_CRPLUS:
1706 case OP_CRMINPLUS:
1707 count = current_state->count; /* Already matched */
1708 if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
1709 if (isinclass) { count++; ADD_NEW(state_offset, count); }
1710 break;
1711
1712 case OP_CRQUERY:
1713 case OP_CRMINQUERY:
1714 ADD_ACTIVE(next_state_offset + 1, 0);
1715 if (isinclass) { ADD_NEW(next_state_offset + 1, 0); }
1716 break;
1717
1718 case OP_CRRANGE:
1719 case OP_CRMINRANGE:
1720 count = current_state->count; /* Already matched */
1721 if (count >= GET2(ecode, 1))
1722 { ADD_ACTIVE(next_state_offset + 5, 0); }
1723 if (isinclass)
1724 {
1725 int max = GET2(ecode, 3);
1726 if (++count >= max && max != 0) /* Max 0 => no limit */
1727 { ADD_NEW(next_state_offset + 5, 0); }
1728 else
1729 { ADD_NEW(state_offset, count); }
1730 }
1731 break;
1732
1733 default:
1734 if (isinclass) { ADD_NEW(next_state_offset, 0); }
1735 break;
1736 }
1737 }
1738 break;
1739
1740 /* ========================================================================== */
1741 /* These are the opcodes for fancy brackets of various kinds. We have
1742 to use recursion in order to handle them. */
1743
1744 case OP_ASSERT:
1745 case OP_ASSERT_NOT:
1746 case OP_ASSERTBACK:
1747 case OP_ASSERTBACK_NOT:
1748 {
1749 int rc;
1750 int local_offsets[2];
1751 int local_workspace[1000];
1752 const uschar *endasscode = code + GET(code, 1);
1753
1754 while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
1755
1756 rc = internal_dfa_exec(
1757 md, /* static match data */
1758 code, /* this subexpression's code */
1759 ptr, /* where we currently are */
1760 ptr - start_subject, /* start offset */
1761 local_offsets, /* offset vector */
1762 sizeof(local_offsets)/sizeof(int), /* size of same */
1763 local_workspace, /* workspace vector */
1764 sizeof(local_workspace)/sizeof(int), /* size of same */
1765 ims, /* the current ims flags */
1766 rlevel, /* function recursion level */
1767 recursing); /* pass on regex recursion */
1768
1769 if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
1770 { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
1771 }
1772 break;
1773
1774 /*-----------------------------------------------------------------*/
1775 case OP_COND:
1776 case OP_SCOND:
1777 {
1778 int local_offsets[1000];
1779 int local_workspace[1000];
1780 int condcode = code[LINK_SIZE+1];
1781
1782 /* Back reference conditions are not supported */
1783
1784 if (condcode == OP_CREF) return PCRE_ERROR_DFA_UCOND;
1785
1786 /* The DEFINE condition is always false */
1787
1788 if (condcode == OP_DEF)
1789 {
1790 ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0);
1791 }
1792
1793 /* The only supported version of OP_RREF is for the value RREF_ANY,
1794 which means "test if in any recursion". We can't test for specifically
1795 recursed groups. */
1796
1797 else if (condcode == OP_RREF)
1798 {
1799 int value = GET2(code, LINK_SIZE+2);
1800 if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
1801 if (recursing > 0) { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); }
1802 else { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }
1803 }
1804
1805 /* Otherwise, the condition is an assertion */
1806
1807 else
1808 {
1809 int rc;
1810 const uschar *asscode = code + LINK_SIZE + 1;
1811 const uschar *endasscode = asscode + GET(asscode, 1);
1812
1813 while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
1814
1815 rc = internal_dfa_exec(
1816 md, /* fixed match data */
1817 asscode, /* this subexpression's code */
1818 ptr, /* where we currently are */
1819 ptr - start_subject, /* start offset */
1820 local_offsets, /* offset vector */
1821 sizeof(local_offsets)/sizeof(int), /* size of same */
1822 local_workspace, /* workspace vector */
1823 sizeof(local_workspace)/sizeof(int), /* size of same */
1824 ims, /* the current ims flags */
1825 rlevel, /* function recursion level */
1826 recursing); /* pass on regex recursion */
1827
1828 if ((rc >= 0) ==
1829 (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
1830 { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
1831 else
1832 { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }
1833 }
1834 }
1835 break;
1836
1837 /*-----------------------------------------------------------------*/
1838 case OP_RECURSE:
1839 {
1840 int local_offsets[1000];
1841 int local_workspace[1000];
1842 int rc;
1843
1844 DPRINTF(("%.*sStarting regex recursion %d\n", rlevel*2-2, SP,
1845 recursing + 1));
1846
1847 rc = internal_dfa_exec(
1848 md, /* fixed match data */
1849 start_code + GET(code, 1), /* this subexpression's code */
1850 ptr, /* where we currently are */
1851 ptr - start_subject, /* start offset */
1852 local_offsets, /* offset vector */
1853 sizeof(local_offsets)/sizeof(int), /* size of same */
1854 local_workspace, /* workspace vector */
1855 sizeof(local_workspace)/sizeof(int), /* size of same */
1856 ims, /* the current ims flags */
1857 rlevel, /* function recursion level */
1858 recursing + 1); /* regex recurse level */
1859
1860 DPRINTF(("%.*sReturn from regex recursion %d: rc=%d\n", rlevel*2-2, SP,
1861 recursing + 1, rc));
1862
1863 /* Ran out of internal offsets */
1864
1865 if (rc == 0) return PCRE_ERROR_DFA_RECURSE;
1866
1867 /* For each successful matched substring, set up the next state with a
1868 count of characters to skip before trying it. Note that the count is in
1869 characters, not bytes. */
1870
1871 if (rc > 0)
1872 {
1873 for (rc = rc*2 - 2; rc >= 0; rc -= 2)
1874 {
1875 const uschar *p = start_subject + local_offsets[rc];
1876 const uschar *pp = start_subject + local_offsets[rc+1];
1877 int charcount = local_offsets[rc+1] - local_offsets[rc];
1878 while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
1879 if (charcount > 0)
1880 {
1881 ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));
1882 }
1883 else
1884 {
1885 ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0);
1886 }
1887 }
1888 }
1889 else if (rc != PCRE_ERROR_NOMATCH) return rc;
1890 }
1891 break;
1892
1893 /*-----------------------------------------------------------------*/
1894 case OP_ONCE:
1895 {
1896 int local_offsets[2];
1897 int local_workspace[1000];
1898
1899 int rc = internal_dfa_exec(
1900 md, /* fixed match data */
1901 code, /* this subexpression's code */
1902 ptr, /* where we currently are */
1903 ptr - start_subject, /* start offset */
1904 local_offsets, /* offset vector */
1905 sizeof(local_offsets)/sizeof(int), /* size of same */
1906 local_workspace, /* workspace vector */
1907 sizeof(local_workspace)/sizeof(int), /* size of same */
1908 ims, /* the current ims flags */
1909 rlevel, /* function recursion level */
1910 recursing); /* pass on regex recursion */
1911
1912 if (rc >= 0)
1913 {
1914 const uschar *end_subpattern = code;
1915 int charcount = local_offsets[1] - local_offsets[0];
1916 int next_state_offset, repeat_state_offset;
1917
1918 do { end_subpattern += GET(end_subpattern, 1); }
1919 while (*end_subpattern == OP_ALT);
1920 next_state_offset = end_subpattern - start_code + LINK_SIZE + 1;
1921
1922 /* If the end of this subpattern is KETRMAX or KETRMIN, we must
1923 arrange for the repeat state also to be added to the relevant list.
1924 Calculate the offset, or set -1 for no repeat. */
1925
1926 repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
1927 *end_subpattern == OP_KETRMIN)?
1928 end_subpattern - start_code - GET(end_subpattern, 1) : -1;
1929
1930 /* If we have matched an empty string, add the next state at the
1931 current character pointer. This is important so that the duplicate
1932 checking kicks in, which is what breaks infinite loops that match an
1933 empty string. */
1934
1935 if (charcount == 0)
1936 {
1937 ADD_ACTIVE(next_state_offset, 0);
1938 }
1939
1940 /* Optimization: if there are no more active states, and there
1941 are no new states yet set up, then skip over the subject string
1942 right here, to save looping. Otherwise, set up the new state to swing
1943 into action when the end of the substring is reached. */
1944
1945 else if (i + 1 >= active_count && new_count == 0)
1946 {
1947 ptr += charcount;
1948 clen = 0;
1949 ADD_NEW(next_state_offset, 0);
1950
1951 /* If we are adding a repeat state at the new character position,
1952 we must fudge things so that it is the only current state.
1953 Otherwise, it might be a duplicate of one we processed before, and
1954 that would cause it to be skipped. */
1955
1956 if (repeat_state_offset >= 0)
1957 {
1958 next_active_state = active_states;
1959 active_count = 0;
1960 i = -1;
1961 ADD_ACTIVE(repeat_state_offset, 0);
1962 }
1963 }
1964 else
1965 {
1966 const uschar *p = start_subject + local_offsets[0];
1967 const uschar *pp = start_subject + local_offsets[1];
1968 while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
1969 ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
1970 if (repeat_state_offset >= 0)
1971 { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }
1972 }
1973
1974 }
1975 else if (rc != PCRE_ERROR_NOMATCH) return rc;
1976 }
1977 break;
1978
1979
1980 /* ========================================================================== */
1981 /* Handle callouts */
1982
1983 case OP_CALLOUT:
1984 if (pcre_callout != NULL)
1985 {
1986 int rrc;
1987 pcre_callout_block cb;
1988 cb.version = 1; /* Version 1 of the callout block */
1989 cb.callout_number = code[1];
1990 cb.offset_vector = offsets;
1991 cb.subject = (PCRE_SPTR)start_subject;
1992 cb.subject_length = end_subject - start_subject;
1993 cb.start_match = current_subject - start_subject;
1994 cb.current_position = ptr - start_subject;
1995 cb.pattern_position = GET(code, 2);
1996 cb.next_item_length = GET(code, 2 + LINK_SIZE);
1997 cb.capture_top = 1;
1998 cb.capture_last = -1;
1999 cb.callout_data = md->callout_data;
2000 if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc; /* Abandon */
2001 if (rrc == 0) { ADD_ACTIVE(state_offset + 2 + 2*LINK_SIZE, 0); }
2002 }
2003 break;
2004
2005
2006 /* ========================================================================== */
2007 default: /* Unsupported opcode */
2008 return PCRE_ERROR_DFA_UITEM;
2009 }
2010
2011 NEXT_ACTIVE_STATE: continue;
2012
2013 } /* End of loop scanning active states */
2014
2015 /* We have finished the processing at the current subject character. If no
2016 new states have been set for the next character, we have found all the
2017 matches that we are going to find. If we are at the top level and partial
2018 matching has been requested, check for appropriate conditions. */
2019
2020 if (new_count <= 0)
2021 {
2022 if (match_count < 0 && /* No matches found */
2023 rlevel == 1 && /* Top level match function */
2024 (md->moptions & PCRE_PARTIAL) != 0 && /* Want partial matching */
2025 ptr >= end_subject && /* Reached end of subject */
2026 ptr > current_subject) /* Matched non-empty string */
2027 {
2028 if (offsetcount >= 2)
2029 {
2030 offsets[0] = current_subject - start_subject;
2031 offsets[1] = end_subject - start_subject;
2032 }
2033 match_count = PCRE_ERROR_PARTIAL;
2034 }
2035
2036 DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
2037 "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count,
2038 rlevel*2-2, SP));
2039 break; /* In effect, "return", but see the comment below */
2040 }
2041
2042 /* One or more states are active for the next character. */
2043
2044 ptr += clen; /* Advance to next subject character */
2045 } /* Loop to move along the subject string */
2046
2047 /* Control gets here from "break" a few lines above. We do it this way because
2048 if we use "return" above, we have compiler trouble. Some compilers warn if
2049 there's nothing here because they think the function doesn't return a value. On
2050 the other hand, if we put a dummy statement here, some more clever compilers
2051 complain that it can't be reached. Sigh. */
2052
2053 return match_count;
2054 }
2055
2056
2057
2058
2059 /*************************************************
2060 * Execute a Regular Expression - DFA engine *
2061 *************************************************/
2062
2063 /* This external function applies a compiled re to a subject string using a DFA
2064 engine. This function calls the internal function multiple times if the pattern
2065 is not anchored.
2066
2067 Arguments:
2068 argument_re points to the compiled expression
2069 extra_data points to extra data or is NULL
2070 subject points to the subject string
2071 length length of subject string (may contain binary zeros)
2072 start_offset where to start in the subject string
2073 options option bits
2074 offsets vector of match offsets
2075 offsetcount size of same
2076 workspace workspace vector
2077 wscount size of same
2078
2079 Returns: > 0 => number of match offset pairs placed in offsets
2080 = 0 => offsets overflowed; longest matches are present
2081 -1 => failed to match
2082 < -1 => some kind of unexpected problem
2083 */
2084
2085 PCRE_EXP_DEFN int
2086 pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
2087 const char *subject, int length, int start_offset, int options, int *offsets,
2088 int offsetcount, int *workspace, int wscount)
2089 {
2090 real_pcre *re = (real_pcre *)argument_re;
2091 dfa_match_data match_block;
2092 dfa_match_data *md = &match_block;
2093 BOOL utf8, anchored, startline, firstline;
2094 const uschar *current_subject, *end_subject, *lcc;
2095
2096 pcre_study_data internal_study;
2097 const pcre_study_data *study = NULL;
2098 real_pcre internal_re;
2099
2100 const uschar *req_byte_ptr;
2101 const uschar *start_bits = NULL;
2102 BOOL first_byte_caseless = FALSE;
2103 BOOL req_byte_caseless = FALSE;
2104 int first_byte = -1;
2105 int req_byte = -1;
2106 int req_byte2 = -1;
2107 int newline;
2108
2109 /* Plausibility checks */
2110
2111 if ((options & ~PUBLIC_DFA_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
2112 if (re == NULL || subject == NULL || workspace == NULL ||
2113 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
2114 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
2115 if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;
2116
2117 /* We need to find the pointer to any study data before we test for byte
2118 flipping, so we scan the extra_data block first. This may set two fields in the
2119 match block, so we must initialize them beforehand. However, the other fields
2120 in the match block must not be set until after the byte flipping. */
2121
2122 md->tables = re->tables;
2123 md->callout_data = NULL;
2124
2125 if (extra_data != NULL)
2126 {
2127 unsigned int flags = extra_data->flags;
2128 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
2129 study = (const pcre_study_data *)extra_data->study_data;
2130 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) return PCRE_ERROR_DFA_UMLIMIT;
2131 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
2132 return PCRE_ERROR_DFA_UMLIMIT;
2133 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
2134 md->callout_data = extra_data->callout_data;
2135 if ((flags & PCRE_EXTRA_TABLES) != 0)
2136 md->tables = extra_data->tables;
2137 }
2138
2139 /* Check that the first field in the block is the magic number. If it is not,
2140 test for a regex that was compiled on a host of opposite endianness. If this is
2141 the case, flipped values are put in internal_re and internal_study if there was
2142 study data too. */
2143
2144 if (re->magic_number != MAGIC_NUMBER)
2145 {
2146 re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
2147 if (re == NULL) return PCRE_ERROR_BADMAGIC;
2148 if (study != NULL) study = &internal_study;
2149 }
2150
2151 /* Set some local values */
2152
2153 current_subject = (const unsigned char *)subject + start_offset;
2154 end_subject = (const unsigned char *)subject + length;
2155 req_byte_ptr = current_subject - 1;
2156
2157 #ifdef SUPPORT_UTF8
2158 utf8 = (re->options & PCRE_UTF8) != 0;
2159 #else
2160 utf8 = FALSE;
2161 #endif
2162
2163 anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||
2164 (re->options & PCRE_ANCHORED) != 0;
2165
2166 /* The remaining fixed data for passing around. */
2167
2168 md->start_code = (const uschar *)argument_re +
2169 re->name_table_offset + re->name_count * re->name_entry_size;
2170 md->start_subject = (const unsigned char *)subject;
2171 md->end_subject = end_subject;
2172 md->moptions = options;
2173 md->poptions = re->options;
2174
2175 /* Handle different types of newline. The three bits give eight cases. If
2176 nothing is set at run time, whatever was used at compile time applies. */
2177
2178 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)options) &
2179 PCRE_NEWLINE_BITS)
2180 {
2181 case 0: newline = NEWLINE; break; /* Compile-time default */
2182 case PCRE_NEWLINE_CR: newline = '\r'; break;
2183 case PCRE_NEWLINE_LF: newline = '\n'; break;
2184 case PCRE_NEWLINE_CR+
2185 PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
2186 case PCRE_NEWLINE_ANY: newline = -1; break;
2187 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
2188 default: return PCRE_ERROR_BADNEWLINE;
2189 }
2190
2191 if (newline == -2)
2192 {
2193 md->nltype = NLTYPE_ANYCRLF;
2194 }
2195 else if (newline < 0)
2196 {
2197 md->nltype = NLTYPE_ANY;
2198 }
2199 else
2200 {
2201 md->nltype = NLTYPE_FIXED;
2202 if (newline > 255)
2203 {
2204 md->nllen = 2;
2205 md->nl[0] = (newline >> 8) & 255;
2206 md->nl[1] = newline & 255;
2207 }
2208 else
2209 {
2210 md->nllen = 1;
2211 md->nl[0] = newline;
2212 }
2213 }
2214
2215 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
2216 back the character offset. */
2217
2218 #ifdef SUPPORT_UTF8
2219 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
2220 {
2221 if (_pcre_valid_utf8((uschar *)subject, length) >= 0)
2222 return PCRE_ERROR_BADUTF8;
2223 if (start_offset > 0 && start_offset < length)
2224 {
2225 int tb = ((uschar *)subject)[start_offset];
2226 if (tb > 127)
2227 {
2228 tb &= 0xc0;
2229 if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
2230 }
2231 }
2232 }
2233 #endif
2234
2235 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
2236 is a feature that makes it possible to save compiled regex and re-use them
2237 in other programs later. */
2238
2239 if (md->tables == NULL) md->tables = _pcre_default_tables;
2240
2241 /* The lower casing table and the "must be at the start of a line" flag are
2242 used in a loop when finding where to start. */
2243
2244 lcc = md->tables + lcc_offset;
2245 startline = (re->options & PCRE_STARTLINE) != 0;
2246 firstline = (re->options & PCRE_FIRSTLINE) != 0;
2247
2248 /* Set up the first character to match, if available. The first_byte value is
2249 never set for an anchored regular expression, but the anchoring may be forced
2250 at run time, so we have to test for anchoring. The first char may be unset for
2251 an unanchored pattern, of course. If there's no first char and the pattern was
2252 studied, there may be a bitmap of possible first characters. */
2253
2254 if (!anchored)
2255 {
2256 if ((re->options & PCRE_FIRSTSET) != 0)
2257 {
2258 first_byte = re->first_byte & 255;
2259 if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
2260 first_byte = lcc[first_byte];
2261 }
2262 else
2263 {
2264 if (startline && study != NULL &&
2265 (study->options & PCRE_STUDY_MAPPED) != 0)
2266 start_bits = study->start_bits;
2267 }
2268 }
2269
2270 /* For anchored or unanchored matches, there may be a "last known required
2271 character" set. */
2272
2273 if ((re->options & PCRE_REQCHSET) != 0)
2274 {
2275 req_byte = re->req_byte & 255;
2276 req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
2277 req_byte2 = (md->tables + fcc_offset)[req_byte]; /* case flipped */
2278 }
2279
2280 /* Call the main matching function, looping for a non-anchored regex after a
2281 failed match. Unless restarting, optimize by moving to the first match
2282 character if possible, when not anchored. Then unless wanting a partial match,
2283 check for a required later character. */
2284
2285 for (;;)
2286 {
2287 int rc;
2288
2289 if ((options & PCRE_DFA_RESTART) == 0)
2290 {
2291 const uschar *save_end_subject = end_subject;
2292
2293 /* Advance to a unique first char if possible. If firstline is TRUE, the
2294 start of the match is constrained to the first line of a multiline string.
2295 Implement this by temporarily adjusting end_subject so that we stop
2296 scanning at a newline. If the match fails at the newline, later code breaks
2297 this loop. */
2298
2299 if (firstline)
2300 {
2301 const uschar *t = current_subject;
2302 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
2303 end_subject = t;
2304 }
2305
2306 if (first_byte >= 0)
2307 {
2308 if (first_byte_caseless)
2309 while (current_subject < end_subject &&
2310 lcc[*current_subject] != first_byte)
2311 current_subject++;
2312 else
2313 while (current_subject < end_subject && *current_subject != first_byte)
2314 current_subject++;
2315 }
2316
2317 /* Or to just after a linebreak for a multiline match if possible */
2318
2319 else if (startline)
2320 {
2321 if (current_subject > md->start_subject + start_offset)
2322 {
2323 while (current_subject <= end_subject && !WAS_NEWLINE(current_subject))
2324 current_subject++;
2325
2326 /* If we have just passed a CR and the newline option is ANY or
2327 ANYCRLF, and we are now at a LF, advance the match position by one more
2328 character. */
2329
2330 if (current_subject[-1] == '\r' &&
2331 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
2332 current_subject < end_subject &&
2333 *current_subject == '\n')
2334 current_subject++;
2335 }
2336 }
2337
2338 /* Or to a non-unique first char after study */
2339
2340 else if (start_bits != NULL)
2341 {
2342 while (current_subject < end_subject)
2343 {
2344 register unsigned int c = *current_subject;
2345 if ((start_bits[c/8] & (1 << (c&7))) == 0) current_subject++;
2346 else break;
2347 }
2348 }
2349
2350 /* Restore fudged end_subject */
2351
2352 end_subject = save_end_subject;
2353 }
2354
2355 /* If req_byte is set, we know that that character must appear in the subject
2356 for the match to succeed. If the first character is set, req_byte must be
2357 later in the subject; otherwise the test starts at the match point. This
2358 optimization can save a huge amount of work in patterns with nested unlimited
2359 repeats that aren't going to match. Writing separate code for cased/caseless
2360 versions makes it go faster, as does using an autoincrement and backing off
2361 on a match.
2362
2363 HOWEVER: when the subject string is very, very long, searching to its end can
2364 take a long time, and give bad performance on quite ordinary patterns. This
2365 showed up when somebody was matching /^C/ on a 32-megabyte string... so we
2366 don't do this when the string is sufficiently long.
2367
2368 ALSO: this processing is disabled when partial matching is requested.
2369 */
2370
2371 if (req_byte >= 0 &&
2372 end_subject - current_subject < REQ_BYTE_MAX &&
2373 (options & PCRE_PARTIAL) == 0)
2374 {
2375 register const uschar *p = current_subject + ((first_byte >= 0)? 1 : 0);
2376
2377 /* We don't need to repeat the search if we haven't yet reached the
2378 place we found it at last time. */
2379
2380 if (p > req_byte_ptr)
2381 {
2382 if (req_byte_caseless)
2383 {
2384 while (p < end_subject)
2385 {
2386 register int pp = *p++;
2387 if (pp == req_byte || pp == req_byte2) { p--; break; }
2388 }
2389 }
2390 else
2391 {
2392 while (p < end_subject)
2393 {
2394 if (*p++ == req_byte) { p--; break; }
2395 }
2396 }
2397
2398 /* If we can't find the required character, break the matching loop,
2399 which will cause a return or PCRE_ERROR_NOMATCH. */
2400
2401 if (p >= end_subject) break;
2402
2403 /* If we have found the required character, save the point where we
2404 found it, so that we don't search again next time round the loop if
2405 the start hasn't passed this character yet. */
2406
2407 req_byte_ptr = p;
2408 }
2409 }
2410
2411 /* OK, now we can do the business */
2412
2413 rc = internal_dfa_exec(
2414 md, /* fixed match data */
2415 md->start_code, /* this subexpression's code */
2416 current_subject, /* where we currently are */
2417 start_offset, /* start offset in subject */
2418 offsets, /* offset vector */
2419 offsetcount, /* size of same */
2420 workspace, /* workspace vector */
2421 wscount, /* size of same */
2422 re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL), /* ims flags */
2423 0, /* function recurse level */
2424 0); /* regex recurse level */
2425
2426 /* Anything other than "no match" means we are done, always; otherwise, carry
2427 on only if not anchored. */
2428
2429 if (rc != PCRE_ERROR_NOMATCH || anchored) return rc;
2430
2431 /* Advance to the next subject character unless we are at the end of a line
2432 and firstline is set. */
2433
2434 if (firstline && IS_NEWLINE(current_subject)) break;
2435 current_subject++;
2436 if (utf8)
2437 {
2438 while (current_subject < end_subject && (*current_subject & 0xc0) == 0x80)
2439 current_subject++;
2440 }
2441 if (current_subject > end_subject) break;
2442
2443 /* If we have just passed a CR and the newline option is CRLF or ANY or
2444 ANYCRLF, and we are now at a LF, advance the match position by one more
2445 character. */
2446
2447 if (current_subject[-1] == '\r' &&
2448 (md->nltype == NLTYPE_ANY ||
2449 md->nltype == NLTYPE_ANYCRLF ||
2450 md->nllen == 2) &&
2451 current_subject < end_subject &&
2452 *current_subject == '\n')
2453 current_subject++;
2454
2455 } /* "Bumpalong" loop */
2456
2457 return PCRE_ERROR_NOMATCH;
2458 }
2459
2460 /* End of pcre_dfa_exec.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12