/[pcre]/code/tags/pcre-6.1/pcre_dfa_exec.c
ViewVC logotype

Contents of /code/tags/pcre-6.1/pcre_dfa_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 80 - (hide annotations) (download)
Sat Feb 24 21:40:54 2007 UTC (7 years, 9 months ago) by nigel
File MIME type: text/plain
File size: 66541 byte(s)
Tag code/trunk as code/tags/pcre-6.1.

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6     and semantics are as close as possible to those of the Perl 5 language.
7    
8     Written by Philip Hazel
9     Copyright (c) 1997-2005 University of Cambridge
10    
11     -----------------------------------------------------------------------------
12     Redistribution and use in source and binary forms, with or without
13     modification, are permitted provided that the following conditions are met:
14    
15     * Redistributions of source code must retain the above copyright notice,
16     this list of conditions and the following disclaimer.
17    
18     * Redistributions in binary form must reproduce the above copyright
19     notice, this list of conditions and the following disclaimer in the
20     documentation and/or other materials provided with the distribution.
21    
22     * Neither the name of the University of Cambridge nor the names of its
23     contributors may be used to endorse or promote products derived from
24     this software without specific prior written permission.
25    
26     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36     POSSIBILITY OF SUCH DAMAGE.
37     -----------------------------------------------------------------------------
38     */
39    
40    
41     /* This module contains the external function pcre_dfa_exec(), which is an
42     alternative matching function that uses a DFA algorithm. This is NOT Perl-
43     compatible, but it has advantages in certain applications. */
44    
45    
46     #include "pcre_internal.h"
47    
48    
49     /* For use to indent debugging output */
50    
51     #define SP " "
52    
53    
54    
55     /*************************************************
56     * Code parameters and static tables *
57     *************************************************/
58    
59     /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
60     into others, under special conditions. A gap of 10 between the blocks should be
61     enough. */
62    
63     #define OP_PROP_EXTRA (EXTRACT_BASIC_MAX+1)
64     #define OP_EXTUNI_EXTRA (EXTRACT_BASIC_MAX+11)
65    
66    
67     /* This table identifies those opcodes that are followed immediately by a
68     character that is to be tested in some way. This makes is possible to
69     centralize the loading of these characters. In the case of Type * etc, the
70     "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
71     small value. */
72    
73     static uschar coptable[] = {
74     0, /* End */
75     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* \A, \G, \B, \b, \D, \d, \S, \s, \W, \w */
76     0, 0, /* Any, Anybyte */
77     0, 0, 0, /* NOTPROP, PROP, EXTUNI */
78     0, 0, 0, 0, 0, /* \Z, \z, Opt, ^, $ */
79     1, /* Char */
80     1, /* Charnc */
81     1, /* not */
82     /* Positive single-char repeats */
83     1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
84     3, 3, 3, /* upto, minupto, exact */
85     /* Negative single-char repeats - only for chars < 256 */
86     1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
87     3, 3, 3, /* NOT upto, minupto, exact */
88     /* Positive type repeats */
89     1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
90     3, 3, 3, /* Type upto, minupto, exact */
91     /* Character class & ref repeats */
92     0, 0, 0, 0, 0, 0, /* *, *?, +, +?, ?, ?? */
93     0, 0, /* CRRANGE, CRMINRANGE */
94     0, /* CLASS */
95     0, /* NCLASS */
96     0, /* XCLASS - variable length */
97     0, /* REF */
98     0, /* RECURSE */
99     0, /* CALLOUT */
100     0, /* Alt */
101     0, /* Ket */
102     0, /* KetRmax */
103     0, /* KetRmin */
104     0, /* Assert */
105     0, /* Assert not */
106     0, /* Assert behind */
107     0, /* Assert behind not */
108     0, /* Reverse */
109     0, /* Once */
110     0, /* COND */
111     0, /* CREF */
112     0, 0, /* BRAZERO, BRAMINZERO */
113     0, /* BRANUMBER */
114     0 /* BRA */
115     };
116    
117     /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
118     and \w */
119    
120     static uschar toptable1[] = {
121     0, 0, 0, 0, 0,
122     ctype_digit, ctype_digit,
123     ctype_space, ctype_space,
124     ctype_word, ctype_word,
125     0 /* OP_ANY */
126     };
127    
128     static uschar toptable2[] = {
129     0, 0, 0, 0, 0,
130     ctype_digit, 0,
131     ctype_space, 0,
132     ctype_word, 0,
133     1 /* OP_ANY */
134     };
135    
136    
137     /* Structure for holding data about a particular state, which is in effect the
138     current data for an active path through the match tree. It must consist
139     entirely of ints because the working vector we are passed, and which we put
140     these structures in, is a vector of ints. */
141    
142     typedef struct stateblock {
143     int offset; /* Offset to opcode */
144     int count; /* Count for repeats */
145     int ims; /* ims flag bits */
146     int data; /* Some use extra data */
147     } stateblock;
148    
149     #define INTS_PER_STATEBLOCK (sizeof(stateblock)/sizeof(int))
150    
151    
152     #ifdef DEBUG
153     /*************************************************
154     * Print character string *
155     *************************************************/
156    
157     /* Character string printing function for debugging.
158    
159     Arguments:
160     p points to string
161     length number of bytes
162     f where to print
163    
164     Returns: nothing
165     */
166    
167     static void
168     pchars(unsigned char *p, int length, FILE *f)
169     {
170     int c;
171     while (length-- > 0)
172     {
173     if (isprint(c = *(p++)))
174     fprintf(f, "%c", c);
175     else
176     fprintf(f, "\\x%02x", c);
177     }
178     }
179     #endif
180    
181    
182    
183     /*************************************************
184     * Execute a Regular Expression - DFA engine *
185     *************************************************/
186    
187     /* This internal function applies a compiled pattern to a subject string,
188     starting at a given point, using a DFA engine. This function is called from the
189     external one, possibly multiple times if the pattern is not anchored. The
190     function calls itself recursively for some kinds of subpattern.
191    
192     Arguments:
193     md the match_data block with fixed information
194     this_start_code the opening bracket of this subexpression's code
195     current_subject where we currently are in the subject string
196     start_offset start offset in the subject string
197     offsets vector to contain the matching string offsets
198     offsetcount size of same
199     workspace vector of workspace
200     wscount size of same
201     ims the current ims flags
202     rlevel function call recursion level
203     recursing regex recursive call level
204    
205     Returns: > 0 =>
206     = 0 =>
207     -1 => failed to match
208     < -1 => some kind of unexpected problem
209    
210     The following macros are used for adding states to the two state vectors (one
211     for the current character, one for the following character). */
212    
213     #define ADD_ACTIVE(x,y) \
214     if (active_count++ < wscount) \
215     { \
216     next_active_state->offset = (x); \
217     next_active_state->count = (y); \
218     next_active_state->ims = ims; \
219     next_active_state++; \
220     DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
221     } \
222     else return PCRE_ERROR_DFA_WSSIZE
223    
224     #define ADD_ACTIVE_DATA(x,y,z) \
225     if (active_count++ < wscount) \
226     { \
227     next_active_state->offset = (x); \
228     next_active_state->count = (y); \
229     next_active_state->ims = ims; \
230     next_active_state->data = (z); \
231     next_active_state++; \
232     DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
233     } \
234     else return PCRE_ERROR_DFA_WSSIZE
235    
236     #define ADD_NEW(x,y) \
237     if (new_count++ < wscount) \
238     { \
239     next_new_state->offset = (x); \
240     next_new_state->count = (y); \
241     next_new_state->ims = ims; \
242     next_new_state++; \
243     DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
244     } \
245     else return PCRE_ERROR_DFA_WSSIZE
246    
247     #define ADD_NEW_DATA(x,y,z) \
248     if (new_count++ < wscount) \
249     { \
250     next_new_state->offset = (x); \
251     next_new_state->count = (y); \
252     next_new_state->ims = ims; \
253     next_new_state->data = (z); \
254     next_new_state++; \
255     DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
256     } \
257     else return PCRE_ERROR_DFA_WSSIZE
258    
259     /* And now, here is the code */
260    
261     static int
262     internal_dfa_exec(
263     dfa_match_data *md,
264     const uschar *this_start_code,
265     const uschar *current_subject,
266     int start_offset,
267     int *offsets,
268     int offsetcount,
269     int *workspace,
270     int wscount,
271     int ims,
272     int rlevel,
273     int recursing)
274     {
275     stateblock *active_states, *new_states, *temp_states;
276     stateblock *next_active_state, *next_new_state;
277    
278     const uschar *ctypes, *lcc, *fcc;
279     const uschar *ptr;
280     const uschar *end_code;
281    
282     int active_count, new_count, match_count;
283    
284     /* Some fields in the md block are frequently referenced, so we load them into
285     independent variables in the hope that this will perform better. */
286    
287     const uschar *start_subject = md->start_subject;
288     const uschar *end_subject = md->end_subject;
289     const uschar *start_code = md->start_code;
290    
291     BOOL utf8 = (md->poptions & PCRE_UTF8) != 0;
292    
293     rlevel++;
294     offsetcount &= (-2);
295    
296     wscount -= 2;
297     wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
298     (2 * INTS_PER_STATEBLOCK);
299    
300     DPRINTF(("\n%.*s---------------------\n"
301     "%.*sCall to internal_dfa_exec f=%d r=%d\n",
302     rlevel*2-2, SP, rlevel*2-2, SP, rlevel, recursing));
303    
304     ctypes = md->tables + ctypes_offset;
305     lcc = md->tables + lcc_offset;
306     fcc = md->tables + fcc_offset;
307    
308     match_count = PCRE_ERROR_NOMATCH; /* A negative number */
309    
310     active_states = (stateblock *)(workspace + 2);
311     next_new_state = new_states = active_states + wscount;
312     new_count = 0;
313    
314     /* The first thing in any (sub) pattern is a bracket of some sort. Push all
315     the alternative states onto the list, and find out where the end is. This
316     makes is possible to use this function recursively, when we want to stop at a
317     matching internal ket rather than at the end.
318    
319     If the first opcode in the first alternative is OP_REVERSE, we are dealing with
320     a backward assertion. In that case, we have to find out the maximum amount to
321     move back, and set up each alternative appropriately. */
322    
323     if (this_start_code[1+LINK_SIZE] == OP_REVERSE)
324     {
325     int max_back = 0;
326     int gone_back;
327    
328     end_code = this_start_code;
329     do
330     {
331     int back = GET(end_code, 2+LINK_SIZE);
332     if (back > max_back) max_back = back;
333     end_code += GET(end_code, 1);
334     }
335     while (*end_code == OP_ALT);
336    
337     /* If we can't go back the amount required for the longest lookbehind
338     pattern, go back as far as we can; some alternatives may still be viable. */
339    
340     #ifdef SUPPORT_UTF8
341     /* In character mode we have to step back character by character */
342    
343     if (utf8)
344     {
345     for (gone_back = 0; gone_back < max_back; gone_back++)
346     {
347     if (current_subject <= start_subject) break;
348     current_subject--;
349     while (current_subject > start_subject &&
350     (*current_subject & 0xc0) == 0x80)
351     current_subject--;
352     }
353     }
354     else
355     #endif
356    
357     /* In byte-mode we can do this quickly. */
358    
359     {
360     gone_back = (current_subject - max_back < start_subject)?
361     current_subject - start_subject : max_back;
362     current_subject -= gone_back;
363     }
364    
365     /* Now we can process the individual branches. */
366    
367     end_code = this_start_code;
368     do
369     {
370     int back = GET(end_code, 2+LINK_SIZE);
371     if (back <= gone_back)
372     {
373     int bstate = end_code - start_code + 2 + 2*LINK_SIZE;
374     ADD_NEW_DATA(-bstate, 0, gone_back - back);
375     }
376     end_code += GET(end_code, 1);
377     }
378     while (*end_code == OP_ALT);
379     }
380    
381     /* This is the code for a "normal" subpattern (not a backward assertion). The
382     start of a whole pattern is always one of these. If we are at the top level,
383     we may be asked to restart matching from the same point that we reached for a
384     previous partial match. We still have to scan through the top-level branches to
385     find the end state. */
386    
387     else
388     {
389     end_code = this_start_code;
390    
391     /* Restarting */
392    
393     if (rlevel == 1 && (md->moptions & PCRE_DFA_RESTART) != 0)
394     {
395     do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);
396     new_count = workspace[1];
397     if (!workspace[0])
398     memcpy(new_states, active_states, new_count * sizeof(stateblock));
399     }
400    
401     /* Not restarting */
402    
403     else
404     {
405     do
406     {
407     ADD_NEW(end_code - start_code + 1 + LINK_SIZE, 0);
408     end_code += GET(end_code, 1);
409     }
410     while (*end_code == OP_ALT);
411     }
412     }
413    
414     workspace[0] = 0; /* Bit indicating which vector is current */
415    
416     DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, end_code - start_code));
417    
418     /* Loop for scanning the subject */
419    
420     ptr = current_subject;
421     for (;;)
422     {
423     int i, j;
424     int c, d, clen, dlen;
425    
426     /* Make the new state list into the active state list and empty the
427     new state list. */
428    
429     temp_states = active_states;
430     active_states = new_states;
431     new_states = temp_states;
432     active_count = new_count;
433     new_count = 0;
434    
435     workspace[0] ^= 1; /* Remember for the restarting feature */
436     workspace[1] = active_count;
437    
438     #ifdef DEBUG
439     printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);
440     pchars((uschar *)ptr, strlen((char *)ptr), stdout);
441     printf("\"\n");
442    
443     printf("%.*sActive states: ", rlevel*2-2, SP);
444     for (i = 0; i < active_count; i++)
445     printf("%d/%d ", active_states[i].offset, active_states[i].count);
446     printf("\n");
447     #endif
448    
449     /* Set the pointers for adding new states */
450    
451     next_active_state = active_states + active_count;
452     next_new_state = new_states;
453    
454     /* Load the current character from the subject outside the loop, as many
455     different states may want to look at it, and we assume that at least one
456     will. */
457    
458     if (ptr < end_subject)
459     {
460     clen = 1;
461     #ifdef SUPPORT_UTF8
462     if (utf8) { GETCHARLEN(c, ptr, clen); } else
463     #endif /* SUPPORT_UTF8 */
464     c = *ptr;
465     }
466     else
467     {
468     clen = 0; /* At end subject */
469     c = -1;
470     }
471    
472     /* Scan up the active states and act on each one. The result of an action
473     may be to add more states to the currently active list (e.g. on hitting a
474     parenthesis) or it may be to put states on the new list, for considering
475     when we move the character pointer on. */
476    
477     for (i = 0; i < active_count; i++)
478     {
479     stateblock *current_state = active_states + i;
480     const uschar *code;
481     int state_offset = current_state->offset;
482     int count, codevalue;
483     int chartype, othercase;
484    
485     #ifdef DEBUG
486     printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
487     if (c < 0) printf("-1\n");
488     else if (c > 32 && c < 127) printf("'%c'\n", c);
489     else printf("0x%02x\n", c);
490     #endif
491    
492     /* This variable is referred to implicity in the ADD_xxx macros. */
493    
494     ims = current_state->ims;
495    
496     /* A negative offset is a special case meaning "hold off going to this
497     (negated) state until the number of characters in the data field have
498     been skipped". */
499    
500     if (state_offset < 0)
501     {
502     if (current_state->data > 0)
503     {
504     DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP));
505     ADD_NEW_DATA(state_offset, current_state->count,
506     current_state->data - 1);
507     continue;
508     }
509     else
510     {
511     current_state->offset = state_offset = -state_offset;
512     }
513     }
514    
515     /* Check for a duplicate state with the same count, and skip if found. */
516    
517     for (j = 0; j < i; j++)
518     {
519     if (active_states[j].offset == state_offset &&
520     active_states[j].count == current_state->count)
521     {
522     DPRINTF(("%.*sDuplicate state: skipped\n", rlevel*2-2, SP));
523     goto NEXT_ACTIVE_STATE;
524     }
525     }
526    
527     /* The state offset is the offset to the opcode */
528    
529     code = start_code + state_offset;
530     codevalue = *code;
531     if (codevalue >= OP_BRA) codevalue = OP_BRA; /* All brackets are equal */
532    
533     /* If this opcode is followed by an inline character, load it. It is
534     tempting to test for the presence of a subject character here, but that
535     is wrong, because sometimes zero repetitions of the subject are
536     permitted.
537    
538     We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
539     argument that is not a data character - but is always one byte long.
540     Unfortunately, we have to take special action to deal with \P, \p, and
541     \X in this case. To keep the other cases fast, convert these ones to new
542     opcodes. */
543    
544     if (coptable[codevalue] > 0)
545     {
546     dlen = 1;
547     #ifdef SUPPORT_UTF8
548     if (utf8) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
549     #endif /* SUPPORT_UTF8 */
550     d = code[coptable[codevalue]];
551     if (codevalue >= OP_TYPESTAR)
552     {
553     if (d == OP_ANYBYTE) return PCRE_ERROR_DFA_UITEM;
554     if (d >= OP_NOTPROP)
555     codevalue += (d == OP_EXTUNI)? OP_EXTUNI_EXTRA : OP_PROP_EXTRA;
556     }
557     }
558     else
559     {
560     dlen = 0; /* Not strictly necessary, but compilers moan */
561     d = -1; /* if these variables are not set. */
562     }
563    
564    
565     /* Now process the individual opcodes */
566    
567     switch (codevalue)
568     {
569    
570     /* ========================================================================== */
571     /* Reached a closing bracket. If not at the end of the pattern, carry
572     on with the next opcode. Otherwise, unless we have an empty string and
573     PCRE_NOTEMPTY is set, save the match data, shifting up all previous
574     matches so we always have the longest first. */
575    
576     case OP_KET:
577     case OP_KETRMIN:
578     case OP_KETRMAX:
579     if (code != end_code)
580     {
581     ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
582     if (codevalue != OP_KET)
583     {
584     ADD_ACTIVE(state_offset - GET(code, 1), 0);
585     }
586     }
587     else if (ptr > current_subject || (md->moptions & PCRE_NOTEMPTY) == 0)
588     {
589     if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
590     else if (match_count > 0 && ++match_count * 2 >= offsetcount)
591     match_count = 0;
592     count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
593     if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
594     if (offsetcount >= 2)
595     {
596     offsets[0] = current_subject - start_subject;
597     offsets[1] = ptr - start_subject;
598     DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
599     offsets[1] - offsets[0], current_subject));
600     }
601     if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
602     {
603     DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
604     "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,
605     match_count, rlevel*2-2, SP));
606     return match_count;
607     }
608     }
609     break;
610    
611     /* ========================================================================== */
612     /* These opcodes add to the current list of states without looking
613     at the current character. */
614    
615     /*-----------------------------------------------------------------*/
616     case OP_ALT:
617     do { code += GET(code, 1); } while (*code == OP_ALT);
618     ADD_ACTIVE(code - start_code, 0);
619     break;
620    
621     /*-----------------------------------------------------------------*/
622     case OP_BRA:
623     do
624     {
625     ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
626     code += GET(code, 1);
627     }
628     while (*code == OP_ALT);
629     break;
630    
631     /*-----------------------------------------------------------------*/
632     case OP_BRAZERO:
633     case OP_BRAMINZERO:
634     ADD_ACTIVE(state_offset + 1, 0);
635     code += 1 + GET(code, 2);
636     while (*code == OP_ALT) code += GET(code, 1);
637     ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
638     break;
639    
640     /*-----------------------------------------------------------------*/
641     case OP_BRANUMBER:
642     ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
643     break;
644    
645     /*-----------------------------------------------------------------*/
646     case OP_CIRC:
647     if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
648     ((ims & PCRE_MULTILINE) != 0 && ptr[-1] == NEWLINE))
649     { ADD_ACTIVE(state_offset + 1, 0); }
650     break;
651    
652     /*-----------------------------------------------------------------*/
653     case OP_EOD:
654     if (ptr >= end_subject) { ADD_ACTIVE(state_offset + 1, 0); }
655     break;
656    
657     /*-----------------------------------------------------------------*/
658     case OP_OPT:
659     ims = code[1];
660     ADD_ACTIVE(state_offset + 2, 0);
661     break;
662    
663     /*-----------------------------------------------------------------*/
664     case OP_SOD:
665     if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
666     break;
667    
668     /*-----------------------------------------------------------------*/
669     case OP_SOM:
670     if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); }
671     break;
672    
673    
674     /* ========================================================================== */
675     /* These opcodes inspect the next subject character, and sometimes
676     the previous one as well, but do not have an argument. The variable
677     clen contains the length of the current character and is zero if we are
678     at the end of the subject. */
679    
680     /*-----------------------------------------------------------------*/
681     case OP_ANY:
682     if (clen > 0 && (c != NEWLINE || (ims & PCRE_DOTALL) != 0))
683     { ADD_NEW(state_offset + 1, 0); }
684     break;
685    
686     /*-----------------------------------------------------------------*/
687     case OP_EODN:
688     if (clen == 0 || (c == NEWLINE && ptr + 1 == end_subject))
689     { ADD_ACTIVE(state_offset + 1, 0); }
690     break;
691    
692     /*-----------------------------------------------------------------*/
693     case OP_DOLL:
694     if ((md->moptions & PCRE_NOTEOL) == 0)
695     {
696     if (clen == 0 || (c == NEWLINE && (ptr + 1 == end_subject ||
697     (ims & PCRE_MULTILINE) != 0)))
698     { ADD_ACTIVE(state_offset + 1, 0); }
699     }
700     else if (c == NEWLINE && (ims & PCRE_MULTILINE) != 0)
701     { ADD_ACTIVE(state_offset + 1, 0); }
702     break;
703    
704     /*-----------------------------------------------------------------*/
705    
706     case OP_DIGIT:
707     case OP_WHITESPACE:
708     case OP_WORDCHAR:
709     if (clen > 0 && c < 256 &&
710     ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)
711     { ADD_NEW(state_offset + 1, 0); }
712     break;
713    
714     /*-----------------------------------------------------------------*/
715     case OP_NOT_DIGIT:
716     case OP_NOT_WHITESPACE:
717     case OP_NOT_WORDCHAR:
718     if (clen > 0 && (c >= 256 ||
719     ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0))
720     { ADD_NEW(state_offset + 1, 0); }
721     break;
722    
723     /*-----------------------------------------------------------------*/
724     case OP_WORD_BOUNDARY:
725     case OP_NOT_WORD_BOUNDARY:
726     {
727     int left_word, right_word;
728    
729     if (ptr > start_subject)
730     {
731     const uschar *temp = ptr - 1;
732     #ifdef SUPPORT_UTF8
733     if (utf8) BACKCHAR(temp);
734     #endif
735     GETCHARTEST(d, temp);
736     left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
737     }
738     else left_word = 0;
739    
740     if (clen > 0) right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
741     else right_word = 0;
742    
743     if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
744     { ADD_ACTIVE(state_offset + 1, 0); }
745     }
746     break;
747    
748    
749     #ifdef SUPPORT_UCP
750    
751     /*-----------------------------------------------------------------*/
752     /* Check the next character by Unicode property. We will get here only
753     if the support is in the binary; otherwise a compile-time error occurs.
754     */
755    
756     case OP_PROP:
757     case OP_NOTPROP:
758     if (clen > 0)
759     {
760     int rqdtype, category;
761     category = ucp_findchar(c, &chartype, &othercase);
762     rqdtype = code[1];
763     if (rqdtype >= 128)
764     {
765     if ((rqdtype - 128 == category) == (codevalue == OP_PROP))
766     { ADD_NEW(state_offset + 2, 0); }
767     }
768     else
769     {
770     if ((rqdtype == chartype) == (codevalue == OP_PROP))
771     { ADD_NEW(state_offset + 2, 0); }
772     }
773     }
774     break;
775     #endif
776    
777    
778    
779     /* ========================================================================== */
780     /* These opcodes likewise inspect the subject character, but have an
781     argument that is not a data character. It is one of these opcodes:
782     OP_ANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE, OP_WORDCHAR,
783     OP_NOT_WORDCHAR. The value is loaded into d. */
784    
785     case OP_TYPEPLUS:
786     case OP_TYPEMINPLUS:
787     count = current_state->count; /* Already matched */
788     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
789     if (clen > 0)
790     {
791     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
792     (c < 256 &&
793     (d != OP_ANY || c != '\n' || (ims & PCRE_DOTALL) != 0) &&
794     ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
795     {
796     count++;
797     ADD_NEW(state_offset, count);
798     }
799     }
800     break;
801    
802     /*-----------------------------------------------------------------*/
803     case OP_TYPEQUERY:
804     case OP_TYPEMINQUERY:
805     ADD_ACTIVE(state_offset + 2, 0);
806     if (clen > 0)
807     {
808     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
809     (c < 256 &&
810     (d != OP_ANY || c != '\n' || (ims & PCRE_DOTALL) != 0) &&
811     ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
812     {
813     ADD_NEW(state_offset + 2, 0);
814     }
815     }
816     break;
817    
818     /*-----------------------------------------------------------------*/
819     case OP_TYPESTAR:
820     case OP_TYPEMINSTAR:
821     ADD_ACTIVE(state_offset + 2, 0);
822     if (clen > 0)
823     {
824     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
825     (c < 256 &&
826     (d != OP_ANY || c != '\n' || (ims & PCRE_DOTALL) != 0) &&
827     ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
828     {
829     ADD_NEW(state_offset, 0);
830     }
831     }
832     break;
833    
834     /*-----------------------------------------------------------------*/
835     case OP_TYPEEXACT:
836     case OP_TYPEUPTO:
837     case OP_TYPEMINUPTO:
838     if (codevalue != OP_TYPEEXACT)
839     { ADD_ACTIVE(state_offset + 4, 0); }
840     count = current_state->count; /* Number already matched */
841     if (clen > 0)
842     {
843     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
844     (c < 256 &&
845     (d != OP_ANY || c != '\n' || (ims & PCRE_DOTALL) != 0) &&
846     ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
847     {
848     if (++count >= GET2(code, 1))
849     { ADD_NEW(state_offset + 4, 0); }
850     else
851     { ADD_NEW(state_offset, count); }
852     }
853     }
854     break;
855    
856     /* ========================================================================== */
857     /* These are virtual opcodes that are used when something like
858     OP_TYPEPLUS has OP_PROP, OP_NOTPROP, or OP_EXTUNI as its argument. It
859     keeps the code above fast for the other cases. The argument is in the
860     d variable. */
861    
862     case OP_PROP_EXTRA + OP_TYPEPLUS:
863     case OP_PROP_EXTRA + OP_TYPEMINPLUS:
864     count = current_state->count; /* Already matched */
865     if (count > 0) { ADD_ACTIVE(state_offset + 3, 0); }
866     if (clen > 0)
867     {
868     int category = ucp_findchar(c, &chartype, &othercase);
869     int rqdtype = code[2];
870     if ((d == OP_PROP) ==
871     (rqdtype == ((rqdtype >= 128)? (category + 128) : chartype)))
872     { count++; ADD_NEW(state_offset, count); }
873     }
874     break;
875    
876     /*-----------------------------------------------------------------*/
877     case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
878     case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
879     count = current_state->count; /* Already matched */
880     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
881     if (clen > 0 && ucp_findchar(c, &chartype, &othercase) != ucp_M)
882     {
883     const uschar *nptr = ptr + clen;
884     int ncount = 0;
885     while (nptr < end_subject)
886     {
887     int nd;
888     int ndlen = 1;
889     GETCHARLEN(nd, nptr, ndlen);
890     if (ucp_findchar(nd, &chartype, &othercase) != ucp_M) break;
891     ncount++;
892     nptr += ndlen;
893     }
894     count++;
895     ADD_NEW_DATA(-state_offset, count, ncount);
896     }
897     break;
898    
899     /*-----------------------------------------------------------------*/
900     case OP_PROP_EXTRA + OP_TYPEQUERY:
901     case OP_PROP_EXTRA + OP_TYPEMINQUERY:
902     count = 3;
903     goto QS1;
904    
905     case OP_PROP_EXTRA + OP_TYPESTAR:
906     case OP_PROP_EXTRA + OP_TYPEMINSTAR:
907     count = 0;
908    
909     QS1:
910    
911     ADD_ACTIVE(state_offset + 3, 0);
912     if (clen > 0)
913     {
914     int category = ucp_findchar(c, &chartype, &othercase);
915     int rqdtype = code[2];
916     if ((d == OP_PROP) ==
917     (rqdtype == ((rqdtype >= 128)? (category + 128) : chartype)))
918     { ADD_NEW(state_offset + count, 0); }
919     }
920     break;
921    
922     /*-----------------------------------------------------------------*/
923     case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
924     case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
925     count = 2;
926     goto QS2;
927    
928     case OP_EXTUNI_EXTRA + OP_TYPESTAR:
929     case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
930     count = 0;
931    
932     QS2:
933    
934     ADD_ACTIVE(state_offset + 2, 0);
935     if (clen > 0 && ucp_findchar(c, &chartype, &othercase) != ucp_M)
936     {
937     const uschar *nptr = ptr + clen;
938     int ncount = 0;
939     while (nptr < end_subject)
940     {
941     int nd;
942     int ndlen = 1;
943     GETCHARLEN(nd, nptr, ndlen);
944     if (ucp_findchar(nd, &chartype, &othercase) != ucp_M) break;
945     ncount++;
946     nptr += ndlen;
947     }
948     ADD_NEW_DATA(-(state_offset + count), 0, ncount);
949     }
950     break;
951    
952     /*-----------------------------------------------------------------*/
953     case OP_PROP_EXTRA + OP_TYPEEXACT:
954     case OP_PROP_EXTRA + OP_TYPEUPTO:
955     case OP_PROP_EXTRA + OP_TYPEMINUPTO:
956     if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
957     { ADD_ACTIVE(state_offset + 5, 0); }
958     count = current_state->count; /* Number already matched */
959     if (clen > 0)
960     {
961     int category = ucp_findchar(c, &chartype, &othercase);
962     int rqdtype = code[4];
963     if ((d == OP_PROP) ==
964     (rqdtype == ((rqdtype >= 128)? (category + 128) : chartype)))
965     {
966     if (++count >= GET2(code, 1))
967     { ADD_NEW(state_offset + 5, 0); }
968     else
969     { ADD_NEW(state_offset, count); }
970     }
971     }
972     break;
973    
974     /*-----------------------------------------------------------------*/
975     case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
976     case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
977     case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
978     if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
979     { ADD_ACTIVE(state_offset + 4, 0); }
980     count = current_state->count; /* Number already matched */
981     if (clen > 0 && ucp_findchar(c, &chartype, &othercase) != ucp_M)
982     {
983     const uschar *nptr = ptr + clen;
984     int ncount = 0;
985     while (nptr < end_subject)
986     {
987     int nd;
988     int ndlen = 1;
989     GETCHARLEN(nd, nptr, ndlen);
990     if (ucp_findchar(nd, &chartype, &othercase) != ucp_M) break;
991     ncount++;
992     nptr += ndlen;
993     }
994     if (++count >= GET2(code, 1))
995     { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
996     else
997     { ADD_NEW_DATA(-state_offset, count, ncount); }
998     }
999     break;
1000    
1001     /* ========================================================================== */
1002     /* These opcodes are followed by a character that is usually compared
1003     to the current subject character; it is loaded into d. We still get
1004     here even if there is no subject character, because in some cases zero
1005     repetitions are permitted. */
1006    
1007     /*-----------------------------------------------------------------*/
1008     case OP_CHAR:
1009     if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); }
1010     break;
1011    
1012     /*-----------------------------------------------------------------*/
1013     case OP_CHARNC:
1014     if (clen == 0) break;
1015    
1016     #ifdef SUPPORT_UTF8
1017     if (utf8)
1018     {
1019     if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
1020     {
1021     if (c < 128) othercase = fcc[c]; else
1022    
1023     /* If we have Unicode property support, we can use it to test the
1024     other case of the character, if there is one. The result of
1025     ucp_findchar() is < 0 if the char isn't found, and othercase is
1026     returned as zero if there isn't another case. */
1027    
1028     #ifdef SUPPORT_UCP
1029     if (ucp_findchar(c, &chartype, &othercase) < 0)
1030     #endif
1031     othercase = -1;
1032    
1033     if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
1034     }
1035     }
1036     else
1037     #endif /* SUPPORT_UTF8 */
1038    
1039     /* Non-UTF-8 mode */
1040     {
1041     if (lcc[c] == lcc[d]) { ADD_NEW(state_offset + 2, 0); }
1042     }
1043     break;
1044    
1045    
1046     #ifdef SUPPORT_UCP
1047     /*-----------------------------------------------------------------*/
1048     /* This is a tricky one because it can match more than one character.
1049     Find out how many characters to skip, and then set up a negative state
1050     to wait for them to pass before continuing. */
1051    
1052     case OP_EXTUNI:
1053     if (clen > 0 && ucp_findchar(c, &chartype, &othercase) != ucp_M)
1054     {
1055     const uschar *nptr = ptr + clen;
1056     int ncount = 0;
1057     while (nptr < end_subject)
1058     {
1059     int nclen = 1;
1060     GETCHARLEN(c, nptr, nclen);
1061     if (ucp_findchar(c, &chartype, &othercase) != ucp_M) break;
1062     ncount++;
1063     nptr += nclen;
1064     }
1065     ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
1066     }
1067     break;
1068     #endif
1069    
1070     /*-----------------------------------------------------------------*/
1071     /* Match a negated single character. This is only used for one-byte
1072     characters, that is, we know that d < 256. The character we are
1073     checking (c) can be multibyte. */
1074    
1075     case OP_NOT:
1076     if (clen > 0)
1077     {
1078     int otherd = ((ims & PCRE_CASELESS) != 0)? fcc[d] : d;
1079     if (c != d && c != otherd) { ADD_NEW(state_offset + dlen + 1, 0); }
1080     }
1081     break;
1082    
1083     /*-----------------------------------------------------------------*/
1084     case OP_PLUS:
1085     case OP_MINPLUS:
1086     case OP_NOTPLUS:
1087     case OP_NOTMINPLUS:
1088     count = current_state->count; /* Already matched */
1089     if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
1090     if (clen > 0)
1091     {
1092     int otherd = -1;
1093     if ((ims & PCRE_CASELESS) != 0)
1094     {
1095     #ifdef SUPPORT_UTF8
1096     if (utf8 && c >= 128)
1097     {
1098     #ifdef SUPPORT_UCP
1099     if (ucp_findchar(d, &chartype, &otherd) < 0) otherd = -1;
1100     #endif /* SUPPORT_UCP */
1101     }
1102     else
1103     #endif /* SUPPORT_UTF8 */
1104     otherd = fcc[d];
1105     }
1106     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1107     { count++; ADD_NEW(state_offset, count); }
1108     }
1109     break;
1110    
1111     /*-----------------------------------------------------------------*/
1112     case OP_QUERY:
1113     case OP_MINQUERY:
1114     case OP_NOTQUERY:
1115     case OP_NOTMINQUERY:
1116     ADD_ACTIVE(state_offset + dlen + 1, 0);
1117     if (clen > 0)
1118     {
1119     int otherd = -1;
1120     if ((ims && PCRE_CASELESS) != 0)
1121     {
1122     #ifdef SUPPORT_UTF8
1123     if (utf8 && c >= 128)
1124     {
1125     #ifdef SUPPORT_UCP
1126     if (ucp_findchar(c, &chartype, &otherd) < 0) otherd = -1;
1127     #endif /* SUPPORT_UCP */
1128     }
1129     else
1130     #endif /* SUPPORT_UTF8 */
1131     otherd = fcc[d];
1132     }
1133     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1134     { ADD_NEW(state_offset + dlen + 1, 0); }
1135     }
1136     break;
1137    
1138     /*-----------------------------------------------------------------*/
1139     case OP_STAR:
1140     case OP_MINSTAR:
1141     case OP_NOTSTAR:
1142     case OP_NOTMINSTAR:
1143     ADD_ACTIVE(state_offset + dlen + 1, 0);
1144     if (clen > 0)
1145     {
1146     int otherd = -1;
1147     if ((ims && PCRE_CASELESS) != 0)
1148     {
1149     #ifdef SUPPORT_UTF8
1150     if (utf8 && c >= 128)
1151     {
1152     #ifdef SUPPORT_UCP
1153     if (ucp_findchar(c, &chartype, &otherd) < 0) otherd = -1;
1154     #endif /* SUPPORT_UCP */
1155     }
1156     else
1157     #endif /* SUPPORT_UTF8 */
1158     otherd = fcc[d];
1159     }
1160     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1161     { ADD_NEW(state_offset, 0); }
1162     }
1163     break;
1164    
1165     /*-----------------------------------------------------------------*/
1166     case OP_EXACT:
1167     case OP_UPTO:
1168     case OP_MINUPTO:
1169     case OP_NOTEXACT:
1170     case OP_NOTUPTO:
1171     case OP_NOTMINUPTO:
1172     if (codevalue != OP_EXACT && codevalue != OP_NOTEXACT)
1173     { ADD_ACTIVE(state_offset + dlen + 3, 0); }
1174     count = current_state->count; /* Number already matched */
1175     if (clen > 0)
1176     {
1177     int otherd = -1;
1178     if ((ims & PCRE_CASELESS) != 0)
1179     {
1180     #ifdef SUPPORT_UTF8
1181     if (utf8 && c >= 128)
1182     {
1183     #ifdef SUPPORT_UCP
1184     if (ucp_findchar(d, &chartype, &otherd) < 0) otherd = -1;
1185     #endif /* SUPPORT_UCP */
1186     }
1187     else
1188     #endif /* SUPPORT_UTF8 */
1189     otherd = fcc[d];
1190     }
1191     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1192     {
1193     if (++count >= GET2(code, 1))
1194     { ADD_NEW(state_offset + dlen + 3, 0); }
1195     else
1196     { ADD_NEW(state_offset, count); }
1197     }
1198     }
1199     break;
1200    
1201    
1202     /* ========================================================================== */
1203     /* These are the class-handling opcodes */
1204    
1205     case OP_CLASS:
1206     case OP_NCLASS:
1207     case OP_XCLASS:
1208     {
1209     BOOL isinclass = FALSE;
1210     int next_state_offset;
1211     const uschar *ecode;
1212    
1213     /* For a simple class, there is always just a 32-byte table, and we
1214     can set isinclass from it. */
1215    
1216     if (codevalue != OP_XCLASS)
1217     {
1218     ecode = code + 33;
1219     if (clen > 0)
1220     {
1221     isinclass = (c > 255)? (codevalue == OP_NCLASS) :
1222     ((code[1 + c/8] & (1 << (c&7))) != 0);
1223     }
1224     }
1225    
1226     /* An extended class may have a table or a list of single characters,
1227     ranges, or both, and it may be positive or negative. There's a
1228     function that sorts all this out. */
1229    
1230     else
1231     {
1232     ecode = code + GET(code, 1);
1233     if (clen > 0) isinclass = _pcre_xclass(c, code + 1 + LINK_SIZE);
1234     }
1235    
1236     /* At this point, isinclass is set for all kinds of class, and ecode
1237     points to the byte after the end of the class. If there is a
1238     quantifier, this is where it will be. */
1239    
1240     next_state_offset = ecode - start_code;
1241    
1242     switch (*ecode)
1243     {
1244     case OP_CRSTAR:
1245     case OP_CRMINSTAR:
1246     ADD_ACTIVE(next_state_offset + 1, 0);
1247     if (isinclass) { ADD_NEW(state_offset, 0); }
1248     break;
1249    
1250     case OP_CRPLUS:
1251     case OP_CRMINPLUS:
1252     count = current_state->count; /* Already matched */
1253     if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
1254     if (isinclass) { count++; ADD_NEW(state_offset, count); }
1255     break;
1256    
1257     case OP_CRQUERY:
1258     case OP_CRMINQUERY:
1259     ADD_ACTIVE(next_state_offset + 1, 0);
1260     if (isinclass) { ADD_NEW(next_state_offset + 1, 0); }
1261     break;
1262    
1263     case OP_CRRANGE:
1264     case OP_CRMINRANGE:
1265     count = current_state->count; /* Already matched */
1266     if (count >= GET2(ecode, 1))
1267     { ADD_ACTIVE(next_state_offset + 5, 0); }
1268     if (isinclass)
1269     {
1270     if (++count >= GET2(ecode, 3))
1271     { ADD_NEW(next_state_offset + 5, 0); }
1272     else
1273     { ADD_NEW(state_offset, count); }
1274     }
1275     break;
1276    
1277     default:
1278     if (isinclass) { ADD_NEW(next_state_offset, 0); }
1279     break;
1280     }
1281     }
1282     break;
1283    
1284     /* ========================================================================== */
1285     /* These are the opcodes for fancy brackets of various kinds. We have
1286     to use recursion in order to handle them. */
1287    
1288     case OP_ASSERT:
1289     case OP_ASSERT_NOT:
1290     case OP_ASSERTBACK:
1291     case OP_ASSERTBACK_NOT:
1292     {
1293     int rc;
1294     int local_offsets[2];
1295     int local_workspace[1000];
1296     const uschar *endasscode = code + GET(code, 1);
1297    
1298     while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
1299    
1300     rc = internal_dfa_exec(
1301     md, /* static match data */
1302     code, /* this subexpression's code */
1303     ptr, /* where we currently are */
1304     ptr - start_subject, /* start offset */
1305     local_offsets, /* offset vector */
1306     sizeof(local_offsets)/sizeof(int), /* size of same */
1307     local_workspace, /* workspace vector */
1308     sizeof(local_workspace)/sizeof(int), /* size of same */
1309     ims, /* the current ims flags */
1310     rlevel, /* function recursion level */
1311     recursing); /* pass on regex recursion */
1312    
1313     if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
1314     { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
1315     }
1316     break;
1317    
1318     /*-----------------------------------------------------------------*/
1319     case OP_COND:
1320     {
1321     int local_offsets[1000];
1322     int local_workspace[1000];
1323     int condcode = code[LINK_SIZE+1];
1324    
1325     /* The only supported version of OP_CREF is for the value 0xffff, which
1326     means "test if in a recursion". */
1327    
1328     if (condcode == OP_CREF)
1329     {
1330     int value = GET2(code, LINK_SIZE+2);
1331     if (value != 0xffff) return PCRE_ERROR_DFA_UCOND;
1332     if (recursing > 0) { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); }
1333     else { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }
1334     }
1335    
1336     /* Otherwise, the condition is an assertion */
1337    
1338     else
1339     {
1340     int rc;
1341     const uschar *asscode = code + LINK_SIZE + 1;
1342     const uschar *endasscode = asscode + GET(asscode, 1);
1343    
1344     while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
1345    
1346     rc = internal_dfa_exec(
1347     md, /* fixed match data */
1348     asscode, /* this subexpression's code */
1349     ptr, /* where we currently are */
1350     ptr - start_subject, /* start offset */
1351     local_offsets, /* offset vector */
1352     sizeof(local_offsets)/sizeof(int), /* size of same */
1353     local_workspace, /* workspace vector */
1354     sizeof(local_workspace)/sizeof(int), /* size of same */
1355     ims, /* the current ims flags */
1356     rlevel, /* function recursion level */
1357     recursing); /* pass on regex recursion */
1358    
1359     if ((rc >= 0) ==
1360     (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
1361     { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
1362     else
1363     { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }
1364     }
1365     }
1366     break;
1367    
1368     /*-----------------------------------------------------------------*/
1369     case OP_RECURSE:
1370     {
1371     int local_offsets[1000];
1372     int local_workspace[1000];
1373     int rc;
1374    
1375     DPRINTF(("%.*sStarting regex recursion %d\n", rlevel*2-2, SP,
1376     recursing + 1));
1377    
1378     rc = internal_dfa_exec(
1379     md, /* fixed match data */
1380     start_code + GET(code, 1), /* this subexpression's code */
1381     ptr, /* where we currently are */
1382     ptr - start_subject, /* start offset */
1383     local_offsets, /* offset vector */
1384     sizeof(local_offsets)/sizeof(int), /* size of same */
1385     local_workspace, /* workspace vector */
1386     sizeof(local_workspace)/sizeof(int), /* size of same */
1387     ims, /* the current ims flags */
1388     rlevel, /* function recursion level */
1389     recursing + 1); /* regex recurse level */
1390    
1391     DPRINTF(("%.*sReturn from regex recursion %d: rc=%d\n", rlevel*2-2, SP,
1392     recursing + 1, rc));
1393    
1394     /* Ran out of internal offsets */
1395    
1396     if (rc == 0) return PCRE_ERROR_DFA_RECURSE;
1397    
1398     /* For each successful matched substring, set up the next state with a
1399     count of characters to skip before trying it. Note that the count is in
1400     characters, not bytes. */
1401    
1402     if (rc > 0)
1403     {
1404     for (rc = rc*2 - 2; rc >= 0; rc -= 2)
1405     {
1406     const uschar *p = start_subject + local_offsets[rc];
1407     const uschar *pp = start_subject + local_offsets[rc+1];
1408     int charcount = local_offsets[rc+1] - local_offsets[rc];
1409     while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
1410     if (charcount > 0)
1411     {
1412     ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));
1413     }
1414     else
1415     {
1416     ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0);
1417     }
1418     }
1419     }
1420     else if (rc != PCRE_ERROR_NOMATCH) return rc;
1421     }
1422     break;
1423    
1424     /*-----------------------------------------------------------------*/
1425     case OP_ONCE:
1426     {
1427     const uschar *endcode;
1428     int local_offsets[2];
1429     int local_workspace[1000];
1430    
1431     int rc = internal_dfa_exec(
1432     md, /* fixed match data */
1433     code, /* this subexpression's code */
1434     ptr, /* where we currently are */
1435     ptr - start_subject, /* start offset */
1436     local_offsets, /* offset vector */
1437     sizeof(local_offsets)/sizeof(int), /* size of same */
1438     local_workspace, /* workspace vector */
1439     sizeof(local_workspace)/sizeof(int), /* size of same */
1440     ims, /* the current ims flags */
1441     rlevel, /* function recursion level */
1442     recursing); /* pass on regex recursion */
1443    
1444     if (rc >= 0)
1445     {
1446     const uschar *end_subpattern = code;
1447     int charcount = local_offsets[1] - local_offsets[0];
1448     int next_state_offset, repeat_state_offset;
1449     BOOL is_repeated;
1450    
1451     do { end_subpattern += GET(end_subpattern, 1); }
1452     while (*end_subpattern == OP_ALT);
1453     next_state_offset = end_subpattern - start_code + LINK_SIZE + 1;
1454    
1455     /* If the end of this subpattern is KETRMAX or KETRMIN, we must
1456     arrange for the repeat state also to be added to the relevant list.
1457     Calculate the offset, or set -1 for no repeat. */
1458    
1459     repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
1460     *end_subpattern == OP_KETRMIN)?
1461     end_subpattern - start_code - GET(end_subpattern, 1) : -1;
1462    
1463     /* If we have matched an empty string, add the next state at the
1464     current character pointer. This is important so that the duplicate
1465     checking kicks in, which is what breaks infinite loops that match an
1466     empty string. */
1467    
1468     if (charcount == 0)
1469     {
1470     ADD_ACTIVE(next_state_offset, 0);
1471     }
1472    
1473     /* Optimization: if there are no more active states, and there
1474     are no new states yet set up, then skip over the subject string
1475     right here, to save looping. Otherwise, set up the new state to swing
1476     into action when the end of the substring is reached. */
1477    
1478     else if (i + 1 >= active_count && new_count == 0)
1479     {
1480     ptr += charcount;
1481     clen = 0;
1482     ADD_NEW(next_state_offset, 0);
1483    
1484     /* If we are adding a repeat state at the new character position,
1485     we must fudge things so that it is the only current state.
1486     Otherwise, it might be a duplicate of one we processed before, and
1487     that would cause it to be skipped. */
1488    
1489     if (repeat_state_offset >= 0)
1490     {
1491     next_active_state = active_states;
1492     active_count = 0;
1493     i = -1;
1494     ADD_ACTIVE(repeat_state_offset, 0);
1495     }
1496     }
1497     else
1498     {
1499     const uschar *p = start_subject + local_offsets[0];
1500     const uschar *pp = start_subject + local_offsets[1];
1501     while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
1502     ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
1503     if (repeat_state_offset >= 0)
1504     { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }
1505     }
1506    
1507     }
1508     else if (rc != PCRE_ERROR_NOMATCH) return rc;
1509     }
1510     break;
1511    
1512    
1513     /* ========================================================================== */
1514     /* Handle callouts */
1515    
1516     case OP_CALLOUT:
1517     if (pcre_callout != NULL)
1518     {
1519     int rrc;
1520     pcre_callout_block cb;
1521     cb.version = 1; /* Version 1 of the callout block */
1522     cb.callout_number = code[1];
1523     cb.offset_vector = offsets;
1524     cb.subject = (char *)start_subject;
1525     cb.subject_length = end_subject - start_subject;
1526     cb.start_match = current_subject - start_subject;
1527     cb.current_position = ptr - start_subject;
1528     cb.pattern_position = GET(code, 2);
1529     cb.next_item_length = GET(code, 2 + LINK_SIZE);
1530     cb.capture_top = 1;
1531     cb.capture_last = -1;
1532     cb.callout_data = md->callout_data;
1533     if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc; /* Abandon */
1534     if (rrc == 0) { ADD_ACTIVE(state_offset + 2 + 2*LINK_SIZE, 0); }
1535     }
1536     break;
1537    
1538    
1539     /* ========================================================================== */
1540     default: /* Unsupported opcode */
1541     return PCRE_ERROR_DFA_UITEM;
1542     }
1543    
1544     NEXT_ACTIVE_STATE: continue;
1545    
1546     } /* End of loop scanning active states */
1547    
1548     /* We have finished the processing at the current subject character. If no
1549     new states have been set for the next character, we have found all the
1550     matches that we are going to find. If we are at the top level and partial
1551     matching has been requested, check for appropriate conditions. */
1552    
1553     if (new_count <= 0)
1554     {
1555     if (match_count < 0 && /* No matches found */
1556     rlevel == 1 && /* Top level match function */
1557     (md->moptions & PCRE_PARTIAL) != 0 && /* Want partial matching */
1558     ptr >= end_subject && /* Reached end of subject */
1559     ptr > current_subject) /* Matched non-empty string */
1560     {
1561     if (offsetcount >= 2)
1562     {
1563     offsets[0] = current_subject - start_subject;
1564     offsets[1] = end_subject - start_subject;
1565     }
1566     match_count = PCRE_ERROR_PARTIAL;
1567     }
1568    
1569     DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
1570     "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count,
1571     rlevel*2-2, SP));
1572     return match_count;
1573     }
1574    
1575     /* One or more states are active for the next character. */
1576    
1577     ptr += clen; /* Advance to next subject character */
1578     } /* Loop to move along the subject string */
1579    
1580     /* Control never gets here, but we must keep the compiler happy. */
1581    
1582     DPRINTF(("%.*s+++ Unexpected end of internal_dfa_exec %d +++\n"
1583     "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, rlevel*2-2, SP));
1584     return PCRE_ERROR_NOMATCH;
1585     }
1586    
1587    
1588    
1589    
1590     /*************************************************
1591     * Execute a Regular Expression - DFA engine *
1592     *************************************************/
1593    
1594     /* This external function applies a compiled re to a subject string using a DFA
1595     engine. This function calls the internal function multiple times if the pattern
1596     is not anchored.
1597    
1598     Arguments:
1599     argument_re points to the compiled expression
1600     extra_data points to extra data or is NULL (not currently used)
1601     subject points to the subject string
1602     length length of subject string (may contain binary zeros)
1603     start_offset where to start in the subject string
1604     options option bits
1605     offsets vector of match offsets
1606     offsetcount size of same
1607     workspace workspace vector
1608     wscount size of same
1609    
1610     Returns: > 0 => number of match offset pairs placed in offsets
1611     = 0 => offsets overflowed; longest matches are present
1612     -1 => failed to match
1613     < -1 => some kind of unexpected problem
1614     */
1615    
1616     EXPORT int
1617     pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
1618     const char *subject, int length, int start_offset, int options, int *offsets,
1619     int offsetcount, int *workspace, int wscount)
1620     {
1621     real_pcre *re = (real_pcre *)argument_re;
1622     dfa_match_data match_block;
1623     BOOL utf8, anchored, startline, firstline;
1624     const uschar *current_subject, *end_subject, *lcc;
1625    
1626     pcre_study_data internal_study;
1627     const pcre_study_data *study = NULL;
1628     real_pcre internal_re;
1629    
1630     const uschar *req_byte_ptr;
1631     const uschar *start_bits = NULL;
1632     BOOL first_byte_caseless = FALSE;
1633     BOOL req_byte_caseless = FALSE;
1634     int first_byte = -1;
1635     int req_byte = -1;
1636     int req_byte2 = -1;
1637    
1638     /* Plausibility checks */
1639    
1640     if ((options & ~PUBLIC_DFA_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
1641     if (re == NULL || subject == NULL || workspace == NULL ||
1642     (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
1643     if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
1644     if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;
1645    
1646     /* We need to find the pointer to any study data before we test for byte
1647     flipping, so we scan the extra_data block first. This may set two fields in the
1648     match block, so we must initialize them beforehand. However, the other fields
1649     in the match block must not be set until after the byte flipping. */
1650    
1651     match_block.tables = re->tables;
1652     match_block.callout_data = NULL;
1653    
1654     if (extra_data != NULL)
1655     {
1656     unsigned int flags = extra_data->flags;
1657     if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
1658     study = (const pcre_study_data *)extra_data->study_data;
1659     if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) return PCRE_ERROR_DFA_UMLIMIT;
1660     if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
1661     match_block.callout_data = extra_data->callout_data;
1662     if ((flags & PCRE_EXTRA_TABLES) != 0)
1663     match_block.tables = extra_data->tables;
1664     }
1665    
1666     /* Check that the first field in the block is the magic number. If it is not,
1667     test for a regex that was compiled on a host of opposite endianness. If this is
1668     the case, flipped values are put in internal_re and internal_study if there was
1669     study data too. */
1670    
1671     if (re->magic_number != MAGIC_NUMBER)
1672     {
1673     re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
1674     if (re == NULL) return PCRE_ERROR_BADMAGIC;
1675     if (study != NULL) study = &internal_study;
1676     }
1677    
1678     /* Set some local values */
1679    
1680     current_subject = (const unsigned char *)subject + start_offset;
1681     end_subject = (const unsigned char *)subject + length;
1682     req_byte_ptr = current_subject - 1;
1683    
1684     utf8 = (re->options & PCRE_UTF8) != 0;
1685     anchored = (options & PCRE_ANCHORED) != 0 || (re->options & PCRE_ANCHORED) != 0;
1686    
1687     /* The remaining fixed data for passing around. */
1688    
1689     match_block.start_code = (const uschar *)argument_re +
1690     re->name_table_offset + re->name_count * re->name_entry_size;
1691     match_block.start_subject = (const unsigned char *)subject;
1692     match_block.end_subject = end_subject;
1693     match_block.moptions = options;
1694     match_block.poptions = re->options;
1695    
1696     /* Check a UTF-8 string if required. Unfortunately there's no way of passing
1697     back the character offset. */
1698    
1699     #ifdef SUPPORT_UTF8
1700     if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
1701     {
1702     if (_pcre_valid_utf8((uschar *)subject, length) >= 0)
1703     return PCRE_ERROR_BADUTF8;
1704     if (start_offset > 0 && start_offset < length)
1705     {
1706     int tb = ((uschar *)subject)[start_offset];
1707     if (tb > 127)
1708     {
1709     tb &= 0xc0;
1710     if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
1711     }
1712     }
1713     }
1714     #endif
1715    
1716     /* If the exec call supplied NULL for tables, use the inbuilt ones. This
1717     is a feature that makes it possible to save compiled regex and re-use them
1718     in other programs later. */
1719    
1720     if (match_block.tables == NULL) match_block.tables = _pcre_default_tables;
1721    
1722     /* The lower casing table and the "must be at the start of a line" flag are
1723     used in a loop when finding where to start. */
1724    
1725     lcc = match_block.tables + lcc_offset;
1726     startline = (re->options & PCRE_STARTLINE) != 0;
1727     firstline = (re->options & PCRE_FIRSTLINE) != 0;
1728    
1729     /* Set up the first character to match, if available. The first_byte value is
1730     never set for an anchored regular expression, but the anchoring may be forced
1731     at run time, so we have to test for anchoring. The first char may be unset for
1732     an unanchored pattern, of course. If there's no first char and the pattern was
1733     studied, there may be a bitmap of possible first characters. */
1734    
1735     if (!anchored)
1736     {
1737     if ((re->options & PCRE_FIRSTSET) != 0)
1738     {
1739     first_byte = re->first_byte & 255;
1740     if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
1741     first_byte = lcc[first_byte];
1742     }
1743     else
1744     {
1745     if (startline && study != NULL &&
1746     (study->options & PCRE_STUDY_MAPPED) != 0)
1747     start_bits = study->start_bits;
1748     }
1749     }
1750    
1751     /* For anchored or unanchored matches, there may be a "last known required
1752     character" set. */
1753    
1754     if ((re->options & PCRE_REQCHSET) != 0)
1755     {
1756     req_byte = re->req_byte & 255;
1757     req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
1758     req_byte2 = (match_block.tables + fcc_offset)[req_byte]; /* case flipped */
1759     }
1760    
1761     /* Call the main matching function, looping for a non-anchored regex after a
1762     failed match. Unless restarting, optimize by moving to the first match
1763     character if possible, when not anchored. Then unless wanting a partial match,
1764     check for a required later character. */
1765    
1766     for (;;)
1767     {
1768     int rc;
1769    
1770     if ((options & PCRE_DFA_RESTART) == 0)
1771     {
1772     const uschar *save_end_subject = end_subject;
1773    
1774     /* Advance to a unique first char if possible. If firstline is TRUE, the
1775     start of the match is constrained to the first line of a multiline string.
1776     Implement this by temporarily adjusting end_subject so that we stop scanning
1777     at a newline. If the match fails at the newline, later code breaks this loop.
1778     */
1779    
1780     if (firstline)
1781     {
1782     const uschar *t = current_subject;
1783     while (t < save_end_subject && *t != '\n') t++;
1784     end_subject = t;
1785     }
1786    
1787     if (first_byte >= 0)
1788     {
1789     if (first_byte_caseless)
1790     while (current_subject < end_subject &&
1791     lcc[*current_subject] != first_byte)
1792     current_subject++;
1793     else
1794     while (current_subject < end_subject && *current_subject != first_byte)
1795     current_subject++;
1796     }
1797    
1798     /* Or to just after \n for a multiline match if possible */
1799    
1800     else if (startline)
1801     {
1802     if (current_subject > match_block.start_subject + start_offset)
1803     {
1804     while (current_subject < end_subject && current_subject[-1] != NEWLINE)
1805     current_subject++;
1806     }
1807     }
1808    
1809     /* Or to a non-unique first char after study */
1810    
1811     else if (start_bits != NULL)
1812     {
1813     while (current_subject < end_subject)
1814     {
1815     register unsigned int c = *current_subject;
1816     if ((start_bits[c/8] & (1 << (c&7))) == 0) current_subject++;
1817     else break;
1818     }
1819     }
1820    
1821     /* Restore fudged end_subject */
1822    
1823     end_subject = save_end_subject;
1824     }
1825    
1826     /* If req_byte is set, we know that that character must appear in the subject
1827     for the match to succeed. If the first character is set, req_byte must be
1828     later in the subject; otherwise the test starts at the match point. This
1829     optimization can save a huge amount of work in patterns with nested unlimited
1830     repeats that aren't going to match. Writing separate code for cased/caseless
1831     versions makes it go faster, as does using an autoincrement and backing off
1832     on a match.
1833    
1834     HOWEVER: when the subject string is very, very long, searching to its end can
1835     take a long time, and give bad performance on quite ordinary patterns. This
1836     showed up when somebody was matching /^C/ on a 32-megabyte string... so we
1837     don't do this when the string is sufficiently long.
1838    
1839     ALSO: this processing is disabled when partial matching is requested.
1840     */
1841    
1842     if (req_byte >= 0 &&
1843     end_subject - current_subject < REQ_BYTE_MAX &&
1844     (options & PCRE_PARTIAL) == 0)
1845     {
1846     register const uschar *p = current_subject + ((first_byte >= 0)? 1 : 0);
1847    
1848     /* We don't need to repeat the search if we haven't yet reached the
1849     place we found it at last time. */
1850    
1851     if (p > req_byte_ptr)
1852     {
1853     if (req_byte_caseless)
1854     {
1855     while (p < end_subject)
1856     {
1857     register int pp = *p++;
1858     if (pp == req_byte || pp == req_byte2) { p--; break; }
1859     }
1860     }
1861     else
1862     {
1863     while (p < end_subject)
1864     {
1865     if (*p++ == req_byte) { p--; break; }
1866     }
1867     }
1868    
1869     /* If we can't find the required character, break the matching loop,
1870     which will cause a return or PCRE_ERROR_NOMATCH. */
1871    
1872     if (p >= end_subject) break;
1873    
1874     /* If we have found the required character, save the point where we
1875     found it, so that we don't search again next time round the loop if
1876     the start hasn't passed this character yet. */
1877    
1878     req_byte_ptr = p;
1879     }
1880     }
1881    
1882     /* OK, now we can do the business */
1883    
1884     rc = internal_dfa_exec(
1885     &match_block, /* fixed match data */
1886     match_block.start_code, /* this subexpression's code */
1887     current_subject, /* where we currently are */
1888     start_offset, /* start offset in subject */
1889     offsets, /* offset vector */
1890     offsetcount, /* size of same */
1891     workspace, /* workspace vector */
1892     wscount, /* size of same */
1893     re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL), /* ims flags */
1894     0, /* function recurse level */
1895     0); /* regex recurse level */
1896    
1897     /* Anything other than "no match" means we are done, always; otherwise, carry
1898     on only if not anchored. */
1899    
1900     if (rc != PCRE_ERROR_NOMATCH || anchored) return rc;
1901    
1902     /* Advance to the next subject character unless we are at the end of a line
1903     and firstline is set. */
1904    
1905     if (firstline && *current_subject == NEWLINE) break;
1906     current_subject++;
1907    
1908     #ifdef SUPPORT_UTF8
1909     if (utf8)
1910     {
1911     while (current_subject < end_subject && (*current_subject & 0xc0) == 0x80)
1912     current_subject++;
1913     }
1914     #endif
1915    
1916     if (current_subject > end_subject) break;
1917     }
1918    
1919     return PCRE_ERROR_NOMATCH;
1920     }
1921    
1922     /* End of pcre_dfa_exec.c */

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12